diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index b17540633225f..79526adef2a79 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -18,6 +18,8 @@ source /etc/environment docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test function cpu_tests() { + set -e + # Run basic model test docker exec cpu-test bash -c " set -e diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 7a0c9dc902bae..26a202b09b8a2 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -20,6 +20,8 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2 function cpu_tests() { + set -e + # offline inference docker exec cpu-test-avx2 bash -c " set -e diff --git a/vllm/worker/cpu_embedding_model_runner.py b/vllm/worker/cpu_embedding_model_runner.py index 86918fee65c5e..7053075bf4d8f 100644 --- a/vllm/worker/cpu_embedding_model_runner.py +++ b/vllm/worker/cpu_embedding_model_runner.py @@ -95,6 +95,7 @@ def prepare_model_input( model_input.seq_lens) return dataclasses.replace(model_input, + virtual_engine=virtual_engine, pooling_metadata=pooling_metadata) def _prepare_pooling( diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py index 896e948948c7a..d040831870bd8 100644 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -4,6 +4,7 @@ import torch from vllm.attention import AttentionMetadata +from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput from vllm.multimodal import MultiModalKwargs from vllm.sequence import IntermediateTensors, SequenceGroupMetadata @@ -96,11 +97,21 @@ def prepare_model_input( encoder_input_positions_tensor, ) = self._prepare_encoder_model_input_tensors(seq_group_metadata_list, model_input) + # Sampling metadata is only required for the final pp group + generators = self.get_generators(finished_requests_ids) + sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, + model_input.seq_lens, + model_input.query_lens, + self.device, + pin_memory=False, + generators=generators) return dataclasses.replace( model_input, + sampling_metadata=sampling_metadata, attn_metadata=attn_metadata, encoder_input_tokens=encoder_input_tokens_tensor, encoder_input_positions=encoder_input_positions_tensor, + virtual_engine=virtual_engine, ) def _prepare_encoder_model_input_tensors(