diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index 1d92d239986a3..9106b9d914df5 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -5,6 +5,7 @@ import ray from tests.nm_utils.utils_skip import should_skip_test_group + from ..utils import RemoteOpenAIServer if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"): diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index bebd088a2280a..8e89430c3adb4 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -8,10 +8,10 @@ """ import pytest -from ..models.utils import check_outputs_equal - from tests.nm_utils.utils_skip import should_skip_test_group +from ..models.utils import check_outputs_equal + if should_skip_test_group(group_name="TEST_BASIC_CORRECTNESS"): pytest.skip( "TEST_BASIC_CORRECTNESS=DISABLE, skipping basic correctness test group", diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py index da6ed7580d5ba..b99eab4f3aaa4 100644 --- a/tests/distributed/test_basic_distributed_correctness.py +++ b/tests/distributed/test_basic_distributed_correctness.py @@ -20,9 +20,9 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.utils import cuda_device_count_stateless -from tests.nm_utils.utils_skip import should_skip_test_group from ..models.utils import check_outputs_equal if should_skip_test_group(group_name="TEST_DISTRIBUTED"): diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py index 2f069813826f5..7e83a4005753c 100644 --- a/tests/distributed/test_chunked_prefill_distributed.py +++ b/tests/distributed/test_chunked_prefill_distributed.py @@ -19,12 +19,11 @@ import pytest +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.utils import cuda_device_count_stateless from ..models.utils import check_outputs_equal -from tests.nm_utils.utils_skip import should_skip_test_group - if should_skip_test_group(group_name="TEST_DISTRIBUTED"): pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group", allow_module_level=True) diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index 983eeea5a1ed6..2967c39924c1b 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -1,18 +1,10 @@ -<<<<<<< HEAD -import os - import pytest import ray -from tests.nm_utils.utils_skip import should_skip_test_group -from vllm.utils import cuda_device_count_stateless -======= -import ray - import vllm.envs as envs +from tests.nm_utils.utils_skip import should_skip_test_group from vllm.utils import (cuda_device_count_stateless, is_hip, update_environment_variables) ->>>>>>> dd793d1d ([Hardware][AMD][CI/Build][Doc] Upgrade to ROCm 6.1, Dockerfile improvements, test fixes (#5422)) if should_skip_test_group(group_name="TEST_DISTRIBUTED"): pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group", diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py index 358728146089a..e74a88c38f685 100644 --- a/tests/entrypoints/openai/test_models.py +++ b/tests/entrypoints/openai/test_models.py @@ -7,6 +7,7 @@ from huggingface_hub import snapshot_download from tests.nm_utils.utils_skip import should_skip_test_group + from ...utils import RemoteOpenAIServer if should_skip_test_group(group_name="TEST_ENTRYPOINTS"): diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py index 7b6b89ebb6872..b709f2863e617 100644 --- a/tests/models/test_big_models.py +++ b/tests/models/test_big_models.py @@ -11,6 +11,7 @@ import torch from tests.nm_utils.utils_skip import should_skip_test_group + from .utils import check_outputs_equal if should_skip_test_group(group_name="TEST_MODELS"): diff --git a/tests/models/test_models.py b/tests/models/test_models.py index 03630d4ef2cb3..aa9c61e8aea43 100644 --- a/tests/models/test_models.py +++ b/tests/models/test_models.py @@ -10,6 +10,7 @@ import pytest from tests.nm_utils.utils_skip import should_skip_test_group + from .utils import check_outputs_equal if should_skip_test_group(group_name="TEST_MODELS"): @@ -59,6 +60,7 @@ def test_models( name_1="vllm", ) + @pytest.mark.skip("Slow and not useful (just prints model).") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 245c99e1a05b0..9b7bffd4ae06f 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1168,110 +1168,6 @@ def execute_model( return [output] -class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): - """ - GPU model runner with sampling step. - """ - _model_input_cls: Type[ModelInputForGPUWithSamplingMetadata] = ( - ModelInputForGPUWithSamplingMetadata) - - def make_model_input_from_broadcasted_tensor_dict( - self, - tensor_dict: Dict[str, Any], - ) -> ModelInputForGPUWithSamplingMetadata: - return ( - ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict( - tensor_dict, - attn_backend=self.attn_backend, - )) - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> ModelInputForGPUWithSamplingMetadata: - """Prepare the model input based on a given sequence group, including - metadata for the sampling step. - - The API assumes seq_group_metadata_list is sorted by prefill -> decode. - - The result tensors and data structure also batches input in prefill - -> decode order. For example, - - - input_tokens[:num_prefill_tokens] contains prefill tokens. - - input_tokens[num_prefill_tokens:] contains decode tokens. - - If cuda graph is required, this API automatically pads inputs. - """ - model_input = self._prepare_model_input_tensors( - seq_group_metadata_list) - sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, - model_input.seq_lens, - model_input.query_lens, - self.device, - self.pin_memory) - is_prompt = (seq_group_metadata_list[0].is_prompt - if seq_group_metadata_list else None) - return dataclasses.replace(model_input, - sampling_metadata=sampling_metadata, - is_prompt=is_prompt) - - @torch.inference_mode() - def execute_model( - self, - model_input: ModelInputForGPUWithSamplingMetadata, - kv_caches: List[torch.Tensor], - ) -> SamplerOutput: - if self.lora_config: - assert model_input.lora_requests is not None - assert model_input.lora_mapping is not None - self.set_active_loras(model_input.lora_requests, - model_input.lora_mapping) - - # Currently cuda graph is only supported by the decode phase. - assert model_input.attn_metadata is not None - prefill_meta = model_input.attn_metadata.prefill_metadata - decode_meta = model_input.attn_metadata.decode_metadata - if prefill_meta is None and decode_meta.use_cuda_graph: - assert model_input.input_tokens is not None - graph_batch_size = model_input.input_tokens.shape[0] - model_executable = self.graph_runners[graph_batch_size] - else: - model_executable = self.model - - multi_modal_kwargs = model_input.multi_modal_kwargs or {} - hidden_states = model_executable( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - kv_caches=kv_caches, - attn_metadata=model_input.attn_metadata, - **multi_modal_kwargs, - ) - - # Compute the logits. - logits = self.model.compute_logits(hidden_states, - model_input.sampling_metadata) - - # Only perform sampling in the driver worker. - if not self.is_driver_worker: - return None - - # Sample the next token. - output: SamplerOutput = self.model.sample( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - - if self.return_hidden_states: - # we only need to pass hidden states of most recent token - if model_input.is_prompt: - assert model_input.sampling_metadata is not None - hidden_states = hidden_states.index_select( - 0, model_input.sampling_metadata.selected_token_indices) - output.hidden_states = hidden_states - - return output - - # NOTE: this is nn.Module so the profiler can properly capture/group # kernels calls made within the graph class CUDAGraphRunner(nn.Module):