format

neuralmagic · Jul 1, 2024 · 53655b2 · 53655b2 · github-actions · Jul 1, 2024
1 parent 27a711a
commit 53655b2
Show file tree

Hide file tree

Showing 9 changed files with 10 additions and 118 deletions.
diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py
@@ -5,6 +5,7 @@
 import ray
 
 from tests.nm_utils.utils_skip import should_skip_test_group
+
 from ..utils import RemoteOpenAIServer
 
 if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"):

diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
@@ -8,10 +8,10 @@
 """
 import pytest
 
-from ..models.utils import check_outputs_equal
-
 from tests.nm_utils.utils_skip import should_skip_test_group
 
+from ..models.utils import check_outputs_equal
+
 if should_skip_test_group(group_name="TEST_BASIC_CORRECTNESS"):
     pytest.skip(
         "TEST_BASIC_CORRECTNESS=DISABLE, skipping basic correctness test group",

diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
@@ -20,9 +20,9 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.utils import cuda_device_count_stateless
 
-from tests.nm_utils.utils_skip import should_skip_test_group
 from ..models.utils import check_outputs_equal
 
 if should_skip_test_group(group_name="TEST_DISTRIBUTED"):

diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py
@@ -19,12 +19,11 @@
 
 import pytest
 
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.utils import cuda_device_count_stateless
 
 from ..models.utils import check_outputs_equal
 
-from tests.nm_utils.utils_skip import should_skip_test_group
-
 if should_skip_test_group(group_name="TEST_DISTRIBUTED"):
     pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group",
                 allow_module_level=True)

diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
@@ -1,18 +1,10 @@
-<<<<<<< HEAD
-import os
-
 import pytest
 import ray
 
-from tests.nm_utils.utils_skip import should_skip_test_group
-from vllm.utils import cuda_device_count_stateless
-=======
-import ray
-
 import vllm.envs as envs
+from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.utils import (cuda_device_count_stateless, is_hip,
                         update_environment_variables)
->>>>>>> dd793d1d ([Hardware][AMD][CI/Build][Doc] Upgrade to ROCm 6.1, Dockerfile improvements, test fixes (#5422))
 
 if should_skip_test_group(group_name="TEST_DISTRIBUTED"):
     pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group",

diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
@@ -7,6 +7,7 @@
 from huggingface_hub import snapshot_download
 
 from tests.nm_utils.utils_skip import should_skip_test_group
+
 from ...utils import RemoteOpenAIServer
 
 if should_skip_test_group(group_name="TEST_ENTRYPOINTS"):

diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py
@@ -11,6 +11,7 @@
 import torch
 
 from tests.nm_utils.utils_skip import should_skip_test_group
+
 from .utils import check_outputs_equal
 
 if should_skip_test_group(group_name="TEST_MODELS"):

diff --git a/tests/models/test_models.py b/tests/models/test_models.py
@@ -10,6 +10,7 @@
 import pytest
 
 from tests.nm_utils.utils_skip import should_skip_test_group
+
 from .utils import check_outputs_equal
 
 if should_skip_test_group(group_name="TEST_MODELS"):
@@ -59,6 +60,7 @@ def test_models(
         name_1="vllm",
     )
 
+
 @pytest.mark.skip("Slow and not useful (just prints model).")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -1168,110 +1168,6 @@ def execute_model(
         return [output]
 
 
-class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
-    """
-    GPU model runner with sampling step.
-    """
-    _model_input_cls: Type[ModelInputForGPUWithSamplingMetadata] = (
-        ModelInputForGPUWithSamplingMetadata)
-
-    def make_model_input_from_broadcasted_tensor_dict(
-        self,
-        tensor_dict: Dict[str, Any],
-    ) -> ModelInputForGPUWithSamplingMetadata:
-        return (
-            ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
-                tensor_dict,
-                attn_backend=self.attn_backend,
-            ))
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> ModelInputForGPUWithSamplingMetadata:
-        """Prepare the model input based on a given sequence group, including
-        metadata for the sampling step.
-
-        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
-
-        The result tensors and data structure also batches input in prefill
-        -> decode order. For example,
-
-        - input_tokens[:num_prefill_tokens] contains prefill tokens.
-        - input_tokens[num_prefill_tokens:] contains decode tokens.
-
-        If cuda graph is required, this API automatically pads inputs.
-        """
-        model_input = self._prepare_model_input_tensors(
-            seq_group_metadata_list)
-        sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
-                                                     model_input.seq_lens,
-                                                     model_input.query_lens,
-                                                     self.device,
-                                                     self.pin_memory)
-        is_prompt = (seq_group_metadata_list[0].is_prompt
-                     if seq_group_metadata_list else None)
-        return dataclasses.replace(model_input,
-                                   sampling_metadata=sampling_metadata,
-                                   is_prompt=is_prompt)
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: ModelInputForGPUWithSamplingMetadata,
-        kv_caches: List[torch.Tensor],
-    ) -> SamplerOutput:
-        if self.lora_config:
-            assert model_input.lora_requests is not None
-            assert model_input.lora_mapping is not None
-            self.set_active_loras(model_input.lora_requests,
-                                  model_input.lora_mapping)
-
-        # Currently cuda graph is only supported by the decode phase.
-        assert model_input.attn_metadata is not None
-        prefill_meta = model_input.attn_metadata.prefill_metadata
-        decode_meta = model_input.attn_metadata.decode_metadata
-        if prefill_meta is None and decode_meta.use_cuda_graph:
-            assert model_input.input_tokens is not None
-            graph_batch_size = model_input.input_tokens.shape[0]
-            model_executable = self.graph_runners[graph_batch_size]
-        else:
-            model_executable = self.model
-
-        multi_modal_kwargs = model_input.multi_modal_kwargs or {}
-        hidden_states = model_executable(
-            input_ids=model_input.input_tokens,
-            positions=model_input.input_positions,
-            kv_caches=kv_caches,
-            attn_metadata=model_input.attn_metadata,
-            **multi_modal_kwargs,
-        )
-
-        # Compute the logits.
-        logits = self.model.compute_logits(hidden_states,
-                                           model_input.sampling_metadata)
-
-        # Only perform sampling in the driver worker.
-        if not self.is_driver_worker:
-            return None
-
-        # Sample the next token.
-        output: SamplerOutput = self.model.sample(
-            logits=logits,
-            sampling_metadata=model_input.sampling_metadata,
-        )
-
-        if self.return_hidden_states:
-            # we only need to pass hidden states of most recent token
-            if model_input.is_prompt:
-                assert model_input.sampling_metadata is not None
-                hidden_states = hidden_states.index_select(
-                    0, model_input.sampling_metadata.selected_token_indices)
-            output.hidden_states = hidden_states
-
-        return output
-
-
 # NOTE: this is nn.Module so the profiler can properly capture/group
 #  kernels calls made within the graph
 class CUDAGraphRunner(nn.Module):
Benchmark suite	Current: `53655b2`	Previous: `569c905`	Ratio
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`188.96383155666626` ms	`183.7486813564707` ms	`1.03`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`84.33693336099152` ms	`83.87263279896116` ms	`1.01`
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`24.193035176667763` ms	`24.654848356343184` ms	`0.98`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`6.003360066266881` ms	`6.001352674302764` ms	`1.00`
Benchmark suite	Current: `53655b2`	Previous: `569c905`	Ratio
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`23.572875830000537` ms	`24.654848356343184` ms	`0.96`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`5.944829726853235` ms	`6.001352674302764` ms	`0.99`
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`184.9719024633373` ms	`183.7486813564707` ms	`1.01`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`83.81614097409972` ms	`83.87263279896116` ms	`1.00`