Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
format
Browse files Browse the repository at this point in the history
  • Loading branch information
robertgshaw2-neuralmagic committed Jul 1, 2024
1 parent 27a711a commit 53655b2
Show file tree
Hide file tree
Showing 9 changed files with 10 additions and 118 deletions.
1 change: 1 addition & 0 deletions tests/async_engine/test_openapi_server_ray.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import ray

from tests.nm_utils.utils_skip import should_skip_test_group

from ..utils import RemoteOpenAIServer

if should_skip_test_group(group_name="TEST_ASYNC_ENGINE"):
Expand Down
4 changes: 2 additions & 2 deletions tests/basic_correctness/test_chunked_prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
"""
import pytest

from ..models.utils import check_outputs_equal

from tests.nm_utils.utils_skip import should_skip_test_group

from ..models.utils import check_outputs_equal

if should_skip_test_group(group_name="TEST_BASIC_CORRECTNESS"):
pytest.skip(
"TEST_BASIC_CORRECTNESS=DISABLE, skipping basic correctness test group",
Expand Down
2 changes: 1 addition & 1 deletion tests/distributed/test_basic_distributed_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@

import pytest

from tests.nm_utils.utils_skip import should_skip_test_group
from vllm.utils import cuda_device_count_stateless

from tests.nm_utils.utils_skip import should_skip_test_group
from ..models.utils import check_outputs_equal

if should_skip_test_group(group_name="TEST_DISTRIBUTED"):
Expand Down
3 changes: 1 addition & 2 deletions tests/distributed/test_chunked_prefill_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,11 @@

import pytest

from tests.nm_utils.utils_skip import should_skip_test_group
from vllm.utils import cuda_device_count_stateless

from ..models.utils import check_outputs_equal

from tests.nm_utils.utils_skip import should_skip_test_group

if should_skip_test_group(group_name="TEST_DISTRIBUTED"):
pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group",
allow_module_level=True)
Expand Down
10 changes: 1 addition & 9 deletions tests/distributed/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,10 @@
<<<<<<< HEAD
import os

import pytest
import ray

from tests.nm_utils.utils_skip import should_skip_test_group
from vllm.utils import cuda_device_count_stateless
=======
import ray

import vllm.envs as envs
from tests.nm_utils.utils_skip import should_skip_test_group
from vllm.utils import (cuda_device_count_stateless, is_hip,
update_environment_variables)
>>>>>>> dd793d1d ([Hardware][AMD][CI/Build][Doc] Upgrade to ROCm 6.1, Dockerfile improvements, test fixes (#5422))

if should_skip_test_group(group_name="TEST_DISTRIBUTED"):
pytest.skip("TEST_DISTRIBUTED=DISABLE, skipping distributed test group",
Expand Down
1 change: 1 addition & 0 deletions tests/entrypoints/openai/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from huggingface_hub import snapshot_download

from tests.nm_utils.utils_skip import should_skip_test_group

from ...utils import RemoteOpenAIServer

if should_skip_test_group(group_name="TEST_ENTRYPOINTS"):
Expand Down
1 change: 1 addition & 0 deletions tests/models/test_big_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import torch

from tests.nm_utils.utils_skip import should_skip_test_group

from .utils import check_outputs_equal

if should_skip_test_group(group_name="TEST_MODELS"):
Expand Down
2 changes: 2 additions & 0 deletions tests/models/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pytest

from tests.nm_utils.utils_skip import should_skip_test_group

from .utils import check_outputs_equal

if should_skip_test_group(group_name="TEST_MODELS"):
Expand Down Expand Up @@ -59,6 +60,7 @@ def test_models(
name_1="vllm",
)


@pytest.mark.skip("Slow and not useful (just prints model).")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
Expand Down
104 changes: 0 additions & 104 deletions vllm/worker/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1168,110 +1168,6 @@ def execute_model(
return [output]


class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
"""
GPU model runner with sampling step.
"""
_model_input_cls: Type[ModelInputForGPUWithSamplingMetadata] = (
ModelInputForGPUWithSamplingMetadata)

def make_model_input_from_broadcasted_tensor_dict(
self,
tensor_dict: Dict[str, Any],
) -> ModelInputForGPUWithSamplingMetadata:
return (
ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
tensor_dict,
attn_backend=self.attn_backend,
))

def prepare_model_input(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
) -> ModelInputForGPUWithSamplingMetadata:
"""Prepare the model input based on a given sequence group, including
metadata for the sampling step.
The API assumes seq_group_metadata_list is sorted by prefill -> decode.
The result tensors and data structure also batches input in prefill
-> decode order. For example,
- input_tokens[:num_prefill_tokens] contains prefill tokens.
- input_tokens[num_prefill_tokens:] contains decode tokens.
If cuda graph is required, this API automatically pads inputs.
"""
model_input = self._prepare_model_input_tensors(
seq_group_metadata_list)
sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
model_input.seq_lens,
model_input.query_lens,
self.device,
self.pin_memory)
is_prompt = (seq_group_metadata_list[0].is_prompt
if seq_group_metadata_list else None)
return dataclasses.replace(model_input,
sampling_metadata=sampling_metadata,
is_prompt=is_prompt)

@torch.inference_mode()
def execute_model(
self,
model_input: ModelInputForGPUWithSamplingMetadata,
kv_caches: List[torch.Tensor],
) -> SamplerOutput:
if self.lora_config:
assert model_input.lora_requests is not None
assert model_input.lora_mapping is not None
self.set_active_loras(model_input.lora_requests,
model_input.lora_mapping)

# Currently cuda graph is only supported by the decode phase.
assert model_input.attn_metadata is not None
prefill_meta = model_input.attn_metadata.prefill_metadata
decode_meta = model_input.attn_metadata.decode_metadata
if prefill_meta is None and decode_meta.use_cuda_graph:
assert model_input.input_tokens is not None
graph_batch_size = model_input.input_tokens.shape[0]
model_executable = self.graph_runners[graph_batch_size]
else:
model_executable = self.model

multi_modal_kwargs = model_input.multi_modal_kwargs or {}
hidden_states = model_executable(
input_ids=model_input.input_tokens,
positions=model_input.input_positions,
kv_caches=kv_caches,
attn_metadata=model_input.attn_metadata,
**multi_modal_kwargs,
)

# Compute the logits.
logits = self.model.compute_logits(hidden_states,
model_input.sampling_metadata)

# Only perform sampling in the driver worker.
if not self.is_driver_worker:
return None

# Sample the next token.
output: SamplerOutput = self.model.sample(
logits=logits,
sampling_metadata=model_input.sampling_metadata,
)

if self.return_hidden_states:
# we only need to pass hidden states of most recent token
if model_input.is_prompt:
assert model_input.sampling_metadata is not None
hidden_states = hidden_states.index_select(
0, model_input.sampling_metadata.selected_token_indices)
output.hidden_states = hidden_states

return output


# NOTE: this is nn.Module so the profiler can properly capture/group
# kernels calls made within the graph
class CUDAGraphRunner(nn.Module):
Expand Down

2 comments on commit 53655b2

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

smaller_is_better

Benchmark suite Current: 53655b2 Previous: 569c905 Ratio
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 188.96383155666626 ms 183.7486813564707 ms 1.03
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 84.33693336099152 ms 83.87263279896116 ms 1.01
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 24.193035176667763 ms 24.654848356343184 ms 0.98
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 6.003360066266881 ms 6.001352674302764 ms 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

smaller_is_better

Benchmark suite Current: 53655b2 Previous: 569c905 Ratio
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 23.572875830000537 ms 24.654848356343184 ms 0.96
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 5.944829726853235 ms 6.001352674302764 ms 0.99
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 184.9719024633373 ms 183.7486813564707 ms 1.01
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"} 83.81614097409972 ms 83.87263279896116 ms 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.