Merge branch 'main' into benchmarking-separation

neuralmagic · Jul 16, 2024 · fa52f36 · fa52f36 · github-actions · Jul 16, 2024
2 parents 1752b5c + 9daca33
commit fa52f36
Show file tree

Hide file tree

Showing 230 changed files with 9,304 additions and 3,611 deletions.
diff --git a/.buildkite/download-images.sh b/.buildkite/download-images.sh
@@ -8,10 +8,6 @@ set -o pipefail
 # aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
 mkdir -p images
 cd images
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
 

diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
@@ -0,0 +1,11 @@
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
+model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.86
+  - name: "exact_match,flexible-extract"
+    value: 0.86
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
@@ -0,0 +1,11 @@
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
+model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.624
+  - name: "exact_match,flexible-extract"
+    value: 0.624
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
@@ -0,0 +1,11 @@
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
+model_name: "Qwen/Qwen2-57B-A14B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.792
+  - name: "exact_match,flexible-extract"
+    value: 0.824
+limit: 250
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -1,2 +1,3 @@
 Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
+Qwen2-57B-A14-Instruct.yaml
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
@@ -12,8 +12,10 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image
-docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
-docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
+  --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
+  --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
 
 # offline inference
 docker exec cpu-test bash -c "python3 examples/offline_inference.py"
@@ -23,4 +25,4 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 docker exec cpu-test bash -c "cd tests;
   pip install pytest Pillow protobuf
   cd ../
-  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
+  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -74,6 +74,17 @@ steps:
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
 
+- label: Pipeline Parallelism Test
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  commands:
+  - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
+  - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
+  - TP_SIZE=1 PP_SIZE=3 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
+  - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
+  - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
+
+
 - label: Engine Test
   mirror_hardwares: [amd]
   command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
@@ -107,12 +118,15 @@ steps:
 
 - label: Kernels Test %N
   #mirror_hardwares: [amd]
-  command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  commands:
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 4
 
 - label: Models Test
   #mirror_hardwares: [amd]
   commands:
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
     - pytest -v -s models -m \"not vlm\"
 
 - label: Vision Language Models Test
@@ -223,7 +237,7 @@ steps:
   - pytest -v -s distributed/test_custom_all_reduce.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.5/flashinfer-0.0.5+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
   - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - pytest -v -s -x lora/test_mixtral.py
diff --git a/.github/actions/nm-cp-assets/action.yml b/.github/actions/nm-cp-assets/action.yml
@@ -1,18 +1,17 @@
 name: cp assets
 description: "cp whl and tarfile to Google storage 'neuralmagic-public-pypi/dist'"
+inputs:
+  whl:
+    description: "the wheel asset file path"
+    required: true
+  targz:
+    description: "the tar.gz asset file path"
+    required: true
 runs:
   using: composite
   steps:
     - id: cp_assets
       run: |
-        WHL=$(find assets -type f -name "*nm_vllm*.whl")
-        WHL_FILE=$(basename ${WHL})
-        echo "whl: ${WHL}"
-        echo "whl_file: ${WHL_FILE}"
-        TAR=$(find assets -path "*nm-vllm*.tar.gz" -type f -name "nm-vllm*.tar.gz")
-        TAR_FILE=$(basename ${TAR})
-        echo "tar: ${TAR}"
-        echo "tar_file: ${TAR_FILE}"
-        gcloud storage cp ${WHL} gs://neuralmagic-public-pypi/dist/${WHL_FILE}
-        gcloud storage cp ${TAR} gs://neuralmagic-public-pypi/dist/${TAR_FILE}
+        gcloud storage cp ${WHL} gs://neuralmagic-public-pypi/dist/${{ inputs.whl }}
+        gcloud storage cp ${TAR} gs://neuralmagic-public-pypi/dist/${{ inputs.targz }}
       shell: bash
diff --git a/.github/actions/nm-get-docker-tags/action.yml b/.github/actions/nm-get-docker-tags/action.yml
@@ -4,8 +4,8 @@ inputs:
   wf_category:
     description: "type of nm-vllm to install for the docker image: NIGHTLY or RELEASE"
     required: true
-  wheel:
-    description: "wheel name, if latest use the latest from nm pypi"
+  whl:
+    description: "name of nm-vllm wheel to install for the docker image"
     required: true
 outputs:
   tag:
@@ -22,16 +22,23 @@ runs:
   steps:
   - id: tags
     run: |
-      BUILD_VERSION=`echo "${{ inputs.wheel }}" | cut -d'-' -f2`
+      BUILD_VERSION=`echo "${{ inputs.whl }}" | cut -d'-' -f2`
       if [[ "${{ inputs.wf_category }}" == "RELEASE" ]]; then
-          TAG="v${build_version}"
-          EXTRA_TAG=latest
+          if [[ "${BUILD_VERSION}" =~ ^[0-9]+.[0-9]+.[0-9]+$ ]]; then
+              TAG="v${BUILD_VERSION}"
+              EXTRA_TAG=latest
+          else
+              echo "ERROR: wheel version ${BUILD_VERSION} doesn't match RELEASE format. Check input."
+              exit 1
+          fi
       else
-          TAG=`echo "${build_version}" | cut -d'.' -f4`
-          EXTRA_TAG=nightly
-      fi
-      if [[ "${{ inputs.wheel }}" == "latest" ]]; then
-          BUILD_VERSION="latest"
+          if [[ "${BUILD_VERSION}" =~ ^[0-9]+.[0-9]+.[0-9]+.[0-9]{8}$ ]]; then
+              TAG=`echo "${BUILD_VERSION}" | cut -d'.' -f4`
+              EXTRA_TAG=nightly
+          else
+              echo "ERROR: wheel version ${BUILD_VERSION} doesn't match NIGHTLY format. Check input."
+              exit 1
+          fi
       fi
       echo "tag=${TAG}" >> $GITHUB_OUTPUT
       echo "extra_tag=${EXTRA_TAG}" >> $GITHUB_OUTPUT

diff --git a/.github/actions/nm_whl_tar_gz_names/action.yml b/.github/actions/nm_whl_tar_gz_names/action.yml
@@ -0,0 +1,25 @@
+name: get wheel and tar.gz names
+description: "retrieve the whl and tarfile names from existing assets"
+outputs:
+  whl:
+    description: "the wheel asset file path"
+    value: ${{ steps.whl_targz_names.outputs.whl }}
+  targz:
+    description: "the tar.gz asset file path"
+    value: ${{ steps.whl_targz_names.outputs.targz }}
+runs:
+  using: composite
+  steps:
+    - id: whl_targz_names
+      run: |
+        WHL=$(find assets -type f -name "*nm_vllm*.whl")
+        WHL_FILE=$(basename ${WHL})
+        echo "whl: ${WHL}"
+        echo "whl_file: ${WHL_FILE}"
+        TAR=$(find assets -path "*nm-vllm*.tar.gz" -type f -name "nm-vllm*.tar.gz")
+        TAR_FILE=$(basename ${TAR})
+        echo "tar: ${TAR}"
+        echo "tar_file: ${TAR_FILE}"
+        echo "whl=${WHL_FILE}" >> $GITHUB_OUTPUT
+        echo "targz=${TAR_FILE}" >> $GITHUB_OUTPUT
+      shell: bash
diff --git a/.github/workflows/nm-build-test.yml b/.github/workflows/nm-build-test.yml
@@ -58,10 +58,10 @@ on:
         required: true
 
       # benchmark related parameters
-      benchmark_label:
-        description: "requested benchmark label (specifies instance)"
+      benchmark_labels:
+        description: "stringified Json array of benchmark labels"
         type: string
-        default: ""
+        required: true
       benchmark_config_list_file:
         description: "benchmark configs file, e.g. 'nm_benchmark_nightly_configs_list.txt'"
         type: string
@@ -136,9 +136,13 @@ jobs:
     BENCHMARK:
         needs: [BUILD]
         if: success()
+        strategy:
+            fail-fast: false
+            matrix:
+                benchmark_label: ${{ fromJson(inputs.benchmark_labels) }}
         uses: ./.github/workflows/nm-benchmark.yml
         with:
-            label: ${{ inputs.benchmark_label }}
+            label: ${{ matrix.benchmark_label }}
             benchmark_config_list_file: ${{ inputs.benchmark_config_list_file }}
             timeout: ${{ inputs.benchmark_timeout }}
             gitref: ${{ github.ref }}
@@ -164,21 +168,22 @@ jobs:
     UPLOAD:
         needs: [TEST, BENCHMARK, LM-EVAL]
         if: ${{ inputs.push_to_pypi }}
-        uses: ./.github/workflows/nm-upload-assets-to-gcp.yml
+        uses: ./.github/workflows/nm-upload-assets.yml
         with:
             label: gcp-k8s-util
             timeout: ${{ inputs.build_timeout }}
             gitref: ${{ github.ref }}
+            wf_category: ${{ inputs.wf_category }}
         secrets: inherit
 
     # update docker
     DOCKER:
         needs: [BUILD, UPLOAD]
-        if: ${{ inputs.push_to_pypi }}
+        if: ${{ inputs.wf_category != 'REMOTE' }}
         uses: ./.github/workflows/publish-docker.yml
         with:
             push_to_repository: ${{ inputs.push_to_pypi }}
             gitref: ${{ inputs.gitref }}
             wf_category: ${{ inputs.wf_category }}
-            wheel: ${{ needs.BUILD.outputs.whl }}
+            whl: ${{ needs.BUILD.outputs.whl }}
         secrets: inherit
diff --git a/.github/workflows/nm-build.yml b/.github/workflows/nm-build.yml
@@ -31,6 +31,10 @@ on:
         description: "python version, e.g. 3.10.12"
         type: string
         required: true
+    outputs:
+      whl:
+        description: 'basename for generated whl'
+        value: ${{ jobs.BUILD.outputs.whl }}
 
   # makes workflow manually callable
   workflow_dispatch:
@@ -104,7 +108,7 @@ jobs:
 
             - name: set python
               id: set_python
-              uses: neuralmagic/nm-actions/actions/set-python@main
+              uses: neuralmagic/nm-actions/actions/set-python@v1.0.0
               with:
                 python: ${{ inputs.python }}
                 venv: ${{ env.VENV_BASE }}

diff --git a/.github/workflows/nm-nightly.yml b/.github/workflows/nm-nightly.yml
@@ -37,9 +37,9 @@ jobs:
                             {"python":"3.9.17","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"},
                             {"python":"3.10.12","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"},
                             {"python":"3.11.4","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"}]'
-            test_timeout: 480
+            test_timeout: 720
 
-            benchmark_label: gcp-k8s-l4-solo
+            benchmark_labels: '["gcp-k8s-l4-solo", "k8s-h100-solo"]'
             benchmark_config_list_file: ./.github/data/nm_benchmark_base_config_list.txt
             benchmark_timeout: 480
             push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"

diff --git a/.github/workflows/nm-remote-push.yml b/.github/workflows/nm-remote-push.yml
@@ -25,7 +25,7 @@ jobs:
                             {"python":"3.11.4","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/smoke.txt"}]'
             test_timeout: 480
 
-            benchmark_label: gcp-k8s-l4-solo
+            benchmark_labels: '["gcp-k8s-l4-solo", "k8s-h100-solo"]'
             benchmark_config_list_file: ./.github/data/nm_benchmark_base_config_list.txt
             benchmark_timeout: 480
 

diff --git a/.github/workflows/nm-test.yml b/.github/workflows/nm-test.yml
@@ -72,10 +72,7 @@ jobs:
                 python-version: ${{ inputs.python }}
 
             - name: install automation components
-              run: |
-                sudo apt-get update --fix-missing
-                sudo apt-get install -y git-all
-                sudo apt-get install -y curl
+              uses: neuralmagic/nm-actions/actions/[email protected]
 
             - name: checkout
               id: checkout
@@ -94,7 +91,7 @@ jobs:
                 nvcc_threads: 0
 
             - name: install testmo
-              uses: neuralmagic/nm-actions/actions/install-testmo@main
+              uses: neuralmagic/nm-actions/actions/install-testmo@v1.0.0
 
             - name: create testmo run
               id: create_testmo_run
Benchmark suite	Current: `fa52f36`	Previous: `0a36369`	Ratio
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA H100 80GB HBM3 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`32.82793972951671` ms
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA H100 80GB HBM3 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`11.661015613410393` ms
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA H100 80GB HBM3 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`39.72160837147385` ms
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA H100 80GB HBM3 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`7.0571835390463935` ms
Benchmark suite	Current: `fa52f36`	Previous: `0a36369`	Ratio
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`184.78112051343487` ms	`185.6497189733265` ms	`1.00`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`84.07154724240117` ms	`84.4072092276112` ms	`1.00`
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`24.82930486325737` ms	`23.88663713000028` ms	`1.04`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`6.175839846742194` ms	`5.988013016028265` ms	`1.03`