Merge branch 'sgl-project:main' into main

EvolvingLMMs-Lab · Aug 14, 2024 · 0667043 · 0667043
2 parents 3962718 + 0909bb0
commit 0667043
Show file tree

Hide file tree

Showing 102 changed files with 2,535 additions and 2,669 deletions.
diff --git a/.github/ISSUE_TEMPLATE/1-bug-report.yml b/.github/ISSUE_TEMPLATE/1-bug-report.yml
@@ -33,6 +33,5 @@ body:
     description: |
       Please provide necessary environment information here with `python3 -m sglang.check_env`.
     placeholder: Environment here.
-    render: Shell
   validations:
     required: true
diff --git a/.github/workflows/accuracy-test.yml b/.github/workflows/accuracy-test.yml
@@ -0,0 +1,46 @@
+name: Accuracy Test
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "python/sglang/**"
+      - "test/**"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "python/sglang/**"
+      - "test/**"
+  workflow_dispatch:
+
+concurrency:
+  group: accuracy-test-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  accuracy-test:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: accuracy
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Install dependencies
+      run: |
+        source $HOME/venv/bin/activate
+        echo "$HOME/venv/bin" >> $GITHUB_PATH
+
+        pip install --upgrade pip
+        pip install -e "python[all]"
+        pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
+
+        git clone https://github.com/merrymercy/human-eval.git
+        cd human-eval
+        pip install -e .
+
+    - name: Evaluate Accuracy
+      run: |
+        cd test/srt
+        python3 test_eval_accuracy_large.py
+      timeout-minutes: 20
diff --git a/.github/workflows/cancel-pr-workflow.yml b/.github/workflows/cancel-pr-workflow.yml
@@ -0,0 +1,22 @@
+name: Cancel PR Workflows on Merge
+
+on:
+  pull_request:
+    types:
+      - closed
+
+permissions:
+  actions: write
+
+jobs:
+  cancel:
+    if: github.event.pull_request.merged == true
+    runs-on: ubuntu-latest
+    steps:
+      - name: Cancel Previous Runs
+        uses: styfle/[email protected]
+        with:
+          workflow_id: all
+          access_token: ${{ secrets.GITHUB_TOKEN }}
+          ignore_sha: true
+          pr_number: ${{ github.event.pull_request.number }}
diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml
@@ -20,7 +20,7 @@ concurrency:
 jobs:
   e2e-test:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
-    runs-on: self-hosted
+    runs-on: e2e
 
     steps:
     - name: Checkout code
@@ -37,22 +37,15 @@ jobs:
 
     - name: Benchmark Serving Throughput
       run: |
-        python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --port 8413 --disable-radix-cache &
-
-        echo "Waiting for server to start..."
-        for i in {1..120}; do
-          if curl -s http://127.0.0.1:8413/health; then
-            echo "Server is up!"
-            break
-          fi
-          if [ $i -eq 120 ]; then
-            echo "Server failed to start within 120 seconds"
-            exit 1
-          fi
-          sleep 1
-        done
-
-        cd $HOME && python3 -m sglang.bench_serving --backend sglang --port 8413 --dataset-name random --num-prompts 3000 --random-input 256 --random-output 512
-
-        echo "Stopping server..."
-        kill -9 $(ps aux | grep sglang | grep Meta-Llama-3.1-8B-Instruct | grep -- "--port 8413" | grep -v grep | awk '{print $2}')
+        cd test/srt
+        python3 -m unittest test_serving_throughput.TestServingThroughput.test_default
+
+    - name: Benchmark Serving Throughput (w/o RadixAttention)
+      run: |
+        cd test/srt
+        python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache
+
+    - name: Benchmark Serving Throughput (w/ ChunkedPrefill)
+      run: |
+        cd test/srt
+        python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_with_chunked_prefill
diff --git a/.github/workflows/moe-test.yml b/.github/workflows/moe-test.yml
@@ -0,0 +1,42 @@
+name: MoE Test
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "python/sglang/**"
+      - "test/**"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "python/sglang/**"
+      - "test/**"
+  workflow_dispatch:
+
+concurrency:
+  group: moe-test-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+    moe-test:
+        if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+        runs-on: accuracy
+
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v3
+
+            - name: Install dependencies
+              run: |
+                source $HOME/venv/bin/activate
+                echo "$HOME/venv/bin" >> $GITHUB_PATH
+        
+                pip install --upgrade pip
+                pip install -e "python[all]"
+                pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
+
+            - name: Benchmark MOE Serving Throughput
+              run: |
+                cd test/srt
+                python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
+                python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml
@@ -1,4 +1,4 @@
-name: Release Docker
+name: Release Docker Images
 on:
   push:
     branches:
@@ -14,39 +14,51 @@ jobs:
     environment: 'prod'
     strategy:
       matrix:
-        cuda_version: ['12.1.1', '12.4.1']
+        cuda_version: ['11.8.0', '12.1.1', '12.4.1']
+        build_type: ['all', 'srt']
     steps:
       - name: Delete huge unnecessary tools folder
         run: rm -rf /opt/hostedtoolcache
 
       - name: Checkout repository
         uses: actions/checkout@v3
-      
+
       - name: Login to Docker Hub
         uses: docker/login-action@v2
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
-      
+
       - name: Build and Push
         run: |
           version=$(cat python/sglang/version.py | cut -d'"' -f2)
-          
-          if [ "${{ matrix.cuda_version }}" = "12.1.1" ]; then
+
+          if [ "${{ matrix.cuda_version }}" = "11.8.0" ]; then
+            cuda_tag="cu118"
+          elif [ "${{ matrix.cuda_version }}" = "12.1.1" ]; then
             cuda_tag="cu121"
           elif [ "${{ matrix.cuda_version }}" = "12.4.1" ]; then
             cuda_tag="cu124"
           else
             echo "Unsupported CUDA version"
             exit 1
           fi
-          
+
           tag=v${version}-${cuda_tag}
-          
-          docker build . -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.cuda_version }} -t lmsysorg/sglang:${tag} --no-cache
-          docker push lmsysorg/sglang:${tag}
-          
+
+          if [ "${{ matrix.build_type }}" = "all" ]; then
+            tag_suffix=""
+          elif [ "${{ matrix.build_type }}" = "srt" ]; then
+            tag_suffix="-srt"
+          else
+            echo "Unsupported build type"
+            exit 1
+          fi
+
+          docker build . -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.cuda_version }} --build-arg BUILD_TYPE=${{ matrix.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} --no-cache
+          docker push lmsysorg/sglang:${tag}${tag_suffix}
+
           if [ "${{ matrix.cuda_version }}" = "12.1.1" ]; then
-            docker tag lmsysorg/sglang:${tag} lmsysorg/sglang:latest
-            docker push lmsysorg/sglang:latest
+            docker tag lmsysorg/sglang:${tag}${tag_suffix} lmsysorg/sglang:latest${tag_suffix}
+            docker push lmsysorg/sglang:latest${tag_suffix}
           fi
diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml
@@ -20,7 +20,7 @@ concurrency:
 jobs:
   unit-test:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
-    runs-on: self-hosted
+    runs-on: unit
 
     steps:
     - name: Checkout code
@@ -37,12 +37,12 @@ jobs:
         pip install accelerate
         pip install sentence_transformers
 
-    - name: Test Frontend Language
+    - name: Test Backend Runtime
       run: |
-        cd test/lang
+        cd test/srt
         python3 run_suite.py --suite minimal
 
-    - name: Test Backend Runtime
+    - name: Test Frontend Language
       run: |
-        cd test/srt
+        cd test/lang
         python3 run_suite.py --suite minimal
diff --git a/README.md b/README.md
@@ -55,7 +55,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ### Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.2.11 https://github.com/sgl-project/sglang.git
+git clone -b v0.2.12 https://github.com/sgl-project/sglang.git
 cd sglang
 
 pip install --upgrade pip
@@ -76,9 +76,17 @@ docker run --gpus all \
     --env "HF_TOKEN=<secret>" \
     --ipc=host \
     lmsysorg/sglang:latest \
-    python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
+    python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
 ```
 
+### Method 4: Using docker compose
+
+> This method is recommended if you plan to serve it as a service.
+> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
+
+1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
+2. Execute the command `docker compose up -d` in your terminal.
+
 ### Common Notes
 - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
 - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
@@ -139,23 +147,23 @@ print(response)
 It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
 
 ### Additional Server Arguments
-- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
+- Add `--tp 2` to enable multi-GPU tensor parallelism. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
 ```
-- Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
+- Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
 ```
-- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
+- If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
 ```
-- If you see out-of-memory errors during prefill for long prompts on a model that supports long context, consider using chunked prefill.
+- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
+- If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
 ```
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --chunked-prefill-size 8192
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096
 ```
-- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
 - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
 ```
 # Node 0
@@ -165,23 +173,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
 ```
 - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
-- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
-
-### Use Models From ModelScope
-To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
-```
-export SGLANG_USE_MODELSCOPE=true
-```
-Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
-```
-SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
-```    
-
+- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
+
 ### Supported Models
 
 - Llama / Llama 2 / Llama 3 / Llama 3.1
-- Mistral / Mixtral
+- Mistral / Mixtral / Mistral NeMo
 - Gemma / Gemma 2
 - Qwen / Qwen 2 / Qwen 2 MoE
 - DeepSeek / DeepSeek 2
@@ -199,11 +197,20 @@ SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen
 - Grok
 - ChatGLM
 - InternLM 2
-- Mistral NeMo
 
 Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
 
-### Run Llama 3.1 405B
+#### Use Models From ModelScope
+To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
+```
+export SGLANG_USE_MODELSCOPE=true
+```
+Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
+```
+SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
+```    
+
+#### Run Llama 3.1 405B
 
 ```bash
 ## Run 405B (fp8) on a single node
@@ -231,7 +238,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
   ```
 
 ## Frontend: Structured Generation Language (SGLang)
-The frontend language can be used with local models or API models.
+The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
 
 ### Quick Start
 The example below shows how to use sglang to answer a mulit-turn question.