Merge branch 'main' of https://github.com/xiangyuT/BigDL into refine_…

…pp_serving_run_0711
intel-analytics · Jul 11, 2024 · 6f725d6 · 6f725d6
2 parents c15c40e + 105e124
commit 6f725d6
Show file tree

Hide file tree

Showing 37 changed files with 1,112 additions and 82 deletions.
diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
@@ -679,6 +679,29 @@ jobs:
           if %ERRORLEVEL% neq 0 (exit /b 1)
 
           call conda deactivate
+          
+      - name: Prepare igpu perf test for transformers 4.38 (32-32 int4+fp16)
+        shell: bash
+        run: |
+          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_438.yaml
+
+      - name: Test on igpu for transformers 4.38 (32-32 int4+fp16)
+        shell: cmd
+        run: |
+          call conda activate igpu-perf
+          pip install transformers==4.38.2
+
+          set SYCL_CACHE_PERSISTENT=1
+          set BIGDL_LLM_XMX_DISABLED=1
+
+          cd python\llm\dev\benchmark\all-in-one
+          move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16_438.yaml config.yaml
+          set PYTHONIOENCODING=utf-8
+          python run.py >> %CSV_SAVE_PATH%\32-32_int4_fp16\log\%LOG_FILE% 2>&1
+          if %ERRORLEVEL% neq 0 if %ERRORLEVEL% neq -1073740791 (exit /b 1)
+
+          call conda deactivate
 
       - name: Concat csv and generate html (32-32 int4+fp16)
         shell: cmd
@@ -703,7 +726,7 @@ jobs:
         shell: bash
         run: |
           sed -i 's/32-32/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
 
       - name: Test on igpu (1024-128 int4+fp16)
@@ -747,6 +770,29 @@ jobs:
           if %ERRORLEVEL% neq 0 (exit /b 1)
 
           call conda deactivate
+          
+      - name: Prepare igpu perf test for transformers 4.38 (1024-128 int4+fp16)
+        shell: bash
+        run: |
+          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_438.yaml
+
+      - name: Test on igpu for transformers 4.38 (1024-128 int4+fp16)
+        shell: cmd
+        run: |
+          call conda activate igpu-perf
+          pip install transformers==4.38.2
+
+          set SYCL_CACHE_PERSISTENT=1
+          set BIGDL_LLM_XMX_DISABLED=1
+
+          cd python\llm\dev\benchmark\all-in-one
+          move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_438.yaml config.yaml
+          set PYTHONIOENCODING=utf-8
+          python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+
+          call conda deactivate
 
       - name: Concat csv and generate html (1024-128 int4+fp16)
         shell: cmd
@@ -770,7 +816,7 @@ jobs:
         shell: bash
         run: |
           sed -i 's/1024-128/2048-256/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml
 
       - name: Test on igpu (2048-256 int4+fp16)
@@ -814,6 +860,29 @@ jobs:
           if %ERRORLEVEL% neq 0 (exit /b 1)
 
           call conda deactivate
+          
+      - name: Prepare igpu perf test for transformers 4.38 (2048-256 int4+fp16)
+        shell: bash
+        run: |
+          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_438.yaml
+
+      - name: Test on igpu for transformers 4.38 (2048-256 int4+fp16)
+        shell: cmd
+        run: |
+          call conda activate igpu-perf
+          pip install transformers==4.38.2
+
+          set SYCL_CACHE_PERSISTENT=1
+          set BIGDL_LLM_XMX_DISABLED=1
+
+          cd python\llm\dev\benchmark\all-in-one
+          move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16_438.yaml config.yaml
+          set PYTHONIOENCODING=utf-8
+          python run.py >> %CSV_SAVE_PATH%\2048-256_int4_fp16\log\%LOG_FILE% 2>&1
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+
+          call conda deactivate
 
       - name: Concat csv and generate html (2048-256 int4+fp16)
         shell: cmd
@@ -837,7 +906,7 @@ jobs:
         shell: bash
         run: |
           sed -i 's/2048-256/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit.yaml
 
       - name: Test on igpu (load_low_bit 1024-128 int4+fp16)
@@ -882,6 +951,29 @@ jobs:
 
           call conda deactivate
 
+      - name: Prepare igpu perf test for transformers 4.38 (load_low_bit 1024-128 int4+fp16)
+        shell: bash
+        run: |
+          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_438.yaml
+
+      - name: Test on igpu for transformers 4.38 (load_low_bit 1024-128 int4+fp16)
+        shell: cmd
+        run: |
+          call conda activate igpu-perf
+          pip install transformers==4.38.2
+
+          set SYCL_CACHE_PERSISTENT=1
+          set BIGDL_LLM_XMX_DISABLED=1
+
+          cd python\llm\dev\benchmark\all-in-one
+          move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_loadlowbit_438.yaml config.yaml
+          set PYTHONIOENCODING=utf-8
+          python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16_loadlowbit\log\%LOG_FILE% 2>&1
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+
+          call conda deactivate
+
       - name: Concat csv and generate html (load_low_bit 1024-128 int4+fp16)
         shell: cmd
         run: |
@@ -903,7 +995,7 @@ jobs:
       - name: Prepare igpu perf test (1024-128)
         shell: bash
         run: |
-          sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128.yaml
 
       - name: Test on igpu (1024-128)
@@ -948,6 +1040,29 @@ jobs:
 
           call conda deactivate
 
+      - name: Prepare igpu perf test for transformers 4.38 (1024-128)
+        shell: bash
+        run: |
+          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_438.yaml
+
+      - name: Test on igpu for transformers 4.38 (1024-128)
+        shell: cmd
+        run: |
+          call conda activate igpu-perf
+          pip install transformers==4.38.2
+
+          set SYCL_CACHE_PERSISTENT=1
+          set BIGDL_LLM_XMX_DISABLED=1
+
+          cd python\llm\dev\benchmark\all-in-one
+          move ..\..\..\test\benchmark\igpu-perf\1024-128_438.yaml config.yaml
+          set PYTHONIOENCODING=utf-8
+          python run.py >> %CSV_SAVE_PATH%\1024-128\log\%LOG_FILE% 2>&1
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+
+          call conda deactivate
+
       - name: Concat csv and generate html (1024-128)
         shell: cmd
         run: |

diff --git a/.github/workflows/manually_build_for_testing.yml b/.github/workflows/manually_build_for_testing.yml
@@ -19,6 +19,7 @@ on:
         - ipex-llm-inference-cpp-xpu
         - ipex-llm-serving-cpu
         - ipex-llm-serving-xpu
+        - ipex-llm-serving-xpu-tgi
         - ipex-llm-finetune-lora-cpu
         - ipex-llm-finetune-qlora-cpu
         - ipex-llm-finetune-qlora-cpu-k8s
@@ -174,6 +175,34 @@ jobs:
         sudo docker push 10.239.45.10/arda/${image}:${TAG}
         sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
 
+  ipex-llm-serving-xpu-tgi:
+    if: ${{ github.event.inputs.artifact == 'ipex-llm-serving-xpu-tgi' || github.event.inputs.artifact == 'all' }}
+    runs-on: [self-hosted, Shire]
+
+    steps:
+    - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
+      with:
+        ref: ${{ github.event.inputs.sha }}
+    - name: docker login
+      run: |
+        docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD}
+    - name: ipex-llm-serving-xpu-tgi
+      run: |
+        echo "##############################################################"
+        echo "####### ipex-llm-serving-xpu-tgi ########"
+        echo "##############################################################"
+        export image=intelanalytics/ipex-llm-serving-xpu-tgi
+        cd docker/llm/serving/xpu-tgi
+        sudo docker build \
+          --no-cache=true \
+          --build-arg http_proxy=${HTTP_PROXY} \
+          --build-arg https_proxy=${HTTPS_PROXY} \
+          --build-arg no_proxy=${NO_PROXY} \
+          -t ${image}:${TAG} -f ./Dockerfile .
+        sudo docker tag ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
+        sudo docker push 10.239.45.10/arda/${image}:${TAG}
+        sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
+
   ipex-llm-inference-cpp-xpu:
     if: ${{ github.event.inputs.artifact == 'ipex-llm-inference-cpp-xpu' || github.event.inputs.artifact == 'all' }}
     runs-on: [self-hosted, Shire]

diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
@@ -42,6 +42,8 @@
 
 LLAVA_IDS = ['liuhaotian/llava-v1.5-7b']
 
+PHI3VISION_IDS = ['microsoft/phi-3-vision-128k-instruct']
+
 results = []
 excludes = []
 
@@ -914,6 +916,13 @@ def run_transformer_int4_gpu_win(repo_id,
                                           trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         model = model.to('xpu')
+    elif repo_id in PHI3VISION_IDS:
+        model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
+                                                     _attn_implementation="eager",
+                                                     modules_to_not_convert=["vision_embed_tokens"],
+                                                     trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        model = model.to('xpu')
     else:
         model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
                                                      trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
@@ -1021,6 +1030,14 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
                                           torch_dtype=torch.float16).eval()
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         model = model.to('xpu')
+    elif repo_id in PHI3VISION_IDS:
+        model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
+                                                     _attn_implementation="eager",
+                                                     modules_to_not_convert=["vision_embed_tokens"],
+                                                     trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding,
+                                                     torch_dtype=torch.float16).eval()
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        model = model.to('xpu')
     else:
         model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
                                                      trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding,
@@ -1125,6 +1142,13 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id,
                                                   use_cache=True, cpu_embedding=cpu_embedding).eval()
         tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True)
         model = model.to('xpu')
+    elif repo_id in PHI3VISION_IDS:
+        model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
+                                                  _attn_implementation="eager",
+                                                  modules_to_not_convert=["vision_embed_tokens"],
+                                                  use_cache=True, cpu_embedding=cpu_embedding).eval()
+        tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True)
+        model = model.to('xpu')
     else:
         model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
                                                   use_cache=True, cpu_embedding=cpu_embedding).eval()
@@ -1228,6 +1252,13 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
                                                   use_cache=True, cpu_embedding=cpu_embedding).eval()
         tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True)
         model = model.half().to('xpu')
+    elif repo_id in PHI3VISION_IDS:
+        model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
+                                                  _attn_implementation="eager",
+                                                  modules_to_not_convert=["vision_embed_tokens"],
+                                                  use_cache=True, cpu_embedding=cpu_embedding).eval()
+        tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True)
+        model = model.half().to('xpu')
     else:
         model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
                                                   use_cache=True, cpu_embedding=cpu_embedding).eval()
@@ -1980,11 +2011,20 @@ def run_pipeline_parallel_gpu(repo_id,
         df = pd.DataFrame(results, columns=['model', '1st token avg latency (ms)', '2+ avg latency (ms/token)', 'encoder time (ms)',
                                             'input/output tokens', 'batch_size', 'actual input/output tokens', 'num_beams', 'low_bit', 'cpu_embedding',
                                             'model loading time (s)', 'peak mem (GB)', 'streaming', 'use_fp16_torch_dtype'])
-        df.index += max(line_counter-1, 0)
-        if api not in ["transformer_int4_gpu", "transformer_int4_fp16_gpu"]:
-            if line_counter == 0:
-                df.to_csv(csv_name, mode='a')
-            else:
-                df.to_csv(csv_name, mode='a', header=None)
-        line_counter += len(df.index)
+        if "pipeline" in api or "deepspeed" in api:
+            if torch.distributed.get_rank() == 0:
+                df.index += max(line_counter - 1, 0)
+                if line_counter == 0:
+                    df.to_csv(csv_name, mode='a')
+                else:
+                    df.to_csv(csv_name, mode='a', header=None)
+                line_counter += len(df.index)
+        else:
+            df.index += max(line_counter - 1, 0)
+            if api not in ["transformer_int4_gpu", "transformer_int4_fp16_gpu"]:
+                if line_counter == 0:
+                    df.to_csv(csv_name, mode='a')
+                else:
+                    df.to_csv(csv_name, mode='a', header=None)
+            line_counter += len(df.index)
         results = []
diff --git a/python/llm/dev/benchmark/all-in-one/save.py b/python/llm/dev/benchmark/all-in-one/save.py
@@ -23,7 +23,7 @@
 import sys
 import gc
 
-from run import LLAMA_IDS, CHATGLM_IDS, LLAVA_IDS, get_model_path
+from run import LLAMA_IDS, CHATGLM_IDS, LLAVA_IDS, PHI3VISION_IDS, get_model_path
 
 current_dir = os.path.dirname(os.path.realpath(__file__))
 
@@ -51,6 +51,12 @@ def save_model_in_low_bit(repo_id,
         model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True,
                                           trust_remote_code=True, use_cache=True).eval()
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    elif repo_id in PHI3VISION_IDS:
+        model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
+                                                     _attn_implementation="eager",
+                                                     modules_to_not_convert=["vision_embed_tokens"],
+                                                     trust_remote_code=True, use_cache=True).eval()
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
     else:
         model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
                                                      trust_remote_code=True, use_cache=True).eval()