Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/xiangyuT/BigDL into refine_…
Browse files Browse the repository at this point in the history
…pp_serving_run_0711
  • Loading branch information
xiangyuT committed Jul 11, 2024
2 parents c15c40e + 105e124 commit 6f725d6
Show file tree
Hide file tree
Showing 37 changed files with 1,112 additions and 82 deletions.
123 changes: 119 additions & 4 deletions .github/workflows/llm_performance_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,29 @@ jobs:
if %ERRORLEVEL% neq 0 (exit /b 1)
call conda deactivate
- name: Prepare igpu perf test for transformers 4.38 (32-32 int4+fp16)
shell: bash
run: |
sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_438.yaml
- name: Test on igpu for transformers 4.38 (32-32 int4+fp16)
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.38.2
set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
cd python\llm\dev\benchmark\all-in-one
move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16_438.yaml config.yaml
set PYTHONIOENCODING=utf-8
python run.py >> %CSV_SAVE_PATH%\32-32_int4_fp16\log\%LOG_FILE% 2>&1
if %ERRORLEVEL% neq 0 if %ERRORLEVEL% neq -1073740791 (exit /b 1)
call conda deactivate
- name: Concat csv and generate html (32-32 int4+fp16)
shell: cmd
Expand All @@ -703,7 +726,7 @@ jobs:
shell: bash
run: |
sed -i 's/32-32/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
- name: Test on igpu (1024-128 int4+fp16)
Expand Down Expand Up @@ -747,6 +770,29 @@ jobs:
if %ERRORLEVEL% neq 0 (exit /b 1)
call conda deactivate
- name: Prepare igpu perf test for transformers 4.38 (1024-128 int4+fp16)
shell: bash
run: |
sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_438.yaml
- name: Test on igpu for transformers 4.38 (1024-128 int4+fp16)
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.38.2
set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
cd python\llm\dev\benchmark\all-in-one
move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_438.yaml config.yaml
set PYTHONIOENCODING=utf-8
python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
if %ERRORLEVEL% neq 0 (exit /b 1)
call conda deactivate
- name: Concat csv and generate html (1024-128 int4+fp16)
shell: cmd
Expand All @@ -770,7 +816,7 @@ jobs:
shell: bash
run: |
sed -i 's/1024-128/2048-256/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml
- name: Test on igpu (2048-256 int4+fp16)
Expand Down Expand Up @@ -814,6 +860,29 @@ jobs:
if %ERRORLEVEL% neq 0 (exit /b 1)
call conda deactivate
- name: Prepare igpu perf test for transformers 4.38 (2048-256 int4+fp16)
shell: bash
run: |
sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_438.yaml
- name: Test on igpu for transformers 4.38 (2048-256 int4+fp16)
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.38.2
set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
cd python\llm\dev\benchmark\all-in-one
move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16_438.yaml config.yaml
set PYTHONIOENCODING=utf-8
python run.py >> %CSV_SAVE_PATH%\2048-256_int4_fp16\log\%LOG_FILE% 2>&1
if %ERRORLEVEL% neq 0 (exit /b 1)
call conda deactivate
- name: Concat csv and generate html (2048-256 int4+fp16)
shell: cmd
Expand All @@ -837,7 +906,7 @@ jobs:
shell: bash
run: |
sed -i 's/2048-256/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit.yaml
- name: Test on igpu (load_low_bit 1024-128 int4+fp16)
Expand Down Expand Up @@ -882,6 +951,29 @@ jobs:
call conda deactivate
- name: Prepare igpu perf test for transformers 4.38 (load_low_bit 1024-128 int4+fp16)
shell: bash
run: |
sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_loadlowbit_438.yaml
- name: Test on igpu for transformers 4.38 (load_low_bit 1024-128 int4+fp16)
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.38.2
set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
cd python\llm\dev\benchmark\all-in-one
move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_loadlowbit_438.yaml config.yaml
set PYTHONIOENCODING=utf-8
python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16_loadlowbit\log\%LOG_FILE% 2>&1
if %ERRORLEVEL% neq 0 (exit /b 1)
call conda deactivate
- name: Concat csv and generate html (load_low_bit 1024-128 int4+fp16)
shell: cmd
run: |
Expand All @@ -903,7 +995,7 @@ jobs:
- name: Prepare igpu perf test (1024-128)
shell: bash
run: |
sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128.yaml
- name: Test on igpu (1024-128)
Expand Down Expand Up @@ -948,6 +1040,29 @@ jobs:
call conda deactivate
- name: Prepare igpu perf test for transformers 4.38 (1024-128)
shell: bash
run: |
sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_438.yaml
- name: Test on igpu for transformers 4.38 (1024-128)
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.38.2
set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
cd python\llm\dev\benchmark\all-in-one
move ..\..\..\test\benchmark\igpu-perf\1024-128_438.yaml config.yaml
set PYTHONIOENCODING=utf-8
python run.py >> %CSV_SAVE_PATH%\1024-128\log\%LOG_FILE% 2>&1
if %ERRORLEVEL% neq 0 (exit /b 1)
call conda deactivate
- name: Concat csv and generate html (1024-128)
shell: cmd
run: |
Expand Down
29 changes: 29 additions & 0 deletions .github/workflows/manually_build_for_testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ on:
- ipex-llm-inference-cpp-xpu
- ipex-llm-serving-cpu
- ipex-llm-serving-xpu
- ipex-llm-serving-xpu-tgi
- ipex-llm-finetune-lora-cpu
- ipex-llm-finetune-qlora-cpu
- ipex-llm-finetune-qlora-cpu-k8s
Expand Down Expand Up @@ -174,6 +175,34 @@ jobs:
sudo docker push 10.239.45.10/arda/${image}:${TAG}
sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
ipex-llm-serving-xpu-tgi:
if: ${{ github.event.inputs.artifact == 'ipex-llm-serving-xpu-tgi' || github.event.inputs.artifact == 'all' }}
runs-on: [self-hosted, Shire]

steps:
- uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3
with:
ref: ${{ github.event.inputs.sha }}
- name: docker login
run: |
docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD}
- name: ipex-llm-serving-xpu-tgi
run: |
echo "##############################################################"
echo "####### ipex-llm-serving-xpu-tgi ########"
echo "##############################################################"
export image=intelanalytics/ipex-llm-serving-xpu-tgi
cd docker/llm/serving/xpu-tgi
sudo docker build \
--no-cache=true \
--build-arg http_proxy=${HTTP_PROXY} \
--build-arg https_proxy=${HTTPS_PROXY} \
--build-arg no_proxy=${NO_PROXY} \
-t ${image}:${TAG} -f ./Dockerfile .
sudo docker tag ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
sudo docker push 10.239.45.10/arda/${image}:${TAG}
sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG}
ipex-llm-inference-cpp-xpu:
if: ${{ github.event.inputs.artifact == 'ipex-llm-inference-cpp-xpu' || github.event.inputs.artifact == 'all' }}
runs-on: [self-hosted, Shire]
Expand Down
54 changes: 47 additions & 7 deletions python/llm/dev/benchmark/all-in-one/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@

LLAVA_IDS = ['liuhaotian/llava-v1.5-7b']

PHI3VISION_IDS = ['microsoft/phi-3-vision-128k-instruct']

results = []
excludes = []

Expand Down Expand Up @@ -914,6 +916,13 @@ def run_transformer_int4_gpu_win(repo_id,
trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.to('xpu')
elif repo_id in PHI3VISION_IDS:
model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
_attn_implementation="eager",
modules_to_not_convert=["vision_embed_tokens"],
trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.to('xpu')
else:
model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
Expand Down Expand Up @@ -1021,6 +1030,14 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
torch_dtype=torch.float16).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.to('xpu')
elif repo_id in PHI3VISION_IDS:
model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
_attn_implementation="eager",
modules_to_not_convert=["vision_embed_tokens"],
trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding,
torch_dtype=torch.float16).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = model.to('xpu')
else:
model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding,
Expand Down Expand Up @@ -1125,6 +1142,13 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id,
use_cache=True, cpu_embedding=cpu_embedding).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True)
model = model.to('xpu')
elif repo_id in PHI3VISION_IDS:
model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
_attn_implementation="eager",
modules_to_not_convert=["vision_embed_tokens"],
use_cache=True, cpu_embedding=cpu_embedding).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True)
model = model.to('xpu')
else:
model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
use_cache=True, cpu_embedding=cpu_embedding).eval()
Expand Down Expand Up @@ -1228,6 +1252,13 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
use_cache=True, cpu_embedding=cpu_embedding).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True)
model = model.half().to('xpu')
elif repo_id in PHI3VISION_IDS:
model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
_attn_implementation="eager",
modules_to_not_convert=["vision_embed_tokens"],
use_cache=True, cpu_embedding=cpu_embedding).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path+'-'+low_bit, trust_remote_code=True)
model = model.half().to('xpu')
else:
model = AutoModelForCausalLM.load_low_bit(model_path+'-'+low_bit, optimize_model=True, trust_remote_code=True,
use_cache=True, cpu_embedding=cpu_embedding).eval()
Expand Down Expand Up @@ -1980,11 +2011,20 @@ def run_pipeline_parallel_gpu(repo_id,
df = pd.DataFrame(results, columns=['model', '1st token avg latency (ms)', '2+ avg latency (ms/token)', 'encoder time (ms)',
'input/output tokens', 'batch_size', 'actual input/output tokens', 'num_beams', 'low_bit', 'cpu_embedding',
'model loading time (s)', 'peak mem (GB)', 'streaming', 'use_fp16_torch_dtype'])
df.index += max(line_counter-1, 0)
if api not in ["transformer_int4_gpu", "transformer_int4_fp16_gpu"]:
if line_counter == 0:
df.to_csv(csv_name, mode='a')
else:
df.to_csv(csv_name, mode='a', header=None)
line_counter += len(df.index)
if "pipeline" in api or "deepspeed" in api:
if torch.distributed.get_rank() == 0:
df.index += max(line_counter - 1, 0)
if line_counter == 0:
df.to_csv(csv_name, mode='a')
else:
df.to_csv(csv_name, mode='a', header=None)
line_counter += len(df.index)
else:
df.index += max(line_counter - 1, 0)
if api not in ["transformer_int4_gpu", "transformer_int4_fp16_gpu"]:
if line_counter == 0:
df.to_csv(csv_name, mode='a')
else:
df.to_csv(csv_name, mode='a', header=None)
line_counter += len(df.index)
results = []
8 changes: 7 additions & 1 deletion python/llm/dev/benchmark/all-in-one/save.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import sys
import gc

from run import LLAMA_IDS, CHATGLM_IDS, LLAVA_IDS, get_model_path
from run import LLAMA_IDS, CHATGLM_IDS, LLAVA_IDS, PHI3VISION_IDS, get_model_path

current_dir = os.path.dirname(os.path.realpath(__file__))

Expand Down Expand Up @@ -51,6 +51,12 @@ def save_model_in_low_bit(repo_id,
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True,
trust_remote_code=True, use_cache=True).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
elif repo_id in PHI3VISION_IDS:
model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
_attn_implementation="eager",
modules_to_not_convert=["vision_embed_tokens"],
trust_remote_code=True, use_cache=True).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
else:
model = AutoModelForCausalLM.from_pretrained(model_path, optimize_model=True, load_in_low_bit=low_bit,
trust_remote_code=True, use_cache=True).eval()
Expand Down
Loading

0 comments on commit 6f725d6

Please sign in to comment.