From 1bffe3394fdebb4e326abc34ab4b504c91d5d69a Mon Sep 17 00:00:00 2001 From: HuayiL <442488254@qq.com> Date: Thu, 1 Aug 2024 20:16:07 +0800 Subject: [PATCH] Add scripts to create benchmark in ci. (#896) * add scripts and config files for the perf test. * add ci related files which make this ci decoupled with one_iter. * fix python black --- dipu/scripts/ci/ci_benchmark.sh | 204 ++++++++++++++++++++++++++ dipu/scripts/ci/ci_run_perf.py | 182 +++++++++++++++++++++++ dipu/scripts/ci/test_perf_config.json | 20 +++ 3 files changed, 406 insertions(+) create mode 100644 dipu/scripts/ci/ci_benchmark.sh create mode 100644 dipu/scripts/ci/ci_run_perf.py create mode 100644 dipu/scripts/ci/test_perf_config.json diff --git a/dipu/scripts/ci/ci_benchmark.sh b/dipu/scripts/ci/ci_benchmark.sh new file mode 100644 index 000000000..2600fbd6a --- /dev/null +++ b/dipu/scripts/ci/ci_benchmark.sh @@ -0,0 +1,204 @@ +#!/bin/bash + +function check_and_clone_repository() { + repo_name=$1 + branch_name=$2 + current_path=$(pwd) + repo_path="$current_path/$repo_name" + if [ "$repo_name" == "mmcv" ] || [ "$repo_name" == "mmengine" ]; then + clone_url="https://github.com/open-mmlab/$repo_name.git" + else + clone_url="https://github.com/DeepLink-org/$repo_name.git" + fi + if [ -d "$repo_path" ]; then + cd $repo_name + current_branch=$(git rev-parse --abbrev-ref HEAD)_$(git rev-parse HEAD)_$(git describe --tags 2>/dev/null || echo "none") + if [[ "$current_branch" =~ "$branch_name" ]]; then + echo "$repo_name $branch_name is right" + cd .. + else + git checkout main && git pull && git checkout $branch_name + cd .. + fi + else + cd $current_path && rm -rf $repo_name + git clone -b ${branch_name} ${clone_url} || (git clone ${clone_url} && cd $repo_name && git checkout ${branch_name} && cd ..) + fi +} + +function clone_needed_repo() { + set -e + # clone some repositories + SMART_VERSION=dev_for_mmcv2.0 + TRANSFORMERS=main + LIGHTLLM=main + DEEPLINKEXT=2a47138de420a0147e8de70685e628d3732135d7 + ALPACALORA=sco_benchmark_finetune + + check_and_clone_repository "SMART" ${SMART_VERSION} + check_and_clone_repository "transformers" ${TRANSFORMERS} + check_and_clone_repository "lightllm" ${LIGHTLLM} + check_and_clone_repository "DeepLinkExt" ${DEEPLINKEXT} + check_and_clone_repository "alpaca-lora" ${ALPACALORA} + cd .. +} + +function build_needed_repo_cuda() { + cd mmcv + MMCV_WITH_DIOPI=1 MMCV_WITH_OPS=1 python setup.py build_ext -i + cd .. + cd DeepLinkExt + python setup.py build_ext -i + cd .. + cd alpaca-lora + pip install -r requirements.txt + cd .. +} + +function build_needed_repo_camb() { + cd mmcv + MMCV_WITH_DIOPI=1 MMCV_WITH_OPS=1 python setup.py build_ext -i + cd .. +} + +function build_needed_repo_ascend() { + cd mmcv + MMCV_WITH_DIOPI=1 MMCV_WITH_OPS=1 python setup.py build_ext -i + cd .. +} + +function build_needed_repo_kunlunxin() { + echo "skip" +} + + +function export_repo_pythonpath(){ + basic_path="$2" + if [ "$1" = "cuda" ]; then + echo "Executing CUDA operation in pythonpath..." + export PYTHONPATH=${basic_path}:$PYTHONPATH + export PYTHONPATH=${basic_path}/transformers/src:$PYTHONPATH + export PYTHONPATH=${basic_path}/lightllm:$PYTHONPATH + + # set the environment variable for the transformers repository + export HF_HOME=${basic_path}/huggingface + export HUGGINGFACE_HUB_CACHE=/mnt/lustre/share_data/PAT/datasets/hub + + export PYTHONPATH=${basic_path}/mmcv:$PYTHONPATH + export PYTHONPATH=${basic_path}/SMART/tools/one_iter_tool/one_iter:$PYTHONPATH + echo "python path: $PYTHONPATH" + fi +} + + +function build_dataset(){ + # link dataset + if [ "$1" = "cuda" ]; then + echo "Executing CUDA operation in build dataset..." + rm -rf data + mkdir data + ln -s /mnt/lustre/share_data/PAT/datasets/Imagenet data/imagenet + ln -s /mnt/lustre/share_data/PAT/datasets/mscoco2017 data/coco + ln -s /mnt/lustre/share_data/PAT/datasets/mmseg/cityscapes data/cityscapes + ln -s /mnt/lustre/share_data/PAT/datasets/Kinetics400 data/kinetics400 + ln -s /mnt/lustre/share_data/PAT/datasets/icdar2015 data/icdar2015 + ln -s /mnt/lustre/share_data/PAT/datasets/mjsynth data/mjsynth + ln -s /mnt/lustre/share_data/PAT/datasets/kitti data/kitti + ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/checkpoint/swin_large_patch4_window12_384_22k.pth data/swin_large_patch4_window12_384_22k.pth + ln -s /mnt/lustre/share_data/PAT/datasets/stable-diffusion-v1-5 data/stable-diffusion-v1-5 + ln -s /mnt/lustre/share_data/PAT/datasets/llama_1B_oneiter data/llama_1B_oneiter + + elif [ "$1" = "camb" ]; then + echo "Executing CAMB operation in build dataset..." + rm -rf data + mkdir data + ln -s /mnt/lustre/share_data/PAT/datasets/Imagenet data/imagenet + ln -s /mnt/lustre/share_data/PAT/datasets/mscoco2017 data/coco + ln -s /mnt/lustre/share_data/PAT/datasets/mmseg/cityscapes data/cityscapes + ln -s /mnt/lustre/share_data/PAT/datasets/mmdet3d/mmdet3d_kitti data/kitti + ln -s /mnt/lustre/share_data/PAT/datasets/mmaction/Kinetics400 data/kinetics400 + ln -s /mnt/lustre/share_data/PAT/datasets/mmocr/icdar2015 data/icdar2015 + ln -s /mnt/lustre/share_data/PAT/datasets/mmocr/mjsynth data/mjsynth + ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/checkpoint/swin_large_patch4_window12_384_22k.pth data/swin_large_patch4_window12_384_22k.pth + ln -s /mnt/lustre/share_data/PAT/datasets/pretrain/torchvision/resnet50-0676ba61.pth data/resnet50-0676ba61.pth + ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/pretrain/vgg16_caffe-292e1171.pth data/vgg16_caffe-292e1171.pth + ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/pretrain/darknet53-a628ea1b.pth data/darknet53-a628ea1b.pth + ln -s /mnt/lustre/share_data/PAT/datasets/mmpose/pretrain/hrnet_w32-36af842e.pth data/hrnet_w32-36af842e.pth + ln -s /mnt/lustre/share_data/PAT/datasets/pretrain/mmcv/resnet50_v1c-2cccc1ad.pth data/resnet50_v1c-2cccc1ad.pth + + elif [ "$1" = "ascend" ]; then + echo "Executing ASCEND operation in build dataset..." + rm -rf data + mkdir data + ln -s /mnt/lustre/share_data/PAT/datasets/Imagenet data/imagenet + ln -s /mnt/lustre/share_data/PAT/datasets/mscoco2017 data/coco + ln -s /mnt/lustre/share_data/PAT/datasets/mmseg/cityscapes data/cityscapes + ln -s /mnt/lustre/share_data/PAT/datasets/kitti data/kitti + ln -s /mnt/lustre/share_data/PAT/datasets/mmaction/Kinetics400 data/kinetics400 + ln -s /mnt/lustre/share_data/PAT/datasets/mmocr/icdar2015 data/icdar2015 + ln -s /mnt/lustre/share_data/PAT/datasets/mmocr/mjsynth data/mjsynth + ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/checkpoint/swin_large_patch4_window12_384_22k.pth data/swin_large_patch4_window12_384_22k.pth + ln -s /mnt/lustre/share_data/PAT/datasets/pretrain/torchvision/resnet50-0676ba61.pth data/resnet50-0676ba61.pth + ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/pretrain/vgg16_caffe-292e1171.pth data/vgg16_caffe-292e1171.pth + ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/pretrain/darknet53-a628ea1b.pth data/darknet53-a628ea1b.pth + ln -s /mnt/lustre/share_data/PAT/datasets/mmpose/pretrain/hrnet_w32-36af842e.pth data/hrnet_w32-36af842e.pth + ln -s /mnt/lustre/share_data/PAT/datasets/pretrain/mmcv/resnet50_v1c-2cccc1ad.pth data/resnet50_v1c-2cccc1ad.pth + elif [ "$1" = "ascend910b" ]; then + echo "Executing ASCEND operation in build dataset..." + rm -rf data + mkdir data + ln -s /mnt/cache/share/datasets/Imagenet data/imagenet + ln -s /mnt/cache/share/datasets/mscoco2017 data/coco + ln -s /mnt/cache/share/datasets/mmseg/cityscapes data/cityscapes + ln -s /mnt/cache/share/datasets/kitti data/kitti + ln -s /mnt/cache/share/datasets/mmaction/Kinetics400 data/kinetics400 + ln -s /mnt/cache/share/datasets/mmocr/icdar2015 data/icdar2015 + ln -s /mnt/cache/share/datasets/mmocr/mjsynth data/mjsynth + ln -s /mnt/cache/share/datasets/mmdet/checkpoint/swin_large_patch4_window12_384_22k.pth data/swin_large_patch4_window12_384_22k.pth + ln -s /mnt/cache/share/datasets/pretrain/torchvision/resnet50-0676ba61.pth data/resnet50-0676ba61.pth + ln -s /mnt/cache/share/datasets/mmdet/pretrain/vgg16_caffe-292e1171.pth data/vgg16_caffe-292e1171.pth + ln -s /mnt/cache/share/datasets/mmdet/pretrain/darknet53-a628ea1b.pth data/darknet53-a628ea1b.pth + ln -s /mnt/cache/share/datasets/mmpose/pretrain/hrnet_w32-36af842e.pth data/hrnet_w32-36af842e.pth + ln -s /mnt/cache/share/datasets/pretrain/mmcv/resnet50_v1c-2cccc1ad.pth data/resnet50_v1c-2cccc1ad.pth + elif [ "$1" = "kunlunxin" ]; then + echo "Executing KUNLUNXIN operation in build dataset..." + rm -rf data + mkdir data + ln -s /mnt/cache/share/datasets/imagenet data/imagenet + + else + echo "Invalid parameter. Please specify 'cuda' 'camb' 'ascend' or 'kunlunxin'." + exit 1 + fi +} + + +case $1 in + clone) + clone_needed_repo;; + build_cuda) + build_needed_repo_cuda + build_dataset cuda;; + build_camb) + build_needed_repo_camb + build_dataset camb;; + build_ascend) + build_needed_repo_ascend + build_dataset ascend;; + build_ascend910b) + build_needed_repo_ascend + build_dataset ascend910b;; + build_kunlunxin) + build_needed_repo_kunlunxin + build_dataset kunlunxin;; + export_pythonpath_camb) + export_repo_pythonpath camb $2;; + export_pythonpath_cuda) + export_repo_pythonpath cuda $2;; + export_pythonpath_ascend) + export_repo_pythonpath ascend $2;; + export_pythonpath_kunlunxin) + export_repo_pythonpath kunlunxin $2;; + *) + echo -e "[ERROR] Incorrect option:" $1; +esac diff --git a/dipu/scripts/ci/ci_run_perf.py b/dipu/scripts/ci/ci_run_perf.py new file mode 100644 index 000000000..4e9f18fe0 --- /dev/null +++ b/dipu/scripts/ci/ci_run_perf.py @@ -0,0 +1,182 @@ +import os +import sys +from multiprocessing import Pool +import subprocess as sp +import time +import yaml +import multiprocessing +import argparse +import logging +import json + +log_format = "%(asctime)s - %(levelname)s: %(message)s" +logging.basicConfig(level=logging.INFO, format=log_format, datefmt="%Y-%m-%d %H:%M:%S") + + +def run_cmd(cmd: str) -> None: + cp = sp.run(cmd, shell=True, encoding="utf-8") + if cp.returncode != 0: + error = ( + f"Some thing wrong has happened when running command [{cmd}]:{cp.stderr}" + ) + raise Exception(error) + + +def parse_device_task(device): + # get json file path + device_config = dict() + current_path = os.path.dirname(os.path.realpath(__file__)) + config_path = current_path + "/test_perf_config.json" + with open(config_path) as json_config: + json_content = json.loads(json_config.read()) + if device in json_content: + device_config = json_content[device] + return device_config + + +def process_test_perf(log_file, clear_log, task: dict) -> None: + # READ CONFIG + + task_name = task["name"] + storage_path = os.getcwd() + "/perf_data/" + task_name + partition = task["partition"] + job_name = "trial" + gpu_requests = task["gpu_requests"] + relative_workdir = task["relative_workdir"] + task_script = task["script"] + filter_pattern = task["filter"] + op_args = task["op_args"] + + os.environ["ONE_ITER_TOOL_STORAGE_PATH"] = storage_path + os.environ["DIPU_FORCE_FALLBACK_OPS_LIST"] = ( + task["fallback_op_list"] if "fallback_op_list" in task else "" + ) + + logging.info(f"task_name = {task_name}") + + if not os.path.exists(storage_path): + os.makedirs(storage_path) + + # GENERATE RUN COMMAND + cmd_run_test_perf = f"srun --job-name={job_name} --partition={partition} --gres={gpu_requests} python {task_script} {op_args}" + if device == "sco": + current_path = os.getcwd() + parent_directory = os.path.dirname(current_path) + cmd_run_test_perf = f"""srun --job-name={job_name} bash -c "cd {parent_directory} && source scripts/ci/ci_one_iter.sh export_pythonpath_cuda {current_path} && source /mnt/cache/share/deeplinkci/github/dipu_env && cd mmlab_pack && source environment_exported && export ONE_ITER_TOOL_STORAGE_PATH={storage_path} && python {current_path}/{task_script}" """ + + print(cmd_run_test_perf) + + current_path = os.getcwd() + os.chdir(relative_workdir) + if clear_log: + run_cmd(cmd_run_test_perf + f" 2>&1 > {current_path}/{log_file}") + else: + run_cmd(cmd_run_test_perf + f" 2>&1 >> {current_path}/{log_file}") + os.chdir(current_path) + + print("MATCH_PATTERN:", filter_pattern) + import re + + log_content = open(f"{current_path}/{log_file}").read() + pattern = re.compile(filter_pattern) + match_result = pattern.search(log_content) + run_perf = 0.0 + + if match_result: + match_result = match_result.group(0) + float_pattern = re.compile("\d+(\.\d+)?") + run_perf = float(float_pattern.search(match_result).group(0)) + print("RUNNING PERF:{}".format(run_perf)) + + +def run_perf_task(device_config): + error_flag = multiprocessing.Value("i", 0) # if encount error + + device = device_config["name"] + + logging.info("we use {}!".format(device)) + logging.info(f"main process id (ppid): {os.getpid()} {os.getppid()}") + logging.info(f"python path: {os.environ.get('PYTHONPATH', None)}") + + os.environ["DIPU_DUMP_OP_ARGS"] = "0" + os.environ["DIPU_DEBUG_ALLOCATOR"] = "0" + os.environ["ONE_ITER_TOOL_DEVICE"] = "dipu" + + current_path = os.path.dirname(os.path.realpath(__file__)) + env_file_path = os.path.join(current_path, "environment_exported") + env_variables = os.environ + keywords_to_filter = ["DIPU", "ONE_ITER"] + if os.path.exists(env_file_path): + os.remove(env_file_path) + + with open("environment_exported", "w") as file: + file.write("pwd\n") + for key, value in env_variables.items(): + if any(keyword in key for keyword in keywords_to_filter): + file.write(f'export {key}="{value}"\n') + + tasks = device_config["tasks"] + logging.info(f"tasks nums: {len(tasks)}") + + if not os.path.exists("perf_data"): + os.mkdir("perf_data") + + pool = Pool(max_parall) + log_files = [] + try: + for i in range(len(tasks)): + task = tasks[i] + log_file = f"child_{i % max_parall}_log.txt" + log_files.append(log_file) + pool.apply_async( + process_test_perf, + args=( + log_file, + True, + task, + ), + error_callback=handle_error, + ) + logging.info("Waiting for all subprocesses done...") + pool.close() + pool.join() + for log_file in log_files: + print_file(log_file) + if error_flag.value != 0: + exit(1) + logging.info("All subprocesses done.") + except Exception as e: + logging.error(e) + exit(1) + + +def handle_error(error: str) -> None: + logging.error(f"Error: {error}") + if pool is not None: + logging.error("Kill all!") + pool.terminate() + error_flag.value = 1 + + +def print_file(file_name): + with open(file_name) as f: + lines = f.read() + logging.info(lines) + + +if __name__ == "__main__": + # set some params + max_parall = 8 + parser = argparse.ArgumentParser(description="set some params.") + parser.add_argument("device", type=str, help="the device to use") + parser.add_argument("job_name", type=str, help="the name of the job") + args = parser.parse_args() + + device = args.device + job_name = args.job_name + + device_config = parse_device_task(device) + print(device_config) + + logging.info(f"device: {device}, job_name: {job_name}") + run_perf_task(device_config) diff --git a/dipu/scripts/ci/test_perf_config.json b/dipu/scripts/ci/test_perf_config.json new file mode 100644 index 000000000..3ec72fc91 --- /dev/null +++ b/dipu/scripts/ci/test_perf_config.json @@ -0,0 +1,20 @@ +{ + "cuda": { + "name": "cuda", + "tasks": [ + { + "name": "llama2_7B", + "partition": "pat_rd", + "gpu_requests": "gpu:8", + "script": "run_llama_finetune_perf.py", + "filter": "'train_runtime': \\d+(\\.\\d+)?[,]", + "relative_workdir": "./alpaca-lora", + "op_args": "--base_model /mnt/lustrenew/share_data/PAT/datasets/llama2/7B/" + } + ] + }, + "sco": { + }, + "camb": { + } +}