From 1bffe3394fdebb4e326abc34ab4b504c91d5d69a Mon Sep 17 00:00:00 2001
From: HuayiL <442488254@qq.com>
Date: Thu, 1 Aug 2024 20:16:07 +0800
Subject: [PATCH] Add scripts to create benchmark in ci. (#896)

* add scripts and config files for the perf test.

* add ci related files which make this ci decoupled with one_iter.

* fix python black
---
 dipu/scripts/ci/ci_benchmark.sh       | 204 ++++++++++++++++++++++++++
 dipu/scripts/ci/ci_run_perf.py        | 182 +++++++++++++++++++++++
 dipu/scripts/ci/test_perf_config.json |  20 +++
 3 files changed, 406 insertions(+)
 create mode 100644 dipu/scripts/ci/ci_benchmark.sh
 create mode 100644 dipu/scripts/ci/ci_run_perf.py
 create mode 100644 dipu/scripts/ci/test_perf_config.json

diff --git a/dipu/scripts/ci/ci_benchmark.sh b/dipu/scripts/ci/ci_benchmark.sh
new file mode 100644
index 000000000..2600fbd6a
--- /dev/null
+++ b/dipu/scripts/ci/ci_benchmark.sh
@@ -0,0 +1,204 @@
+#!/bin/bash
+
+function check_and_clone_repository() {
+    repo_name=$1
+    branch_name=$2
+    current_path=$(pwd)
+    repo_path="$current_path/$repo_name"
+    if [ "$repo_name" == "mmcv" ] || [ "$repo_name" == "mmengine" ]; then
+        clone_url="https://github.com/open-mmlab/$repo_name.git"
+    else
+        clone_url="https://github.com/DeepLink-org/$repo_name.git"
+    fi
+    if [ -d "$repo_path" ]; then
+        cd $repo_name
+        current_branch=$(git rev-parse --abbrev-ref HEAD)_$(git rev-parse HEAD)_$(git describe --tags 2>/dev/null || echo "none")
+        if [[ "$current_branch" =~ "$branch_name" ]]; then
+            echo "$repo_name $branch_name is right"
+            cd ..
+        else
+            git checkout main && git pull && git checkout $branch_name 
+            cd ..
+        fi
+    else
+        cd $current_path && rm -rf  $repo_name
+        git clone -b ${branch_name} ${clone_url} || (git clone ${clone_url} && cd $repo_name && git checkout ${branch_name} && cd ..)
+    fi
+}
+
+function clone_needed_repo() {
+    set -e
+    # clone some repositories
+    SMART_VERSION=dev_for_mmcv2.0
+    TRANSFORMERS=main
+    LIGHTLLM=main
+    DEEPLINKEXT=2a47138de420a0147e8de70685e628d3732135d7
+    ALPACALORA=sco_benchmark_finetune
+
+    check_and_clone_repository "SMART" ${SMART_VERSION}
+    check_and_clone_repository "transformers" ${TRANSFORMERS}
+    check_and_clone_repository "lightllm" ${LIGHTLLM}
+    check_and_clone_repository "DeepLinkExt" ${DEEPLINKEXT}
+    check_and_clone_repository "alpaca-lora" ${ALPACALORA}
+    cd ..
+}
+
+function build_needed_repo_cuda() {
+    cd mmcv
+    MMCV_WITH_DIOPI=1 MMCV_WITH_OPS=1 python setup.py build_ext -i
+    cd ..
+    cd DeepLinkExt
+    python setup.py build_ext -i
+    cd ..
+    cd alpaca-lora
+    pip install -r requirements.txt
+    cd ..
+}
+
+function build_needed_repo_camb() {
+    cd mmcv
+    MMCV_WITH_DIOPI=1 MMCV_WITH_OPS=1 python setup.py build_ext -i
+    cd ..
+}
+
+function build_needed_repo_ascend() {
+    cd mmcv
+    MMCV_WITH_DIOPI=1 MMCV_WITH_OPS=1 python setup.py build_ext -i 
+    cd ..
+}
+
+function build_needed_repo_kunlunxin() {
+    echo "skip"
+}
+
+
+function export_repo_pythonpath(){
+    basic_path="$2"
+    if [ "$1" = "cuda" ]; then
+        echo "Executing CUDA operation in pythonpath..."
+        export PYTHONPATH=${basic_path}:$PYTHONPATH
+        export PYTHONPATH=${basic_path}/transformers/src:$PYTHONPATH
+        export PYTHONPATH=${basic_path}/lightllm:$PYTHONPATH
+
+        # set the environment variable for the transformers repository
+        export HF_HOME=${basic_path}/huggingface
+        export HUGGINGFACE_HUB_CACHE=/mnt/lustre/share_data/PAT/datasets/hub
+
+        export PYTHONPATH=${basic_path}/mmcv:$PYTHONPATH
+        export PYTHONPATH=${basic_path}/SMART/tools/one_iter_tool/one_iter:$PYTHONPATH
+        echo "python path: $PYTHONPATH"
+    fi
+}
+
+
+function build_dataset(){
+    # link dataset
+    if [ "$1" = "cuda" ]; then
+        echo "Executing CUDA operation in build dataset..."
+        rm -rf data
+        mkdir data
+        ln -s /mnt/lustre/share_data/PAT/datasets/Imagenet data/imagenet
+        ln -s /mnt/lustre/share_data/PAT/datasets/mscoco2017  data/coco
+        ln -s /mnt/lustre/share_data/PAT/datasets/mmseg/cityscapes data/cityscapes
+        ln -s /mnt/lustre/share_data/PAT/datasets/Kinetics400 data/kinetics400 
+        ln -s /mnt/lustre/share_data/PAT/datasets/icdar2015 data/icdar2015
+        ln -s /mnt/lustre/share_data/PAT/datasets/mjsynth data/mjsynth
+        ln -s /mnt/lustre/share_data/PAT/datasets/kitti data/kitti
+        ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/checkpoint/swin_large_patch4_window12_384_22k.pth data/swin_large_patch4_window12_384_22k.pth
+        ln -s /mnt/lustre/share_data/PAT/datasets/stable-diffusion-v1-5 data/stable-diffusion-v1-5
+        ln -s /mnt/lustre/share_data/PAT/datasets/llama_1B_oneiter  data/llama_1B_oneiter
+
+    elif [ "$1" = "camb" ]; then
+        echo "Executing CAMB operation in build dataset..."
+        rm -rf data
+        mkdir data
+        ln -s /mnt/lustre/share_data/PAT/datasets/Imagenet data/imagenet
+        ln -s /mnt/lustre/share_data/PAT/datasets/mscoco2017  data/coco
+        ln -s /mnt/lustre/share_data/PAT/datasets/mmseg/cityscapes data/cityscapes
+        ln -s /mnt/lustre/share_data/PAT/datasets/mmdet3d/mmdet3d_kitti data/kitti
+        ln -s /mnt/lustre/share_data/PAT/datasets/mmaction/Kinetics400 data/kinetics400
+        ln -s /mnt/lustre/share_data/PAT/datasets/mmocr/icdar2015 data/icdar2015
+        ln -s /mnt/lustre/share_data/PAT/datasets/mmocr/mjsynth data/mjsynth
+        ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/checkpoint/swin_large_patch4_window12_384_22k.pth data/swin_large_patch4_window12_384_22k.pth
+        ln -s /mnt/lustre/share_data/PAT/datasets/pretrain/torchvision/resnet50-0676ba61.pth data/resnet50-0676ba61.pth
+        ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/pretrain/vgg16_caffe-292e1171.pth data/vgg16_caffe-292e1171.pth
+        ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/pretrain/darknet53-a628ea1b.pth data/darknet53-a628ea1b.pth
+        ln -s /mnt/lustre/share_data/PAT/datasets/mmpose/pretrain/hrnet_w32-36af842e.pth data/hrnet_w32-36af842e.pth
+        ln -s /mnt/lustre/share_data/PAT/datasets/pretrain/mmcv/resnet50_v1c-2cccc1ad.pth data/resnet50_v1c-2cccc1ad.pth
+    
+    elif [ "$1" = "ascend" ]; then
+        echo "Executing ASCEND operation in build dataset..."
+        rm -rf data
+        mkdir data
+        ln -s /mnt/lustre/share_data/PAT/datasets/Imagenet data/imagenet
+        ln -s /mnt/lustre/share_data/PAT/datasets/mscoco2017  data/coco
+        ln -s /mnt/lustre/share_data/PAT/datasets/mmseg/cityscapes data/cityscapes
+        ln -s /mnt/lustre/share_data/PAT/datasets/kitti data/kitti
+        ln -s /mnt/lustre/share_data/PAT/datasets/mmaction/Kinetics400 data/kinetics400
+        ln -s /mnt/lustre/share_data/PAT/datasets/mmocr/icdar2015 data/icdar2015
+        ln -s /mnt/lustre/share_data/PAT/datasets/mmocr/mjsynth data/mjsynth
+        ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/checkpoint/swin_large_patch4_window12_384_22k.pth data/swin_large_patch4_window12_384_22k.pth
+        ln -s /mnt/lustre/share_data/PAT/datasets/pretrain/torchvision/resnet50-0676ba61.pth data/resnet50-0676ba61.pth
+        ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/pretrain/vgg16_caffe-292e1171.pth data/vgg16_caffe-292e1171.pth
+        ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/pretrain/darknet53-a628ea1b.pth data/darknet53-a628ea1b.pth
+        ln -s /mnt/lustre/share_data/PAT/datasets/mmpose/pretrain/hrnet_w32-36af842e.pth data/hrnet_w32-36af842e.pth
+        ln -s /mnt/lustre/share_data/PAT/datasets/pretrain/mmcv/resnet50_v1c-2cccc1ad.pth data/resnet50_v1c-2cccc1ad.pth
+    elif [ "$1" = "ascend910b" ]; then
+        echo "Executing ASCEND operation in build dataset..."
+        rm -rf data
+        mkdir data
+        ln -s /mnt/cache/share/datasets/Imagenet data/imagenet
+        ln -s /mnt/cache/share/datasets/mscoco2017  data/coco
+        ln -s /mnt/cache/share/datasets/mmseg/cityscapes data/cityscapes
+        ln -s /mnt/cache/share/datasets/kitti data/kitti
+        ln -s /mnt/cache/share/datasets/mmaction/Kinetics400 data/kinetics400
+        ln -s /mnt/cache/share/datasets/mmocr/icdar2015 data/icdar2015
+        ln -s /mnt/cache/share/datasets/mmocr/mjsynth data/mjsynth
+        ln -s /mnt/cache/share/datasets/mmdet/checkpoint/swin_large_patch4_window12_384_22k.pth data/swin_large_patch4_window12_384_22k.pth
+        ln -s /mnt/cache/share/datasets/pretrain/torchvision/resnet50-0676ba61.pth data/resnet50-0676ba61.pth
+        ln -s /mnt/cache/share/datasets/mmdet/pretrain/vgg16_caffe-292e1171.pth data/vgg16_caffe-292e1171.pth
+        ln -s /mnt/cache/share/datasets/mmdet/pretrain/darknet53-a628ea1b.pth data/darknet53-a628ea1b.pth
+        ln -s /mnt/cache/share/datasets/mmpose/pretrain/hrnet_w32-36af842e.pth data/hrnet_w32-36af842e.pth
+        ln -s /mnt/cache/share/datasets/pretrain/mmcv/resnet50_v1c-2cccc1ad.pth data/resnet50_v1c-2cccc1ad.pth
+    elif [ "$1" = "kunlunxin" ]; then
+        echo "Executing KUNLUNXIN operation in build dataset..."
+        rm -rf data
+        mkdir data
+        ln -s /mnt/cache/share/datasets/imagenet data/imagenet
+
+    else
+        echo "Invalid parameter. Please specify 'cuda' 'camb' 'ascend' or 'kunlunxin'."
+        exit 1
+    fi
+}
+
+
+case $1 in
+    clone)
+        clone_needed_repo;;
+    build_cuda)
+        build_needed_repo_cuda
+        build_dataset cuda;;
+    build_camb)
+        build_needed_repo_camb
+        build_dataset camb;;
+    build_ascend)
+        build_needed_repo_ascend
+        build_dataset ascend;;
+    build_ascend910b)
+        build_needed_repo_ascend
+        build_dataset ascend910b;;
+    build_kunlunxin)
+        build_needed_repo_kunlunxin
+        build_dataset kunlunxin;;
+    export_pythonpath_camb)
+        export_repo_pythonpath camb $2;;
+    export_pythonpath_cuda)
+        export_repo_pythonpath cuda $2;;
+    export_pythonpath_ascend)
+        export_repo_pythonpath ascend $2;;
+    export_pythonpath_kunlunxin)
+        export_repo_pythonpath kunlunxin $2;;
+    *)
+        echo -e "[ERROR] Incorrect option:" $1;
+esac
diff --git a/dipu/scripts/ci/ci_run_perf.py b/dipu/scripts/ci/ci_run_perf.py
new file mode 100644
index 000000000..4e9f18fe0
--- /dev/null
+++ b/dipu/scripts/ci/ci_run_perf.py
@@ -0,0 +1,182 @@
+import os
+import sys
+from multiprocessing import Pool
+import subprocess as sp
+import time
+import yaml
+import multiprocessing
+import argparse
+import logging
+import json
+
+log_format = "%(asctime)s - %(levelname)s: %(message)s"
+logging.basicConfig(level=logging.INFO, format=log_format, datefmt="%Y-%m-%d %H:%M:%S")
+
+
+def run_cmd(cmd: str) -> None:
+    cp = sp.run(cmd, shell=True, encoding="utf-8")
+    if cp.returncode != 0:
+        error = (
+            f"Some thing wrong has happened when running command [{cmd}]:{cp.stderr}"
+        )
+        raise Exception(error)
+
+
+def parse_device_task(device):
+    # get json file path
+    device_config = dict()
+    current_path = os.path.dirname(os.path.realpath(__file__))
+    config_path = current_path + "/test_perf_config.json"
+    with open(config_path) as json_config:
+        json_content = json.loads(json_config.read())
+        if device in json_content:
+            device_config = json_content[device]
+    return device_config
+
+
+def process_test_perf(log_file, clear_log, task: dict) -> None:
+    # READ CONFIG
+
+    task_name = task["name"]
+    storage_path = os.getcwd() + "/perf_data/" + task_name
+    partition = task["partition"]
+    job_name = "trial"
+    gpu_requests = task["gpu_requests"]
+    relative_workdir = task["relative_workdir"]
+    task_script = task["script"]
+    filter_pattern = task["filter"]
+    op_args = task["op_args"]
+
+    os.environ["ONE_ITER_TOOL_STORAGE_PATH"] = storage_path
+    os.environ["DIPU_FORCE_FALLBACK_OPS_LIST"] = (
+        task["fallback_op_list"] if "fallback_op_list" in task else ""
+    )
+
+    logging.info(f"task_name = {task_name}")
+
+    if not os.path.exists(storage_path):
+        os.makedirs(storage_path)
+
+    # GENERATE RUN COMMAND
+    cmd_run_test_perf = f"srun --job-name={job_name} --partition={partition}  --gres={gpu_requests} python {task_script} {op_args}"
+    if device == "sco":
+        current_path = os.getcwd()
+        parent_directory = os.path.dirname(current_path)
+        cmd_run_test_perf = f"""srun --job-name={job_name} bash -c "cd {parent_directory} && source scripts/ci/ci_one_iter.sh export_pythonpath_cuda {current_path} && source /mnt/cache/share/deeplinkci/github/dipu_env && cd mmlab_pack && source environment_exported && export ONE_ITER_TOOL_STORAGE_PATH={storage_path} && python {current_path}/{task_script}" """
+
+    print(cmd_run_test_perf)
+
+    current_path = os.getcwd()
+    os.chdir(relative_workdir)
+    if clear_log:
+        run_cmd(cmd_run_test_perf + f" 2>&1 > {current_path}/{log_file}")
+    else:
+        run_cmd(cmd_run_test_perf + f" 2>&1 >> {current_path}/{log_file}")
+    os.chdir(current_path)
+
+    print("MATCH_PATTERN:", filter_pattern)
+    import re
+
+    log_content = open(f"{current_path}/{log_file}").read()
+    pattern = re.compile(filter_pattern)
+    match_result = pattern.search(log_content)
+    run_perf = 0.0
+
+    if match_result:
+        match_result = match_result.group(0)
+        float_pattern = re.compile("\d+(\.\d+)?")
+        run_perf = float(float_pattern.search(match_result).group(0))
+    print("RUNNING PERF:{}".format(run_perf))
+
+
+def run_perf_task(device_config):
+    error_flag = multiprocessing.Value("i", 0)  # if encount error
+
+    device = device_config["name"]
+
+    logging.info("we use {}!".format(device))
+    logging.info(f"main process id (ppid): {os.getpid()} {os.getppid()}")
+    logging.info(f"python path: {os.environ.get('PYTHONPATH', None)}")
+
+    os.environ["DIPU_DUMP_OP_ARGS"] = "0"
+    os.environ["DIPU_DEBUG_ALLOCATOR"] = "0"
+    os.environ["ONE_ITER_TOOL_DEVICE"] = "dipu"
+
+    current_path = os.path.dirname(os.path.realpath(__file__))
+    env_file_path = os.path.join(current_path, "environment_exported")
+    env_variables = os.environ
+    keywords_to_filter = ["DIPU", "ONE_ITER"]
+    if os.path.exists(env_file_path):
+        os.remove(env_file_path)
+
+    with open("environment_exported", "w") as file:
+        file.write("pwd\n")
+        for key, value in env_variables.items():
+            if any(keyword in key for keyword in keywords_to_filter):
+                file.write(f'export {key}="{value}"\n')
+
+    tasks = device_config["tasks"]
+    logging.info(f"tasks nums: {len(tasks)}")
+
+    if not os.path.exists("perf_data"):
+        os.mkdir("perf_data")
+
+    pool = Pool(max_parall)
+    log_files = []
+    try:
+        for i in range(len(tasks)):
+            task = tasks[i]
+            log_file = f"child_{i % max_parall}_log.txt"
+            log_files.append(log_file)
+            pool.apply_async(
+                process_test_perf,
+                args=(
+                    log_file,
+                    True,
+                    task,
+                ),
+                error_callback=handle_error,
+            )
+        logging.info("Waiting for all subprocesses done...")
+        pool.close()
+        pool.join()
+        for log_file in log_files:
+            print_file(log_file)
+        if error_flag.value != 0:
+            exit(1)
+        logging.info("All subprocesses done.")
+    except Exception as e:
+        logging.error(e)
+        exit(1)
+
+
+def handle_error(error: str) -> None:
+    logging.error(f"Error: {error}")
+    if pool is not None:
+        logging.error("Kill all!")
+        pool.terminate()
+    error_flag.value = 1
+
+
+def print_file(file_name):
+    with open(file_name) as f:
+        lines = f.read()
+        logging.info(lines)
+
+
+if __name__ == "__main__":
+    # set some params
+    max_parall = 8
+    parser = argparse.ArgumentParser(description="set some params.")
+    parser.add_argument("device", type=str, help="the device to use")
+    parser.add_argument("job_name", type=str, help="the name of the job")
+    args = parser.parse_args()
+
+    device = args.device
+    job_name = args.job_name
+
+    device_config = parse_device_task(device)
+    print(device_config)
+
+    logging.info(f"device: {device}, job_name: {job_name}")
+    run_perf_task(device_config)
diff --git a/dipu/scripts/ci/test_perf_config.json b/dipu/scripts/ci/test_perf_config.json
new file mode 100644
index 000000000..3ec72fc91
--- /dev/null
+++ b/dipu/scripts/ci/test_perf_config.json
@@ -0,0 +1,20 @@
+{
+    "cuda": {
+        "name": "cuda",
+        "tasks": [
+            {
+                "name": "llama2_7B",
+                "partition": "pat_rd",
+                "gpu_requests": "gpu:8",
+                "script": "run_llama_finetune_perf.py",
+                "filter": "'train_runtime': \\d+(\\.\\d+)?[,]",
+                "relative_workdir": "./alpaca-lora",
+                "op_args": "--base_model /mnt/lustrenew/share_data/PAT/datasets/llama2/7B/"
+            }   
+        ]
+    },
+    "sco": {
+    },
+    "camb": {
+    }
+}