Skip to content

Commit

Permalink
Add scripts to create benchmark in ci. (#896)
Browse files Browse the repository at this point in the history
* add scripts and config files for the perf test.

* add ci related files which make this ci decoupled with one_iter.

* fix python black
  • Loading branch information
Wrench-Git authored Aug 1, 2024
1 parent fb62c85 commit 1bffe33
Show file tree
Hide file tree
Showing 3 changed files with 406 additions and 0 deletions.
204 changes: 204 additions & 0 deletions dipu/scripts/ci/ci_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
#!/bin/bash

function check_and_clone_repository() {
repo_name=$1
branch_name=$2
current_path=$(pwd)
repo_path="$current_path/$repo_name"
if [ "$repo_name" == "mmcv" ] || [ "$repo_name" == "mmengine" ]; then
clone_url="https://github.com/open-mmlab/$repo_name.git"
else
clone_url="https://github.com/DeepLink-org/$repo_name.git"
fi
if [ -d "$repo_path" ]; then
cd $repo_name
current_branch=$(git rev-parse --abbrev-ref HEAD)_$(git rev-parse HEAD)_$(git describe --tags 2>/dev/null || echo "none")
if [[ "$current_branch" =~ "$branch_name" ]]; then
echo "$repo_name $branch_name is right"
cd ..
else
git checkout main && git pull && git checkout $branch_name
cd ..
fi
else
cd $current_path && rm -rf $repo_name
git clone -b ${branch_name} ${clone_url} || (git clone ${clone_url} && cd $repo_name && git checkout ${branch_name} && cd ..)
fi
}

function clone_needed_repo() {
set -e
# clone some repositories
SMART_VERSION=dev_for_mmcv2.0
TRANSFORMERS=main
LIGHTLLM=main
DEEPLINKEXT=2a47138de420a0147e8de70685e628d3732135d7
ALPACALORA=sco_benchmark_finetune

check_and_clone_repository "SMART" ${SMART_VERSION}
check_and_clone_repository "transformers" ${TRANSFORMERS}
check_and_clone_repository "lightllm" ${LIGHTLLM}
check_and_clone_repository "DeepLinkExt" ${DEEPLINKEXT}
check_and_clone_repository "alpaca-lora" ${ALPACALORA}
cd ..
}

function build_needed_repo_cuda() {
cd mmcv
MMCV_WITH_DIOPI=1 MMCV_WITH_OPS=1 python setup.py build_ext -i
cd ..
cd DeepLinkExt
python setup.py build_ext -i
cd ..
cd alpaca-lora
pip install -r requirements.txt
cd ..
}

function build_needed_repo_camb() {
cd mmcv
MMCV_WITH_DIOPI=1 MMCV_WITH_OPS=1 python setup.py build_ext -i
cd ..
}

function build_needed_repo_ascend() {
cd mmcv
MMCV_WITH_DIOPI=1 MMCV_WITH_OPS=1 python setup.py build_ext -i
cd ..
}

function build_needed_repo_kunlunxin() {
echo "skip"
}


function export_repo_pythonpath(){
basic_path="$2"
if [ "$1" = "cuda" ]; then
echo "Executing CUDA operation in pythonpath..."
export PYTHONPATH=${basic_path}:$PYTHONPATH
export PYTHONPATH=${basic_path}/transformers/src:$PYTHONPATH
export PYTHONPATH=${basic_path}/lightllm:$PYTHONPATH

# set the environment variable for the transformers repository
export HF_HOME=${basic_path}/huggingface
export HUGGINGFACE_HUB_CACHE=/mnt/lustre/share_data/PAT/datasets/hub

export PYTHONPATH=${basic_path}/mmcv:$PYTHONPATH
export PYTHONPATH=${basic_path}/SMART/tools/one_iter_tool/one_iter:$PYTHONPATH
echo "python path: $PYTHONPATH"
fi
}


function build_dataset(){
# link dataset
if [ "$1" = "cuda" ]; then
echo "Executing CUDA operation in build dataset..."
rm -rf data
mkdir data
ln -s /mnt/lustre/share_data/PAT/datasets/Imagenet data/imagenet
ln -s /mnt/lustre/share_data/PAT/datasets/mscoco2017 data/coco
ln -s /mnt/lustre/share_data/PAT/datasets/mmseg/cityscapes data/cityscapes
ln -s /mnt/lustre/share_data/PAT/datasets/Kinetics400 data/kinetics400
ln -s /mnt/lustre/share_data/PAT/datasets/icdar2015 data/icdar2015
ln -s /mnt/lustre/share_data/PAT/datasets/mjsynth data/mjsynth
ln -s /mnt/lustre/share_data/PAT/datasets/kitti data/kitti
ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/checkpoint/swin_large_patch4_window12_384_22k.pth data/swin_large_patch4_window12_384_22k.pth
ln -s /mnt/lustre/share_data/PAT/datasets/stable-diffusion-v1-5 data/stable-diffusion-v1-5
ln -s /mnt/lustre/share_data/PAT/datasets/llama_1B_oneiter data/llama_1B_oneiter

elif [ "$1" = "camb" ]; then
echo "Executing CAMB operation in build dataset..."
rm -rf data
mkdir data
ln -s /mnt/lustre/share_data/PAT/datasets/Imagenet data/imagenet
ln -s /mnt/lustre/share_data/PAT/datasets/mscoco2017 data/coco
ln -s /mnt/lustre/share_data/PAT/datasets/mmseg/cityscapes data/cityscapes
ln -s /mnt/lustre/share_data/PAT/datasets/mmdet3d/mmdet3d_kitti data/kitti
ln -s /mnt/lustre/share_data/PAT/datasets/mmaction/Kinetics400 data/kinetics400
ln -s /mnt/lustre/share_data/PAT/datasets/mmocr/icdar2015 data/icdar2015
ln -s /mnt/lustre/share_data/PAT/datasets/mmocr/mjsynth data/mjsynth
ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/checkpoint/swin_large_patch4_window12_384_22k.pth data/swin_large_patch4_window12_384_22k.pth
ln -s /mnt/lustre/share_data/PAT/datasets/pretrain/torchvision/resnet50-0676ba61.pth data/resnet50-0676ba61.pth
ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/pretrain/vgg16_caffe-292e1171.pth data/vgg16_caffe-292e1171.pth
ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/pretrain/darknet53-a628ea1b.pth data/darknet53-a628ea1b.pth
ln -s /mnt/lustre/share_data/PAT/datasets/mmpose/pretrain/hrnet_w32-36af842e.pth data/hrnet_w32-36af842e.pth
ln -s /mnt/lustre/share_data/PAT/datasets/pretrain/mmcv/resnet50_v1c-2cccc1ad.pth data/resnet50_v1c-2cccc1ad.pth

elif [ "$1" = "ascend" ]; then
echo "Executing ASCEND operation in build dataset..."
rm -rf data
mkdir data
ln -s /mnt/lustre/share_data/PAT/datasets/Imagenet data/imagenet
ln -s /mnt/lustre/share_data/PAT/datasets/mscoco2017 data/coco
ln -s /mnt/lustre/share_data/PAT/datasets/mmseg/cityscapes data/cityscapes
ln -s /mnt/lustre/share_data/PAT/datasets/kitti data/kitti
ln -s /mnt/lustre/share_data/PAT/datasets/mmaction/Kinetics400 data/kinetics400
ln -s /mnt/lustre/share_data/PAT/datasets/mmocr/icdar2015 data/icdar2015
ln -s /mnt/lustre/share_data/PAT/datasets/mmocr/mjsynth data/mjsynth
ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/checkpoint/swin_large_patch4_window12_384_22k.pth data/swin_large_patch4_window12_384_22k.pth
ln -s /mnt/lustre/share_data/PAT/datasets/pretrain/torchvision/resnet50-0676ba61.pth data/resnet50-0676ba61.pth
ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/pretrain/vgg16_caffe-292e1171.pth data/vgg16_caffe-292e1171.pth
ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/pretrain/darknet53-a628ea1b.pth data/darknet53-a628ea1b.pth
ln -s /mnt/lustre/share_data/PAT/datasets/mmpose/pretrain/hrnet_w32-36af842e.pth data/hrnet_w32-36af842e.pth
ln -s /mnt/lustre/share_data/PAT/datasets/pretrain/mmcv/resnet50_v1c-2cccc1ad.pth data/resnet50_v1c-2cccc1ad.pth
elif [ "$1" = "ascend910b" ]; then
echo "Executing ASCEND operation in build dataset..."
rm -rf data
mkdir data
ln -s /mnt/cache/share/datasets/Imagenet data/imagenet
ln -s /mnt/cache/share/datasets/mscoco2017 data/coco
ln -s /mnt/cache/share/datasets/mmseg/cityscapes data/cityscapes
ln -s /mnt/cache/share/datasets/kitti data/kitti
ln -s /mnt/cache/share/datasets/mmaction/Kinetics400 data/kinetics400
ln -s /mnt/cache/share/datasets/mmocr/icdar2015 data/icdar2015
ln -s /mnt/cache/share/datasets/mmocr/mjsynth data/mjsynth
ln -s /mnt/cache/share/datasets/mmdet/checkpoint/swin_large_patch4_window12_384_22k.pth data/swin_large_patch4_window12_384_22k.pth
ln -s /mnt/cache/share/datasets/pretrain/torchvision/resnet50-0676ba61.pth data/resnet50-0676ba61.pth
ln -s /mnt/cache/share/datasets/mmdet/pretrain/vgg16_caffe-292e1171.pth data/vgg16_caffe-292e1171.pth
ln -s /mnt/cache/share/datasets/mmdet/pretrain/darknet53-a628ea1b.pth data/darknet53-a628ea1b.pth
ln -s /mnt/cache/share/datasets/mmpose/pretrain/hrnet_w32-36af842e.pth data/hrnet_w32-36af842e.pth
ln -s /mnt/cache/share/datasets/pretrain/mmcv/resnet50_v1c-2cccc1ad.pth data/resnet50_v1c-2cccc1ad.pth
elif [ "$1" = "kunlunxin" ]; then
echo "Executing KUNLUNXIN operation in build dataset..."
rm -rf data
mkdir data
ln -s /mnt/cache/share/datasets/imagenet data/imagenet

else
echo "Invalid parameter. Please specify 'cuda' 'camb' 'ascend' or 'kunlunxin'."
exit 1
fi
}


case $1 in
clone)
clone_needed_repo;;
build_cuda)
build_needed_repo_cuda
build_dataset cuda;;
build_camb)
build_needed_repo_camb
build_dataset camb;;
build_ascend)
build_needed_repo_ascend
build_dataset ascend;;
build_ascend910b)
build_needed_repo_ascend
build_dataset ascend910b;;
build_kunlunxin)
build_needed_repo_kunlunxin
build_dataset kunlunxin;;
export_pythonpath_camb)
export_repo_pythonpath camb $2;;
export_pythonpath_cuda)
export_repo_pythonpath cuda $2;;
export_pythonpath_ascend)
export_repo_pythonpath ascend $2;;
export_pythonpath_kunlunxin)
export_repo_pythonpath kunlunxin $2;;
*)
echo -e "[ERROR] Incorrect option:" $1;
esac
182 changes: 182 additions & 0 deletions dipu/scripts/ci/ci_run_perf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
import os
import sys
from multiprocessing import Pool
import subprocess as sp
import time
import yaml
import multiprocessing
import argparse
import logging
import json

log_format = "%(asctime)s - %(levelname)s: %(message)s"
logging.basicConfig(level=logging.INFO, format=log_format, datefmt="%Y-%m-%d %H:%M:%S")


def run_cmd(cmd: str) -> None:
cp = sp.run(cmd, shell=True, encoding="utf-8")
if cp.returncode != 0:
error = (
f"Some thing wrong has happened when running command [{cmd}]:{cp.stderr}"
)
raise Exception(error)


def parse_device_task(device):
# get json file path
device_config = dict()
current_path = os.path.dirname(os.path.realpath(__file__))
config_path = current_path + "/test_perf_config.json"
with open(config_path) as json_config:
json_content = json.loads(json_config.read())
if device in json_content:
device_config = json_content[device]
return device_config


def process_test_perf(log_file, clear_log, task: dict) -> None:
# READ CONFIG

task_name = task["name"]
storage_path = os.getcwd() + "/perf_data/" + task_name
partition = task["partition"]
job_name = "trial"
gpu_requests = task["gpu_requests"]
relative_workdir = task["relative_workdir"]
task_script = task["script"]
filter_pattern = task["filter"]
op_args = task["op_args"]

os.environ["ONE_ITER_TOOL_STORAGE_PATH"] = storage_path
os.environ["DIPU_FORCE_FALLBACK_OPS_LIST"] = (
task["fallback_op_list"] if "fallback_op_list" in task else ""
)

logging.info(f"task_name = {task_name}")

if not os.path.exists(storage_path):
os.makedirs(storage_path)

# GENERATE RUN COMMAND
cmd_run_test_perf = f"srun --job-name={job_name} --partition={partition} --gres={gpu_requests} python {task_script} {op_args}"
if device == "sco":
current_path = os.getcwd()
parent_directory = os.path.dirname(current_path)
cmd_run_test_perf = f"""srun --job-name={job_name} bash -c "cd {parent_directory} && source scripts/ci/ci_one_iter.sh export_pythonpath_cuda {current_path} && source /mnt/cache/share/deeplinkci/github/dipu_env && cd mmlab_pack && source environment_exported && export ONE_ITER_TOOL_STORAGE_PATH={storage_path} && python {current_path}/{task_script}" """

print(cmd_run_test_perf)

current_path = os.getcwd()
os.chdir(relative_workdir)
if clear_log:
run_cmd(cmd_run_test_perf + f" 2>&1 > {current_path}/{log_file}")
else:
run_cmd(cmd_run_test_perf + f" 2>&1 >> {current_path}/{log_file}")
os.chdir(current_path)

print("MATCH_PATTERN:", filter_pattern)
import re

log_content = open(f"{current_path}/{log_file}").read()
pattern = re.compile(filter_pattern)
match_result = pattern.search(log_content)
run_perf = 0.0

if match_result:
match_result = match_result.group(0)
float_pattern = re.compile("\d+(\.\d+)?")
run_perf = float(float_pattern.search(match_result).group(0))
print("RUNNING PERF:{}".format(run_perf))


def run_perf_task(device_config):
error_flag = multiprocessing.Value("i", 0) # if encount error

device = device_config["name"]

logging.info("we use {}!".format(device))
logging.info(f"main process id (ppid): {os.getpid()} {os.getppid()}")
logging.info(f"python path: {os.environ.get('PYTHONPATH', None)}")

os.environ["DIPU_DUMP_OP_ARGS"] = "0"
os.environ["DIPU_DEBUG_ALLOCATOR"] = "0"
os.environ["ONE_ITER_TOOL_DEVICE"] = "dipu"

current_path = os.path.dirname(os.path.realpath(__file__))
env_file_path = os.path.join(current_path, "environment_exported")
env_variables = os.environ
keywords_to_filter = ["DIPU", "ONE_ITER"]
if os.path.exists(env_file_path):
os.remove(env_file_path)

with open("environment_exported", "w") as file:
file.write("pwd\n")
for key, value in env_variables.items():
if any(keyword in key for keyword in keywords_to_filter):
file.write(f'export {key}="{value}"\n')

tasks = device_config["tasks"]
logging.info(f"tasks nums: {len(tasks)}")

if not os.path.exists("perf_data"):
os.mkdir("perf_data")

pool = Pool(max_parall)
log_files = []
try:
for i in range(len(tasks)):
task = tasks[i]
log_file = f"child_{i % max_parall}_log.txt"
log_files.append(log_file)
pool.apply_async(
process_test_perf,
args=(
log_file,
True,
task,
),
error_callback=handle_error,
)
logging.info("Waiting for all subprocesses done...")
pool.close()
pool.join()
for log_file in log_files:
print_file(log_file)
if error_flag.value != 0:
exit(1)
logging.info("All subprocesses done.")
except Exception as e:
logging.error(e)
exit(1)


def handle_error(error: str) -> None:
logging.error(f"Error: {error}")
if pool is not None:
logging.error("Kill all!")
pool.terminate()
error_flag.value = 1


def print_file(file_name):
with open(file_name) as f:
lines = f.read()
logging.info(lines)


if __name__ == "__main__":
# set some params
max_parall = 8
parser = argparse.ArgumentParser(description="set some params.")
parser.add_argument("device", type=str, help="the device to use")
parser.add_argument("job_name", type=str, help="the name of the job")
args = parser.parse_args()

device = args.device
job_name = args.job_name

device_config = parse_device_task(device)
print(device_config)

logging.info(f"device: {device}, job_name: {job_name}")
run_perf_task(device_config)
Loading

0 comments on commit 1bffe33

Please sign in to comment.