Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add scripts to create benchmark in ci. #896

Merged
merged 5 commits into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 204 additions & 0 deletions dipu/scripts/ci/ci_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
#!/bin/bash

function check_and_clone_repository() {
repo_name=$1
branch_name=$2
current_path=$(pwd)
repo_path="$current_path/$repo_name"
if [ "$repo_name" == "mmcv" ] || [ "$repo_name" == "mmengine" ]; then
clone_url="https://github.com/open-mmlab/$repo_name.git"
else
clone_url="https://github.com/DeepLink-org/$repo_name.git"
fi
if [ -d "$repo_path" ]; then
cd $repo_name
current_branch=$(git rev-parse --abbrev-ref HEAD)_$(git rev-parse HEAD)_$(git describe --tags 2>/dev/null || echo "none")
if [[ "$current_branch" =~ "$branch_name" ]]; then
echo "$repo_name $branch_name is right"
cd ..
else
git checkout main && git pull && git checkout $branch_name
cd ..
fi
else
cd $current_path && rm -rf $repo_name
git clone -b ${branch_name} ${clone_url} || (git clone ${clone_url} && cd $repo_name && git checkout ${branch_name} && cd ..)
fi
}

function clone_needed_repo() {
set -e
# clone some repositories
SMART_VERSION=dev_for_mmcv2.0
TRANSFORMERS=main
LIGHTLLM=main
DEEPLINKEXT=2a47138de420a0147e8de70685e628d3732135d7
ALPACALORA=sco_benchmark_finetune

check_and_clone_repository "SMART" ${SMART_VERSION}
check_and_clone_repository "transformers" ${TRANSFORMERS}
check_and_clone_repository "lightllm" ${LIGHTLLM}
check_and_clone_repository "DeepLinkExt" ${DEEPLINKEXT}
check_and_clone_repository "alpaca-lora" ${ALPACALORA}
cd ..
}

function build_needed_repo_cuda() {
cd mmcv
MMCV_WITH_DIOPI=1 MMCV_WITH_OPS=1 python setup.py build_ext -i
cd ..
cd DeepLinkExt
python setup.py build_ext -i
cd ..
cd alpaca-lora
pip install -r requirements.txt
cd ..
}

function build_needed_repo_camb() {
cd mmcv
MMCV_WITH_DIOPI=1 MMCV_WITH_OPS=1 python setup.py build_ext -i
cd ..
}

function build_needed_repo_ascend() {
cd mmcv
MMCV_WITH_DIOPI=1 MMCV_WITH_OPS=1 python setup.py build_ext -i
cd ..
}

function build_needed_repo_kunlunxin() {
echo "skip"
}


function export_repo_pythonpath(){
basic_path="$2"
if [ "$1" = "cuda" ]; then
echo "Executing CUDA operation in pythonpath..."
export PYTHONPATH=${basic_path}:$PYTHONPATH
export PYTHONPATH=${basic_path}/transformers/src:$PYTHONPATH
export PYTHONPATH=${basic_path}/lightllm:$PYTHONPATH

# set the environment variable for the transformers repository
export HF_HOME=${basic_path}/huggingface
export HUGGINGFACE_HUB_CACHE=/mnt/lustre/share_data/PAT/datasets/hub

export PYTHONPATH=${basic_path}/mmcv:$PYTHONPATH
export PYTHONPATH=${basic_path}/SMART/tools/one_iter_tool/one_iter:$PYTHONPATH
echo "python path: $PYTHONPATH"
fi
}


function build_dataset(){
# link dataset
if [ "$1" = "cuda" ]; then
echo "Executing CUDA operation in build dataset..."
rm -rf data
mkdir data
ln -s /mnt/lustre/share_data/PAT/datasets/Imagenet data/imagenet
ln -s /mnt/lustre/share_data/PAT/datasets/mscoco2017 data/coco
ln -s /mnt/lustre/share_data/PAT/datasets/mmseg/cityscapes data/cityscapes
ln -s /mnt/lustre/share_data/PAT/datasets/Kinetics400 data/kinetics400
ln -s /mnt/lustre/share_data/PAT/datasets/icdar2015 data/icdar2015
ln -s /mnt/lustre/share_data/PAT/datasets/mjsynth data/mjsynth
ln -s /mnt/lustre/share_data/PAT/datasets/kitti data/kitti
ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/checkpoint/swin_large_patch4_window12_384_22k.pth data/swin_large_patch4_window12_384_22k.pth
ln -s /mnt/lustre/share_data/PAT/datasets/stable-diffusion-v1-5 data/stable-diffusion-v1-5
ln -s /mnt/lustre/share_data/PAT/datasets/llama_1B_oneiter data/llama_1B_oneiter

elif [ "$1" = "camb" ]; then
echo "Executing CAMB operation in build dataset..."
rm -rf data
mkdir data
ln -s /mnt/lustre/share_data/PAT/datasets/Imagenet data/imagenet
ln -s /mnt/lustre/share_data/PAT/datasets/mscoco2017 data/coco
ln -s /mnt/lustre/share_data/PAT/datasets/mmseg/cityscapes data/cityscapes
ln -s /mnt/lustre/share_data/PAT/datasets/mmdet3d/mmdet3d_kitti data/kitti
ln -s /mnt/lustre/share_data/PAT/datasets/mmaction/Kinetics400 data/kinetics400
ln -s /mnt/lustre/share_data/PAT/datasets/mmocr/icdar2015 data/icdar2015
ln -s /mnt/lustre/share_data/PAT/datasets/mmocr/mjsynth data/mjsynth
ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/checkpoint/swin_large_patch4_window12_384_22k.pth data/swin_large_patch4_window12_384_22k.pth
ln -s /mnt/lustre/share_data/PAT/datasets/pretrain/torchvision/resnet50-0676ba61.pth data/resnet50-0676ba61.pth
ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/pretrain/vgg16_caffe-292e1171.pth data/vgg16_caffe-292e1171.pth
ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/pretrain/darknet53-a628ea1b.pth data/darknet53-a628ea1b.pth
ln -s /mnt/lustre/share_data/PAT/datasets/mmpose/pretrain/hrnet_w32-36af842e.pth data/hrnet_w32-36af842e.pth
ln -s /mnt/lustre/share_data/PAT/datasets/pretrain/mmcv/resnet50_v1c-2cccc1ad.pth data/resnet50_v1c-2cccc1ad.pth

elif [ "$1" = "ascend" ]; then
echo "Executing ASCEND operation in build dataset..."
rm -rf data
mkdir data
ln -s /mnt/lustre/share_data/PAT/datasets/Imagenet data/imagenet
ln -s /mnt/lustre/share_data/PAT/datasets/mscoco2017 data/coco
ln -s /mnt/lustre/share_data/PAT/datasets/mmseg/cityscapes data/cityscapes
ln -s /mnt/lustre/share_data/PAT/datasets/kitti data/kitti
ln -s /mnt/lustre/share_data/PAT/datasets/mmaction/Kinetics400 data/kinetics400
ln -s /mnt/lustre/share_data/PAT/datasets/mmocr/icdar2015 data/icdar2015
ln -s /mnt/lustre/share_data/PAT/datasets/mmocr/mjsynth data/mjsynth
ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/checkpoint/swin_large_patch4_window12_384_22k.pth data/swin_large_patch4_window12_384_22k.pth
ln -s /mnt/lustre/share_data/PAT/datasets/pretrain/torchvision/resnet50-0676ba61.pth data/resnet50-0676ba61.pth
ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/pretrain/vgg16_caffe-292e1171.pth data/vgg16_caffe-292e1171.pth
ln -s /mnt/lustre/share_data/PAT/datasets/mmdet/pretrain/darknet53-a628ea1b.pth data/darknet53-a628ea1b.pth
ln -s /mnt/lustre/share_data/PAT/datasets/mmpose/pretrain/hrnet_w32-36af842e.pth data/hrnet_w32-36af842e.pth
ln -s /mnt/lustre/share_data/PAT/datasets/pretrain/mmcv/resnet50_v1c-2cccc1ad.pth data/resnet50_v1c-2cccc1ad.pth
elif [ "$1" = "ascend910b" ]; then
echo "Executing ASCEND operation in build dataset..."
rm -rf data
mkdir data
ln -s /mnt/cache/share/datasets/Imagenet data/imagenet
ln -s /mnt/cache/share/datasets/mscoco2017 data/coco
ln -s /mnt/cache/share/datasets/mmseg/cityscapes data/cityscapes
ln -s /mnt/cache/share/datasets/kitti data/kitti
ln -s /mnt/cache/share/datasets/mmaction/Kinetics400 data/kinetics400
ln -s /mnt/cache/share/datasets/mmocr/icdar2015 data/icdar2015
ln -s /mnt/cache/share/datasets/mmocr/mjsynth data/mjsynth
ln -s /mnt/cache/share/datasets/mmdet/checkpoint/swin_large_patch4_window12_384_22k.pth data/swin_large_patch4_window12_384_22k.pth
ln -s /mnt/cache/share/datasets/pretrain/torchvision/resnet50-0676ba61.pth data/resnet50-0676ba61.pth
ln -s /mnt/cache/share/datasets/mmdet/pretrain/vgg16_caffe-292e1171.pth data/vgg16_caffe-292e1171.pth
ln -s /mnt/cache/share/datasets/mmdet/pretrain/darknet53-a628ea1b.pth data/darknet53-a628ea1b.pth
ln -s /mnt/cache/share/datasets/mmpose/pretrain/hrnet_w32-36af842e.pth data/hrnet_w32-36af842e.pth
ln -s /mnt/cache/share/datasets/pretrain/mmcv/resnet50_v1c-2cccc1ad.pth data/resnet50_v1c-2cccc1ad.pth
elif [ "$1" = "kunlunxin" ]; then
echo "Executing KUNLUNXIN operation in build dataset..."
rm -rf data
mkdir data
ln -s /mnt/cache/share/datasets/imagenet data/imagenet

else
echo "Invalid parameter. Please specify 'cuda' 'camb' 'ascend' or 'kunlunxin'."
exit 1
fi
}


case $1 in
clone)
clone_needed_repo;;
build_cuda)
build_needed_repo_cuda
build_dataset cuda;;
build_camb)
build_needed_repo_camb
build_dataset camb;;
build_ascend)
build_needed_repo_ascend
build_dataset ascend;;
build_ascend910b)
build_needed_repo_ascend
build_dataset ascend910b;;
build_kunlunxin)
build_needed_repo_kunlunxin
build_dataset kunlunxin;;
export_pythonpath_camb)
export_repo_pythonpath camb $2;;
export_pythonpath_cuda)
export_repo_pythonpath cuda $2;;
export_pythonpath_ascend)
export_repo_pythonpath ascend $2;;
export_pythonpath_kunlunxin)
export_repo_pythonpath kunlunxin $2;;
*)
echo -e "[ERROR] Incorrect option:" $1;
esac
178 changes: 178 additions & 0 deletions dipu/scripts/ci/ci_run_perf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
import os
import sys
from multiprocessing import Pool
import subprocess as sp
import time
import yaml
import multiprocessing
import argparse
import logging
import json

log_format = "%(asctime)s - %(levelname)s: %(message)s"
logging.basicConfig(level=logging.INFO, format=log_format, datefmt="%Y-%m-%d %H:%M:%S")


def run_cmd(cmd: str) -> None:
cp = sp.run(cmd, shell=True, encoding="utf-8")
if cp.returncode != 0:
error = (
f"Some thing wrong has happened when running command [{cmd}]:{cp.stderr}"
)
raise Exception(error)


def parse_device_task(device):
#get json file path
device_config = dict()
current_path = os.path.dirname(os.path.realpath(__file__))
config_path = current_path+"/test_perf_config.json"
with open(config_path) as json_config:
json_content = json.loads(json_config.read())
if device in json_content:
device_config = json_content[device]
return device_config

def process_test_perf(log_file, clear_log, task: dict) -> None:
# READ CONFIG

task_name = task["name"]
storage_path = os.getcwd() + "/perf_data/" + task_name
partition = task["partition"]
job_name = "trial"
gpu_requests = task["gpu_requests"]
relative_workdir = task["relative_workdir"]
task_script = task["script"]
filter_pattern = task["filter"]
op_args = task["op_args"]

os.environ["ONE_ITER_TOOL_STORAGE_PATH"] = storage_path
os.environ["DIPU_FORCE_FALLBACK_OPS_LIST"] = task["fallback_op_list"] if "fallback_op_list" in task else ""

logging.info(f"task_name = {task_name}")

if not os.path.exists(storage_path):
os.makedirs(storage_path)

#GENERATE RUN COMMAND
cmd_run_test_perf = f"srun --job-name={job_name} --partition={partition} --gres={gpu_requests} python {task_script} {op_args}"
if device == "sco":
current_path = os.getcwd()
parent_directory = os.path.dirname(current_path)
cmd_run_test_perf = f"""srun --job-name={job_name} bash -c "cd {parent_directory} && source scripts/ci/ci_one_iter.sh export_pythonpath_cuda {current_path} && source /mnt/cache/share/deeplinkci/github/dipu_env && cd mmlab_pack && source environment_exported && export ONE_ITER_TOOL_STORAGE_PATH={storage_path} && python {current_path}/{task_script}" """

print(cmd_run_test_perf)

current_path = os.getcwd()
os.chdir(relative_workdir)
if clear_log:
run_cmd(cmd_run_test_perf + f" 2>&1 > {current_path}/{log_file}")
else:
run_cmd(cmd_run_test_perf + f" 2>&1 >> {current_path}/{log_file}")
os.chdir(current_path)

print("MATCH_PATTERN:",filter_pattern)
import re
log_content = open(f"{current_path}/{log_file}").read()
pattern = re.compile(filter_pattern)
match_result = pattern.search(log_content)
run_perf = 0.0

if match_result:
match_result = match_result.group(0)
float_pattern = re.compile("\d+(\.\d+)?")
run_perf = float(float_pattern.search(match_result).group(0))
print("RUNNING PERF:{}".format(run_perf))

def run_perf_task(device_config):
error_flag = multiprocessing.Value("i", 0) # if encount error

device = device_config["name"]

logging.info("we use {}!".format(device))
logging.info(f"main process id (ppid): {os.getpid()} {os.getppid()}")
logging.info(f"python path: {os.environ.get('PYTHONPATH', None)}")

os.environ["DIPU_DUMP_OP_ARGS"] = "0"
os.environ["DIPU_DEBUG_ALLOCATOR"] = "0"
os.environ["ONE_ITER_TOOL_DEVICE"] = "dipu"

current_path = os.path.dirname(os.path.realpath(__file__))
env_file_path = os.path.join(current_path, "environment_exported")
env_variables = os.environ
keywords_to_filter = ["DIPU", "ONE_ITER"]
if os.path.exists(env_file_path):
os.remove(env_file_path)

with open("environment_exported", "w") as file:
file.write("pwd\n")
for key, value in env_variables.items():
if any(keyword in key for keyword in keywords_to_filter):
file.write(f'export {key}="{value}"\n')

tasks = device_config["tasks"]
logging.info(f"tasks nums: {len(tasks)}")

if not os.path.exists("perf_data"):
os.mkdir("perf_data")

pool = Pool(max_parall)
log_files = []
try:
for i in range(len(tasks)):
task = tasks[i]
log_file = f"child_{i % max_parall}_log.txt"
log_files.append(log_file)
pool.apply_async(
process_test_perf,
args=(
log_file,
True,
task,
),
error_callback=handle_error,
)
logging.info("Waiting for all subprocesses done...")
pool.close()
pool.join()
for log_file in log_files:
print_file(log_file)
if error_flag.value != 0:
exit(1)
logging.info("All subprocesses done.")
except Exception as e:
logging.error(e)
exit(1)

def handle_error(error: str) -> None:
logging.error(f"Error: {error}")
if pool is not None:
logging.error("Kill all!")
pool.terminate()
error_flag.value = 1


def print_file(file_name):
with open(file_name) as f:
lines = f.read()
logging.info(lines)


if __name__ == "__main__":
# set some params
max_parall = 8
parser = argparse.ArgumentParser(description="set some params.")
parser.add_argument("device", type=str, help="the device to use")
parser.add_argument("job_name", type=str, help="the name of the job")
args = parser.parse_args()

device = args.device
job_name = args.job_name

device_config = parse_device_task(device)
print(device_config)

logging.info(
f"device: {device}, job_name: {job_name}"
)
run_perf_task(device_config)
Loading
Loading