From b6296868c3d491737c4d2b3dcfe04742b5fea443 Mon Sep 17 00:00:00 2001 From: Ayush Sawant Date: Mon, 4 Dec 2023 14:24:40 +0530 Subject: [PATCH] Download support for custom Huggingface models (#40) * Download custom Huggingface models and progress bars * changes default repo_version to None in pytest --- .github/workflows/lint.yaml | 2 +- llm/download.py | 44 ++++++++++++++--- llm/kubeflow_inference_run.py | 20 ++++++-- llm/run.sh | 2 +- llm/tests/test_download.py | 73 +++++++++++++++++++++++++++- llm/utils/generate_data_model.py | 3 ++ llm/utils/hf_utils.py | 25 +++++++--- llm/utils/marsgen.py | 83 ++++++++++++++++++++++++++++---- llm/utils/system_utils.py | 26 +++++++++- llm/utils/tsutils.py | 2 +- 10 files changed, 245 insertions(+), 35 deletions(-) diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index cb5a016..f3aab36 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -25,7 +25,7 @@ jobs: python-version: 3.11 - name: Install Python dependencies - run: pip install pytest black pylint torchserve==0.8.2 torch==2.0.1 transformers==4.33.0 -r llm/requirements.txt + run: pip install --no-cache-dir pytest black pylint torchserve==0.8.2 torch==2.0.1 transformers==4.33.0 -r llm/requirements.txt - name: Run pylint run: pylint ./llm diff --git a/llm/download.py b/llm/download.py index a103b74..4dab97b 100644 --- a/llm/download.py +++ b/llm/download.py @@ -199,7 +199,7 @@ class with relevant information. hf.hf_token_check(gen_model.repo_info.repo_id, gen_model.repo_info.hf_token) - if gen_model.repo_info.repo_version == "": + if not gen_model.repo_info.repo_version: gen_model.repo_info.repo_version = model["repo_version"] gen_model.repo_info.repo_version = hf.get_repo_commit_id( @@ -225,15 +225,36 @@ class with relevant information. os.path.dirname(__file__), HANDLER, ) - if gen_model.repo_info.repo_version == "": + if not gen_model.repo_info.repo_version: gen_model.repo_info.repo_version = "1.0" + elif gen_model.repo_info.repo_id: + hf.hf_token_check(gen_model.repo_info.repo_id, gen_model.repo_info.hf_token) + gen_model.repo_info.repo_version = hf.get_repo_commit_id( + repo_id=gen_model.repo_info.repo_id, + revision=gen_model.repo_info.repo_version, + token=gen_model.repo_info.hf_token, + ) + gen_model.is_custom = True + if gen_model.mar_utils.handler_path == "": + gen_model.mar_utils.handler_path = os.path.join( + os.path.dirname(__file__), + HANDLER, + ) else: print( - "## Please check your model name, it should be one of the following : " + "## If you want to create a model archive file with the supported models, " + "make sure you're model name is present in the below : " ) print(list(models.keys())) print( - "If it is a custom model and you have model files include no_download flag : " + "If you want to create a model archive file for a custom model, there " + "are two methods:\n" + "1. If you have already downloaded the custom model files, please include" + " the --no_download flag and provide the model_path directory which contains " + "the model files.\n" + "2. If you need to download the model files, provide the HuggingFace " + "repository ID along with a model_path driectory where the model " + "files are to be downloaded." ) sys.exit(1) @@ -295,13 +316,13 @@ class with relevant information. else: if check_if_folder_empty(gen_model.mar_utils.model_path): print( - f"\n##Error: {gen_model.model_name} model files not found" - f" in the provided path: {gen_model.mar_utils.model_path}" + f"\n##Error: {gen_model.model_name} model files for the custom" + f" model not found in the provided path: {gen_model.mar_utils.model_path}" ) sys.exit(1) else: print( - f"\n## Generating MAR file for custom model files: {gen_model.model_name}" + f"\n## Generating MAR file for custom model files: {gen_model.model_name} \n" ) create_folder_if_not_exists(gen_model.mar_utils.mar_output) @@ -348,6 +369,13 @@ def run_script(params: argparse.Namespace) -> bool: metavar="mn", help="name of the model", ) + parser.add_argument( + "--repo_id", + type=str, + default="", + metavar="ri", + help="HuggingFace repository ID (In case of custom model download)", + ) parser.add_argument( "--no_download", action="store_false", help="flag to not download" ) @@ -376,7 +404,7 @@ def run_script(params: argparse.Namespace) -> bool: parser.add_argument( "--repo_version", type=str, - default="", + default=None, metavar="rv", help="commit id of the HuggingFace Repo", ) diff --git a/llm/kubeflow_inference_run.py b/llm/kubeflow_inference_run.py index 83ce6cf..85e5be0 100644 --- a/llm/kubeflow_inference_run.py +++ b/llm/kubeflow_inference_run.py @@ -8,6 +8,7 @@ import os import time from typing import List, Dict +import tqdm import utils.tsutils as ts import utils.hf_utils as hf from utils.system_utils import check_if_path_exists, get_all_files_in_directory @@ -300,8 +301,15 @@ def health_check(model_name: str, deploy_name: str, model_timeout: int) -> None: model_input = os.path.join(os.path.dirname(__file__), PATH_TO_SAMPLE) retry_count = 0 - sleep_time = 30 + sleep_time = 15 success = False + total_tries = model_timeout / sleep_time + progress_bar = tqdm.tqdm( + total=total_tries, + unit="check", + desc="Waiting for Model to be ready", + bar_format="{desc}: |{bar}| {n_fmt}/{total_fmt} checks", + ) while not success and retry_count * sleep_time < model_timeout: success = execute_inference_on_inputs( [model_input], model_name, deploy_name, retry=True @@ -310,12 +318,16 @@ def health_check(model_name: str, deploy_name: str, model_timeout: int) -> None: if not success: time.sleep(sleep_time) retry_count += 1 + progress_bar.update(1) if success: - print("## Health check passed. Model deployed.\n\n") + progress_bar.update(total_tries - retry_count) + progress_bar.close() + print("\n## Health check passed. Model deployed.\n") else: + progress_bar.close() print( - f"## Failed health check after multiple retries for model - {model_name} \n" + f"\n## Failed health check after multiple retries for model - {model_name} \n" ) sys.exit(1) @@ -377,7 +389,7 @@ def execute(params: argparse.Namespace) -> None: create_pvc(core_api, deploy_name, storage) create_isvc(deploy_name, model_info, deployment_resources, model_params) - print("wait for model registration to complete, will take some time") + print("\nWait for model registration to complete, will take some time. \n") health_check(model_info["model_name"], deploy_name, model_timeout) if input_path: diff --git a/llm/run.sh b/llm/run.sh index 4578316..8ea15dd 100644 --- a/llm/run.sh +++ b/llm/run.sh @@ -4,7 +4,7 @@ wdir=$(dirname "$SCRIPT") CPU_POD="8" MEM_POD="32Gi" -MODEL_TIMEOUT_IN_SEC="1200" +MODEL_TIMEOUT_IN_SEC="1500" function helpFunction() { diff --git a/llm/tests/test_download.py b/llm/tests/test_download.py index f45be9c..4ea9a63 100644 --- a/llm/tests/test_download.py +++ b/llm/tests/test_download.py @@ -29,7 +29,7 @@ def set_args( model_name="", output="", model_path="", - repo_version="", + repo_version=None, handler_path="", ): """ @@ -50,6 +50,7 @@ def set_args( args.model_path = model_path args.no_download = True args.repo_version = repo_version + args.repo_id = "" args.handler_path = handler_path args.hf_token = None args.debug = False @@ -250,7 +251,7 @@ def test_short_repo_version_success(): assert result is True -def test_custom_model_success(): +def test_custom_model_with_modelfiles_success(): """ This function tests the custom model case. This is done by clearing the 'model_config.json' and @@ -285,8 +286,76 @@ def test_custom_model_no_model_files_failure(): args.no_download = False try: download.run_script(args) + except SystemExit as e: + custom_model_restore() + assert e.code == 1 + else: + assert False + + +def test_custom_model_with_repo_id_success(): + """ + This function tests the custom model case where + model files are to be downloaded for provided + repo ID. + This is done by clearing the 'model_config.json' and + generating the 'GPT2' MAR file. + Expected result: Success. + """ + model_path = custom_model_setup() + args = set_args(MODEL_NAME, OUTPUT, model_path) + args.repo_id = "gpt2" + try: + result = download.run_script(args) + custom_model_restore() + except SystemExit: + assert False + else: + assert result is True + + +def test_custom_model_wrong_repo_id_failure(): + """ + This function tests the custom model case when + model repo ID is wrong. + Expected result: Failure. + """ + model_path = custom_model_setup() + model_store_path = os.path.join( + os.path.dirname(__file__), MODEL_NAME, "model-store" + ) + empty_folder(model_path) + empty_folder(model_store_path) + args = set_args(MODEL_NAME, OUTPUT, model_path) + args.repo_id = "wrong_repo_id" + try: + download.run_script(args) + except SystemExit as e: custom_model_restore() + assert e.code == 1 + else: + assert False + + +def test_custom_model_wrong_repo_version_failure(): + """ + This function tests the custom model case when + model repo version is wrong. + Expected result: Failure. + """ + model_path = custom_model_setup() + model_store_path = os.path.join( + os.path.dirname(__file__), MODEL_NAME, "model-store" + ) + empty_folder(model_path) + empty_folder(model_store_path) + args = set_args(MODEL_NAME, OUTPUT, model_path) + args.repo_id = "gpt2" + args.repo_version = "wrong_version" + try: + download.run_script(args) except SystemExit as e: + custom_model_restore() assert e.code == 1 else: assert False diff --git a/llm/utils/generate_data_model.py b/llm/utils/generate_data_model.py index 7ec5fc3..3c0af1c 100644 --- a/llm/utils/generate_data_model.py +++ b/llm/utils/generate_data_model.py @@ -25,6 +25,8 @@ class MarUtils: mar_output = str() model_path = str() handler_path = str() + extra_files = str() + requirements_file = str() @dataclasses.dataclass @@ -100,6 +102,7 @@ class with values set based on the arguments. self.mar_utils.handler_path = params.handler_path + self.repo_info.repo_id = params.repo_id self.repo_info.repo_version = params.repo_version self.repo_info.hf_token = params.hf_token diff --git a/llm/utils/hf_utils.py b/llm/utils/hf_utils.py index a2f6c97..6a9ca59 100644 --- a/llm/utils/hf_utils.py +++ b/llm/utils/hf_utils.py @@ -7,6 +7,8 @@ from huggingface_hub.utils import ( RepositoryNotFoundError, RevisionNotFoundError, + HfHubHTTPError, + HFValidationError, ) from utils.generate_data_model import GenerateDataModel @@ -34,11 +36,17 @@ class with relevant information. token=gen_model.repo_info.hf_token, ) return repo_files - except (RepositoryNotFoundError, RevisionNotFoundError, KeyError): + except ( + HfHubHTTPError, + HFValidationError, + RepositoryNotFoundError, + RevisionNotFoundError, + KeyError, + ): print( ( - "## Error: Please check either repo_id, repo_version " - "or huggingface token is not correct" + "\n## Error: Please check either repo_id, repo_version " + "or huggingface token is not correct\n" ) ) sys.exit(1) @@ -68,11 +76,16 @@ def get_repo_commit_id(repo_id: str, revision: str, token: str) -> str: token=token, ) return commit_info[0].commit_id - except (RepositoryNotFoundError, RevisionNotFoundError): + except ( + HfHubHTTPError, + HFValidationError, + RepositoryNotFoundError, + RevisionNotFoundError, + ): print( ( - "## Error: Please check either repo_id, repo_version " - "or huggingface token is not correct" + "\n## Error: Please check either repo_id, repo_version " + "or huggingface token is not correct\n" ) ) sys.exit(1) diff --git a/llm/utils/marsgen.py b/llm/utils/marsgen.py index 3ebd1bc..7e5ab01 100644 --- a/llm/utils/marsgen.py +++ b/llm/utils/marsgen.py @@ -5,13 +5,57 @@ import os import sys import subprocess +import threading +import time from typing import Dict -from utils.system_utils import check_if_path_exists, get_all_files_in_directory +import tqdm +from utils.system_utils import ( + check_if_path_exists, + get_all_files_in_directory, + get_files_sizes, +) from utils.generate_data_model import GenerateDataModel REQUIREMENTS_FILE = "model_requirements.txt" +def monitor_marfile_size( + file_path: str, approx_marfile_size: float, stop_monitoring: threading.Event +) -> None: + """ + Monitor the generation of a Model Archive File and display progress. + + Args: + file_path (str): The path to the Model Archive File. + approx_marfile_size (float): The approximate size of the Model Archive File in bytes. + stop_monitoring (EVENT): event which states when to stop monitoring + Return: + None + """ + print("Model Archive File is Generating...\n") + previous_file_size = 0 + progress_bar = tqdm.tqdm( + total=approx_marfile_size, + unit="B", + unit_scale=True, + desc="Creating Model Archive", + ) + while not stop_monitoring.is_set(): + try: + current_file_size = os.path.getsize(file_path) + except FileNotFoundError: + current_file_size = 0 + size_change = current_file_size - previous_file_size + previous_file_size = current_file_size + progress_bar.update(size_change) + time.sleep(2) + progress_bar.update(approx_marfile_size - current_file_size) + progress_bar.close() + print( + f"\nModel Archive file size: {os.path.getsize(file_path) / (1024 ** 3):.2f} GB\n" + ) + + def generate_mars( gen_model: GenerateDataModel, model_config: str, @@ -53,23 +97,25 @@ def generate_mars( print(list(models.keys())) sys.exit(1) - extra_files = None + gen_model.mar_utils.extra_files = None extra_files_list = get_all_files_in_directory(gen_model.mar_utils.model_path) extra_files_list = [ os.path.join(gen_model.mar_utils.model_path, file) for file in extra_files_list ] - extra_files = ",".join(extra_files_list) + gen_model.mar_utils.extra_files = ",".join(extra_files_list) - requirements_file = os.path.join(os.path.dirname(__file__), REQUIREMENTS_FILE) - check_if_path_exists(requirements_file) + gen_model.mar_utils.requirements_file = os.path.join( + os.path.dirname(__file__), REQUIREMENTS_FILE + ) + check_if_path_exists(gen_model.mar_utils.requirements_file) model_archiver_args = { "model_name": gen_model.model_name, "version": gen_model.repo_info.repo_version, "handler": gen_model.mar_utils.handler_path, - "extra_files": extra_files, - "requirements_file": requirements_file, + "extra_files": gen_model.mar_utils.extra_files, + "requirements_file": gen_model.mar_utils.requirements_file, "export_path": model_store_dir, } cmd = model_archiver_command_builder( @@ -81,9 +127,26 @@ def generate_mars( print(f"## In directory: {os.getcwd()} | Executing command: {cmd}\n") try: + # Event to stop the thread from monitoring output file size. + stop_monitoring = threading.Event() + + # Approximate size of output Model Archive file. + approx_marfile_size = get_files_sizes(extra_files_list) / 1.15 + + # Creating a thread to monitor MAR file size while generation and show progress bar. + mar_size_thread = threading.Thread( + target=monitor_marfile_size, + args=( + os.path.join(model_store_dir, f"{gen_model.model_name}.mar"), + approx_marfile_size, + stop_monitoring, + ), + ) + mar_size_thread.start() subprocess.check_call(cmd, shell=True) - marfile = f"{gen_model.model_name}.mar" - print(f"## {marfile} is generated.\n") + stop_monitoring.set() + mar_size_thread.join() + print(f"## {gen_model.model_name}.mar is generated.\n") except subprocess.CalledProcessError as exc: print("## Creation failed !\n") if debug: @@ -165,7 +228,7 @@ def model_archiver_command_builder( cmd += f" --export-path {model_archiver_args['export_path']}" if force: cmd += " --force" - print("\n## Generating mar file, will take few mins.\n") + print("## Generating MAR file, will take few mins.\n") if debug: print(cmd) return cmd diff --git a/llm/utils/system_utils.py b/llm/utils/system_utils.py index 63224ae..8e81610 100644 --- a/llm/utils/system_utils.py +++ b/llm/utils/system_utils.py @@ -33,7 +33,7 @@ def create_folder_if_not_exists(path: str) -> None: None """ os.makedirs(path, exist_ok=True) - print(f"The new directory is created! - {path}") + print(f"The new directory is created! - {path} \n") def delete_directory(directory_path: str) -> None: @@ -51,7 +51,7 @@ def delete_directory(directory_path: str) -> None: return try: shutil.rmtree(directory_path) - print(f"Deleted all contents from '{directory_path}'") + print(f"Deleted all contents from '{directory_path}' \n") except OSError as e: print(f"Error deleting contents from '{directory_path}': {str(e)}") @@ -103,3 +103,25 @@ def check_if_folder_empty(path: str) -> bool: """ dir_items = os.listdir(path) return len(dir_items) == 0 + + +def get_files_sizes(file_paths: List[str]) -> float: + """ + Calculate the total size of the specified files. + + Args: + file_paths (list): A list of file paths for which the sizes should be calculated. + + Returns: + total_size (float): The sum of sizes (in bytes) of all the specified files. + """ + total_size = 0 + + for file_path in file_paths: + try: + size = os.path.getsize(file_path) + total_size += size + except FileNotFoundError: + print(f"File not found: {file_path}") + + return total_size diff --git a/llm/utils/tsutils.py b/llm/utils/tsutils.py index 3a2ad98..1017d54 100644 --- a/llm/utils/tsutils.py +++ b/llm/utils/tsutils.py @@ -78,7 +78,7 @@ def get_model_params(model_name: str) -> Dict[str, str]: else: model_params["is_custom"] = True print( - f"## Using custom MAR file : {model_name}.mar\n\n" + f"\n## Using custom MAR file : {model_name}.mar\n\n" "WARNING: This model has not been validated on any GPUs\n\n" )