From bf5d9282537b2883334b087e71aec63fd2ae31c1 Mon Sep 17 00:00:00 2001 From: Abhishek Thakur Date: Tue, 7 Nov 2023 17:37:19 +0100 Subject: [PATCH] ngc backend --- Dockerfile.api | 2 -- Makefile | 5 +++ src/autotrain/api.py | 5 ++- src/autotrain/backend.py | 65 ++++++++++++++++++++++++++++++++++++ src/autotrain/cli/run_llm.py | 6 ++-- 5 files changed, 77 insertions(+), 6 deletions(-) delete mode 100644 Dockerfile.api diff --git a/Dockerfile.api b/Dockerfile.api deleted file mode 100644 index 94dd3358c9..0000000000 --- a/Dockerfile.api +++ /dev/null @@ -1,2 +0,0 @@ -FROM huggingface/autotrain-advanced:latest -CMD autotrain setup && autotrain api --port 7860 --host 0.0.0.0 diff --git a/Makefile b/Makefile index cc8e0146ce..e66f41ab45 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,11 @@ docker: docker tag autotrain-advanced:latest huggingface/autotrain-advanced:latest docker push huggingface/autotrain-advanced:latest +ngc: + docker build -t autotrain-advanced:latest . + docker tag autotrain-advanced:latest nvcr.io/ycymhzotssoi/autotrain-advanced:latest + docker push nvcr.io/ycymhzotssoi/autotrain-advanced:latest + pip: rm -rf build/ rm -rf dist/ diff --git a/src/autotrain/api.py b/src/autotrain/api.py index 22a7fc46a2..29334eced3 100644 --- a/src/autotrain/api.py +++ b/src/autotrain/api.py @@ -38,7 +38,10 @@ def run_training(): params = json.loads(PARAMS) logger.info(params) if TASK_ID == 9: - params = LLMTrainingParams.parse_raw(params) + try: + params = LLMTrainingParams.parse_raw(params) + except Exception: + params = LLMTrainingParams.parse_obj(params) params.project_name = "/tmp/model" params.save(output_dir=params.project_name) cmd = ["accelerate", "launch", "--num_machines", "1", "--num_processes", "1"] diff --git a/src/autotrain/backend.py b/src/autotrain/backend.py index 9b5a38cede..4e1d49d6bd 100644 --- a/src/autotrain/backend.py +++ b/src/autotrain/backend.py @@ -1,6 +1,7 @@ import io import json import os +import subprocess from dataclasses import dataclass from typing import Union @@ -262,6 +263,7 @@ def __post_init__(self): "t4s": "t4-small", "cpu": "cpu-upgrade", "cpuf": "cpu-basic", + "dgx-a100": "dgxa100.80g.1.norm", } if not isinstance(self.params, GenericParams): if self.params.repo_id is not None: @@ -361,6 +363,30 @@ def _add_secrets(self, api, repo_id): api.add_space_secret(repo_id=repo_id, key="OUTPUT_MODEL_REPO", value=self.params.repo_id) def _create_space(self): + if self.backend.startswith("dgx-"): + env_vars = { + "HF_TOKEN": self.params.token, + "AUTOTRAIN_USERNAME": self.username, + "PROJECT_NAME": self.params.project_name, + "TASK_ID": str(self.task_id), + "PARAMS": json.dumps(self.params.json()), + } + if isinstance(self.params, DreamBoothTrainingParams): + env_vars["DATA_PATH"] = self.params.image_path + else: + env_vars["DATA_PATH"] = self.params.data_path + + if not isinstance(self.params, GenericParams): + env_vars["MODEL"] = self.params.model + env_vars["OUTPUT_MODEL_REPO"] = self.params.repo_id + + ngc_runner = NGCRunner( + job_name=self.params.repo_id.replace("/", "-"), + env_vars=env_vars, + backend=self.backend, + ) + ngc_runner.create() + return api = HfApi(token=self.params.token) repo_id = f"{self.username}/autotrain-{self.params.project_name}" api.create_repo( @@ -387,3 +413,42 @@ def _create_space(self): repo_type="space", ) return repo_id + + +@dataclass +class NGCRunner: + job_name: str + env_vars: dict + backend: str + + def __post_init__(self): + self.ngc_ace = os.environ.get("NGC_ACE") + self.ngc_org = os.environ.get("NGC_ORG") + self.instance_map = { + "dgx-a100": "dgxa100.80g.1.norm", + } + logger.info("Creating NGC Job") + logger.info(f"NGC_ACE: {self.ngc_ace}") + logger.info(f"NGC_ORG: {self.ngc_org}") + logger.info(f"job_name: {self.job_name}") + logger.info(f"backend: {self.backend}") + + def create(self): + cmd = "ngc base-command job run --name {job_name}" + cmd += " --priority NORMAL --order 50 --preempt RUNONCE --min-timeslice 0s" + cmd += " --total-runtime 3600s --ace {ngc_ace} --org {ngc_org} --instance {instance}" + cmd += " --commandline 'set -x; conda run --no-capture-output -p /app/env autotrain api --port 7860 --host 0.0.0.0' -p 7860 --result /results" + cmd += " --image '{ngc_org}/autotrain-advanced:latest'" + + cmd = cmd.format( + job_name=self.job_name, + ngc_ace=self.ngc_ace, + ngc_org=self.ngc_org, + instance=self.instance_map[self.backend], + ) + + for k, v in self.env_vars.items(): + cmd += f" --env-var {k}:{v}" + + # run using subprocess, wait for completion + subprocess.run(cmd, shell=True, check=True) diff --git a/src/autotrain/cli/run_llm.py b/src/autotrain/cli/run_llm.py index 9f0e151c24..216e11677c 100644 --- a/src/autotrain/cli/run_llm.py +++ b/src/autotrain/cli/run_llm.py @@ -447,8 +447,8 @@ def __init__(self, args): if self.args.backend.startswith("spaces") or self.args.backend.startswith("ep-"): if not self.args.push_to_hub: raise ValueError("Push to hub must be specified for spaces backend") - if self.args.repo_id is None: - raise ValueError("Repo id must be specified for spaces backend") + if self.args.username is None and self.args.repo_id is None: + raise ValueError("Repo id or username must be specified for spaces backend") if self.args.token is None: raise ValueError("Token must be specified for spaces backend") @@ -534,7 +534,7 @@ def run(self): ) # space training - if self.args.backend.startswith("spaces"): + if self.args.backend.startswith("spaces") or self.args.backend.startswith("dgx"): logger.info("Creating space...") sr = SpaceRunner( params=params,