Skip to content

Commit

Permalink
Set 48h timeout, limit NVCF instance types
Browse files Browse the repository at this point in the history
Signed-off-by: J.R. Morgan <[email protected]>
  • Loading branch information
liveaverage committed Mar 6, 2024
1 parent 8a7aca4 commit c9aeb46
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 12 deletions.
9 changes: 5 additions & 4 deletions src/autotrain/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,8 @@ def __post_init__(self):
self.nvcf_token = os.environ.get("NVCF_API_TOKEN")

self.instance_map = {
"nvcf-l40": {"backend": "GFN", "id": "67bb8939-c932-429a-a446-8ae898311856"},
"nvcf-l40": {"id": "67bb8939-c932-429a-a446-8ae898311856"},
"nvcf-h100x1": {"id": "848348f8-a4e2-4242-bce9-6baa1bd70a66"},
}

logger.info("Starting NVCF training")
Expand All @@ -422,7 +423,7 @@ def _convert_dict_to_object(self, dictionary):
return dictionary

def _conf_nvcf(self, token, nvcf_type, url, method="POST", payload=None):
logger.info(f"{self.job_name}: {method} - Configuring NVCF {nvcf_type}.")
logger.info(f"{self.job_name}: {method} - Configuring NVCF {nvcf_type}.")
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"}

try:
Expand Down Expand Up @@ -472,7 +473,7 @@ def _poll_nvcf(self, url, token, method="get", timeout=86400, interval=30, op="p
while time.time() - start_time < timeout:
try:
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"}
if method == "get":
if method.upper() == "GET":
response = requests.get(url, headers=headers)
else:
raise ValueError(f"Unsupported HTTP method: {method}")
Expand Down Expand Up @@ -527,4 +528,4 @@ def create(self):

nvcf_url_reqpoll = f"{self.nvcf_api}/v2/nvcf/pexec/status/{nvcf_fn_req}"
logger.info(f"{self.job_name}: Polling : {nvcf_url_reqpoll}")
self._poll_nvcf(url=nvcf_url_reqpoll, token=self.nvcf_token, method="GET", timeout=1200, interval=20)
self._poll_nvcf(url=nvcf_url_reqpoll, token=self.nvcf_token, method="GET", timeout=172800, interval=20)
6 changes: 1 addition & 5 deletions src/autotrain/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,8 @@ def __post_init__(self):
"DGX 2xA100": "dgx-2a100",
"DGX 4xA100": "dgx-4a100",
"DGX 8xA100": "dgx-8a100",
"NVCF 1xA100": "nvcf-a100",
"NVCF 8xA100": "nvcf-8a100",
"NVCF 1xA10G": "nvcf-a10g",
"NVCF 1xH100": "nvcf-h100x1",
"NVCF 1xL40": "nvcf-l40",
"NVCF 1xL40G": "nvcf-l40g",
"NVCF 1xT10": "nvcf-t10",
"Local": "local",
"EP US-East-1 1xA10g": "ep-aws-useast1-m",
"EP US-East-1 1xA100": "ep-aws-useast1-xl",
Expand Down
4 changes: 1 addition & 3 deletions src/autotrain/templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -189,10 +189,8 @@
{% endif %}
{% if enable_nvcf == 1 %}
<optgroup label="NVIDIA Cloud Functions">
<option value="NVCF 1xA10G">1xA10G</option>
<option value="NVCF 1xL40">1xL40</option>
<option value="NVCF 1xL40G">1xL40G</option>
<option value="NVCF 1xT10">1xT10</option>
<option value="NVCF 1xH100">1xH100</option>
</optgroup>
{% endif %}
{% endif %}
Expand Down

0 comments on commit c9aeb46

Please sign in to comment.