From 01c5b05acd5728894e2066f35e03c98474efbbca Mon Sep 17 00:00:00 2001 From: tooyosi Date: Wed, 11 Sep 2024 15:40:09 +0100 Subject: [PATCH 1/2] add retry constraints and wait_for_success as true for training jobs --- bajor/batch/train_finetuning.py | 3 ++- bajor/batch/train_from_scratch.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/bajor/batch/train_finetuning.py b/bajor/batch/train_finetuning.py index 9f82068..9cd2e28 100644 --- a/bajor/batch/train_finetuning.py +++ b/bajor/batch/train_finetuning.py @@ -112,6 +112,7 @@ def create_batch_job(job_id, manifest_container_path, pool_id): setup_pytorch_kernel_cache_dir = 'mkdir -p $AZ_BATCH_NODE_SHARED_DIR/.cache/torch/kernels' job.job_preparation_task = batchmodels.JobPreparationTask( command_line=f'/bin/bash -c \"set -ex; {setup_pytorch_kernel_cache_dir}; {create_results_dir}; {copy_code_to_shared_dir}\"', + constraints=batchmodels.TaskConstraints(max_task_retry_count=3), # # A busted preparation task means the main task won't launch...ever! # and leave the node in a scaled state costing $$ ££ @@ -123,7 +124,7 @@ def create_batch_job(job_id, manifest_container_path, pool_id): # Short term: avoid waiting for this prep task to complete before starting the main task # https://learn.microsoft.com/en-us/python/api/azure-batch/azure.batch.models.JobPreparationTask?view=azure-python#constructor # https://learn.microsoft.com/en-us/azure/batch/batch-job-task-error-checking#job-preparation-tasks - wait_for_success=False) + wait_for_success=True) # Job release task that runs after the job completes diff --git a/bajor/batch/train_from_scratch.py b/bajor/batch/train_from_scratch.py index cea3e8e..be92576 100644 --- a/bajor/batch/train_from_scratch.py +++ b/bajor/batch/train_from_scratch.py @@ -104,6 +104,7 @@ def create_batch_job(job_id, manifest_container_path, pool_id): copy_code_to_shared_dir = 'cp -Rf $AZ_BATCH_NODE_MOUNTS_DIR/$TRAINING_CONTAINER_MOUNT_DIR/$CODE_DIR_PATH/* $AZ_BATCH_NODE_SHARED_DIR/' job.job_preparation_task = batchmodels.JobPreparationTask( command_line=f'/bin/bash -c \"set -ex; {create_results_dir}; {copy_code_to_shared_dir}\"', + constraints=batchmodels.TaskConstraints(max_task_retry_count=3), # # A busted preparation task means the main task won't launch...ever! # and leave the node in a scaled state costing $$ ££ @@ -115,7 +116,7 @@ def create_batch_job(job_id, manifest_container_path, pool_id): # Short term: avoid waiting for this prep task to complete before starting the main task # https://learn.microsoft.com/en-us/python/api/azure-batch/azure.batch.models.JobPreparationTask?view=azure-python#constructor # https://learn.microsoft.com/en-us/azure/batch/batch-job-task-error-checking#job-preparation-tasks - wait_for_success=False) + wait_for_success=True) # Job release task that runs after the job completes From a2ec5fa020ff6933152b595a94b9f1894ba2726b Mon Sep 17 00:00:00 2001 From: tooyosi Date: Wed, 11 Sep 2024 15:42:08 +0100 Subject: [PATCH 2/2] add retry constraints to prediction job --- bajor/batch/predictions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bajor/batch/predictions.py b/bajor/batch/predictions.py index 5bc878c..95f4462 100644 --- a/bajor/batch/predictions.py +++ b/bajor/batch/predictions.py @@ -84,6 +84,7 @@ def create_batch_job(job_id, manifest_url, pool_id): copy_code_to_shared_dir = 'cp -Rf $AZ_BATCH_NODE_MOUNTS_DIR/$PREDICTIONS_CONTAINER_MOUNT_DIR/$CODE_DIR_PATH/* $AZ_BATCH_NODE_SHARED_DIR/' job.job_preparation_task = batchmodels.JobPreparationTask( command_line=f'/bin/bash -c \"set -ex; {create_results_dir}; {copy_code_to_shared_dir}\"', + constraints=batchmodels.TaskConstraints(max_task_retry_count=3), # # A busted preparation task means the main task won't launch...ever! # and leave the node in a scaled state costing $$ ££