diff --git a/configs/test/batch/batch.yaml b/configs/test/batch/batch.yaml index 8ddfa25e56..c28b0715ed 100644 --- a/configs/test/batch/batch.yaml +++ b/configs/test/batch/batch.yaml @@ -39,6 +39,7 @@ mapping: subnetwork: 'projects/google.com:clusterfuzz/regions/gce-region/subnetworks/subnetworkname' preemptible: false machine_type: n1-standard-1 + retry: true LINUX-PREEMPTIBLE: clusterfuzz_release: 'prod' docker_image: 'gcr.io/clusterfuzz-images/base:a2f4dd6-202202070654' diff --git a/src/clusterfuzz/_internal/google_cloud_utils/batch.py b/src/clusterfuzz/_internal/google_cloud_utils/batch.py index 8fb8b8b639..553ebb2994 100644 --- a/src/clusterfuzz/_internal/google_cloud_utils/batch.py +++ b/src/clusterfuzz/_internal/google_cloud_utils/batch.py @@ -33,7 +33,7 @@ _local = threading.local() -RETRY_COUNT = 0 +DEFAULT_RETRY_COUNT = 0 TASK_BUNCH_SIZE = 20 @@ -60,6 +60,7 @@ 'gce_region', 'priority', 'max_run_duration', + 'retry', ]) @@ -139,7 +140,13 @@ def _get_task_spec(batch_workload_spec): runnable.container.volumes = ['/var/scratch0:/mnt/scratch0'] task_spec = batch.TaskSpec() task_spec.runnables = [runnable] - task_spec.max_retry_count = RETRY_COUNT + if batch_workload_spec.retry: + # Tasks in general have 6 hours to run (except pruning which has 24). + # Our signed URLs last 24 hours. Therefore, the maxiumum number of retries + # is 4. This is a temporary solution anyway. + task_spec.max_retry_count = 4 + else: + task_spec.max_retry_count = DEFAULT_RETRY_COUNT task_spec.max_run_duration = batch_workload_spec.max_run_duration return task_spec @@ -282,6 +289,7 @@ def _get_spec_from_config(command, job_name): project_name = batch_config.get('project') docker_image = instance_spec['docker_image'] user_data = instance_spec['user_data'] + should_retry = instance_spec.get('retry', False) clusterfuzz_release = instance_spec.get('clusterfuzz_release', 'prod') # Lower numbers are lower priority. From: @@ -290,6 +298,8 @@ def _get_spec_from_config(command, job_name): priority = 0 if low_priority else 1 max_run_duration = f'{_get_task_duration(command)}s' + if command == 'corpus_pruning': + should_retry = False # It is naturally retried the next day. spec = BatchWorkloadSpec( clusterfuzz_release=clusterfuzz_release, @@ -309,5 +319,6 @@ def _get_spec_from_config(command, job_name): machine_type=instance_spec['machine_type'], priority=priority, max_run_duration=max_run_duration, + retry=should_retry, ) return spec diff --git a/src/clusterfuzz/_internal/tests/core/google_cloud_utils/batch_test.py b/src/clusterfuzz/_internal/tests/core/google_cloud_utils/batch_test.py index 5f430ced78..5b9135103a 100644 --- a/src/clusterfuzz/_internal/tests/core/google_cloud_utils/batch_test.py +++ b/src/clusterfuzz/_internal/tests/core/google_cloud_utils/batch_test.py @@ -51,6 +51,7 @@ def test_nonpreemptible(self): preemptible=False, machine_type='n1-standard-1', priority=1, + retry=True, max_run_duration='21600s', ) @@ -77,6 +78,7 @@ def test_fuzz_get_spec_from_config(self): preemptible=True, machine_type='n1-standard-1', priority=0, + retry=False, max_run_duration='21600s', )