Skip to content

Commit

Permalink
Retry failing tasks in batch. (#4468)
Browse files Browse the repository at this point in the history
Except for fuzz and corpus_pruning.
  • Loading branch information
jonathanmetzman authored Dec 1, 2024
1 parent ecb90a1 commit 10461e6
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 2 deletions.
1 change: 1 addition & 0 deletions configs/test/batch/batch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ mapping:
subnetwork: 'projects/google.com:clusterfuzz/regions/gce-region/subnetworks/subnetworkname'
preemptible: false
machine_type: n1-standard-1
retry: true
LINUX-PREEMPTIBLE:
clusterfuzz_release: 'prod'
docker_image: 'gcr.io/clusterfuzz-images/base:a2f4dd6-202202070654'
Expand Down
15 changes: 13 additions & 2 deletions src/clusterfuzz/_internal/google_cloud_utils/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

_local = threading.local()

RETRY_COUNT = 0
DEFAULT_RETRY_COUNT = 0

TASK_BUNCH_SIZE = 20

Expand All @@ -60,6 +60,7 @@
'gce_region',
'priority',
'max_run_duration',
'retry',
])


Expand Down Expand Up @@ -139,7 +140,13 @@ def _get_task_spec(batch_workload_spec):
runnable.container.volumes = ['/var/scratch0:/mnt/scratch0']
task_spec = batch.TaskSpec()
task_spec.runnables = [runnable]
task_spec.max_retry_count = RETRY_COUNT
if batch_workload_spec.retry:
# Tasks in general have 6 hours to run (except pruning which has 24).
# Our signed URLs last 24 hours. Therefore, the maxiumum number of retries
# is 4. This is a temporary solution anyway.
task_spec.max_retry_count = 4
else:
task_spec.max_retry_count = DEFAULT_RETRY_COUNT
task_spec.max_run_duration = batch_workload_spec.max_run_duration
return task_spec

Expand Down Expand Up @@ -282,6 +289,7 @@ def _get_spec_from_config(command, job_name):
project_name = batch_config.get('project')
docker_image = instance_spec['docker_image']
user_data = instance_spec['user_data']
should_retry = instance_spec.get('retry', False)
clusterfuzz_release = instance_spec.get('clusterfuzz_release', 'prod')

# Lower numbers are lower priority. From:
Expand All @@ -290,6 +298,8 @@ def _get_spec_from_config(command, job_name):
priority = 0 if low_priority else 1

max_run_duration = f'{_get_task_duration(command)}s'
if command == 'corpus_pruning':
should_retry = False # It is naturally retried the next day.

spec = BatchWorkloadSpec(
clusterfuzz_release=clusterfuzz_release,
Expand All @@ -309,5 +319,6 @@ def _get_spec_from_config(command, job_name):
machine_type=instance_spec['machine_type'],
priority=priority,
max_run_duration=max_run_duration,
retry=should_retry,
)
return spec
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def test_nonpreemptible(self):
preemptible=False,
machine_type='n1-standard-1',
priority=1,
retry=True,
max_run_duration='21600s',
)

Expand All @@ -77,6 +78,7 @@ def test_fuzz_get_spec_from_config(self):
preemptible=True,
machine_type='n1-standard-1',
priority=0,
retry=False,
max_run_duration='21600s',
)

Expand Down

0 comments on commit 10461e6

Please sign in to comment.