diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py index 005e9f50e8..863e5cd734 100644 --- a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py +++ b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py @@ -27,6 +27,7 @@ from clusterfuzz._internal.bot.webserver import http_server from clusterfuzz._internal.metrics import logs from clusterfuzz._internal.metrics import monitoring_metrics +from clusterfuzz._internal.protos import uworker_msg_pb2 from clusterfuzz._internal.system import environment # Define an alias to appease pylint. @@ -74,12 +75,15 @@ class _MetricRecorder(contextlib.AbstractContextManager): Members: start_time_ns (int): The time at which this recorder was constructed, in nanoseconds since the Unix epoch. + utask_main_failure: this class stores the uworker_output.ErrorType + object returned by utask_main, and uses it to emmit a metric. """ def __init__(self, subtask: _Subtask): self.start_time_ns = time.time_ns() self._subtask = subtask self._labels = None + self.utask_main_failure = None if subtask == _Subtask.PREPROCESS: self._preprocess_start_time_ns = self.start_time_ns @@ -138,6 +142,30 @@ def __exit__(self, _exc_type, _exc_value, _traceback): monitoring_metrics.UTASK_SUBTASK_E2E_DURATION_SECS.add( e2e_duration_secs, self._labels) + # The only case where a task might fail without throwing, is in + # utask_main, by returning an ErrorType proto which indicates + # failure. + outcome = 'error' if _exc_type or self.utask_main_failure else 'success' + monitoring_metrics.TASK_OUTCOME_COUNT.increment({ + **self._labels, 'outcome': outcome + }) + if outcome == "success": + error_condition = 'N/A' + elif _exc_type: + error_condition = 'UNHANDLED_EXCEPTION' + else: + error_condition = uworker_msg_pb2.ErrorType.Name( # pylint: disable=no-member + self.utask_main_failure) + # Get rid of job as a label, so we can have another metric to make + # error conditions more explicit, respecting the 30k distinct + # labels limit recommended by gcp. + trimmed_labels = self._labels + del trimmed_labels['job'] + trimmed_labels['outcome'] = outcome + trimmed_labels['error_condition'] = error_condition + monitoring_metrics.TASK_OUTCOME_COUNT_BY_ERROR_TYPE.increment( + trimmed_labels) + def ensure_uworker_env_type_safety(uworker_env): """Converts all values in |uworker_env| to str types. @@ -226,6 +254,8 @@ def uworker_main_no_io(utask_module, serialized_uworker_input): return None # NOTE: Keep this in sync with `uworker_main()`. + if uworker_output.error_type != uworker_msg_pb2.ErrorType.NO_ERROR: # pylint: disable=no-member + recorder.utask_main_failure = uworker_output.error_type uworker_output.bot_name = environment.get_value('BOT_NAME', '') uworker_output.platform_id = environment.get_platform_id() @@ -306,6 +336,9 @@ def uworker_main(input_download_url) -> None: logs.info('Starting utask_main: %s.' % utask_module) uworker_output = utask_module.utask_main(uworker_input) + if uworker_output.error_type != uworker_msg_pb2.ErrorType.NO_ERROR: # pylint: disable=no-member + recorder.utask_main_failure = uworker_output.error_type + # NOTE: Keep this in sync with `uworker_main_no_io()`. uworker_output.bot_name = environment.get_value('BOT_NAME', '') uworker_output.platform_id = environment.get_platform_id() diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py index 58489eae45..362c76cdee 100644 --- a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py +++ b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py @@ -241,6 +241,7 @@ monitor.StringField('job'), ], ) + TASK_RATE_LIMIT_COUNT = monitor.CounterMetric( 'task/rate_limit', description=('Counter for rate limit events.'), @@ -250,6 +251,30 @@ monitor.StringField('argument'), ]) +TASK_OUTCOME_COUNT = monitor.CounterMetric( + 'task/outcome', + description=('Counter metric for task outcome (success/failure).'), + field_spec=[ + monitor.StringField('task'), + monitor.StringField('job'), + monitor.StringField('subtask'), + monitor.StringField('mode'), + monitor.StringField('platform'), + monitor.StringField('outcome'), + ]) + +TASK_OUTCOME_COUNT_BY_ERROR_TYPE = monitor.CounterMetric( + 'task/outcome_by_error_type', + description=('Counter metric for task outcome, with error type.'), + field_spec=[ + monitor.StringField('task'), + monitor.StringField('subtask'), + monitor.StringField('mode'), + monitor.StringField('platform'), + monitor.StringField('outcome'), + monitor.StringField('error_condition'), + ]) + UTASK_SUBTASK_E2E_DURATION_SECS = monitor.CumulativeDistributionMetric( 'utask/subtask_e2e_duration_secs', description=(