From 3fbcbefe0be7c13a70eaea8cce0f1cbbd4a1a161 Mon Sep 17 00:00:00 2001
From: Vitor Guidi <vitorguidi@gmail.com>
Date: Wed, 11 Dec 2024 10:44:56 +1100
Subject: [PATCH] [Monitoring] Adding a metric for task outcome (#4458)

### Motivation

We currently have no metric that tracks the error rate for each task.
This PR implements that, and the error rate can be obtained by summing
up the metric with outcome=failure, divided by the overall sum.

This is useful for SLI alerting.

Part of #4271
---
 .../_internal/bot/tasks/utasks/__init__.py    | 33 +++++++++++++++++++
 .../_internal/metrics/monitoring_metrics.py   | 24 ++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py
index 7b7d01637d..680d1306bb 100644
--- a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py
+++ b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py
@@ -27,6 +27,7 @@
 from clusterfuzz._internal.bot.webserver import http_server
 from clusterfuzz._internal.metrics import logs
 from clusterfuzz._internal.metrics import monitoring_metrics
+from clusterfuzz._internal.protos import uworker_msg_pb2
 from clusterfuzz._internal.system import environment
 
 # Define an alias to appease pylint.
@@ -74,12 +75,15 @@ class _MetricRecorder(contextlib.AbstractContextManager):
   Members:
     start_time_ns (int): The time at which this recorder was constructed, in
       nanoseconds since the Unix epoch.
+    utask_main_failure: this class stores the uworker_output.ErrorType 
+      object returned by utask_main, and uses it to emmit a metric.
   """
 
   def __init__(self, subtask: _Subtask):
     self.start_time_ns = time.time_ns()
     self._subtask = subtask
     self._labels = None
+    self.utask_main_failure = None
 
     if subtask == _Subtask.PREPROCESS:
       self._preprocess_start_time_ns = self.start_time_ns
@@ -138,6 +142,30 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
     monitoring_metrics.UTASK_SUBTASK_E2E_DURATION_SECS.add(
         e2e_duration_secs, self._labels)
 
+    # The only case where a task might fail without throwing, is in
+    # utask_main, by returning an ErrorType proto which indicates
+    # failure.
+    outcome = 'error' if _exc_type or self.utask_main_failure else 'success'
+    monitoring_metrics.TASK_OUTCOME_COUNT.increment({
+        **self._labels, 'outcome': outcome
+    })
+    if outcome == "success":
+      error_condition = 'N/A'
+    elif _exc_type:
+      error_condition = 'UNHANDLED_EXCEPTION'
+    else:
+      error_condition = uworker_msg_pb2.ErrorType.Name(  # pylint: disable=no-member
+          self.utask_main_failure)
+    # Get rid of job as a label, so we can have another metric to make
+    # error conditions more explicit, respecting the 30k distinct
+    # labels limit recommended by gcp.
+    trimmed_labels = self._labels
+    del trimmed_labels['job']
+    trimmed_labels['outcome'] = outcome
+    trimmed_labels['error_condition'] = error_condition
+    monitoring_metrics.TASK_OUTCOME_COUNT_BY_ERROR_TYPE.increment(
+        trimmed_labels)
+
 
 def ensure_uworker_env_type_safety(uworker_env):
   """Converts all values in |uworker_env| to str types.
@@ -226,6 +254,8 @@ def uworker_main_no_io(utask_module, serialized_uworker_input):
       return None
 
     # NOTE: Keep this in sync with `uworker_main()`.
+    if uworker_output.error_type != uworker_msg_pb2.ErrorType.NO_ERROR:  # pylint: disable=no-member
+      recorder.utask_main_failure = uworker_output.error_type
     uworker_output.bot_name = environment.get_value('BOT_NAME', '')
     uworker_output.platform_id = environment.get_platform_id()
 
@@ -306,6 +336,9 @@ def uworker_main(input_download_url) -> None:
     logs.info('Starting utask_main: %s.' % utask_module)
     uworker_output = utask_module.utask_main(uworker_input)
 
+    if uworker_output.error_type != uworker_msg_pb2.ErrorType.NO_ERROR:  # pylint: disable=no-member
+      recorder.utask_main_failure = uworker_output.error_type
+
     # NOTE: Keep this in sync with `uworker_main_no_io()`.
     uworker_output.bot_name = environment.get_value('BOT_NAME', '')
     uworker_output.platform_id = environment.get_platform_id()
diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
index 72de35362d..bf1653c50b 100644
--- a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
+++ b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
@@ -240,6 +240,30 @@
         monitor.StringField('argument'),
     ])
 
+TASK_OUTCOME_COUNT = monitor.CounterMetric(
+    'task/outcome',
+    description=('Counter metric for task outcome (success/failure).'),
+    field_spec=[
+        monitor.StringField('task'),
+        monitor.StringField('job'),
+        monitor.StringField('subtask'),
+        monitor.StringField('mode'),
+        monitor.StringField('platform'),
+        monitor.StringField('outcome'),
+    ])
+
+TASK_OUTCOME_COUNT_BY_ERROR_TYPE = monitor.CounterMetric(
+    'task/outcome_by_error_type',
+    description=('Counter metric for task outcome, with error type.'),
+    field_spec=[
+        monitor.StringField('task'),
+        monitor.StringField('subtask'),
+        monitor.StringField('mode'),
+        monitor.StringField('platform'),
+        monitor.StringField('outcome'),
+        monitor.StringField('error_condition'),
+    ])
+
 UTASK_SUBTASK_E2E_DURATION_SECS = monitor.CumulativeDistributionMetric(
     'utask/subtask_e2e_duration_secs',
     description=(