From 514cec033711c4efb86d337bc2035702072455c3 Mon Sep 17 00:00:00 2001
From: Vitor Guidi <vitorguidi@gmail.com>
Date: Wed, 11 Dec 2024 10:44:56 +1100
Subject: [PATCH] [Monitoring] Adding a metric for task outcome (#4458)

### Motivation

We currently have no metric that tracks the error rate for each task.
This PR implements that, and the error rate can be obtained by summing
up the metric with outcome=failure, divided by the overall sum.

This is useful for SLI alerting.

Part of #4271
---
 .../_internal/bot/tasks/utasks/__init__.py    | 33 +++++++++++++++++++
 .../_internal/metrics/monitoring_metrics.py   | 25 ++++++++++++++
 2 files changed, 58 insertions(+)

diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py
index 005e9f50e8..863e5cd734 100644
--- a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py
+++ b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py
@@ -27,6 +27,7 @@
 from clusterfuzz._internal.bot.webserver import http_server
 from clusterfuzz._internal.metrics import logs
 from clusterfuzz._internal.metrics import monitoring_metrics
+from clusterfuzz._internal.protos import uworker_msg_pb2
 from clusterfuzz._internal.system import environment
 
 # Define an alias to appease pylint.
@@ -74,12 +75,15 @@ class _MetricRecorder(contextlib.AbstractContextManager):
   Members:
     start_time_ns (int): The time at which this recorder was constructed, in
       nanoseconds since the Unix epoch.
+    utask_main_failure: this class stores the uworker_output.ErrorType 
+      object returned by utask_main, and uses it to emmit a metric.
   """
 
   def __init__(self, subtask: _Subtask):
     self.start_time_ns = time.time_ns()
     self._subtask = subtask
     self._labels = None
+    self.utask_main_failure = None
 
     if subtask == _Subtask.PREPROCESS:
       self._preprocess_start_time_ns = self.start_time_ns
@@ -138,6 +142,30 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
     monitoring_metrics.UTASK_SUBTASK_E2E_DURATION_SECS.add(
         e2e_duration_secs, self._labels)
 
+    # The only case where a task might fail without throwing, is in
+    # utask_main, by returning an ErrorType proto which indicates
+    # failure.
+    outcome = 'error' if _exc_type or self.utask_main_failure else 'success'
+    monitoring_metrics.TASK_OUTCOME_COUNT.increment({
+        **self._labels, 'outcome': outcome
+    })
+    if outcome == "success":
+      error_condition = 'N/A'
+    elif _exc_type:
+      error_condition = 'UNHANDLED_EXCEPTION'
+    else:
+      error_condition = uworker_msg_pb2.ErrorType.Name(  # pylint: disable=no-member
+          self.utask_main_failure)
+    # Get rid of job as a label, so we can have another metric to make
+    # error conditions more explicit, respecting the 30k distinct
+    # labels limit recommended by gcp.
+    trimmed_labels = self._labels
+    del trimmed_labels['job']
+    trimmed_labels['outcome'] = outcome
+    trimmed_labels['error_condition'] = error_condition
+    monitoring_metrics.TASK_OUTCOME_COUNT_BY_ERROR_TYPE.increment(
+        trimmed_labels)
+
 
 def ensure_uworker_env_type_safety(uworker_env):
   """Converts all values in |uworker_env| to str types.
@@ -226,6 +254,8 @@ def uworker_main_no_io(utask_module, serialized_uworker_input):
       return None
 
     # NOTE: Keep this in sync with `uworker_main()`.
+    if uworker_output.error_type != uworker_msg_pb2.ErrorType.NO_ERROR:  # pylint: disable=no-member
+      recorder.utask_main_failure = uworker_output.error_type
     uworker_output.bot_name = environment.get_value('BOT_NAME', '')
     uworker_output.platform_id = environment.get_platform_id()
 
@@ -306,6 +336,9 @@ def uworker_main(input_download_url) -> None:
     logs.info('Starting utask_main: %s.' % utask_module)
     uworker_output = utask_module.utask_main(uworker_input)
 
+    if uworker_output.error_type != uworker_msg_pb2.ErrorType.NO_ERROR:  # pylint: disable=no-member
+      recorder.utask_main_failure = uworker_output.error_type
+
     # NOTE: Keep this in sync with `uworker_main_no_io()`.
     uworker_output.bot_name = environment.get_value('BOT_NAME', '')
     uworker_output.platform_id = environment.get_platform_id()
diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
index 58489eae45..362c76cdee 100644
--- a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
+++ b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
@@ -241,6 +241,7 @@
         monitor.StringField('job'),
     ],
 )
+
 TASK_RATE_LIMIT_COUNT = monitor.CounterMetric(
     'task/rate_limit',
     description=('Counter for rate limit events.'),
@@ -250,6 +251,30 @@
         monitor.StringField('argument'),
     ])
 
+TASK_OUTCOME_COUNT = monitor.CounterMetric(
+    'task/outcome',
+    description=('Counter metric for task outcome (success/failure).'),
+    field_spec=[
+        monitor.StringField('task'),
+        monitor.StringField('job'),
+        monitor.StringField('subtask'),
+        monitor.StringField('mode'),
+        monitor.StringField('platform'),
+        monitor.StringField('outcome'),
+    ])
+
+TASK_OUTCOME_COUNT_BY_ERROR_TYPE = monitor.CounterMetric(
+    'task/outcome_by_error_type',
+    description=('Counter metric for task outcome, with error type.'),
+    field_spec=[
+        monitor.StringField('task'),
+        monitor.StringField('subtask'),
+        monitor.StringField('mode'),
+        monitor.StringField('platform'),
+        monitor.StringField('outcome'),
+        monitor.StringField('error_condition'),
+    ])
+
 UTASK_SUBTASK_E2E_DURATION_SECS = monitor.CumulativeDistributionMetric(
     'utask/subtask_e2e_duration_secs',
     description=(