From f4dc5ca81845d6677ce5b165e05ded9782f6e5a4 Mon Sep 17 00:00:00 2001
From: Vitor Guidi <vitorguidi@gmail.com>
Date: Mon, 23 Dec 2024 15:11:06 -0300
Subject: [PATCH] [Monitoring] Enrich UNTRIAGED_TESTCASE_AGE metric to track
 testcases stuck in analyze (#4547)

### Motivation

We currently have no way to tell if analyze task was successfully
executed. The TESTCASE_UPLOAD_TRIAGE_DURATION metric from #4364 would
only track duration for tasks that did finish.

An analyze_pending field is added to the Testcase entity in datastore,
which is set to False by default, to True for manually uploaded
testcases, and to False once analyze task postprocess runs.

It also increments the UNTRIAGED_TESTCASE_AGE metric from #4381 with a
status label, so we can know at what step the testcase is stuck, thus
allowing us to alert if analyze is taking longer to finish than
expected.

The alert itself could be, for instance, P50 age of untriaged testcase
(status=analyze_pending) > 3h.

Also, this retroactively addresses comments from #4481:

* Fixes docstring for emit_testcase_triage_duration_metric
* Removes assertions
* Renames TESTCASE_UPLOAD_TRIAGE_DURATION to TESTCASE_TRIAGE_DURATION,
since it now accounts for fuzzer generated testcases
* Use a boolean "from_fuzzer" field, instead of "origin" string, in
TESTCASE_TRIAGE_DURATION
---
 .../bot/tasks/utasks/analyze_task.py          |  2 +
 .../_internal/common/testcase_utils.py        | 40 ++++------------
 src/clusterfuzz/_internal/cron/triage.py      | 46 +++++++++++--------
 .../_internal/datastore/data_handler.py       |  3 +-
 .../_internal/datastore/data_types.py         |  4 ++
 .../_internal/metrics/monitoring_metrics.py   |  9 ++--
 6 files changed, 49 insertions(+), 55 deletions(-)

diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py b/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py
index 785974f735..1f0b7e8f58 100644
--- a/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py
+++ b/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py
@@ -552,6 +552,8 @@ def _update_testcase(output):
   if analyze_task_output.platform_id:
     testcase.platform_id = analyze_task_output.platform_id
 
+  testcase.analyze_pending = False
+
   testcase.put()
 
 
diff --git a/src/clusterfuzz/_internal/common/testcase_utils.py b/src/clusterfuzz/_internal/common/testcase_utils.py
index 73fe9edfd4..4314dcd55f 100644
--- a/src/clusterfuzz/_internal/common/testcase_utils.py
+++ b/src/clusterfuzz/_internal/common/testcase_utils.py
@@ -31,59 +31,39 @@
 
 
 def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
-  '''Finds out if a testcase is fuzzer generated or manually uploaded,
-      and emits the TESTCASE_UPLOAD_TRIAGE_DURATION metric.'''
-  testcase_upload_metadata = get_testcase_upload_metadata(testcase_id)
-  if not testcase_upload_metadata:
-    logs.warning(f'No upload metadata found for testcase {testcase_id},'
-                 ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
-    return
-  if not testcase_upload_metadata.timestamp:
-    logs.warning(
-        f'No timestamp for testcase {testcase_upload_metadata.testcase_id},'
-        ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
-    return
-  assert step in [
-      'analyze_launched', 'analyze_completed', 'minimize_completed',
-      'regression_completed', 'impact_completed', 'issue_updated'
-  ]
-
+  """Finds out if a testcase is fuzzer generated or manually uploaded,
+      and emits the TESTCASE_TRIAGE_DURATION metric."""
   testcase = data_handler.get_testcase_by_id(testcase_id)
 
   if not testcase:
     logs.warning(f'No testcase found with id {testcase_id},'
-                 ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
+                 ' failed to emit TESTCASE_TRIAGE_DURATION metric.')
     return
 
   if not testcase.job_type:
     logs.warning(f'No job_type associated to testcase {testcase_id},'
-                 ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
+                 ' failed to emit TESTCASE_TRIAGE_DURATION metric.')
     return
 
   from_fuzzer = not get_testcase_upload_metadata(testcase_id)
 
-  assert step in [
-      'analyze_launched', 'analyze_completed', 'minimize_completed',
-      'regression_completed', 'impact_completed', 'issue_updated'
-  ]
-
   if not testcase.get_age_in_seconds():
     logs.warning(f'No timestamp associated to testcase {testcase_id},'
-                 ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
+                 ' failed to emit TESTCASE_TRIAGE_DURATION metric.')
     return
 
-  testcase_age_in_hours = testcase.get_age_in_seconds() / 3600
+  testcase_age_in_hours = testcase.get_age_in_seconds() / (60 * 60)
 
-  logs.info('Emiting TESTCASE_UPLOAD_TRIAGE_DURATION metric for testcase '
+  logs.info('Emiting TESTCASE_TRIAGE_DURATION metric for testcase '
             f'{testcase_id} (age = {testcase_age_in_hours} hours.) '
-            'in step {step}.')
+            f'in step {step}, from_fuzzer: {from_fuzzer}.')
 
-  monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add(
+  monitoring_metrics.TESTCASE_TRIAGE_DURATION.add(
       testcase_age_in_hours,
       labels={
           'job': testcase.job_type,
           'step': step,
-          'origin': 'fuzzer' if from_fuzzer else 'manually_uploaded'
+          'from_fuzzer': from_fuzzer
       })
 
 
diff --git a/src/clusterfuzz/_internal/cron/triage.py b/src/clusterfuzz/_internal/cron/triage.py
index 1f3b375922..ccfb03e8ae 100644
--- a/src/clusterfuzz/_internal/cron/triage.py
+++ b/src/clusterfuzz/_internal/cron/triage.py
@@ -256,6 +256,8 @@ def _check_and_update_similar_bug(testcase, issue_tracker):
 
 def _emit_bug_filing_from_testcase_elapsed_time_metric(testcase):
   testcase_age = testcase.get_age_in_seconds()
+  if not testcase_age:
+    return
   monitoring_metrics.BUG_FILING_FROM_TESTCASE_ELAPSED_TIME.add(
       testcase_age,
       labels={
@@ -336,27 +338,30 @@ def _emit_untriaged_testcase_count_metric():
         })
 
 
-def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase):
+PENDING_ANALYZE = 'pending_analyze'
+PENDING_CRITICAL_TASKS = 'pending_critical_tasks'
+PENDING_PROGRESSION = 'pending_progression'
+PENDING_GROUPING = 'pending_grouping'
+PENDING_FILING = 'pending_filing'
+
+
+def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase,
+                                        step: str):
   """Emmits a metric to track age of untriaged testcases."""
-  if not testcase.timestamp:
+  if not testcase.get_age_in_seconds():
     return
 
   logs.info(f'Emiting UNTRIAGED_TESTCASE_AGE for testcase {testcase.key.id()} '
-            f'(age = {testcase.get_age_in_seconds()})')
+            f'(age = {testcase.get_age_in_seconds()}), step = {step}')
   monitoring_metrics.UNTRIAGED_TESTCASE_AGE.add(
       testcase.get_age_in_seconds() / 3600,
       labels={
           'job': testcase.job_type,
           'platform': testcase.platform,
+          'step': step,
       })
 
 
-PENDING_CRITICAL_TASKS = 'pending_critical_tasks'
-PENDING_PROGRESSION = 'pending_progression'
-PENDING_GROUPING = 'pending_grouping'
-PENDING_FILING = 'pending_filing'
-
-
 def main():
   """Files bugs."""
   try:
@@ -409,7 +414,7 @@ def main():
     if testcase.get_metadata('progression_pending'):
       _set_testcase_stuck_state(testcase, True)
       logs.info(f'Skipping testcase {testcase_id}, progression pending')
-      _emit_untriaged_testcase_age_metric(testcase)
+      _emit_untriaged_testcase_age_metric(testcase, PENDING_PROGRESSION)
       _increment_untriaged_testcase_count(testcase.job_type,
                                           PENDING_PROGRESSION)
       continue
@@ -432,10 +437,12 @@ def main():
     # Require that all tasks like minimizaton, regression testing, etc have
     # finished.
     if not critical_tasks_completed:
-      _emit_untriaged_testcase_age_metric(testcase)
+      status = PENDING_CRITICAL_TASKS
+      if testcase.analyze_pending:
+        status = PENDING_ANALYZE
+      _emit_untriaged_testcase_age_metric(testcase, status)
       _set_testcase_stuck_state(testcase, True)
-      _increment_untriaged_testcase_count(testcase.job_type,
-                                          PENDING_CRITICAL_TASKS)
+      _increment_untriaged_testcase_count(testcase.job_type, status)
       logs.info(
           f'Skipping testcase {testcase_id}, critical tasks still pending.')
       continue
@@ -452,7 +459,7 @@ def main():
     # metadata works well.
     if not testcase.group_id and not dates.time_has_expired(
         testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT):
-      _emit_untriaged_testcase_age_metric(testcase)
+      _emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING)
       _set_testcase_stuck_state(testcase, True)
       _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
       logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
@@ -460,7 +467,7 @@ def main():
 
     if not testcase.get_metadata('ran_grouper'):
       # Testcase should be considered by the grouper first before filing.
-      _emit_untriaged_testcase_age_metric(testcase)
+      _emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING)
       _set_testcase_stuck_state(testcase, True)
       _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
       logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
@@ -490,16 +497,15 @@ def main():
     # Clean up old triage messages that would be not applicable now.
     testcase.delete_metadata(TRIAGE_MESSAGE_KEY, update_testcase=False)
 
-    # A testcase is untriaged, until immediately before a bug is opened
-    _emit_untriaged_testcase_age_metric(testcase)
-    _set_testcase_stuck_state(testcase, False)
-    _increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)
-
     # File the bug first and then create filed bug metadata.
     if not _file_issue(testcase, issue_tracker, throttler):
+      _emit_untriaged_testcase_age_metric(testcase, PENDING_FILING)
+      _increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)
       logs.info(f'Issue filing failed for testcase id {testcase_id}')
       continue
 
+    _set_testcase_stuck_state(testcase, False)
+
     _create_filed_bug_metadata(testcase)
     issue_filer.notify_issue_update(testcase, 'new')
 
diff --git a/src/clusterfuzz/_internal/datastore/data_handler.py b/src/clusterfuzz/_internal/datastore/data_handler.py
index abe8d5c677..6b378609c4 100644
--- a/src/clusterfuzz/_internal/datastore/data_handler.py
+++ b/src/clusterfuzz/_internal/datastore/data_handler.py
@@ -921,7 +921,7 @@ def critical_tasks_completed(testcase):
     return testcase.minimized_keys and testcase.regression
 
   return bool(testcase.minimized_keys and testcase.regression and
-              testcase.is_impact_set_flag)
+              testcase.is_impact_set_flag and not testcase.analyze_pending)
 
 
 # ------------------------------------------------------------------------------
@@ -1379,6 +1379,7 @@ def create_user_uploaded_testcase(key,
 
   testcase.timestamp = utils.utcnow()
   testcase.created = testcase.timestamp
+  testcase.analyze_pending = True
   testcase.uploader_email = uploader_email
   testcase.put()
 
diff --git a/src/clusterfuzz/_internal/datastore/data_types.py b/src/clusterfuzz/_internal/datastore/data_types.py
index 8aa009ad36..cbc63c7f4b 100644
--- a/src/clusterfuzz/_internal/datastore/data_types.py
+++ b/src/clusterfuzz/_internal/datastore/data_types.py
@@ -583,6 +583,10 @@ class Testcase(Model):
   # Tracks if a testcase is stuck during triage.
   stuck_in_triage = ndb.BooleanProperty(default=False)
 
+  # Tracks if analyze task is pending.
+  # Defaults to false, since most testcases are fuzzer produced.
+  analyze_pending = ndb.BooleanProperty(default=False)
+
   def is_chromium(self):
     return self.project_name in ('chromium', 'chromium-testing')
 
diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
index cb54c9da54..dfd92e100e 100644
--- a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
+++ b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
@@ -231,17 +231,17 @@
     ],
 )
 
-TESTCASE_UPLOAD_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
-    'uploaded_testcase_analysis/triage_duration_secs',
+TESTCASE_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
+    'testcase_analysis/triage_duration_hours',
     description=('Time elapsed between testcase upload and completion'
-                 ' of relevant tasks in the testcase upload lifecycle.'
+                 ' of relevant tasks in the testcase lifecycle.'
                  ' Origin can be either from a fuzzer, or a manual'
                  ' upload. Measured in hours.'),
     bucketer=monitor.GeometricBucketer(),
     field_spec=[
         monitor.StringField('step'),
         monitor.StringField('job'),
-        monitor.StringField('origin'),
+        monitor.BooleanField('from_fuzzer'),
     ],
 )
 
@@ -365,6 +365,7 @@
     field_spec=[
         monitor.StringField('job'),
         monitor.StringField('platform'),
+        monitor.StringField('step'),
     ])
 
 UNTRIAGED_TESTCASE_COUNT = monitor.GaugeMetric(