[Monitoring] Extend TESTCASE_UPLOAD_TRIAGE_DURATION to account for fu…

…zzer generated test cases (#4481) ### Motivation [#4364](#4364) implemented the tracking for the time it takes clusterfuzz to complete several steps of the manually uploaded testcase lifecycle. As per Chrome's request, the metric will now contain an 'origin' label, which indicates if the testcase was 'manually_uploaded' or generated by a 'fuzzer'. The code was also simplified, by reusing the get_age_in_seconds method from the TestCase entity. Also, it adds the 'stuck_in_triage' boolean field to the testcase entity, to facilitate figuring out what testcases are in a stuck state, so follow up actions can be taken. Part of #4271
google · Dec 16, 2024 · 8da25b5 · 8da25b5
1 parent d3d1b76
commit 8da25b5
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 8 deletions.
diff --git a/src/clusterfuzz/_internal/common/testcase_utils.py b/src/clusterfuzz/_internal/common/testcase_utils.py
@@ -31,6 +31,8 @@
 
 
 def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
+  '''Finds out if a testcase is fuzzer generated or manually uploaded,
+      and emits the TESTCASE_UPLOAD_TRIAGE_DURATION metric.'''
   testcase_upload_metadata = get_testcase_upload_metadata(testcase_id)
   if not testcase_upload_metadata:
     logs.warning(f'No upload metadata found for testcase {testcase_id},'
@@ -45,9 +47,6 @@ def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
       'analyze_launched', 'analyze_completed', 'minimize_completed',
       'regression_completed', 'impact_completed', 'issue_updated'
   ]
-  elapsed_time_since_upload = datetime.datetime.utcnow()
-  elapsed_time_since_upload -= testcase_upload_metadata.timestamp
-  elapsed_time_since_upload = elapsed_time_since_upload.total_seconds() / 3600
 
   testcase = data_handler.get_testcase_by_id(testcase_id)
 
@@ -61,15 +60,30 @@ def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
                  ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
     return
 
+  from_fuzzer = not get_testcase_upload_metadata(testcase_id)
+
+  assert step in [
+      'analyze_launched', 'analyze_completed', 'minimize_completed',
+      'regression_completed', 'impact_completed', 'issue_updated'
+  ]
+
+  if not testcase.get_age_in_seconds():
+    logs.warning(f'No timestamp associated to testcase {testcase_id},'
+                 ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
+    return
+
+  testcase_age_in_hours = testcase.get_age_in_seconds() / 3600
+
   logs.info('Emiting TESTCASE_UPLOAD_TRIAGE_DURATION metric for testcase '
-            f'{testcase_id} (age = {elapsed_time_since_upload}) '
+            f'{testcase_id} (age = {testcase_age_in_hours} hours.) '
             'in step {step}.')
 
   monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add(
-      elapsed_time_since_upload,
+      testcase_age_in_hours,
       labels={
           'job': testcase.job_type,
           'step': step,
+          'origin': 'fuzzer' if from_fuzzer else 'manually_uploaded'
       })
 
 

diff --git a/src/clusterfuzz/_internal/cron/triage.py b/src/clusterfuzz/_internal/cron/triage.py
@@ -309,6 +309,13 @@ def _file_issue(testcase, issue_tracker, throttler):
   return filed
 
 
+def _set_testcase_stuck_state(testcase: data_types.Testcase, state: bool):
+  if testcase.stuck_in_triage == state:
+    return
+  testcase.stuck_in_triage = state
+  testcase.put()
+
+
 untriaged_testcases = {}
 
 
@@ -322,7 +329,8 @@ def _increment_untriaged_testcase_count(job, status):
 def _emit_untriaged_testcase_count_metric():
   for (job, status) in untriaged_testcases:
     monitoring_metrics.UNTRIAGED_TESTCASE_COUNT.set(
-        untriaged_testcases[(job, status)], labels={
+        untriaged_testcases[(job, status)],
+        labels={
             'job': job,
             'status': status,
         })
@@ -385,18 +393,21 @@ def main():
 
     # Skip if testcase's job is removed.
     if testcase.job_type not in all_jobs:
+      _set_testcase_stuck_state(testcase, False)
       logs.info(f'Skipping testcase {testcase_id}, since its job was removed '
                 f' ({testcase.job_type})')
       continue
 
     # Skip if testcase's job is in exclusions list.
     if testcase.job_type in excluded_jobs:
+      _set_testcase_stuck_state(testcase, False)
       logs.info(f'Skipping testcase {testcase_id}, since its job is in the'
                 f' exclusion list ({testcase.job_type})')
       continue
 
     # Skip if we are running progression task at this time.
     if testcase.get_metadata('progression_pending'):
+      _set_testcase_stuck_state(testcase, True)
       logs.info(f'Skipping testcase {testcase_id}, progression pending')
       _emit_untriaged_testcase_age_metric(testcase)
       _increment_untriaged_testcase_count(testcase.job_type,
@@ -405,13 +416,15 @@ def main():
 
     # If the testcase has a bug filed already, no triage is needed.
     if _is_bug_filed(testcase):
+      _set_testcase_stuck_state(testcase, False)
       logs.info(
           f'Skipping testcase {testcase_id}, since a bug was already filed.')
       continue
 
     # Check if the crash is important, i.e. it is either a reproducible crash
     # or an unreproducible crash happening frequently.
     if not _is_crash_important(testcase):
+      _set_testcase_stuck_state(testcase, False)
       logs.info(
           f'Skipping testcase {testcase_id}, since the crash is not important.')
       continue
@@ -420,6 +433,7 @@ def main():
     # finished.
     if not critical_tasks_completed:
       _emit_untriaged_testcase_age_metric(testcase)
+      _set_testcase_stuck_state(testcase, True)
       _increment_untriaged_testcase_count(testcase.job_type,
                                           PENDING_CRITICAL_TASKS)
       logs.info(
@@ -439,13 +453,15 @@ def main():
     if not testcase.group_id and not dates.time_has_expired(
         testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT):
       _emit_untriaged_testcase_age_metric(testcase)
+      _set_testcase_stuck_state(testcase, True)
       _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
       logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
       continue
 
     if not testcase.get_metadata('ran_grouper'):
       # Testcase should be considered by the grouper first before filing.
       _emit_untriaged_testcase_age_metric(testcase)
+      _set_testcase_stuck_state(testcase, True)
       _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
       logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
       continue
@@ -466,6 +482,7 @@ def main():
     # If there are similar issues to this test case already filed or recently
     # closed, skip filing a duplicate bug.
     if _check_and_update_similar_bug(testcase, issue_tracker):
+      _set_testcase_stuck_state(testcase, False)
       logs.info(f'Skipping testcase {testcase_id}, since a similar bug'
                 ' was already filed.')
       continue
@@ -475,6 +492,7 @@ def main():
 
     # A testcase is untriaged, until immediately before a bug is opened
     _emit_untriaged_testcase_age_metric(testcase)
+    _set_testcase_stuck_state(testcase, False)
     _increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)
 
     # File the bug first and then create filed bug metadata.

diff --git a/src/clusterfuzz/_internal/datastore/data_types.py b/src/clusterfuzz/_internal/datastore/data_types.py
@@ -580,6 +580,9 @@ class Testcase(Model):
   # corpus.
   trusted = ndb.BooleanProperty(default=False)
 
+  # Tracks if a testcase is stuck during triage.
+  stuck_in_triage = ndb.BooleanProperty(default=False)
+
   def is_chromium(self):
     return self.project_name in ('chromium', 'chromium-testing')
 
@@ -686,6 +689,8 @@ def get_created_time(self) -> ndb.DateTimeProperty:
 
   def get_age_in_seconds(self):
     current_time = datetime.datetime.utcnow()
+    if not self.get_created_time():
+      return None
     testcase_age = current_time - self.get_created_time()
     return testcase_age.total_seconds()
 

diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
@@ -234,12 +234,14 @@
 TESTCASE_UPLOAD_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
     'uploaded_testcase_analysis/triage_duration_secs',
     description=('Time elapsed between testcase upload and completion'
-                 ' of relevant tasks in the testcase upload lifecycle, '
-                 'in hours.'),
+                 ' of relevant tasks in the testcase upload lifecycle.'
+                 ' Origin can be either from a fuzzer, or a manual'
+                 ' upload. Measured in hours.'),
     bucketer=monitor.GeometricBucketer(),
     field_spec=[
         monitor.StringField('step'),
         monitor.StringField('job'),
+        monitor.StringField('origin'),
     ],
 )