From 5655b43fbc8603f5c0a597f61066c359a757d8de Mon Sep 17 00:00:00 2001 From: Vitor Guidi Date: Thu, 26 Dec 2024 11:56:38 -0300 Subject: [PATCH] [Chrome deployment] Split utasks properly into success and error, add analyze step to testcase triage metric (#4558) This merges #4547 and #4516 to the chrome branch. --- .../_internal/bot/tasks/utasks/__init__.py | 31 +++++++++---- .../bot/tasks/utasks/analyze_task.py | 2 + .../_internal/common/testcase_utils.py | 40 ++++------------ src/clusterfuzz/_internal/cron/triage.py | 46 +++++++++++-------- .../_internal/datastore/data_handler.py | 3 +- .../_internal/datastore/data_types.py | 4 ++ .../_internal/metrics/monitoring_metrics.py | 13 +++--- 7 files changed, 73 insertions(+), 66 deletions(-) diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py index b69bfebd96..2b9af56a70 100644 --- a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py +++ b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py @@ -84,6 +84,17 @@ def __init__(self, subtask: _Subtask): self._subtask = subtask self._labels = None self.utask_main_failure = None + self._utask_success_conditions = [ + None, # This can be a successful return value in, ie, fuzz task + uworker_msg_pb2.ErrorType.NO_ERROR, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.ANALYZE_NO_CRASH, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.PROGRESSION_BAD_STATE_MIN_MAX, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.REGRESSION_NO_CRASH, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.REGRESSION_LOW_CONFIDENCE_IN_REGRESSION_RANGE, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.MINIMIZE_CRASH_TOO_FLAKY, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.LIBFUZZER_MINIMIZATION_UNREPRODUCIBLE, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.ANALYZE_CLOSE_INVALID_UPLOADED, # pylint: disable=no-member + ] if subtask == _Subtask.PREPROCESS: self._preprocess_start_time_ns = self.start_time_ns @@ -125,6 +136,12 @@ def set_task_details(self, # Ensure we always have a value after this method returns. assert self._preprocess_start_time_ns is not None + def _infer_uworker_main_outcome(self, exc_type, uworker_error) -> bool: + """Returns True if task succeeded, False otherwise.""" + if exc_type or uworker_error not in self._utask_success_conditions: + return False + return True + def __exit__(self, _exc_type, _exc_value, _traceback): # Ignore exception details, let Python continue unwinding the stack. @@ -145,11 +162,12 @@ def __exit__(self, _exc_type, _exc_value, _traceback): # The only case where a task might fail without throwing, is in # utask_main, by returning an ErrorType proto which indicates # failure. - outcome = 'error' if _exc_type or self.utask_main_failure else 'success' + task_succeeded = self._infer_uworker_main_outcome(_exc_type, + self.utask_main_failure) monitoring_metrics.TASK_OUTCOME_COUNT.increment({ - **self._labels, 'outcome': outcome + **self._labels, 'task_succeeded': task_succeeded }) - if outcome == "success": + if task_succeeded: error_condition = 'N/A' elif _exc_type: error_condition = 'UNHANDLED_EXCEPTION' @@ -161,16 +179,11 @@ def __exit__(self, _exc_type, _exc_value, _traceback): # labels limit recommended by gcp. trimmed_labels = self._labels del trimmed_labels['job'] - trimmed_labels['outcome'] = outcome + trimmed_labels['task_succeeded'] = task_succeeded trimmed_labels['error_condition'] = error_condition monitoring_metrics.TASK_OUTCOME_COUNT_BY_ERROR_TYPE.increment( trimmed_labels) - if error_condition != 'UNHANDLED_EXCEPTION': - task = self._labels['task'] - subtask = self._labels['subtask'] - logs.info(f'Task {task}, at subtask {subtask}, finished successfully.') - def ensure_uworker_env_type_safety(uworker_env): """Converts all values in |uworker_env| to str types. diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py b/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py index 785974f735..1f0b7e8f58 100644 --- a/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py +++ b/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py @@ -552,6 +552,8 @@ def _update_testcase(output): if analyze_task_output.platform_id: testcase.platform_id = analyze_task_output.platform_id + testcase.analyze_pending = False + testcase.put() diff --git a/src/clusterfuzz/_internal/common/testcase_utils.py b/src/clusterfuzz/_internal/common/testcase_utils.py index 73fe9edfd4..4314dcd55f 100644 --- a/src/clusterfuzz/_internal/common/testcase_utils.py +++ b/src/clusterfuzz/_internal/common/testcase_utils.py @@ -31,59 +31,39 @@ def emit_testcase_triage_duration_metric(testcase_id: int, step: str): - '''Finds out if a testcase is fuzzer generated or manually uploaded, - and emits the TESTCASE_UPLOAD_TRIAGE_DURATION metric.''' - testcase_upload_metadata = get_testcase_upload_metadata(testcase_id) - if not testcase_upload_metadata: - logs.warning(f'No upload metadata found for testcase {testcase_id},' - ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') - return - if not testcase_upload_metadata.timestamp: - logs.warning( - f'No timestamp for testcase {testcase_upload_metadata.testcase_id},' - ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') - return - assert step in [ - 'analyze_launched', 'analyze_completed', 'minimize_completed', - 'regression_completed', 'impact_completed', 'issue_updated' - ] - + """Finds out if a testcase is fuzzer generated or manually uploaded, + and emits the TESTCASE_TRIAGE_DURATION metric.""" testcase = data_handler.get_testcase_by_id(testcase_id) if not testcase: logs.warning(f'No testcase found with id {testcase_id},' - ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') + ' failed to emit TESTCASE_TRIAGE_DURATION metric.') return if not testcase.job_type: logs.warning(f'No job_type associated to testcase {testcase_id},' - ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') + ' failed to emit TESTCASE_TRIAGE_DURATION metric.') return from_fuzzer = not get_testcase_upload_metadata(testcase_id) - assert step in [ - 'analyze_launched', 'analyze_completed', 'minimize_completed', - 'regression_completed', 'impact_completed', 'issue_updated' - ] - if not testcase.get_age_in_seconds(): logs.warning(f'No timestamp associated to testcase {testcase_id},' - ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') + ' failed to emit TESTCASE_TRIAGE_DURATION metric.') return - testcase_age_in_hours = testcase.get_age_in_seconds() / 3600 + testcase_age_in_hours = testcase.get_age_in_seconds() / (60 * 60) - logs.info('Emiting TESTCASE_UPLOAD_TRIAGE_DURATION metric for testcase ' + logs.info('Emiting TESTCASE_TRIAGE_DURATION metric for testcase ' f'{testcase_id} (age = {testcase_age_in_hours} hours.) ' - 'in step {step}.') + f'in step {step}, from_fuzzer: {from_fuzzer}.') - monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add( + monitoring_metrics.TESTCASE_TRIAGE_DURATION.add( testcase_age_in_hours, labels={ 'job': testcase.job_type, 'step': step, - 'origin': 'fuzzer' if from_fuzzer else 'manually_uploaded' + 'from_fuzzer': from_fuzzer }) diff --git a/src/clusterfuzz/_internal/cron/triage.py b/src/clusterfuzz/_internal/cron/triage.py index 0b96e45d94..f3c4887fe3 100644 --- a/src/clusterfuzz/_internal/cron/triage.py +++ b/src/clusterfuzz/_internal/cron/triage.py @@ -256,6 +256,8 @@ def _check_and_update_similar_bug(testcase, issue_tracker): def _emit_bug_filing_from_testcase_elapsed_time_metric(testcase): testcase_age = testcase.get_age_in_seconds() + if not testcase_age: + return monitoring_metrics.BUG_FILING_FROM_TESTCASE_ELAPSED_TIME.add( testcase_age, labels={ @@ -336,27 +338,30 @@ def _emit_untriaged_testcase_count_metric(): }) -def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase): +PENDING_ANALYZE = 'pending_analyze' +PENDING_CRITICAL_TASKS = 'pending_critical_tasks' +PENDING_PROGRESSION = 'pending_progression' +PENDING_GROUPING = 'pending_grouping' +PENDING_FILING = 'pending_filing' + + +def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase, + step: str): """Emmits a metric to track age of untriaged testcases.""" - if not testcase.timestamp: + if not testcase.get_age_in_seconds(): return logs.info(f'Emiting UNTRIAGED_TESTCASE_AGE for testcase {testcase.key.id()} ' - f'(age = {testcase.get_age_in_seconds()})') + f'(age = {testcase.get_age_in_seconds()}), step = {step}') monitoring_metrics.UNTRIAGED_TESTCASE_AGE.add( testcase.get_age_in_seconds() / 3600, labels={ 'job': testcase.job_type, 'platform': testcase.platform, + 'step': step, }) -PENDING_CRITICAL_TASKS = 'pending_critical_tasks' -PENDING_PROGRESSION = 'pending_progression' -PENDING_GROUPING = 'pending_grouping' -PENDING_FILING = 'pending_filing' - - def main(): """Files bugs.""" try: @@ -409,7 +414,7 @@ def main(): if testcase.get_metadata('progression_pending'): _set_testcase_stuck_state(testcase, True) logs.info(f'Skipping testcase {testcase_id}, progression pending') - _emit_untriaged_testcase_age_metric(testcase) + _emit_untriaged_testcase_age_metric(testcase, PENDING_PROGRESSION) _increment_untriaged_testcase_count(testcase.job_type, PENDING_PROGRESSION) logs.info(f'Skipping testcase {testcase_id}, progression pending') @@ -433,10 +438,12 @@ def main(): # Require that all tasks like minimizaton, regression testing, etc have # finished. if not critical_tasks_completed: - _emit_untriaged_testcase_age_metric(testcase) + status = PENDING_CRITICAL_TASKS + if testcase.analyze_pending: + status = PENDING_ANALYZE + _emit_untriaged_testcase_age_metric(testcase, status) _set_testcase_stuck_state(testcase, True) - _increment_untriaged_testcase_count(testcase.job_type, - PENDING_CRITICAL_TASKS) + _increment_untriaged_testcase_count(testcase.job_type, status) logs.info( f'Skipping testcase {testcase_id}, critical tasks still pending.') continue @@ -453,7 +460,7 @@ def main(): # metadata works well. if not testcase.group_id and not dates.time_has_expired( testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT): - _emit_untriaged_testcase_age_metric(testcase) + _emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING) _set_testcase_stuck_state(testcase, True) _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING) logs.info(f'Skipping testcase {testcase_id}, pending grouping.') @@ -461,7 +468,7 @@ def main(): if not testcase.get_metadata('ran_grouper'): # Testcase should be considered by the grouper first before filing. - _emit_untriaged_testcase_age_metric(testcase) + _emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING) _set_testcase_stuck_state(testcase, True) _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING) logs.info(f'Skipping testcase {testcase_id}, pending grouping.') @@ -491,16 +498,15 @@ def main(): # Clean up old triage messages that would be not applicable now. testcase.delete_metadata(TRIAGE_MESSAGE_KEY, update_testcase=False) - # A testcase is untriaged, until immediately before a bug is opened - _emit_untriaged_testcase_age_metric(testcase) - _set_testcase_stuck_state(testcase, False) - _increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING) - # File the bug first and then create filed bug metadata. if not _file_issue(testcase, issue_tracker, throttler): + _emit_untriaged_testcase_age_metric(testcase, PENDING_FILING) + _increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING) logs.info(f'Issue filing failed for testcase id {testcase_id}') continue + _set_testcase_stuck_state(testcase, False) + _create_filed_bug_metadata(testcase) issue_filer.notify_issue_update(testcase, 'new') diff --git a/src/clusterfuzz/_internal/datastore/data_handler.py b/src/clusterfuzz/_internal/datastore/data_handler.py index ff9d608cfc..cf1aafb5ff 100644 --- a/src/clusterfuzz/_internal/datastore/data_handler.py +++ b/src/clusterfuzz/_internal/datastore/data_handler.py @@ -921,7 +921,7 @@ def critical_tasks_completed(testcase): return testcase.minimized_keys and testcase.regression return bool(testcase.minimized_keys and testcase.regression and - testcase.is_impact_set_flag) + testcase.is_impact_set_flag and not testcase.analyze_pending) # ------------------------------------------------------------------------------ @@ -1380,6 +1380,7 @@ def create_user_uploaded_testcase(key, testcase.timestamp = utils.utcnow() testcase.created = testcase.timestamp + testcase.analyze_pending = True testcase.uploader_email = uploader_email testcase.put() diff --git a/src/clusterfuzz/_internal/datastore/data_types.py b/src/clusterfuzz/_internal/datastore/data_types.py index bd4c3e2e44..ed0b3c16a3 100644 --- a/src/clusterfuzz/_internal/datastore/data_types.py +++ b/src/clusterfuzz/_internal/datastore/data_types.py @@ -583,6 +583,10 @@ class Testcase(Model): # Tracks if a testcase is stuck during triage. stuck_in_triage = ndb.BooleanProperty(default=False) + # Tracks if analyze task is pending. + # Defaults to false, since most testcases are fuzzer produced. + analyze_pending = ndb.BooleanProperty(default=False) + def is_chromium(self): return self.project_name in ('chromium', 'chromium-testing') diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py index cac65fd4ec..dfd92e100e 100644 --- a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py +++ b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py @@ -231,17 +231,17 @@ ], ) -TESTCASE_UPLOAD_TRIAGE_DURATION = monitor.CumulativeDistributionMetric( - 'uploaded_testcase_analysis/triage_duration_secs', +TESTCASE_TRIAGE_DURATION = monitor.CumulativeDistributionMetric( + 'testcase_analysis/triage_duration_hours', description=('Time elapsed between testcase upload and completion' - ' of relevant tasks in the testcase upload lifecycle.' + ' of relevant tasks in the testcase lifecycle.' ' Origin can be either from a fuzzer, or a manual' ' upload. Measured in hours.'), bucketer=monitor.GeometricBucketer(), field_spec=[ monitor.StringField('step'), monitor.StringField('job'), - monitor.StringField('origin'), + monitor.BooleanField('from_fuzzer'), ], ) @@ -263,7 +263,7 @@ monitor.StringField('subtask'), monitor.StringField('mode'), monitor.StringField('platform'), - monitor.StringField('outcome'), + monitor.BooleanField('task_succeeded'), ]) TASK_OUTCOME_COUNT_BY_ERROR_TYPE = monitor.CounterMetric( @@ -274,7 +274,7 @@ monitor.StringField('subtask'), monitor.StringField('mode'), monitor.StringField('platform'), - monitor.StringField('outcome'), + monitor.BooleanField('task_succeeded'), monitor.StringField('error_condition'), ]) @@ -365,6 +365,7 @@ field_spec=[ monitor.StringField('job'), monitor.StringField('platform'), + monitor.StringField('step'), ]) UNTRIAGED_TESTCASE_COUNT = monitor.GaugeMetric(