From 246122fed21c12f5ba892db31a58ab49e3ba5c86 Mon Sep 17 00:00:00 2001 From: Vitor Guidi Date: Fri, 27 Dec 2024 12:40:16 -0300 Subject: [PATCH] Merge remainder monitoring PRs to oss fuzz (#4563) This merges #4512 , #4516 , #4547 , #4497, #4560 and #4561 --- infra/terraform/monitoring.tf | 1622 +++++++++++++++++ .../_internal/bot/tasks/utasks/__init__.py | 61 +- .../bot/tasks/utasks/analyze_task.py | 2 + .../_internal/common/testcase_utils.py | 40 +- src/clusterfuzz/_internal/cron/triage.py | 46 +- .../_internal/datastore/data_handler.py | 3 +- .../_internal/datastore/data_types.py | 4 + .../_internal/metrics/monitoring_metrics.py | 13 +- 8 files changed, 1684 insertions(+), 107 deletions(-) create mode 100644 infra/terraform/monitoring.tf diff --git a/infra/terraform/monitoring.tf b/infra/terraform/monitoring.tf new file mode 100644 index 0000000000..7d9a540d76 --- /dev/null +++ b/infra/terraform/monitoring.tf @@ -0,0 +1,1622 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" { + dashboard_json = < bool: + """Returns True if task succeeded, False otherwise.""" + if exc_type or uworker_error not in self._utask_success_conditions: + return False + return True def __exit__(self, _exc_type, _exc_value, _traceback): # Ignore exception details, let Python continue unwinding the stack. @@ -201,12 +162,12 @@ def __exit__(self, _exc_type, _exc_value, _traceback): # The only case where a task might fail without throwing, is in # utask_main, by returning an ErrorType proto which indicates # failure. - outcome = self._infer_uworker_main_outcome(_exc_type, - self.utask_main_failure) + task_succeeded = self._infer_uworker_main_outcome(_exc_type, + self.utask_main_failure) monitoring_metrics.TASK_OUTCOME_COUNT.increment({ - **self._labels, 'outcome': outcome + **self._labels, 'task_succeeded': task_succeeded }) - if outcome == "success": + if task_succeeded: error_condition = 'N/A' elif _exc_type: error_condition = 'UNHANDLED_EXCEPTION' @@ -218,7 +179,7 @@ def __exit__(self, _exc_type, _exc_value, _traceback): # labels limit recommended by gcp. trimmed_labels = self._labels del trimmed_labels['job'] - trimmed_labels['outcome'] = outcome + trimmed_labels['task_succeeded'] = task_succeeded trimmed_labels['error_condition'] = error_condition monitoring_metrics.TASK_OUTCOME_COUNT_BY_ERROR_TYPE.increment( trimmed_labels) diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py b/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py index 785974f735..1f0b7e8f58 100644 --- a/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py +++ b/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py @@ -552,6 +552,8 @@ def _update_testcase(output): if analyze_task_output.platform_id: testcase.platform_id = analyze_task_output.platform_id + testcase.analyze_pending = False + testcase.put() diff --git a/src/clusterfuzz/_internal/common/testcase_utils.py b/src/clusterfuzz/_internal/common/testcase_utils.py index 73fe9edfd4..4314dcd55f 100644 --- a/src/clusterfuzz/_internal/common/testcase_utils.py +++ b/src/clusterfuzz/_internal/common/testcase_utils.py @@ -31,59 +31,39 @@ def emit_testcase_triage_duration_metric(testcase_id: int, step: str): - '''Finds out if a testcase is fuzzer generated or manually uploaded, - and emits the TESTCASE_UPLOAD_TRIAGE_DURATION metric.''' - testcase_upload_metadata = get_testcase_upload_metadata(testcase_id) - if not testcase_upload_metadata: - logs.warning(f'No upload metadata found for testcase {testcase_id},' - ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') - return - if not testcase_upload_metadata.timestamp: - logs.warning( - f'No timestamp for testcase {testcase_upload_metadata.testcase_id},' - ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') - return - assert step in [ - 'analyze_launched', 'analyze_completed', 'minimize_completed', - 'regression_completed', 'impact_completed', 'issue_updated' - ] - + """Finds out if a testcase is fuzzer generated or manually uploaded, + and emits the TESTCASE_TRIAGE_DURATION metric.""" testcase = data_handler.get_testcase_by_id(testcase_id) if not testcase: logs.warning(f'No testcase found with id {testcase_id},' - ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') + ' failed to emit TESTCASE_TRIAGE_DURATION metric.') return if not testcase.job_type: logs.warning(f'No job_type associated to testcase {testcase_id},' - ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') + ' failed to emit TESTCASE_TRIAGE_DURATION metric.') return from_fuzzer = not get_testcase_upload_metadata(testcase_id) - assert step in [ - 'analyze_launched', 'analyze_completed', 'minimize_completed', - 'regression_completed', 'impact_completed', 'issue_updated' - ] - if not testcase.get_age_in_seconds(): logs.warning(f'No timestamp associated to testcase {testcase_id},' - ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') + ' failed to emit TESTCASE_TRIAGE_DURATION metric.') return - testcase_age_in_hours = testcase.get_age_in_seconds() / 3600 + testcase_age_in_hours = testcase.get_age_in_seconds() / (60 * 60) - logs.info('Emiting TESTCASE_UPLOAD_TRIAGE_DURATION metric for testcase ' + logs.info('Emiting TESTCASE_TRIAGE_DURATION metric for testcase ' f'{testcase_id} (age = {testcase_age_in_hours} hours.) ' - 'in step {step}.') + f'in step {step}, from_fuzzer: {from_fuzzer}.') - monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add( + monitoring_metrics.TESTCASE_TRIAGE_DURATION.add( testcase_age_in_hours, labels={ 'job': testcase.job_type, 'step': step, - 'origin': 'fuzzer' if from_fuzzer else 'manually_uploaded' + 'from_fuzzer': from_fuzzer }) diff --git a/src/clusterfuzz/_internal/cron/triage.py b/src/clusterfuzz/_internal/cron/triage.py index 1f3b375922..ccfb03e8ae 100644 --- a/src/clusterfuzz/_internal/cron/triage.py +++ b/src/clusterfuzz/_internal/cron/triage.py @@ -256,6 +256,8 @@ def _check_and_update_similar_bug(testcase, issue_tracker): def _emit_bug_filing_from_testcase_elapsed_time_metric(testcase): testcase_age = testcase.get_age_in_seconds() + if not testcase_age: + return monitoring_metrics.BUG_FILING_FROM_TESTCASE_ELAPSED_TIME.add( testcase_age, labels={ @@ -336,27 +338,30 @@ def _emit_untriaged_testcase_count_metric(): }) -def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase): +PENDING_ANALYZE = 'pending_analyze' +PENDING_CRITICAL_TASKS = 'pending_critical_tasks' +PENDING_PROGRESSION = 'pending_progression' +PENDING_GROUPING = 'pending_grouping' +PENDING_FILING = 'pending_filing' + + +def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase, + step: str): """Emmits a metric to track age of untriaged testcases.""" - if not testcase.timestamp: + if not testcase.get_age_in_seconds(): return logs.info(f'Emiting UNTRIAGED_TESTCASE_AGE for testcase {testcase.key.id()} ' - f'(age = {testcase.get_age_in_seconds()})') + f'(age = {testcase.get_age_in_seconds()}), step = {step}') monitoring_metrics.UNTRIAGED_TESTCASE_AGE.add( testcase.get_age_in_seconds() / 3600, labels={ 'job': testcase.job_type, 'platform': testcase.platform, + 'step': step, }) -PENDING_CRITICAL_TASKS = 'pending_critical_tasks' -PENDING_PROGRESSION = 'pending_progression' -PENDING_GROUPING = 'pending_grouping' -PENDING_FILING = 'pending_filing' - - def main(): """Files bugs.""" try: @@ -409,7 +414,7 @@ def main(): if testcase.get_metadata('progression_pending'): _set_testcase_stuck_state(testcase, True) logs.info(f'Skipping testcase {testcase_id}, progression pending') - _emit_untriaged_testcase_age_metric(testcase) + _emit_untriaged_testcase_age_metric(testcase, PENDING_PROGRESSION) _increment_untriaged_testcase_count(testcase.job_type, PENDING_PROGRESSION) continue @@ -432,10 +437,12 @@ def main(): # Require that all tasks like minimizaton, regression testing, etc have # finished. if not critical_tasks_completed: - _emit_untriaged_testcase_age_metric(testcase) + status = PENDING_CRITICAL_TASKS + if testcase.analyze_pending: + status = PENDING_ANALYZE + _emit_untriaged_testcase_age_metric(testcase, status) _set_testcase_stuck_state(testcase, True) - _increment_untriaged_testcase_count(testcase.job_type, - PENDING_CRITICAL_TASKS) + _increment_untriaged_testcase_count(testcase.job_type, status) logs.info( f'Skipping testcase {testcase_id}, critical tasks still pending.') continue @@ -452,7 +459,7 @@ def main(): # metadata works well. if not testcase.group_id and not dates.time_has_expired( testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT): - _emit_untriaged_testcase_age_metric(testcase) + _emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING) _set_testcase_stuck_state(testcase, True) _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING) logs.info(f'Skipping testcase {testcase_id}, pending grouping.') @@ -460,7 +467,7 @@ def main(): if not testcase.get_metadata('ran_grouper'): # Testcase should be considered by the grouper first before filing. - _emit_untriaged_testcase_age_metric(testcase) + _emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING) _set_testcase_stuck_state(testcase, True) _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING) logs.info(f'Skipping testcase {testcase_id}, pending grouping.') @@ -490,16 +497,15 @@ def main(): # Clean up old triage messages that would be not applicable now. testcase.delete_metadata(TRIAGE_MESSAGE_KEY, update_testcase=False) - # A testcase is untriaged, until immediately before a bug is opened - _emit_untriaged_testcase_age_metric(testcase) - _set_testcase_stuck_state(testcase, False) - _increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING) - # File the bug first and then create filed bug metadata. if not _file_issue(testcase, issue_tracker, throttler): + _emit_untriaged_testcase_age_metric(testcase, PENDING_FILING) + _increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING) logs.info(f'Issue filing failed for testcase id {testcase_id}') continue + _set_testcase_stuck_state(testcase, False) + _create_filed_bug_metadata(testcase) issue_filer.notify_issue_update(testcase, 'new') diff --git a/src/clusterfuzz/_internal/datastore/data_handler.py b/src/clusterfuzz/_internal/datastore/data_handler.py index abe8d5c677..6b378609c4 100644 --- a/src/clusterfuzz/_internal/datastore/data_handler.py +++ b/src/clusterfuzz/_internal/datastore/data_handler.py @@ -921,7 +921,7 @@ def critical_tasks_completed(testcase): return testcase.minimized_keys and testcase.regression return bool(testcase.minimized_keys and testcase.regression and - testcase.is_impact_set_flag) + testcase.is_impact_set_flag and not testcase.analyze_pending) # ------------------------------------------------------------------------------ @@ -1379,6 +1379,7 @@ def create_user_uploaded_testcase(key, testcase.timestamp = utils.utcnow() testcase.created = testcase.timestamp + testcase.analyze_pending = True testcase.uploader_email = uploader_email testcase.put() diff --git a/src/clusterfuzz/_internal/datastore/data_types.py b/src/clusterfuzz/_internal/datastore/data_types.py index 8aa009ad36..cbc63c7f4b 100644 --- a/src/clusterfuzz/_internal/datastore/data_types.py +++ b/src/clusterfuzz/_internal/datastore/data_types.py @@ -583,6 +583,10 @@ class Testcase(Model): # Tracks if a testcase is stuck during triage. stuck_in_triage = ndb.BooleanProperty(default=False) + # Tracks if analyze task is pending. + # Defaults to false, since most testcases are fuzzer produced. + analyze_pending = ndb.BooleanProperty(default=False) + def is_chromium(self): return self.project_name in ('chromium', 'chromium-testing') diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py index cac65fd4ec..dfd92e100e 100644 --- a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py +++ b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py @@ -231,17 +231,17 @@ ], ) -TESTCASE_UPLOAD_TRIAGE_DURATION = monitor.CumulativeDistributionMetric( - 'uploaded_testcase_analysis/triage_duration_secs', +TESTCASE_TRIAGE_DURATION = monitor.CumulativeDistributionMetric( + 'testcase_analysis/triage_duration_hours', description=('Time elapsed between testcase upload and completion' - ' of relevant tasks in the testcase upload lifecycle.' + ' of relevant tasks in the testcase lifecycle.' ' Origin can be either from a fuzzer, or a manual' ' upload. Measured in hours.'), bucketer=monitor.GeometricBucketer(), field_spec=[ monitor.StringField('step'), monitor.StringField('job'), - monitor.StringField('origin'), + monitor.BooleanField('from_fuzzer'), ], ) @@ -263,7 +263,7 @@ monitor.StringField('subtask'), monitor.StringField('mode'), monitor.StringField('platform'), - monitor.StringField('outcome'), + monitor.BooleanField('task_succeeded'), ]) TASK_OUTCOME_COUNT_BY_ERROR_TYPE = monitor.CounterMetric( @@ -274,7 +274,7 @@ monitor.StringField('subtask'), monitor.StringField('mode'), monitor.StringField('platform'), - monitor.StringField('outcome'), + monitor.BooleanField('task_succeeded'), monitor.StringField('error_condition'), ]) @@ -365,6 +365,7 @@ field_spec=[ monitor.StringField('job'), monitor.StringField('platform'), + monitor.StringField('step'), ]) UNTRIAGED_TESTCASE_COUNT = monitor.GaugeMetric(