Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Chrome deployment] Split utasks properly into success and error, add analyze step to testcase triage metric #4558

Merged
merged 2 commits into from
Dec 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 22 additions & 9 deletions src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,17 @@ def __init__(self, subtask: _Subtask):
self._subtask = subtask
self._labels = None
self.utask_main_failure = None
self._utask_success_conditions = [
None, # This can be a successful return value in, ie, fuzz task
uworker_msg_pb2.ErrorType.NO_ERROR, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.ANALYZE_NO_CRASH, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.PROGRESSION_BAD_STATE_MIN_MAX, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.REGRESSION_NO_CRASH, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.REGRESSION_LOW_CONFIDENCE_IN_REGRESSION_RANGE, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.MINIMIZE_CRASH_TOO_FLAKY, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.LIBFUZZER_MINIMIZATION_UNREPRODUCIBLE, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.ANALYZE_CLOSE_INVALID_UPLOADED, # pylint: disable=no-member
]

if subtask == _Subtask.PREPROCESS:
self._preprocess_start_time_ns = self.start_time_ns
Expand Down Expand Up @@ -125,6 +136,12 @@ def set_task_details(self,
# Ensure we always have a value after this method returns.
assert self._preprocess_start_time_ns is not None

def _infer_uworker_main_outcome(self, exc_type, uworker_error) -> bool:
"""Returns True if task succeeded, False otherwise."""
if exc_type or uworker_error not in self._utask_success_conditions:
return False
return True

def __exit__(self, _exc_type, _exc_value, _traceback):
# Ignore exception details, let Python continue unwinding the stack.

Expand All @@ -145,11 +162,12 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
# The only case where a task might fail without throwing, is in
# utask_main, by returning an ErrorType proto which indicates
# failure.
outcome = 'error' if _exc_type or self.utask_main_failure else 'success'
task_succeeded = self._infer_uworker_main_outcome(_exc_type,
self.utask_main_failure)
monitoring_metrics.TASK_OUTCOME_COUNT.increment({
**self._labels, 'outcome': outcome
**self._labels, 'task_succeeded': task_succeeded
})
if outcome == "success":
if task_succeeded:
error_condition = 'N/A'
elif _exc_type:
error_condition = 'UNHANDLED_EXCEPTION'
Expand All @@ -161,16 +179,11 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
# labels limit recommended by gcp.
trimmed_labels = self._labels
del trimmed_labels['job']
trimmed_labels['outcome'] = outcome
trimmed_labels['task_succeeded'] = task_succeeded
trimmed_labels['error_condition'] = error_condition
monitoring_metrics.TASK_OUTCOME_COUNT_BY_ERROR_TYPE.increment(
trimmed_labels)

if error_condition != 'UNHANDLED_EXCEPTION':
task = self._labels['task']
subtask = self._labels['subtask']
logs.info(f'Task {task}, at subtask {subtask}, finished successfully.')


def ensure_uworker_env_type_safety(uworker_env):
"""Converts all values in |uworker_env| to str types.
Expand Down
2 changes: 2 additions & 0 deletions src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,8 @@ def _update_testcase(output):
if analyze_task_output.platform_id:
testcase.platform_id = analyze_task_output.platform_id

testcase.analyze_pending = False

testcase.put()


Expand Down
40 changes: 10 additions & 30 deletions src/clusterfuzz/_internal/common/testcase_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,59 +31,39 @@


def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
'''Finds out if a testcase is fuzzer generated or manually uploaded,
and emits the TESTCASE_UPLOAD_TRIAGE_DURATION metric.'''
testcase_upload_metadata = get_testcase_upload_metadata(testcase_id)
if not testcase_upload_metadata:
logs.warning(f'No upload metadata found for testcase {testcase_id},'
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
return
if not testcase_upload_metadata.timestamp:
logs.warning(
f'No timestamp for testcase {testcase_upload_metadata.testcase_id},'
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
return
assert step in [
'analyze_launched', 'analyze_completed', 'minimize_completed',
'regression_completed', 'impact_completed', 'issue_updated'
]

"""Finds out if a testcase is fuzzer generated or manually uploaded,
and emits the TESTCASE_TRIAGE_DURATION metric."""
testcase = data_handler.get_testcase_by_id(testcase_id)

if not testcase:
logs.warning(f'No testcase found with id {testcase_id},'
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
' failed to emit TESTCASE_TRIAGE_DURATION metric.')
return

if not testcase.job_type:
logs.warning(f'No job_type associated to testcase {testcase_id},'
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
' failed to emit TESTCASE_TRIAGE_DURATION metric.')
return

from_fuzzer = not get_testcase_upload_metadata(testcase_id)

assert step in [
'analyze_launched', 'analyze_completed', 'minimize_completed',
'regression_completed', 'impact_completed', 'issue_updated'
]

if not testcase.get_age_in_seconds():
logs.warning(f'No timestamp associated to testcase {testcase_id},'
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
' failed to emit TESTCASE_TRIAGE_DURATION metric.')
return

testcase_age_in_hours = testcase.get_age_in_seconds() / 3600
testcase_age_in_hours = testcase.get_age_in_seconds() / (60 * 60)

logs.info('Emiting TESTCASE_UPLOAD_TRIAGE_DURATION metric for testcase '
logs.info('Emiting TESTCASE_TRIAGE_DURATION metric for testcase '
f'{testcase_id} (age = {testcase_age_in_hours} hours.) '
'in step {step}.')
f'in step {step}, from_fuzzer: {from_fuzzer}.')

monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add(
monitoring_metrics.TESTCASE_TRIAGE_DURATION.add(
testcase_age_in_hours,
labels={
'job': testcase.job_type,
'step': step,
'origin': 'fuzzer' if from_fuzzer else 'manually_uploaded'
'from_fuzzer': from_fuzzer
})


Expand Down
46 changes: 26 additions & 20 deletions src/clusterfuzz/_internal/cron/triage.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,8 @@ def _check_and_update_similar_bug(testcase, issue_tracker):

def _emit_bug_filing_from_testcase_elapsed_time_metric(testcase):
testcase_age = testcase.get_age_in_seconds()
if not testcase_age:
return
monitoring_metrics.BUG_FILING_FROM_TESTCASE_ELAPSED_TIME.add(
testcase_age,
labels={
Expand Down Expand Up @@ -336,27 +338,30 @@ def _emit_untriaged_testcase_count_metric():
})


def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase):
PENDING_ANALYZE = 'pending_analyze'
PENDING_CRITICAL_TASKS = 'pending_critical_tasks'
PENDING_PROGRESSION = 'pending_progression'
PENDING_GROUPING = 'pending_grouping'
PENDING_FILING = 'pending_filing'


def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase,
step: str):
"""Emmits a metric to track age of untriaged testcases."""
if not testcase.timestamp:
if not testcase.get_age_in_seconds():
return

logs.info(f'Emiting UNTRIAGED_TESTCASE_AGE for testcase {testcase.key.id()} '
f'(age = {testcase.get_age_in_seconds()})')
f'(age = {testcase.get_age_in_seconds()}), step = {step}')
monitoring_metrics.UNTRIAGED_TESTCASE_AGE.add(
testcase.get_age_in_seconds() / 3600,
labels={
'job': testcase.job_type,
'platform': testcase.platform,
'step': step,
})


PENDING_CRITICAL_TASKS = 'pending_critical_tasks'
PENDING_PROGRESSION = 'pending_progression'
PENDING_GROUPING = 'pending_grouping'
PENDING_FILING = 'pending_filing'


def main():
"""Files bugs."""
try:
Expand Down Expand Up @@ -409,7 +414,7 @@ def main():
if testcase.get_metadata('progression_pending'):
_set_testcase_stuck_state(testcase, True)
logs.info(f'Skipping testcase {testcase_id}, progression pending')
_emit_untriaged_testcase_age_metric(testcase)
_emit_untriaged_testcase_age_metric(testcase, PENDING_PROGRESSION)
_increment_untriaged_testcase_count(testcase.job_type,
PENDING_PROGRESSION)
logs.info(f'Skipping testcase {testcase_id}, progression pending')
Expand All @@ -433,10 +438,12 @@ def main():
# Require that all tasks like minimizaton, regression testing, etc have
# finished.
if not critical_tasks_completed:
_emit_untriaged_testcase_age_metric(testcase)
status = PENDING_CRITICAL_TASKS
if testcase.analyze_pending:
status = PENDING_ANALYZE
_emit_untriaged_testcase_age_metric(testcase, status)
_set_testcase_stuck_state(testcase, True)
_increment_untriaged_testcase_count(testcase.job_type,
PENDING_CRITICAL_TASKS)
_increment_untriaged_testcase_count(testcase.job_type, status)
logs.info(
f'Skipping testcase {testcase_id}, critical tasks still pending.')
continue
Expand All @@ -453,15 +460,15 @@ def main():
# metadata works well.
if not testcase.group_id and not dates.time_has_expired(
testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT):
_emit_untriaged_testcase_age_metric(testcase)
_emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING)
_set_testcase_stuck_state(testcase, True)
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
continue

if not testcase.get_metadata('ran_grouper'):
# Testcase should be considered by the grouper first before filing.
_emit_untriaged_testcase_age_metric(testcase)
_emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING)
_set_testcase_stuck_state(testcase, True)
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
Expand Down Expand Up @@ -491,16 +498,15 @@ def main():
# Clean up old triage messages that would be not applicable now.
testcase.delete_metadata(TRIAGE_MESSAGE_KEY, update_testcase=False)

# A testcase is untriaged, until immediately before a bug is opened
_emit_untriaged_testcase_age_metric(testcase)
_set_testcase_stuck_state(testcase, False)
_increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)

# File the bug first and then create filed bug metadata.
if not _file_issue(testcase, issue_tracker, throttler):
_emit_untriaged_testcase_age_metric(testcase, PENDING_FILING)
_increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)
logs.info(f'Issue filing failed for testcase id {testcase_id}')
continue

_set_testcase_stuck_state(testcase, False)

_create_filed_bug_metadata(testcase)
issue_filer.notify_issue_update(testcase, 'new')

Expand Down
3 changes: 2 additions & 1 deletion src/clusterfuzz/_internal/datastore/data_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -921,7 +921,7 @@ def critical_tasks_completed(testcase):
return testcase.minimized_keys and testcase.regression

return bool(testcase.minimized_keys and testcase.regression and
testcase.is_impact_set_flag)
testcase.is_impact_set_flag and not testcase.analyze_pending)


# ------------------------------------------------------------------------------
Expand Down Expand Up @@ -1380,6 +1380,7 @@ def create_user_uploaded_testcase(key,

testcase.timestamp = utils.utcnow()
testcase.created = testcase.timestamp
testcase.analyze_pending = True
testcase.uploader_email = uploader_email
testcase.put()

Expand Down
4 changes: 4 additions & 0 deletions src/clusterfuzz/_internal/datastore/data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,10 @@ class Testcase(Model):
# Tracks if a testcase is stuck during triage.
stuck_in_triage = ndb.BooleanProperty(default=False)

# Tracks if analyze task is pending.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to track if just analyze is pending and not analyze/minimize/regressoin?

Copy link
Collaborator Author

@vitorguidi vitorguidi Dec 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minimize/regression/impact are tracked together under critical tasks. The idea for spliting analyze, is so we would be able to catch the infamous pubsub incident. Also, there is no reason analyze would fail: the testcase either reproduces or not, if it does not complete, then there is trouble, as opposed to minimize, where there are legitimate reasons for it to get stuck.

Since we already have quite a few stuck testcases under critical tasks, we do not gain much by splitting until we clean up the old stuff.

# Defaults to false, since most testcases are fuzzer produced.
analyze_pending = ndb.BooleanProperty(default=False)

def is_chromium(self):
return self.project_name in ('chromium', 'chromium-testing')

Expand Down
13 changes: 7 additions & 6 deletions src/clusterfuzz/_internal/metrics/monitoring_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,17 +231,17 @@
],
)

TESTCASE_UPLOAD_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
'uploaded_testcase_analysis/triage_duration_secs',
TESTCASE_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
'testcase_analysis/triage_duration_hours',
description=('Time elapsed between testcase upload and completion'
' of relevant tasks in the testcase upload lifecycle.'
' of relevant tasks in the testcase lifecycle.'
' Origin can be either from a fuzzer, or a manual'
' upload. Measured in hours.'),
bucketer=monitor.GeometricBucketer(),
field_spec=[
monitor.StringField('step'),
monitor.StringField('job'),
monitor.StringField('origin'),
monitor.BooleanField('from_fuzzer'),
],
)

Expand All @@ -263,7 +263,7 @@
monitor.StringField('subtask'),
monitor.StringField('mode'),
monitor.StringField('platform'),
monitor.StringField('outcome'),
monitor.BooleanField('task_succeeded'),
])

TASK_OUTCOME_COUNT_BY_ERROR_TYPE = monitor.CounterMetric(
Expand All @@ -274,7 +274,7 @@
monitor.StringField('subtask'),
monitor.StringField('mode'),
monitor.StringField('platform'),
monitor.StringField('outcome'),
monitor.BooleanField('task_succeeded'),
monitor.StringField('error_condition'),
])

Expand Down Expand Up @@ -365,6 +365,7 @@
field_spec=[
monitor.StringField('job'),
monitor.StringField('platform'),
monitor.StringField('step'),
])

UNTRIAGED_TESTCASE_COUNT = monitor.GaugeMetric(
Expand Down
Loading