Skip to content

Commit

Permalink
[Monitoring] Adding a metric for task outcome (#4458)
Browse files Browse the repository at this point in the history
### Motivation

We currently have no metric that tracks the error rate for each task.
This PR implements that, and the error rate can be obtained by summing
up the metric with outcome=failure, divided by the overall sum.

This is useful for SLI alerting.

Part of #4271
  • Loading branch information
vitorguidi authored Dec 10, 2024
1 parent 591d6c5 commit 514cec0
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 0 deletions.
33 changes: 33 additions & 0 deletions src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from clusterfuzz._internal.bot.webserver import http_server
from clusterfuzz._internal.metrics import logs
from clusterfuzz._internal.metrics import monitoring_metrics
from clusterfuzz._internal.protos import uworker_msg_pb2
from clusterfuzz._internal.system import environment

# Define an alias to appease pylint.
Expand Down Expand Up @@ -74,12 +75,15 @@ class _MetricRecorder(contextlib.AbstractContextManager):
Members:
start_time_ns (int): The time at which this recorder was constructed, in
nanoseconds since the Unix epoch.
utask_main_failure: this class stores the uworker_output.ErrorType
object returned by utask_main, and uses it to emmit a metric.
"""

def __init__(self, subtask: _Subtask):
self.start_time_ns = time.time_ns()
self._subtask = subtask
self._labels = None
self.utask_main_failure = None

if subtask == _Subtask.PREPROCESS:
self._preprocess_start_time_ns = self.start_time_ns
Expand Down Expand Up @@ -138,6 +142,30 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
monitoring_metrics.UTASK_SUBTASK_E2E_DURATION_SECS.add(
e2e_duration_secs, self._labels)

# The only case where a task might fail without throwing, is in
# utask_main, by returning an ErrorType proto which indicates
# failure.
outcome = 'error' if _exc_type or self.utask_main_failure else 'success'
monitoring_metrics.TASK_OUTCOME_COUNT.increment({
**self._labels, 'outcome': outcome
})
if outcome == "success":
error_condition = 'N/A'
elif _exc_type:
error_condition = 'UNHANDLED_EXCEPTION'
else:
error_condition = uworker_msg_pb2.ErrorType.Name( # pylint: disable=no-member
self.utask_main_failure)
# Get rid of job as a label, so we can have another metric to make
# error conditions more explicit, respecting the 30k distinct
# labels limit recommended by gcp.
trimmed_labels = self._labels
del trimmed_labels['job']
trimmed_labels['outcome'] = outcome
trimmed_labels['error_condition'] = error_condition
monitoring_metrics.TASK_OUTCOME_COUNT_BY_ERROR_TYPE.increment(
trimmed_labels)


def ensure_uworker_env_type_safety(uworker_env):
"""Converts all values in |uworker_env| to str types.
Expand Down Expand Up @@ -226,6 +254,8 @@ def uworker_main_no_io(utask_module, serialized_uworker_input):
return None

# NOTE: Keep this in sync with `uworker_main()`.
if uworker_output.error_type != uworker_msg_pb2.ErrorType.NO_ERROR: # pylint: disable=no-member
recorder.utask_main_failure = uworker_output.error_type
uworker_output.bot_name = environment.get_value('BOT_NAME', '')
uworker_output.platform_id = environment.get_platform_id()

Expand Down Expand Up @@ -306,6 +336,9 @@ def uworker_main(input_download_url) -> None:
logs.info('Starting utask_main: %s.' % utask_module)
uworker_output = utask_module.utask_main(uworker_input)

if uworker_output.error_type != uworker_msg_pb2.ErrorType.NO_ERROR: # pylint: disable=no-member
recorder.utask_main_failure = uworker_output.error_type

# NOTE: Keep this in sync with `uworker_main_no_io()`.
uworker_output.bot_name = environment.get_value('BOT_NAME', '')
uworker_output.platform_id = environment.get_platform_id()
Expand Down
25 changes: 25 additions & 0 deletions src/clusterfuzz/_internal/metrics/monitoring_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@
monitor.StringField('job'),
],
)

TASK_RATE_LIMIT_COUNT = monitor.CounterMetric(
'task/rate_limit',
description=('Counter for rate limit events.'),
Expand All @@ -250,6 +251,30 @@
monitor.StringField('argument'),
])

TASK_OUTCOME_COUNT = monitor.CounterMetric(
'task/outcome',
description=('Counter metric for task outcome (success/failure).'),
field_spec=[
monitor.StringField('task'),
monitor.StringField('job'),
monitor.StringField('subtask'),
monitor.StringField('mode'),
monitor.StringField('platform'),
monitor.StringField('outcome'),
])

TASK_OUTCOME_COUNT_BY_ERROR_TYPE = monitor.CounterMetric(
'task/outcome_by_error_type',
description=('Counter metric for task outcome, with error type.'),
field_spec=[
monitor.StringField('task'),
monitor.StringField('subtask'),
monitor.StringField('mode'),
monitor.StringField('platform'),
monitor.StringField('outcome'),
monitor.StringField('error_condition'),
])

UTASK_SUBTASK_E2E_DURATION_SECS = monitor.CumulativeDistributionMetric(
'utask/subtask_e2e_duration_secs',
description=(
Expand Down

0 comments on commit 514cec0

Please sign in to comment.