Skip to content

Commit

Permalink
feat(crons): Switch default behavior to thresholds (getsentry#61893)
Browse files Browse the repository at this point in the history
Changes default behavior for crons issues to create one issue per
monitor incident. The default threshold is 1 for issue creation +
recovery.
  • Loading branch information
rjo100 authored Dec 19, 2023
1 parent 78a6d86 commit 4fe1a05
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 44 deletions.
80 changes: 48 additions & 32 deletions src/sentry/monitors/logic/mark_failed.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ def mark_failed(
computed based on the tasks reference time.
"""
monitor_env = failed_checkin.monitor_environment
failure_issue_threshold = monitor_env.monitor.config.get("failure_issue_threshold", 0)
failure_issue_threshold = monitor_env.monitor.config.get("failure_issue_threshold", 1)
if not failure_issue_threshold:
failure_issue_threshold = 1

# Compute the next check-in time from our reference time
next_checkin = monitor_env.monitor.get_next_expected_checkin(ts)
Expand Down Expand Up @@ -94,7 +96,14 @@ def mark_failed(
monitor_env.refresh_from_db()

# Create incidents + issues
if failure_issue_threshold:
use_issue_platform = False
try:
organization = Organization.objects.get_from_cache(id=monitor_env.monitor.organization_id)
use_issue_platform = features.has("organizations:issue-platform", organization=organization)
except Organization.DoesNotExist:
pass

if use_issue_platform:
return mark_failed_threshold(failed_checkin, failure_issue_threshold)
else:
return mark_failed_no_threshold(failed_checkin)
Expand All @@ -110,22 +119,32 @@ def mark_failed_threshold(failed_checkin: MonitorCheckIn, failure_issue_threshol
fingerprint = None

# check to see if we need to update the status
if monitor_env.status == MonitorStatus.OK:
# reverse the list after slicing in order to start with oldest check-in
# use .values() to speed up query
previous_checkins = list(
reversed(
MonitorCheckIn.objects.filter(
monitor_environment=monitor_env, date_added__lte=failed_checkin.date_added
if monitor_env.status in [MonitorStatus.OK, MonitorStatus.ACTIVE]:
# evaluation logic for multiple check-ins
if failure_issue_threshold > 1:
# reverse the list after slicing in order to start with oldest check-in
# use .values() to speed up query
previous_checkins = list(
reversed(
MonitorCheckIn.objects.filter(
monitor_environment=monitor_env, date_added__lte=failed_checkin.date_added
)
.order_by("-date_added")
.values("id", "date_added", "status")[:failure_issue_threshold]
)
.exclude(status=CheckInStatus.IN_PROGRESS)
.order_by("-date_added")
.values("id", "date_added", "status")[:failure_issue_threshold]
)
)
# check for successive failed previous check-ins
if not all([checkin["status"] != CheckInStatus.OK for checkin in previous_checkins]):
return False
# check for any successful previous check-in
if any([checkin["status"] == CheckInStatus.OK for checkin in previous_checkins]):
return False
# if threshold is 1, just use the most recent check-in
else:
previous_checkins = [
{
"id": failed_checkin.id,
"date_added": failed_checkin.date_added,
"status": failed_checkin.status,
}
]

# change monitor status + update fingerprint timestamp
monitor_env.status = MonitorStatus.ERROR
Expand All @@ -151,13 +170,14 @@ def mark_failed_threshold(failed_checkin: MonitorCheckIn, failure_issue_threshol
MonitorStatus.MISSED_CHECKIN,
MonitorStatus.TIMEOUT,
]:
# if monitor environment has a failed status, get the most recent
# if monitor environment has a failed status, use the failed
# check-in and send occurrence
previous_checkins = [
MonitorCheckIn.objects.filter(monitor_environment=monitor_env)
.order_by("-date_added")
.values("id", "date_added", "status")
.first()
{
"id": failed_checkin.id,
"date_added": failed_checkin.date_added,
"status": failed_checkin.status,
}
]

# get the existing grouphash from the monitor environment
Expand All @@ -184,21 +204,17 @@ def mark_failed_no_threshold(failed_checkin: MonitorCheckIn):

monitor_env = failed_checkin.monitor_environment

failed_status_map = {
CheckInStatus.MISSED: MonitorStatus.MISSED_CHECKIN,
CheckInStatus.TIMEOUT: MonitorStatus.TIMEOUT,
}
monitor_env.update(status=failed_status_map.get(failed_checkin.status, MonitorStatus.ERROR))

# Do not create event if monitor is muted
if monitor_env.monitor.is_muted:
return True

use_issue_platform = False
try:
organization = Organization.objects.get(id=monitor_env.monitor.organization_id)
use_issue_platform = features.has("organizations:issue-platform", organization=organization)
except Organization.DoesNotExist:
pass

if use_issue_platform:
create_issue_platform_occurrence(failed_checkin)
else:
create_legacy_event(failed_checkin)
create_legacy_event(failed_checkin)

monitor_environment_failed.send(monitor_environment=monitor_env, sender=type(monitor_env))

Expand Down
6 changes: 4 additions & 2 deletions src/sentry/monitors/logic/mark_ok.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime):

if not monitor_env.monitor.is_muted and monitor_env.status != MonitorStatus.OK:
params["status"] = MonitorStatus.OK
recovery_threshold = monitor_env.monitor.config.get("recovery_threshold")
recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 1)
if not recovery_threshold:
recovery_threshold = 1

# Run incident logic if recovery threshold is set
if recovery_threshold:
if recovery_threshold > 1:
# Check if our incident is recovering
previous_checkins = (
MonitorCheckIn.objects.filter(monitor_environment=monitor_env)
Expand Down
37 changes: 27 additions & 10 deletions tests/sentry/monitors/logic/test_mark_failed.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from django.utils import timezone

from sentry.grouping.utils import hash_from_values
from sentry.issues.grouptype import (
MonitorCheckInFailure,
MonitorCheckInMissed,
Expand Down Expand Up @@ -264,6 +263,12 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence
)
assert mark_failed(checkin, ts=checkin.date_added)

monitor_environment.refresh_from_db()
assert monitor_environment.status == MonitorStatus.ERROR

monitor_incidents = MonitorIncident.objects.filter(monitor_environment=monitor_environment)
assert len(monitor_incidents) == 1

assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1

kwargs = mock_produce_occurrence_to_kafka.call_args.kwargs
Expand All @@ -275,7 +280,7 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence
occurrence,
**{
"project_id": self.project.id,
"fingerprint": [hash_from_values(["monitor", str(monitor.guid), "error"])],
"fingerprint": [monitor_incidents[0].grouphash],
"issue_title": f"Monitor failure: {monitor.name}",
"subtitle": "An error occurred during the latest check-in.",
"resource_id": None,
Expand Down Expand Up @@ -323,7 +328,7 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence
},
"environment": monitor_environment.environment.name,
"event_id": occurrence["event_id"],
"fingerprint": ["monitor", str(monitor.guid), "error"],
"fingerprint": [monitor_incidents[0].grouphash],
"platform": "other",
"project_id": monitor.project_id,
"sdk": None,
Expand Down Expand Up @@ -372,6 +377,12 @@ def test_mark_failed_with_timeout_reason_issue_platform(self, mock_produce_occur
)
assert mark_failed(failed_checkin, ts=failed_checkin.date_added)

monitor_environment.refresh_from_db()
assert monitor_environment.status == MonitorStatus.ERROR

monitor_incidents = MonitorIncident.objects.filter(monitor_environment=monitor_environment)
assert len(monitor_incidents) == 1

assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1

kwargs = mock_produce_occurrence_to_kafka.call_args.kwargs
Expand All @@ -383,7 +394,7 @@ def test_mark_failed_with_timeout_reason_issue_platform(self, mock_produce_occur
occurrence,
**{
"project_id": self.project.id,
"fingerprint": [hash_from_values(["monitor", str(monitor.guid), "duration"])],
"fingerprint": [monitor_incidents[0].grouphash],
"issue_title": f"Monitor failure: {monitor.name}",
"subtitle": "Check-in exceeded maximum duration of 10 minutes.",
"resource_id": None,
Expand Down Expand Up @@ -412,7 +423,7 @@ def test_mark_failed_with_timeout_reason_issue_platform(self, mock_produce_occur
**{
"contexts": {
"monitor": {
"status": "timeout",
"status": "error",
"type": "cron_job",
"config": {
"schedule_type": 2,
Expand All @@ -427,7 +438,7 @@ def test_mark_failed_with_timeout_reason_issue_platform(self, mock_produce_occur
},
"environment": monitor_environment.environment.name,
"event_id": occurrence["event_id"],
"fingerprint": ["monitor", str(monitor.guid), "duration"],
"fingerprint": [monitor_incidents[0].grouphash],
"platform": "other",
"project_id": monitor.project_id,
"sdk": None,
Expand Down Expand Up @@ -477,7 +488,10 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr

monitor.refresh_from_db()
monitor_environment.refresh_from_db()
assert monitor_environment.status == MonitorStatus.MISSED_CHECKIN
assert monitor_environment.status == MonitorStatus.ERROR

monitor_incidents = MonitorIncident.objects.filter(monitor_environment=monitor_environment)
assert len(monitor_incidents) == 1

assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1

Expand All @@ -490,7 +504,7 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr
occurrence,
**{
"project_id": self.project.id,
"fingerprint": [hash_from_values(["monitor", str(monitor.guid), "missed_checkin"])],
"fingerprint": [monitor_incidents[0].grouphash],
"issue_title": f"Monitor failure: {monitor.name}",
"subtitle": f"No check-in reported on {next_checkin.strftime(SUBTITLE_DATETIME_FORMAT)}.",
"resource_id": None,
Expand Down Expand Up @@ -519,7 +533,7 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr
**{
"contexts": {
"monitor": {
"status": "missed_checkin",
"status": "error",
"type": "cron_job",
"config": {
"schedule_type": 2,
Expand All @@ -534,7 +548,7 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr
},
"environment": monitor_environment.environment.name,
"event_id": occurrence["event_id"],
"fingerprint": ["monitor", str(monitor.guid), "missed_checkin"],
"fingerprint": [monitor_incidents[0].grouphash],
"platform": "other",
"project_id": monitor.project_id,
"sdk": None,
Expand Down Expand Up @@ -584,6 +598,7 @@ def test_mark_failed_muted(self, mock_produce_occurrence_to_kafka):
monitor_incidents = MonitorIncident.objects.filter(monitor_environment=monitor_environment)
assert len(monitor_incidents) == 0

@with_feature("organizations:issue-platform")
@patch("sentry.issues.producer.produce_occurrence_to_kafka")
def test_mark_failed_issue_threshold(self, mock_produce_occurrence_to_kafka):
failure_issue_threshold = 8
Expand Down Expand Up @@ -702,6 +717,7 @@ def test_mark_failed_issue_threshold(self, mock_produce_occurrence_to_kafka):

# Test to make sure that timeout mark_failed (which occur in the past)
# correctly create issues once passing the failure_issue_threshold
@with_feature("organizations:issue-platform")
@patch("sentry.issues.producer.produce_occurrence_to_kafka")
def test_mark_failed_issue_threshold_timeout(self, mock_produce_occurrence_to_kafka):
failure_issue_threshold = 8
Expand Down Expand Up @@ -784,6 +800,7 @@ def test_mark_failed_issue_threshold_timeout(self, mock_produce_occurrence_to_ka
assert occurrence["fingerprint"][0] == monitor_incident.grouphash

# we are duplicating this test as the code paths are different, for now
@with_feature("organizations:issue-platform")
@patch("sentry.issues.producer.produce_occurrence_to_kafka")
def test_mark_failed_issue_threshold_disabled(self, mock_produce_occurrence_to_kafka):
failure_issue_threshold = 8
Expand Down

0 comments on commit 4fe1a05

Please sign in to comment.