feat(crons): Switch default behavior to thresholds (getsentry#61893)

Changes default behavior for crons issues to create one issue per monitor incident. The default threshold is 1 for issue creation + recovery.
giovanni-guidini · Dec 19, 2023 · 4fe1a05 · 4fe1a05
1 parent 78a6d86
commit 4fe1a05
Show file tree

Hide file tree

Showing 3 changed files with 79 additions and 44 deletions.
diff --git a/src/sentry/monitors/logic/mark_failed.py b/src/sentry/monitors/logic/mark_failed.py
@@ -41,7 +41,9 @@ def mark_failed(
     computed based on the tasks reference time.
     """
     monitor_env = failed_checkin.monitor_environment
-    failure_issue_threshold = monitor_env.monitor.config.get("failure_issue_threshold", 0)
+    failure_issue_threshold = monitor_env.monitor.config.get("failure_issue_threshold", 1)
+    if not failure_issue_threshold:
+        failure_issue_threshold = 1
 
     # Compute the next check-in time from our reference time
     next_checkin = monitor_env.monitor.get_next_expected_checkin(ts)
@@ -94,7 +96,14 @@ def mark_failed(
     monitor_env.refresh_from_db()
 
     # Create incidents + issues
-    if failure_issue_threshold:
+    use_issue_platform = False
+    try:
+        organization = Organization.objects.get_from_cache(id=monitor_env.monitor.organization_id)
+        use_issue_platform = features.has("organizations:issue-platform", organization=organization)
+    except Organization.DoesNotExist:
+        pass
+
+    if use_issue_platform:
         return mark_failed_threshold(failed_checkin, failure_issue_threshold)
     else:
         return mark_failed_no_threshold(failed_checkin)
@@ -110,22 +119,32 @@ def mark_failed_threshold(failed_checkin: MonitorCheckIn, failure_issue_threshol
     fingerprint = None
 
     # check to see if we need to update the status
-    if monitor_env.status == MonitorStatus.OK:
-        # reverse the list after slicing in order to start with oldest check-in
-        # use .values() to speed up query
-        previous_checkins = list(
-            reversed(
-                MonitorCheckIn.objects.filter(
-                    monitor_environment=monitor_env, date_added__lte=failed_checkin.date_added
+    if monitor_env.status in [MonitorStatus.OK, MonitorStatus.ACTIVE]:
+        # evaluation logic for multiple check-ins
+        if failure_issue_threshold > 1:
+            # reverse the list after slicing in order to start with oldest check-in
+            # use .values() to speed up query
+            previous_checkins = list(
+                reversed(
+                    MonitorCheckIn.objects.filter(
+                        monitor_environment=monitor_env, date_added__lte=failed_checkin.date_added
+                    )
+                    .order_by("-date_added")
+                    .values("id", "date_added", "status")[:failure_issue_threshold]
                 )
-                .exclude(status=CheckInStatus.IN_PROGRESS)
-                .order_by("-date_added")
-                .values("id", "date_added", "status")[:failure_issue_threshold]
             )
-        )
-        # check for successive failed previous check-ins
-        if not all([checkin["status"] != CheckInStatus.OK for checkin in previous_checkins]):
-            return False
+            # check for any successful previous check-in
+            if any([checkin["status"] == CheckInStatus.OK for checkin in previous_checkins]):
+                return False
+        # if threshold is 1, just use the most recent check-in
+        else:
+            previous_checkins = [
+                {
+                    "id": failed_checkin.id,
+                    "date_added": failed_checkin.date_added,
+                    "status": failed_checkin.status,
+                }
+            ]
 
         # change monitor status + update fingerprint timestamp
         monitor_env.status = MonitorStatus.ERROR
@@ -151,13 +170,14 @@ def mark_failed_threshold(failed_checkin: MonitorCheckIn, failure_issue_threshol
         MonitorStatus.MISSED_CHECKIN,
         MonitorStatus.TIMEOUT,
     ]:
-        # if monitor environment has a failed status, get the most recent
+        # if monitor environment has a failed status, use the failed
         # check-in and send occurrence
         previous_checkins = [
-            MonitorCheckIn.objects.filter(monitor_environment=monitor_env)
-            .order_by("-date_added")
-            .values("id", "date_added", "status")
-            .first()
+            {
+                "id": failed_checkin.id,
+                "date_added": failed_checkin.date_added,
+                "status": failed_checkin.status,
+            }
         ]
 
         # get the existing grouphash from the monitor environment
@@ -184,21 +204,17 @@ def mark_failed_no_threshold(failed_checkin: MonitorCheckIn):
 
     monitor_env = failed_checkin.monitor_environment
 
+    failed_status_map = {
+        CheckInStatus.MISSED: MonitorStatus.MISSED_CHECKIN,
+        CheckInStatus.TIMEOUT: MonitorStatus.TIMEOUT,
+    }
+    monitor_env.update(status=failed_status_map.get(failed_checkin.status, MonitorStatus.ERROR))
+
     # Do not create event if monitor is muted
     if monitor_env.monitor.is_muted:
         return True
 
-    use_issue_platform = False
-    try:
-        organization = Organization.objects.get(id=monitor_env.monitor.organization_id)
-        use_issue_platform = features.has("organizations:issue-platform", organization=organization)
-    except Organization.DoesNotExist:
-        pass
-
-    if use_issue_platform:
-        create_issue_platform_occurrence(failed_checkin)
-    else:
-        create_legacy_event(failed_checkin)
+    create_legacy_event(failed_checkin)
 
     monitor_environment_failed.send(monitor_environment=monitor_env, sender=type(monitor_env))
 

diff --git a/src/sentry/monitors/logic/mark_ok.py b/src/sentry/monitors/logic/mark_ok.py
@@ -23,10 +23,12 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime):
 
     if not monitor_env.monitor.is_muted and monitor_env.status != MonitorStatus.OK:
         params["status"] = MonitorStatus.OK
-        recovery_threshold = monitor_env.monitor.config.get("recovery_threshold")
+        recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 1)
+        if not recovery_threshold:
+            recovery_threshold = 1
 
         # Run incident logic if recovery threshold is set
-        if recovery_threshold:
+        if recovery_threshold > 1:
             # Check if our incident is recovering
             previous_checkins = (
                 MonitorCheckIn.objects.filter(monitor_environment=monitor_env)

diff --git a/tests/sentry/monitors/logic/test_mark_failed.py b/tests/sentry/monitors/logic/test_mark_failed.py
@@ -5,7 +5,6 @@
 
 from django.utils import timezone
 
-from sentry.grouping.utils import hash_from_values
 from sentry.issues.grouptype import (
     MonitorCheckInFailure,
     MonitorCheckInMissed,
@@ -264,6 +263,12 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence
         )
         assert mark_failed(checkin, ts=checkin.date_added)
 
+        monitor_environment.refresh_from_db()
+        assert monitor_environment.status == MonitorStatus.ERROR
+
+        monitor_incidents = MonitorIncident.objects.filter(monitor_environment=monitor_environment)
+        assert len(monitor_incidents) == 1
+
         assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1
 
         kwargs = mock_produce_occurrence_to_kafka.call_args.kwargs
@@ -275,7 +280,7 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence
             occurrence,
             **{
                 "project_id": self.project.id,
-                "fingerprint": [hash_from_values(["monitor", str(monitor.guid), "error"])],
+                "fingerprint": [monitor_incidents[0].grouphash],
                 "issue_title": f"Monitor failure: {monitor.name}",
                 "subtitle": "An error occurred during the latest check-in.",
                 "resource_id": None,
@@ -323,7 +328,7 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence
                 },
                 "environment": monitor_environment.environment.name,
                 "event_id": occurrence["event_id"],
-                "fingerprint": ["monitor", str(monitor.guid), "error"],
+                "fingerprint": [monitor_incidents[0].grouphash],
                 "platform": "other",
                 "project_id": monitor.project_id,
                 "sdk": None,
@@ -372,6 +377,12 @@ def test_mark_failed_with_timeout_reason_issue_platform(self, mock_produce_occur
         )
         assert mark_failed(failed_checkin, ts=failed_checkin.date_added)
 
+        monitor_environment.refresh_from_db()
+        assert monitor_environment.status == MonitorStatus.ERROR
+
+        monitor_incidents = MonitorIncident.objects.filter(monitor_environment=monitor_environment)
+        assert len(monitor_incidents) == 1
+
         assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1
 
         kwargs = mock_produce_occurrence_to_kafka.call_args.kwargs
@@ -383,7 +394,7 @@ def test_mark_failed_with_timeout_reason_issue_platform(self, mock_produce_occur
             occurrence,
             **{
                 "project_id": self.project.id,
-                "fingerprint": [hash_from_values(["monitor", str(monitor.guid), "duration"])],
+                "fingerprint": [monitor_incidents[0].grouphash],
                 "issue_title": f"Monitor failure: {monitor.name}",
                 "subtitle": "Check-in exceeded maximum duration of 10 minutes.",
                 "resource_id": None,
@@ -412,7 +423,7 @@ def test_mark_failed_with_timeout_reason_issue_platform(self, mock_produce_occur
             **{
                 "contexts": {
                     "monitor": {
-                        "status": "timeout",
+                        "status": "error",
                         "type": "cron_job",
                         "config": {
                             "schedule_type": 2,
@@ -427,7 +438,7 @@ def test_mark_failed_with_timeout_reason_issue_platform(self, mock_produce_occur
                 },
                 "environment": monitor_environment.environment.name,
                 "event_id": occurrence["event_id"],
-                "fingerprint": ["monitor", str(monitor.guid), "duration"],
+                "fingerprint": [monitor_incidents[0].grouphash],
                 "platform": "other",
                 "project_id": monitor.project_id,
                 "sdk": None,
@@ -477,7 +488,10 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr
 
         monitor.refresh_from_db()
         monitor_environment.refresh_from_db()
-        assert monitor_environment.status == MonitorStatus.MISSED_CHECKIN
+        assert monitor_environment.status == MonitorStatus.ERROR
+
+        monitor_incidents = MonitorIncident.objects.filter(monitor_environment=monitor_environment)
+        assert len(monitor_incidents) == 1
 
         assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1
 
@@ -490,7 +504,7 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr
             occurrence,
             **{
                 "project_id": self.project.id,
-                "fingerprint": [hash_from_values(["monitor", str(monitor.guid), "missed_checkin"])],
+                "fingerprint": [monitor_incidents[0].grouphash],
                 "issue_title": f"Monitor failure: {monitor.name}",
                 "subtitle": f"No check-in reported on {next_checkin.strftime(SUBTITLE_DATETIME_FORMAT)}.",
                 "resource_id": None,
@@ -519,7 +533,7 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr
             **{
                 "contexts": {
                     "monitor": {
-                        "status": "missed_checkin",
+                        "status": "error",
                         "type": "cron_job",
                         "config": {
                             "schedule_type": 2,
@@ -534,7 +548,7 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr
                 },
                 "environment": monitor_environment.environment.name,
                 "event_id": occurrence["event_id"],
-                "fingerprint": ["monitor", str(monitor.guid), "missed_checkin"],
+                "fingerprint": [monitor_incidents[0].grouphash],
                 "platform": "other",
                 "project_id": monitor.project_id,
                 "sdk": None,
@@ -584,6 +598,7 @@ def test_mark_failed_muted(self, mock_produce_occurrence_to_kafka):
         monitor_incidents = MonitorIncident.objects.filter(monitor_environment=monitor_environment)
         assert len(monitor_incidents) == 0
 
+    @with_feature("organizations:issue-platform")
     @patch("sentry.issues.producer.produce_occurrence_to_kafka")
     def test_mark_failed_issue_threshold(self, mock_produce_occurrence_to_kafka):
         failure_issue_threshold = 8
@@ -702,6 +717,7 @@ def test_mark_failed_issue_threshold(self, mock_produce_occurrence_to_kafka):
 
     # Test to make sure that timeout mark_failed (which occur in the past)
     # correctly create issues once passing the failure_issue_threshold
+    @with_feature("organizations:issue-platform")
     @patch("sentry.issues.producer.produce_occurrence_to_kafka")
     def test_mark_failed_issue_threshold_timeout(self, mock_produce_occurrence_to_kafka):
         failure_issue_threshold = 8
@@ -784,6 +800,7 @@ def test_mark_failed_issue_threshold_timeout(self, mock_produce_occurrence_to_ka
         assert occurrence["fingerprint"][0] == monitor_incident.grouphash
 
     # we are duplicating this test as the code paths are different, for now
+    @with_feature("organizations:issue-platform")
     @patch("sentry.issues.producer.produce_occurrence_to_kafka")
     def test_mark_failed_issue_threshold_disabled(self, mock_produce_occurrence_to_kafka):
         failure_issue_threshold = 8