diff --git a/src/sentry/monitors/logic/mark_failed.py b/src/sentry/monitors/logic/mark_failed.py index c0563cd7ac6008..6366167ababd2f 100644 --- a/src/sentry/monitors/logic/mark_failed.py +++ b/src/sentry/monitors/logic/mark_failed.py @@ -41,7 +41,9 @@ def mark_failed( computed based on the tasks reference time. """ monitor_env = failed_checkin.monitor_environment - failure_issue_threshold = monitor_env.monitor.config.get("failure_issue_threshold", 0) + failure_issue_threshold = monitor_env.monitor.config.get("failure_issue_threshold", 1) + if not failure_issue_threshold: + failure_issue_threshold = 1 # Compute the next check-in time from our reference time next_checkin = monitor_env.monitor.get_next_expected_checkin(ts) @@ -94,7 +96,14 @@ def mark_failed( monitor_env.refresh_from_db() # Create incidents + issues - if failure_issue_threshold: + use_issue_platform = False + try: + organization = Organization.objects.get_from_cache(id=monitor_env.monitor.organization_id) + use_issue_platform = features.has("organizations:issue-platform", organization=organization) + except Organization.DoesNotExist: + pass + + if use_issue_platform: return mark_failed_threshold(failed_checkin, failure_issue_threshold) else: return mark_failed_no_threshold(failed_checkin) @@ -110,22 +119,32 @@ def mark_failed_threshold(failed_checkin: MonitorCheckIn, failure_issue_threshol fingerprint = None # check to see if we need to update the status - if monitor_env.status == MonitorStatus.OK: - # reverse the list after slicing in order to start with oldest check-in - # use .values() to speed up query - previous_checkins = list( - reversed( - MonitorCheckIn.objects.filter( - monitor_environment=monitor_env, date_added__lte=failed_checkin.date_added + if monitor_env.status in [MonitorStatus.OK, MonitorStatus.ACTIVE]: + # evaluation logic for multiple check-ins + if failure_issue_threshold > 1: + # reverse the list after slicing in order to start with oldest check-in + # use .values() to speed up query + previous_checkins = list( + reversed( + MonitorCheckIn.objects.filter( + monitor_environment=monitor_env, date_added__lte=failed_checkin.date_added + ) + .order_by("-date_added") + .values("id", "date_added", "status")[:failure_issue_threshold] ) - .exclude(status=CheckInStatus.IN_PROGRESS) - .order_by("-date_added") - .values("id", "date_added", "status")[:failure_issue_threshold] ) - ) - # check for successive failed previous check-ins - if not all([checkin["status"] != CheckInStatus.OK for checkin in previous_checkins]): - return False + # check for any successful previous check-in + if any([checkin["status"] == CheckInStatus.OK for checkin in previous_checkins]): + return False + # if threshold is 1, just use the most recent check-in + else: + previous_checkins = [ + { + "id": failed_checkin.id, + "date_added": failed_checkin.date_added, + "status": failed_checkin.status, + } + ] # change monitor status + update fingerprint timestamp monitor_env.status = MonitorStatus.ERROR @@ -151,13 +170,14 @@ def mark_failed_threshold(failed_checkin: MonitorCheckIn, failure_issue_threshol MonitorStatus.MISSED_CHECKIN, MonitorStatus.TIMEOUT, ]: - # if monitor environment has a failed status, get the most recent + # if monitor environment has a failed status, use the failed # check-in and send occurrence previous_checkins = [ - MonitorCheckIn.objects.filter(monitor_environment=monitor_env) - .order_by("-date_added") - .values("id", "date_added", "status") - .first() + { + "id": failed_checkin.id, + "date_added": failed_checkin.date_added, + "status": failed_checkin.status, + } ] # get the existing grouphash from the monitor environment @@ -184,21 +204,17 @@ def mark_failed_no_threshold(failed_checkin: MonitorCheckIn): monitor_env = failed_checkin.monitor_environment + failed_status_map = { + CheckInStatus.MISSED: MonitorStatus.MISSED_CHECKIN, + CheckInStatus.TIMEOUT: MonitorStatus.TIMEOUT, + } + monitor_env.update(status=failed_status_map.get(failed_checkin.status, MonitorStatus.ERROR)) + # Do not create event if monitor is muted if monitor_env.monitor.is_muted: return True - use_issue_platform = False - try: - organization = Organization.objects.get(id=monitor_env.monitor.organization_id) - use_issue_platform = features.has("organizations:issue-platform", organization=organization) - except Organization.DoesNotExist: - pass - - if use_issue_platform: - create_issue_platform_occurrence(failed_checkin) - else: - create_legacy_event(failed_checkin) + create_legacy_event(failed_checkin) monitor_environment_failed.send(monitor_environment=monitor_env, sender=type(monitor_env)) diff --git a/src/sentry/monitors/logic/mark_ok.py b/src/sentry/monitors/logic/mark_ok.py index 80c926690be149..7657dfd3626ec4 100644 --- a/src/sentry/monitors/logic/mark_ok.py +++ b/src/sentry/monitors/logic/mark_ok.py @@ -23,10 +23,12 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime): if not monitor_env.monitor.is_muted and monitor_env.status != MonitorStatus.OK: params["status"] = MonitorStatus.OK - recovery_threshold = monitor_env.monitor.config.get("recovery_threshold") + recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 1) + if not recovery_threshold: + recovery_threshold = 1 # Run incident logic if recovery threshold is set - if recovery_threshold: + if recovery_threshold > 1: # Check if our incident is recovering previous_checkins = ( MonitorCheckIn.objects.filter(monitor_environment=monitor_env) diff --git a/tests/sentry/monitors/logic/test_mark_failed.py b/tests/sentry/monitors/logic/test_mark_failed.py index eb3e688c9dde2c..68aee60ab3ee36 100644 --- a/tests/sentry/monitors/logic/test_mark_failed.py +++ b/tests/sentry/monitors/logic/test_mark_failed.py @@ -5,7 +5,6 @@ from django.utils import timezone -from sentry.grouping.utils import hash_from_values from sentry.issues.grouptype import ( MonitorCheckInFailure, MonitorCheckInMissed, @@ -264,6 +263,12 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence ) assert mark_failed(checkin, ts=checkin.date_added) + monitor_environment.refresh_from_db() + assert monitor_environment.status == MonitorStatus.ERROR + + monitor_incidents = MonitorIncident.objects.filter(monitor_environment=monitor_environment) + assert len(monitor_incidents) == 1 + assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1 kwargs = mock_produce_occurrence_to_kafka.call_args.kwargs @@ -275,7 +280,7 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence occurrence, **{ "project_id": self.project.id, - "fingerprint": [hash_from_values(["monitor", str(monitor.guid), "error"])], + "fingerprint": [monitor_incidents[0].grouphash], "issue_title": f"Monitor failure: {monitor.name}", "subtitle": "An error occurred during the latest check-in.", "resource_id": None, @@ -323,7 +328,7 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence }, "environment": monitor_environment.environment.name, "event_id": occurrence["event_id"], - "fingerprint": ["monitor", str(monitor.guid), "error"], + "fingerprint": [monitor_incidents[0].grouphash], "platform": "other", "project_id": monitor.project_id, "sdk": None, @@ -372,6 +377,12 @@ def test_mark_failed_with_timeout_reason_issue_platform(self, mock_produce_occur ) assert mark_failed(failed_checkin, ts=failed_checkin.date_added) + monitor_environment.refresh_from_db() + assert monitor_environment.status == MonitorStatus.ERROR + + monitor_incidents = MonitorIncident.objects.filter(monitor_environment=monitor_environment) + assert len(monitor_incidents) == 1 + assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1 kwargs = mock_produce_occurrence_to_kafka.call_args.kwargs @@ -383,7 +394,7 @@ def test_mark_failed_with_timeout_reason_issue_platform(self, mock_produce_occur occurrence, **{ "project_id": self.project.id, - "fingerprint": [hash_from_values(["monitor", str(monitor.guid), "duration"])], + "fingerprint": [monitor_incidents[0].grouphash], "issue_title": f"Monitor failure: {monitor.name}", "subtitle": "Check-in exceeded maximum duration of 10 minutes.", "resource_id": None, @@ -412,7 +423,7 @@ def test_mark_failed_with_timeout_reason_issue_platform(self, mock_produce_occur **{ "contexts": { "monitor": { - "status": "timeout", + "status": "error", "type": "cron_job", "config": { "schedule_type": 2, @@ -427,7 +438,7 @@ def test_mark_failed_with_timeout_reason_issue_platform(self, mock_produce_occur }, "environment": monitor_environment.environment.name, "event_id": occurrence["event_id"], - "fingerprint": ["monitor", str(monitor.guid), "duration"], + "fingerprint": [monitor_incidents[0].grouphash], "platform": "other", "project_id": monitor.project_id, "sdk": None, @@ -477,7 +488,10 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr monitor.refresh_from_db() monitor_environment.refresh_from_db() - assert monitor_environment.status == MonitorStatus.MISSED_CHECKIN + assert monitor_environment.status == MonitorStatus.ERROR + + monitor_incidents = MonitorIncident.objects.filter(monitor_environment=monitor_environment) + assert len(monitor_incidents) == 1 assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1 @@ -490,7 +504,7 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr occurrence, **{ "project_id": self.project.id, - "fingerprint": [hash_from_values(["monitor", str(monitor.guid), "missed_checkin"])], + "fingerprint": [monitor_incidents[0].grouphash], "issue_title": f"Monitor failure: {monitor.name}", "subtitle": f"No check-in reported on {next_checkin.strftime(SUBTITLE_DATETIME_FORMAT)}.", "resource_id": None, @@ -519,7 +533,7 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr **{ "contexts": { "monitor": { - "status": "missed_checkin", + "status": "error", "type": "cron_job", "config": { "schedule_type": 2, @@ -534,7 +548,7 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr }, "environment": monitor_environment.environment.name, "event_id": occurrence["event_id"], - "fingerprint": ["monitor", str(monitor.guid), "missed_checkin"], + "fingerprint": [monitor_incidents[0].grouphash], "platform": "other", "project_id": monitor.project_id, "sdk": None, @@ -584,6 +598,7 @@ def test_mark_failed_muted(self, mock_produce_occurrence_to_kafka): monitor_incidents = MonitorIncident.objects.filter(monitor_environment=monitor_environment) assert len(monitor_incidents) == 0 + @with_feature("organizations:issue-platform") @patch("sentry.issues.producer.produce_occurrence_to_kafka") def test_mark_failed_issue_threshold(self, mock_produce_occurrence_to_kafka): failure_issue_threshold = 8 @@ -702,6 +717,7 @@ def test_mark_failed_issue_threshold(self, mock_produce_occurrence_to_kafka): # Test to make sure that timeout mark_failed (which occur in the past) # correctly create issues once passing the failure_issue_threshold + @with_feature("organizations:issue-platform") @patch("sentry.issues.producer.produce_occurrence_to_kafka") def test_mark_failed_issue_threshold_timeout(self, mock_produce_occurrence_to_kafka): failure_issue_threshold = 8 @@ -784,6 +800,7 @@ def test_mark_failed_issue_threshold_timeout(self, mock_produce_occurrence_to_ka assert occurrence["fingerprint"][0] == monitor_incident.grouphash # we are duplicating this test as the code paths are different, for now + @with_feature("organizations:issue-platform") @patch("sentry.issues.producer.produce_occurrence_to_kafka") def test_mark_failed_issue_threshold_disabled(self, mock_produce_occurrence_to_kafka): failure_issue_threshold = 8