diff --git a/CHANGELOG.md b/CHANGELOG.md index ff7791049..05afd9eac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Remove PrometheusAvailabilityRatio alert. + ## [2.135.0] - 2023-10-02 ### Changed diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus-availability.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus-availability.rules.yml deleted file mode 100644 index 7d772ac4f..000000000 --- a/helm/prometheus-rules/templates/alerting-rules/prometheus-availability.rules.yml +++ /dev/null @@ -1,32 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - creationTimestamp: null - labels: - {{- include "labels.common" . | nindent 4 }} - cluster_type: "management_cluster" - name: prometheus-availability.rules - namespace: {{ .Values.namespace }} -spec: - groups: - - name: prometheus - rules: - - alert: PrometheusAvailabilityRatio - annotations: - description: '{{`Prometheus {{$labels.pod}} has availability ratio of {{ printf "%.2f" $value }} (min 0.8) over the last hour.`}}' - opsrecipe: prometheus-resource-limit-reached/ - dashboard: promavailability/prometheus-availability - expr: label_replace(avg(avg_over_time(kube_pod_status_ready{namespace=~"(.*)-prometheus", condition="true"}[1h])) by (pod), "cluster_id", "$1", "pod", "prometheus-(.+)-(.+)") < 0.8 - # At startup, availability starts at 0 for a few minutes. So ratio grows slowly from 0. - for: 30m - labels: - area: empowerment - cancel_if_any_apiserver_down: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_has_no_workers: "true" - cancel_if_outside_working_hours: "true" - severity: page - team: atlas - topic: observability diff --git a/test/tests/providers/global/prometheus-availability.rules.test.yml b/test/tests/providers/global/prometheus-availability.rules.test.yml deleted file mode 100644 index d40c1de75..000000000 --- a/test/tests/providers/global/prometheus-availability.rules.test.yml +++ /dev/null @@ -1,64 +0,0 @@ ---- -rule_files: - - prometheus-availability.rules.yml - -# Setting evaluation interval to 1h -# to make it faster on long test duration. -evaluation_interval: 1h - -tests: - # Test PrometheusAvailabilityRatio - - interval: 1m - input_series: - # This prometheus is up foreve - generates no alert - - series: 'kube_pod_status_ready{app="kube-state-metrics", condition="true", container="kube-state-metrics", namespace="install-prometheus", pod="prometheus-install-0"}' - values: "1+0x120" - # This prometheus starts at h+1, and takes 5min to get ready - generates no alert - - series: 'kube_pod_status_ready{app="kube-state-metrics", condition="true", container="kube-state-metrics", namespace="wcok-prometheus", pod="prometheus-wcok-0"}' - values: "_x60 0+0x5 1+0x60" - # This prometheus is down - generates alerts - - series: 'kube_pod_status_ready{app="kube-state-metrics", condition="true", container="kube-state-metrics", namespace="wcbad-prometheus", pod="prometheus-wcbad-0"}' - values: "0+0x60 1+0x60" - alert_rule_test: - - alertname: PrometheusAvailabilityRatio - eval_time: 60m - exp_alerts: - - exp_labels: - area: empowerment - severity: page - team: atlas - topic: observability - cancel_if_any_apiserver_down: "true" - cancel_if_cluster_has_no_workers: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "true" - pod: "prometheus-wcbad-0" - cluster_id: wcbad - exp_annotations: - description: "Prometheus prometheus-wcbad-0 has availability ratio of 0.00 (min 0.8) over the last hour." - opsrecipe: "prometheus-resource-limit-reached/" - dashboard: "promavailability/prometheus-availability" - - alertname: PrometheusAvailabilityRatio - eval_time: 108m - exp_alerts: - - exp_labels: - area: empowerment - severity: page - team: atlas - topic: observability - cancel_if_any_apiserver_down: "true" - cancel_if_cluster_has_no_workers: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "true" - pod: "prometheus-wcbad-0" - cluster_id: wcbad - exp_annotations: - description: "Prometheus prometheus-wcbad-0 has availability ratio of 0.00 (min 0.8) over the last hour." - opsrecipe: "prometheus-resource-limit-reached/" - dashboard: "promavailability/prometheus-availability" - - alertname: PrometheusAvailabilityRatio - eval_time: 140m