From 96364d1f416b8f61154b1b75633ef30679e8508f Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Tue, 29 Oct 2024 17:01:17 +0100 Subject: [PATCH 01/24] add sensible alerts for alloy --- .../atlas/alerting-rules/alloy.rules.yml | 49 +++ .../deployment.management-cluster.rules.yml | 2 +- .../atlas/alerting-rules/logging.rules.yaml | 114 +++++++ .../atlas/alerting-rules/monitoring.rules.yml | 136 ++++++++ .../alerting-rules/prometheus-agent.rules.yml | 291 +++++++++--------- .../atlas/alerting-rules/prometheus.rules.yml | 17 - .../atlas/alerting-rules/promtail.rules.yml | 5 +- 7 files changed, 448 insertions(+), 166 deletions(-) create mode 100644 helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml create mode 100644 helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml create mode 100644 helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml new file mode 100644 index 000000000..148168239 --- /dev/null +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml @@ -0,0 +1,49 @@ +# This files describe common alloy alerting rules +# For alerts regarding monitoring and logging agents, please go to the respective files (logging.rules.yml and monitoring.rules.yml). +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + {{- include "labels.common" . | nindent 4 }} + name: alloy.rules + namespace: {{ .Values.namespace }} +spec: + groups: + ## TODO(quentin) add tests for the alerts + ## TODO(quentin) add opsrecipe for the alerts + ## TODO(quentin) add dashboard annotation for the alerts + # List of alerts for on the state of the alloy components. + # Alerts are coming from https://github.com/grafana/alloy/blob/ed52746567d2469a6a97a592ac5aec807646b327/operations/alloy-mixin/alerts/controller.libsonnet + # We added the alert labels and added the missing labels from the aggregations. + - name: alloy.controller + rules: + - alert: SlowComponentEvaluations + annotations: + description: Component evaluations are taking too long under job {{ $labels.job }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id }}. + summary: Component evaluations are taking too long. + expr: sum by (cluster_id, installation, provider, pipeline, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 + for: 15m + labels: + area: platform + severity: notify + team: atlas + topic: observability + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + - alert: UnhealthyComponents + annotations: + description: Unhealthy components detected under job {{ $labels.job }} + summary: Unhealthy components detected. + expr: sum by (cluster_id, installation, provider, pipeline, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 + for: 15m + labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml index 6d62a35bc..54a070368 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml @@ -17,7 +17,7 @@ spec: annotations: description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' opsrecipe: deployment-not-satisfied/ - expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*|mimir.*|loki.*|tempo.*|pyroscope.*|object-storage.*|logging-operator.*|silence-operator.*|sloth.*"} > 0 + expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alloy-rules.*|alertmanager.*|grafana.*|logging-operator.*|loki.*|mimir.*|oauth2-proxy.*|object-storage.*|observability-gateway.*|observability-operator.*|prometheus.*|promxy.*|tempo.*|pyroscope.*|silence-operator.*|sloth.*"} > 0 for: 30m labels: area: platform diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml new file mode 100644 index 000000000..71d96e782 --- /dev/null +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml @@ -0,0 +1,114 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + {{- include "labels.common" . | nindent 4 }} + name: logging.rules + namespace: {{ .Values.namespace }} +spec: + groups: + ## TODO(quentin) add tests for the alerts + ## TODO(quentin) add opsrecipe for the alerts + ## TODO(quentin) add dashboard annotation for the alerts + - name: logging-agent + rules: + # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready) + # and join the pods with the not running containers + - alert: LoggingAgentDown + annotations: + description: '{{`Scraping of all logging-agent pods to check if one failed every 30 minutes.`}}' + opsrecipe: logging-agent/ + expr: |- + kube_pod_info{pod=~"alloy-logs.*"} + * on(cluster_id, pod) + group_left () + up{job="alloy-logs", container="alloy"} == 0 + for: 30m + labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_node_unschedulable: "true" + cancel_if_node_not_ready: "true" + - name: log-ingestion + rules: + # Any alloy component that uses the loki.write component can throw such errors. + # This includes alloy-logs and the observability-gateway + - alert: LogForwardingErrors + annotations: + description: '{{`More that 10% of the requests to Loki are failing.`}}' + opsrecipe: logging-errors/ + expr: |- + ( + 100 + * + ( + ( + sum by (cluster_id, installation, provider, pipeline, namespace, job, instance) ( + rate ( + loki_write_request_duration_seconds_count{status_code!~"2.."}[5m:] + ) + ) + ) + / + ( + sum by (cluster_id, installation, provider, pipeline, namespace, job, instance) ( + rate ( + loki_write_request_duration_seconds_count[5m:] + ) + ) + ) + ) + ) + > 10 + for: 15m + labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + # This alert pages when the loki source api component of the observability gateway is throwing errors + - alert: LogReceivingErrors + annotations: + description: '{{`More that 10% of the loki requests to the observability gateway are failing.`}}' + opsrecipe: logging-errors/ + expr: |- + ( + 100 + * + ( + ( + sum by (cluster_id, installation, provider, pipeline, namespace, job, instance, route) ( + rate ( + loki_source_api_request_duration_seconds_count{route=~"(loki_)?api_v1_push", status_code!~"2.."}[5m:] + ) + ) + ) + / + ( + sum by (cluster_id, installation, provider, pipeline, namespace, job, instance, route) ( + rate ( + loki_source_api_request_duration_seconds_count{route=~"(loki_)?api_v1_push"}[5m:] + ) + ) + ) + ) + ) + > 10 + for: 15m + labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml new file mode 100644 index 000000000..745b86f7e --- /dev/null +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml @@ -0,0 +1,136 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + {{- include "labels.common" . | nindent 4 }} + name: monitoring.rules + namespace: {{ .Values.namespace }} +spec: + groups: + ## TODO(quentin) add tests for the monitoring agent alerts + ## TODO(quentin) add opsrecipe for the monitoring agent alerts + ## TODO(quentin) add dashboard annotation for the monitoring agent alerts + ## TODO(quentin) replace MonitoringAgentShardsMissing for alloy-metrics + ## TODO(quentin) add component specific errors to replace the ones in the prometheus.rules.yml + - name: monitoring-agent + rules: + ## This alert pages if the monitoring-agent fails to send samples to its remote write endpoint. + - alert: MonitoringAgentFailing + annotations: + description: '{{`Monitoring agent fails to send its data via remote write.`}}' + summary: Monitoring agent fails to send samples to its configured remote write endpoint. + opsrecipe: monitoring-agent/ + expr: |- + ( + label_replace( + capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) unless on (cluster_id) ( + count(up{job="alloy-metrics"} > 0) by (cluster_id) + ) + for: 20m + labels: + area: platform + severity: page + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + cancel_if_cluster_is_not_running_monitoring_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_has_no_workers: "true" + ## Same as MonitoringAgentFailing, but triggers inhibition earlier and does not page. + - alert: MonitoringAgentFailingInhibition + annotations: + description: '{{`Monitoring agent fails to send its data via remote write.`}}' + summary: Monitoring agent fails to send samples to its configured remote write endpoint. + opsrecipe: monitoring-agent/ + expr: |- + ( + label_replace( + capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) unless on (cluster_id) ( + count(up{job="prometheus-agent"} > 0) by (cluster_id) + ) + for: 2m + labels: + area: platform + severity: none + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + cancel_if_cluster_is_not_running_monitoring_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + ## This alert pages if some of the monitoring agent shards are not running. + - alert: MonitoringAgentShardsMissing + annotations: + description: '{{`At least one of the monitoring agent shard is missing.`}}' + summary: Monitoring agent is missing some shards. + opsrecipe: monitoring-agent/ + expr: |- + max_over_time(sum by (cluster_id, installation, provider, pipeline)( + count( + ## number of remotes that are not mimir or grafana-cloud + prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"} + ) by (cluster_id, installation, provider, pipeline) + != + sum( + ## number of shards defined in the Prometheus CR + prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} + # if there is only 1 shard, there is no shard metric so we use the replicas metric + or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} + ) by (cluster_id, installation, provider, pipeline) + )[5m:]) + for: 40m + labels: + area: platform + severity: page + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + cancel_if_cluster_is_not_running_monitoring_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + ## Same as MonitoringAgentShardsMissing but triggers inhibition earlier, and does not page. + - alert: MonitoringAgentShardsMissingInhibition + annotations: + description: '{{`At least one of the monitoring agent shard is missing.`}}' + summary: Monitoring agent is missing some shards. + opsrecipe: monitoring-agent/ + expr: |- + max_over_time(sum by (cluster_id, installation, provider, pipeline)( + count( + ## number of remotes that are not mimir or grafana-cloud + prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"} + ) by (cluster_id, installation, provider, pipeline) + != + sum( + ## number of shards defined in the Prometheus CR + prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} + # if there is only 1 shard, there is no shard metric so we use the replicas metric + or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} + ) by (cluster_id, installation, provider, pipeline) + )[5m:]) + for: 2m + labels: + area: platform + severity: none + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + cancel_if_cluster_is_not_running_monitoring_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml index 0dfbc0c91..81163c13e 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml @@ -1,157 +1,156 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: - creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} name: prometheus-agent.rules namespace: {{ .Values.namespace }} spec: groups: - - name: prometheus-agent - rules: - ## Page Atlas if prometheus agent fails to send samples to MC prometheus. - - alert: PrometheusAgentFailing - annotations: - description: '{{`Prometheus agent remote write is failing.`}}' - summary: Prometheus agent fails to send samples to remote write endpoint. - opsrecipe: prometheus-agent/ - dashboard: promRW001/prometheus-remote-write - {{- if not .Values.mimir.enabled }} - expr: |- - max_over_time( - sum by (cluster_type, cluster_id, installation, instance, service) + - name: prometheus-agent + rules: + ## This alert pages if prometheus-agent fails to send samples to its remote write endpoint. + - alert: PrometheusAgentFailing + annotations: + description: '{{`Prometheus agent remote write is failing.`}}' + summary: Prometheus agent fails to send samples to remote write endpoint. + opsrecipe: prometheus-agent/ + dashboard: promRW001/prometheus-remote-write + {{- if not .Values.mimir.enabled }} + expr: |- + max_over_time( + sum by (cluster_type, cluster_id, installation, instance, service) + ( + up{instance="prometheus-agent"} == 0 + or + absent(up{instance="prometheus-agent"}) == 1 + )[5m:] + ) + {{- else }} + expr: |- ( - up{instance="prometheus-agent"} == 0 - or - absent(up{instance="prometheus-agent"}) == 1 - )[5m:] - ) - {{- else }} - expr: |- - ( - label_replace( - capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, - "cluster_id", - "$1", - "name", - "(.*)" - ) == 1 - ) unless on (cluster_id) ( - count(up{job="prometheus-agent"} > 0) by (cluster_id) - ) - {{- end }} - for: 20m - labels: - area: platform - severity: page - team: atlas - topic: observability - inhibit_prometheus_agent_down: "true" - cancel_if_cluster_is_not_running_prometheus_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_has_no_workers: "true" - ## Same as PrometheusAgentFailing, but triggers inhibition earlier and does not page. - - alert: PrometheusAgentFailingInhibition - annotations: - description: '{{`Prometheus agent remote write is failing.`}}' - summary: Prometheus agent fails to send samples to remote write endpoint. - opsrecipe: prometheus-agent/ - dashboard: promRW001/prometheus-remote-write - {{- if not .Values.mimir.enabled }} - expr: |- - max_over_time( - sum by (cluster_type, cluster_id, installation, instance, service) + label_replace( + capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) unless on (cluster_id) ( + count(up{job="prometheus-agent"} > 0) by (cluster_id) + ) + {{- end }} + for: 20m + labels: + area: platform + severity: page + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + cancel_if_cluster_is_not_running_monitoring_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_has_no_workers: "true" + ## Same as PrometheusAgentFailing, but triggers inhibition earlier and does not page. + - alert: PrometheusAgentFailingInhibition + annotations: + description: '{{`Prometheus agent remote write is failing.`}}' + summary: Prometheus agent fails to send samples to remote write endpoint. + opsrecipe: prometheus-agent/ + dashboard: promRW001/prometheus-remote-write + {{- if not .Values.mimir.enabled }} + expr: |- + max_over_time( + sum by (cluster_type, cluster_id, installation, instance, service) + ( + up{instance="prometheus-agent"} == 0 + or + absent(up{instance="prometheus-agent"}) == 1 + )[5m:] + ) + {{- else }} + expr: |- ( - up{instance="prometheus-agent"} == 0 - or - absent(up{instance="prometheus-agent"}) == 1 - )[5m:] - ) - {{- else }} - expr: |- - ( - label_replace( - capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, - "cluster_id", - "$1", - "name", - "(.*)" - ) == 1 - ) unless on (cluster_id) ( - count(up{job="prometheus-agent"} > 0) by (cluster_id) - ) - {{- end }} - for: 2m - labels: - area: platform - severity: none - team: atlas - topic: observability - inhibit_prometheus_agent_down: "true" - cancel_if_cluster_is_not_running_prometheus_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - ## Page Atlas if prometheus agent is missing shards to send samples to MC prometheus. - - alert: PrometheusAgentShardsMissing - annotations: - description: '{{`Prometheus agent is missing shards.`}}' - summary: Prometheus agent is missing shards. - opsrecipe: prometheus-agent/ - expr: |- - max_over_time(sum by (cluster_id, installation, provider, pipeline)( - count( - ## number of remotes that are not mimir or grafana-cloud - prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"} - ) by (cluster_id, installation, provider, pipeline) - != - sum( - ## number of shards defined in the Prometheus CR - prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} - # if there is only 1 shard, there is no shard metric so we use the replicas metric - or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} - ) by (cluster_id, installation, provider, pipeline) - )[5m:]) - for: 40m - labels: - area: platform - severity: page - team: atlas - topic: observability - inhibit_prometheus_agent_down: "true" - cancel_if_cluster_is_not_running_prometheus_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - ## Same as PrometheusAgentShardsMissing but triggers inhibition earlier, and does not page. - - alert: PrometheusAgentShardsMissingInhibition - annotations: - description: '{{`Prometheus agent is missing shards.`}}' - summary: Prometheus agent is missing shards. - opsrecipe: prometheus-agent/ - expr: |- - max_over_time(sum by (cluster_id, installation, provider, pipeline)( - count( - ## number of remotes that are not mimir or grafana-cloud - prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"} - ) by (cluster_id, installation, provider, pipeline) - != - sum( - ## number of shards defined in the Prometheus CR - prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} - # if there is only 1 shard, there is no shard metric so we use the replicas metric - or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} - ) by (cluster_id, installation, provider, pipeline) - )[5m:]) - for: 2m - labels: - area: platform - severity: none - team: atlas - topic: observability - inhibit_prometheus_agent_down: "true" - cancel_if_cluster_is_not_running_prometheus_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" + label_replace( + capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) unless on (cluster_id) ( + count(up{job="prometheus-agent"} > 0) by (cluster_id) + ) + {{- end }} + for: 2m + labels: + area: platform + severity: none + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + cancel_if_cluster_is_not_running_monitoring_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + ## This alert pages if one of the prometheus-agent shard is not running. + - alert: PrometheusAgentShardsMissing + annotations: + description: '{{`Prometheus agent is missing shards.`}}' + summary: Prometheus agent is missing shards. + opsrecipe: prometheus-agent/ + expr: |- + max_over_time(sum by (cluster_id, installation, provider, pipeline)( + count( + ## number of remotes that are not mimir or grafana-cloud + prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"} + ) by (cluster_id, installation, provider, pipeline) + != + sum( + ## number of shards defined in the Prometheus CR + prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} + # if there is only 1 shard, there is no shard metric so we use the replicas metric + or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} + ) by (cluster_id, installation, provider, pipeline) + )[5m:]) + for: 40m + labels: + area: platform + severity: page + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + cancel_if_cluster_is_not_running_monitoring_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + ## Same as PrometheusAgentShardsMissing but triggers inhibition earlier, and does not page. + - alert: PrometheusAgentShardsMissingInhibition + annotations: + description: '{{`Prometheus agent is missing shards.`}}' + summary: Prometheus agent is missing shards. + opsrecipe: prometheus-agent/ + expr: |- + max_over_time(sum by (cluster_id, installation, provider, pipeline)( + count( + ## number of remotes that are not mimir or grafana-cloud + prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"} + ) by (cluster_id, installation, provider, pipeline) + != + sum( + ## number of shards defined in the Prometheus CR + prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} + # if there is only 1 shard, there is no shard metric so we use the replicas metric + or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} + ) by (cluster_id, installation, provider, pipeline) + )[5m:]) + for: 2m + labels: + area: platform + severity: none + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + cancel_if_cluster_is_not_running_monitoring_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml index 054d4980b..e5f68c642 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml @@ -27,23 +27,6 @@ spec: severity: page team: atlas topic: observability - ## Pages Atlas when prometheus fails to send samples to cortex - - alert: PrometheusMissingGrafanaCloud - annotations: - description: 'Prometheus is not sending data to Grafana Cloud.' - opsrecipe: prometheus-grafanacloud/ - {{- if .Values.mimir.enabled }} - expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"}) - {{- else }} - expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"}) - {{- end }} - for: 1h - labels: - area: platform - cancel_if_outside_working_hours: "true" - severity: page - team: atlas - topic: observability - alert: PrometheusFailsToCommunicateWithRemoteStorageAPI annotations: description: '{{`Prometheus can''t communicate with Remote Storage API at {{ $labels.url }}.`}}' diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/promtail.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/promtail.rules.yml index f48d135ab..422a9c9b1 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/promtail.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/promtail.rules.yml @@ -9,16 +9,17 @@ spec: groups: - name: promtail rules: + # This alert lists the existing promtail pods (to extract the node label and inhibit if the node is not ready) + # and join the pods with the not running containers - alert: PromtailDown annotations: description: '{{`Scraping of all promtail pods to check if one failed every 30 minutes.`}}' opsrecipe: promtail/ expr: |- - # List promtail pods to be able to get the node label and join with the node status to not alert if the node is not ready kube_pod_info{pod=~"promtail.*"} * on(cluster_id, pod) group_left () - up{container="promtail"} == 0 # List promtail containers that are not running + up{container="promtail"} == 0 for: 30m labels: area: platform From 9c2f6553ed0e62727b29267520a4062db3e7d194 Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Tue, 29 Oct 2024 17:19:58 +0100 Subject: [PATCH 02/24] wip - add ongoing alerts --- CHANGELOG.md | 16 +++++ .../atlas/alerting-rules/alloy.rules.yml | 10 +-- .../atlas/alerting-rules/monitoring.rules.yml | 66 +------------------ .../alerting-rules/prometheus-agent.rules.yml | 4 +- ...luster.rules.yml => statefulset.rules.yml} | 9 +-- .../atlas/alerting-rules/storage.rules.yml | 2 +- .../prometheus-agent.rules.test.yml | 16 ++--- .../prometheus-agent.rules.test.yml | 16 ++--- .../prometheus-agent.rules.test.yml | 16 ++--- .../prometheus-agent.rules.test.yml | 16 ++--- 10 files changed, 62 insertions(+), 109 deletions(-) rename helm/prometheus-rules/templates/platform/atlas/alerting-rules/{statefulset.management-cluster.rules.yml => statefulset.rules.yml} (81%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 06a4153c3..4d96737f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Add a set of sensible alerts to monitor alloy. + - `AlloySlowComponentEvaluations` and `AlloyUnhealthyComponents` to report about alloy component state. + - `LoggingAgentDown` to be alerted when the logging agent is down. + - `LogForwardingErrors` to be alerted when the `loki.write` component is failing. + - `LogReceivingErrors` to be alerted when the `loki.source.api` components of the gateway is failing. + - `MonitoringAgentFailing` and `InhibitionMonitoringAgentFailing` to be alerted when the monitoring agent is not able to send metrics. + +### Changed + +- Update `DeploymentNotSatisfiedAtlas` to take into account the following components: + - `observability-operator` + - `alloy-rules` + - `observability-gateway` + ## [4.22.0] - 2024-10-29 ### Changed diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml index 148168239..fae2026b7 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml @@ -17,10 +17,11 @@ spec: # We added the alert labels and added the missing labels from the aggregations. - name: alloy.controller rules: - - alert: SlowComponentEvaluations + - alert: AlloySlowComponentEvaluations annotations: - description: Component evaluations are taking too long under job {{ $labels.job }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id }}. + description: '{{`Component evaluations are taking too long under job {{ $labels.job }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id }}.`}}' summary: Component evaluations are taking too long. + opsrecipe: alloy-components/ expr: sum by (cluster_id, installation, provider, pipeline, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 for: 15m labels: @@ -32,10 +33,11 @@ spec: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - - alert: UnhealthyComponents + - alert: AlloyUnhealthyComponents annotations: - description: Unhealthy components detected under job {{ $labels.job }} + description: '{{`Unhealthy components detected under job {{ $labels.job }}`}}' summary: Unhealthy components detected. + opsrecipe: alloy-components/ expr: sum by (cluster_id, installation, provider, pipeline, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m labels: diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml index 745b86f7e..8ba7a3a51 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml @@ -10,7 +10,7 @@ spec: ## TODO(quentin) add tests for the monitoring agent alerts ## TODO(quentin) add opsrecipe for the monitoring agent alerts ## TODO(quentin) add dashboard annotation for the monitoring agent alerts - ## TODO(quentin) replace MonitoringAgentShardsMissing for alloy-metrics + ## TODO(quentin) replace PrometheusAgentShardsMissing for alloy-metrics ## TODO(quentin) add component specific errors to replace the ones in the prometheus.rules.yml - name: monitoring-agent rules: @@ -44,7 +44,7 @@ spec: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_has_no_workers: "true" ## Same as MonitoringAgentFailing, but triggers inhibition earlier and does not page. - - alert: MonitoringAgentFailingInhibition + - alert: InhibitionMonitoringAgentFailing annotations: description: '{{`Monitoring agent fails to send its data via remote write.`}}' summary: Monitoring agent fails to send samples to its configured remote write endpoint. @@ -71,66 +71,4 @@ spec: cancel_if_cluster_is_not_running_monitoring_agent: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" - ## This alert pages if some of the monitoring agent shards are not running. - - alert: MonitoringAgentShardsMissing - annotations: - description: '{{`At least one of the monitoring agent shard is missing.`}}' - summary: Monitoring agent is missing some shards. - opsrecipe: monitoring-agent/ - expr: |- - max_over_time(sum by (cluster_id, installation, provider, pipeline)( - count( - ## number of remotes that are not mimir or grafana-cloud - prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"} - ) by (cluster_id, installation, provider, pipeline) - != - sum( - ## number of shards defined in the Prometheus CR - prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} - # if there is only 1 shard, there is no shard metric so we use the replicas metric - or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} - ) by (cluster_id, installation, provider, pipeline) - )[5m:]) - for: 40m - labels: - area: platform - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - ## Same as MonitoringAgentShardsMissing but triggers inhibition earlier, and does not page. - - alert: MonitoringAgentShardsMissingInhibition - annotations: - description: '{{`At least one of the monitoring agent shard is missing.`}}' - summary: Monitoring agent is missing some shards. - opsrecipe: monitoring-agent/ - expr: |- - max_over_time(sum by (cluster_id, installation, provider, pipeline)( - count( - ## number of remotes that are not mimir or grafana-cloud - prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"} - ) by (cluster_id, installation, provider, pipeline) - != - sum( - ## number of shards defined in the Prometheus CR - prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} - # if there is only 1 shard, there is no shard metric so we use the replicas metric - or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} - ) by (cluster_id, installation, provider, pipeline) - )[5m:]) - for: 2m - labels: - area: platform - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml index 81163c13e..b1813bee9 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml @@ -52,7 +52,7 @@ spec: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_has_no_workers: "true" ## Same as PrometheusAgentFailing, but triggers inhibition earlier and does not page. - - alert: PrometheusAgentFailingInhibition + - alert: InhibitionPrometheusAgentFailing annotations: description: '{{`Prometheus agent remote write is failing.`}}' summary: Prometheus agent fails to send samples to remote write endpoint. @@ -124,7 +124,7 @@ spec: cancel_if_cluster_status_deleting: "true" cancel_if_outside_working_hours: "true" ## Same as PrometheusAgentShardsMissing but triggers inhibition earlier, and does not page. - - alert: PrometheusAgentShardsMissingInhibition + - alert: InhibitionPrometheusAgentShardsMissing annotations: description: '{{`Prometheus agent is missing shards.`}}' summary: Prometheus agent is missing shards. diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.management-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.rules.yml similarity index 81% rename from helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.management-cluster.rules.yml rename to helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.rules.yml index 473be3186..ea72b199d 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.rules.yml @@ -4,10 +4,7 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} - cluster_type: "management_cluster" -{{- end }} - name: deployment.management-cluster.rules + name: statefulset.rules namespace: {{ .Values.namespace }} spec: groups: @@ -18,8 +15,8 @@ spec: description: '{{`Statefulset {{ $labels.namespace}}/{{ $labels.statefulset }} is not satisfied.`}}' opsrecipe: deployment-not-satisfied/ expr: |- - kube_statefulset_status_replicas{cluster_type="management_cluster", statefulset=~"loki.*|mimir.*"} - - kube_statefulset_status_replicas_ready{cluster_type="management_cluster", statefulset=~"loki.*|mimir.*"} + kube_statefulset_status_replicas{cluster_type="management_cluster", statefulset=~"loki.*|mimir.*|pyroscope.*|tempo.*"} + - kube_statefulset_status_replicas_ready{cluster_type="management_cluster", statefulset=~"loki.*|mimir.*|pyroscope.*|tempo.*"} > 0 for: 30m labels: diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml index 7b0798d5d..a1c006233 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml @@ -17,7 +17,7 @@ spec: annotations: description: '{{`The free space on the Data Disk for instance: {{ $labels.instance }} and PVC: {{ $labels.persistentvolumeclaim}} was below 10 percent for longer than 1 hour (current value {{ $value | printf "%.2f" }}).`}}' opsrecipe: low-disk-space/#persistent-volume - expr: kubelet_volume_stats_available_bytes{cluster_type="management_cluster", persistentvolumeclaim=~".*(alertmanager|loki|mimir|prometheus|pyroscope|tempo).*"}/kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~".*(alertmanager|loki|mimir|prometheus|pyroscope|tempo).*"} < 0.10 + expr: kubelet_volume_stats_available_bytes{cluster_type="management_cluster", persistentvolumeclaim=~".*(alertmanager|grafana|loki|mimir|prometheus|pyroscope|tempo).*"}/kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~".*(alertmanager|grafana|loki|mimir|prometheus|pyroscope|tempo).*"} < 0.10 for: 1h labels: area: platform diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml index 204fe5765..10b14e97f 100644 --- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml +++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml @@ -142,7 +142,7 @@ tests: alert_rule_test: - alertname: PrometheusAgentShardsMissing eval_time: 40m - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 40m - alertname: PrometheusAgentShardsMissing eval_time: 120m @@ -165,7 +165,7 @@ tests: description: "Prometheus agent is missing shards." opsrecipe: "prometheus-agent/" summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 100m exp_alerts: - exp_labels: @@ -207,7 +207,7 @@ tests: description: "Prometheus agent is missing shards." opsrecipe: "prometheus-agent/" summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 125m exp_alerts: - exp_labels: @@ -230,7 +230,7 @@ tests: summary: "Prometheus agent is missing shards." - alertname: PrometheusAgentShardsMissing eval_time: 130m - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 130m # Tests for `PrometheusAgentShardsMissing` alert with missing `prometheus_operator_spec_shards` metric - interval: 1m @@ -246,7 +246,7 @@ tests: alert_rule_test: - alertname: PrometheusAgentShardsMissing eval_time: 40m - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 40m - alertname: PrometheusAgentShardsMissing eval_time: 120m @@ -269,7 +269,7 @@ tests: description: "Prometheus agent is missing shards." opsrecipe: "prometheus-agent/" summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 100m exp_alerts: - exp_labels: @@ -311,7 +311,7 @@ tests: description: "Prometheus agent is missing shards." opsrecipe: "prometheus-agent/" summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 125m exp_alerts: - exp_labels: @@ -334,5 +334,5 @@ tests: summary: "Prometheus agent is missing shards." - alertname: PrometheusAgentShardsMissing eval_time: 130m - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 130m diff --git a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml index 79e4a1fc7..b5d92ecc3 100644 --- a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml +++ b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml @@ -112,7 +112,7 @@ tests: alert_rule_test: - alertname: PrometheusAgentShardsMissing eval_time: 40m - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 40m - alertname: PrometheusAgentShardsMissing eval_time: 120m @@ -135,7 +135,7 @@ tests: description: "Prometheus agent is missing shards." opsrecipe: "prometheus-agent/" summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 100m exp_alerts: - exp_labels: @@ -177,7 +177,7 @@ tests: description: "Prometheus agent is missing shards." opsrecipe: "prometheus-agent/" summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 125m exp_alerts: - exp_labels: @@ -200,7 +200,7 @@ tests: summary: "Prometheus agent is missing shards." - alertname: PrometheusAgentShardsMissing eval_time: 130m - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 130m # Tests for `PrometheusAgentShardsMissing` alert with missing `prometheus_operator_spec_shards` metric - interval: 1m @@ -216,7 +216,7 @@ tests: alert_rule_test: - alertname: PrometheusAgentShardsMissing eval_time: 40m - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 40m - alertname: PrometheusAgentShardsMissing eval_time: 120m @@ -239,7 +239,7 @@ tests: description: "Prometheus agent is missing shards." opsrecipe: "prometheus-agent/" summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 100m exp_alerts: - exp_labels: @@ -281,7 +281,7 @@ tests: description: "Prometheus agent is missing shards." opsrecipe: "prometheus-agent/" summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 125m exp_alerts: - exp_labels: @@ -304,5 +304,5 @@ tests: summary: "Prometheus agent is missing shards." - alertname: PrometheusAgentShardsMissing eval_time: 130m - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 130m diff --git a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml index a2e3ed4bc..7497af50e 100644 --- a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml +++ b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml @@ -112,7 +112,7 @@ tests: alert_rule_test: - alertname: PrometheusAgentShardsMissing eval_time: 40m - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 40m - alertname: PrometheusAgentShardsMissing eval_time: 120m @@ -135,7 +135,7 @@ tests: description: "Prometheus agent is missing shards." opsrecipe: "prometheus-agent/" summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 100m exp_alerts: - exp_labels: @@ -177,7 +177,7 @@ tests: description: "Prometheus agent is missing shards." opsrecipe: "prometheus-agent/" summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 125m exp_alerts: - exp_labels: @@ -200,7 +200,7 @@ tests: summary: "Prometheus agent is missing shards." - alertname: PrometheusAgentShardsMissing eval_time: 130m - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 130m # Tests for `PrometheusAgentShardsMissing` alert with missing `prometheus_operator_spec_shards` metric - interval: 1m @@ -216,7 +216,7 @@ tests: alert_rule_test: - alertname: PrometheusAgentShardsMissing eval_time: 40m - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 40m - alertname: PrometheusAgentShardsMissing eval_time: 120m @@ -239,7 +239,7 @@ tests: description: "Prometheus agent is missing shards." opsrecipe: "prometheus-agent/" summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 100m exp_alerts: - exp_labels: @@ -281,7 +281,7 @@ tests: description: "Prometheus agent is missing shards." opsrecipe: "prometheus-agent/" summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 125m exp_alerts: - exp_labels: @@ -304,5 +304,5 @@ tests: summary: "Prometheus agent is missing shards." - alertname: PrometheusAgentShardsMissing eval_time: 130m - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 130m diff --git a/test/tests/providers/vintage/aws/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/vintage/aws/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml index a2e3ed4bc..7497af50e 100644 --- a/test/tests/providers/vintage/aws/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml +++ b/test/tests/providers/vintage/aws/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml @@ -112,7 +112,7 @@ tests: alert_rule_test: - alertname: PrometheusAgentShardsMissing eval_time: 40m - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 40m - alertname: PrometheusAgentShardsMissing eval_time: 120m @@ -135,7 +135,7 @@ tests: description: "Prometheus agent is missing shards." opsrecipe: "prometheus-agent/" summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 100m exp_alerts: - exp_labels: @@ -177,7 +177,7 @@ tests: description: "Prometheus agent is missing shards." opsrecipe: "prometheus-agent/" summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 125m exp_alerts: - exp_labels: @@ -200,7 +200,7 @@ tests: summary: "Prometheus agent is missing shards." - alertname: PrometheusAgentShardsMissing eval_time: 130m - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 130m # Tests for `PrometheusAgentShardsMissing` alert with missing `prometheus_operator_spec_shards` metric - interval: 1m @@ -216,7 +216,7 @@ tests: alert_rule_test: - alertname: PrometheusAgentShardsMissing eval_time: 40m - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 40m - alertname: PrometheusAgentShardsMissing eval_time: 120m @@ -239,7 +239,7 @@ tests: description: "Prometheus agent is missing shards." opsrecipe: "prometheus-agent/" summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 100m exp_alerts: - exp_labels: @@ -281,7 +281,7 @@ tests: description: "Prometheus agent is missing shards." opsrecipe: "prometheus-agent/" summary: "Prometheus agent is missing shards." - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 125m exp_alerts: - exp_labels: @@ -304,5 +304,5 @@ tests: summary: "Prometheus agent is missing shards." - alertname: PrometheusAgentShardsMissing eval_time: 130m - - alertname: PrometheusAgentShardsMissingInhibition + - alertname: InhibitionPrometheusAgentShardsMissing eval_time: 130m From 55078dd91090b4a5824bfb1c4eb2602fdf505660 Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Wed, 30 Oct 2024 12:15:43 +0100 Subject: [PATCH 03/24] add dashboard annotation --- .../turtles/alerting-rules/systemd.rules.yml | 2 +- .../atlas/alerting-rules/alloy.rules.yml | 7 +- .../deployment.management-cluster.rules.yml | 2 +- .../deployment.workload-cluster.rules.yml | 2 +- .../atlas/alerting-rules/logging.rules.yaml | 9 +- .../atlas/alerting-rules/monitoring.rules.yml | 74 ----- .../alerting-rules/prometheus-agent.rules.yml | 290 +++++++++--------- .../prometheus-operator.rules.yml | 2 +- .../alerting-rules/chart.rules.yml | 2 +- .../honeybadger/alerting-rules/helm.rules.yml | 2 +- .../recording-rules/helm-operations.rules.yml | 2 +- .../alerting-rules/logging.rules.test.yml | 229 ++++++++++++++ .../helm-operations.rules.test.yml | 2 +- 13 files changed, 391 insertions(+), 234 deletions(-) delete mode 100644 helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml create mode 100644 test/tests/providers/global/platform/atlas/alerting-rules/logging.rules.test.yml diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/systemd.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/systemd.rules.yml index 370c1a1f6..a58297b73 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/systemd.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/systemd.rules.yml @@ -10,7 +10,7 @@ spec: groups: - name: systemd rules: - ## TODO(@giantswarm/team-turtles) Update those lists when all vintage clusters are gone + ## TODO(@giantswarm/team-tenet) Update those lists when all vintage clusters are gone - alert: ClusterCriticalSystemdUnitFailed annotations: description: '{{`Critical systemd unit {{ $labels.name }} is failed on {{ $labels.instance }}.`}}' diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml index fae2026b7..edf5e61ce 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml @@ -11,7 +11,6 @@ spec: groups: ## TODO(quentin) add tests for the alerts ## TODO(quentin) add opsrecipe for the alerts - ## TODO(quentin) add dashboard annotation for the alerts # List of alerts for on the state of the alloy components. # Alerts are coming from https://github.com/grafana/alloy/blob/ed52746567d2469a6a97a592ac5aec807646b327/operations/alloy-mixin/alerts/controller.libsonnet # We added the alert labels and added the missing labels from the aggregations. @@ -19,9 +18,10 @@ spec: rules: - alert: AlloySlowComponentEvaluations annotations: + dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller description: '{{`Component evaluations are taking too long under job {{ $labels.job }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id }}.`}}' - summary: Component evaluations are taking too long. opsrecipe: alloy-components/ + summary: Component evaluations are taking too long. expr: sum by (cluster_id, installation, provider, pipeline, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 for: 15m labels: @@ -35,9 +35,10 @@ spec: cancel_if_cluster_status_updating: "true" - alert: AlloyUnhealthyComponents annotations: + dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller description: '{{`Unhealthy components detected under job {{ $labels.job }}`}}' - summary: Unhealthy components detected. opsrecipe: alloy-components/ + summary: Unhealthy components detected. expr: sum by (cluster_id, installation, provider, pipeline, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m labels: diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml index 54a070368..1f98fe451 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml @@ -95,7 +95,7 @@ spec: team: phoenix topic: managementcluster {{- if eq .Values.managementCluster.provider.flavor "vintage" }} - ## TODO Remove when all vintage clusters are gone + ## TODO(@giantswarm/team-atlas) Remove when all vintage clusters are gone - alert: AWSManagementClusterDeploymentScaledDownToZero annotations: description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} on AWS has been scaled down to zero for prolonged period of time.`}}' diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml index 599682b91..3e26744c4 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml @@ -13,7 +13,7 @@ spec: groups: - name: deployment rules: - # TODO - This is only used by the chart-operator, let's get rid of it when the chart operator is gone. + # TODO(@giantswarm/team-honeybadger) - This is only used by the chart-operator, let's get rid of it when the chart operator is gone. - alert: WorkloadClusterDeploymentNotSatisfied annotations: description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml index 71d96e782..5e34e77fc 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml @@ -7,15 +7,14 @@ metadata: namespace: {{ .Values.namespace }} spec: groups: - ## TODO(quentin) add tests for the alerts ## TODO(quentin) add opsrecipe for the alerts - ## TODO(quentin) add dashboard annotation for the alerts - name: logging-agent rules: # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready) # and join the pods with the not running containers - alert: LoggingAgentDown annotations: + dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview description: '{{`Scraping of all logging-agent pods to check if one failed every 30 minutes.`}}' opsrecipe: logging-agent/ expr: |- @@ -41,8 +40,9 @@ spec: # This includes alloy-logs and the observability-gateway - alert: LogForwardingErrors annotations: + dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview description: '{{`More that 10% of the requests to Loki are failing.`}}' - opsrecipe: logging-errors/ + opsrecipe: log-shipping-errors/ expr: |- ( 100 @@ -78,8 +78,9 @@ spec: # This alert pages when the loki source api component of the observability gateway is throwing errors - alert: LogReceivingErrors annotations: + dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview description: '{{`More that 10% of the loki requests to the observability gateway are failing.`}}' - opsrecipe: logging-errors/ + opsrecipe: log-shipping-errors/ expr: |- ( 100 diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml deleted file mode 100644 index 8ba7a3a51..000000000 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring.rules.yml +++ /dev/null @@ -1,74 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - {{- include "labels.common" . | nindent 4 }} - name: monitoring.rules - namespace: {{ .Values.namespace }} -spec: - groups: - ## TODO(quentin) add tests for the monitoring agent alerts - ## TODO(quentin) add opsrecipe for the monitoring agent alerts - ## TODO(quentin) add dashboard annotation for the monitoring agent alerts - ## TODO(quentin) replace PrometheusAgentShardsMissing for alloy-metrics - ## TODO(quentin) add component specific errors to replace the ones in the prometheus.rules.yml - - name: monitoring-agent - rules: - ## This alert pages if the monitoring-agent fails to send samples to its remote write endpoint. - - alert: MonitoringAgentFailing - annotations: - description: '{{`Monitoring agent fails to send its data via remote write.`}}' - summary: Monitoring agent fails to send samples to its configured remote write endpoint. - opsrecipe: monitoring-agent/ - expr: |- - ( - label_replace( - capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, - "cluster_id", - "$1", - "name", - "(.*)" - ) == 1 - ) unless on (cluster_id) ( - count(up{job="alloy-metrics"} > 0) by (cluster_id) - ) - for: 20m - labels: - area: platform - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_has_no_workers: "true" - ## Same as MonitoringAgentFailing, but triggers inhibition earlier and does not page. - - alert: InhibitionMonitoringAgentFailing - annotations: - description: '{{`Monitoring agent fails to send its data via remote write.`}}' - summary: Monitoring agent fails to send samples to its configured remote write endpoint. - opsrecipe: monitoring-agent/ - expr: |- - ( - label_replace( - capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, - "cluster_id", - "$1", - "name", - "(.*)" - ) == 1 - ) unless on (cluster_id) ( - count(up{job="prometheus-agent"} > 0) by (cluster_id) - ) - for: 2m - labels: - area: platform - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml index b1813bee9..b0c8e2186 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml @@ -7,150 +7,150 @@ metadata: namespace: {{ .Values.namespace }} spec: groups: - - name: prometheus-agent - rules: - ## This alert pages if prometheus-agent fails to send samples to its remote write endpoint. - - alert: PrometheusAgentFailing - annotations: - description: '{{`Prometheus agent remote write is failing.`}}' - summary: Prometheus agent fails to send samples to remote write endpoint. - opsrecipe: prometheus-agent/ - dashboard: promRW001/prometheus-remote-write - {{- if not .Values.mimir.enabled }} - expr: |- - max_over_time( - sum by (cluster_type, cluster_id, installation, instance, service) - ( - up{instance="prometheus-agent"} == 0 - or - absent(up{instance="prometheus-agent"}) == 1 - )[5m:] - ) - {{- else }} - expr: |- + - name: prometheus-agent + rules: + ## This alert pages if prometheus-agent fails to send samples to its remote write endpoint. + - alert: PrometheusAgentFailing + annotations: + description: '{{`Prometheus agent remote write is failing.`}}' + summary: Prometheus agent fails to send samples to remote write endpoint. + opsrecipe: prometheus-agent/ + dashboard: promRW001/prometheus-remote-write + {{- if not .Values.mimir.enabled }} + expr: |- + max_over_time( + sum by (cluster_type, cluster_id, installation, instance, service) ( - label_replace( - capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, - "cluster_id", - "$1", - "name", - "(.*)" - ) == 1 - ) unless on (cluster_id) ( - count(up{job="prometheus-agent"} > 0) by (cluster_id) - ) - {{- end }} - for: 20m - labels: - area: platform - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_has_no_workers: "true" - ## Same as PrometheusAgentFailing, but triggers inhibition earlier and does not page. - - alert: InhibitionPrometheusAgentFailing - annotations: - description: '{{`Prometheus agent remote write is failing.`}}' - summary: Prometheus agent fails to send samples to remote write endpoint. - opsrecipe: prometheus-agent/ - dashboard: promRW001/prometheus-remote-write - {{- if not .Values.mimir.enabled }} - expr: |- - max_over_time( - sum by (cluster_type, cluster_id, installation, instance, service) - ( - up{instance="prometheus-agent"} == 0 - or - absent(up{instance="prometheus-agent"}) == 1 - )[5m:] - ) - {{- else }} - expr: |- + up{instance="prometheus-agent"} == 0 + or + absent(up{instance="prometheus-agent"}) == 1 + )[5m:] + ) + {{- else }} + expr: |- + ( + label_replace( + capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) unless on (cluster_id) ( + count(up{job="prometheus-agent"} > 0) by (cluster_id) + ) + {{- end }} + for: 20m + labels: + area: platform + severity: page + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + cancel_if_cluster_is_not_running_monitoring_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_has_no_workers: "true" + ## Same as PrometheusAgentFailing, but triggers inhibition earlier and does not page. + - alert: InhibitionPrometheusAgentFailing + annotations: + description: '{{`Prometheus agent remote write is failing.`}}' + summary: Prometheus agent fails to send samples to remote write endpoint. + opsrecipe: prometheus-agent/ + dashboard: promRW001/prometheus-remote-write + {{- if not .Values.mimir.enabled }} + expr: |- + max_over_time( + sum by (cluster_type, cluster_id, installation, instance, service) ( - label_replace( - capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, - "cluster_id", - "$1", - "name", - "(.*)" - ) == 1 - ) unless on (cluster_id) ( - count(up{job="prometheus-agent"} > 0) by (cluster_id) - ) - {{- end }} - for: 2m - labels: - area: platform - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - ## This alert pages if one of the prometheus-agent shard is not running. - - alert: PrometheusAgentShardsMissing - annotations: - description: '{{`Prometheus agent is missing shards.`}}' - summary: Prometheus agent is missing shards. - opsrecipe: prometheus-agent/ - expr: |- - max_over_time(sum by (cluster_id, installation, provider, pipeline)( - count( - ## number of remotes that are not mimir or grafana-cloud - prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"} - ) by (cluster_id, installation, provider, pipeline) - != - sum( - ## number of shards defined in the Prometheus CR - prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} - # if there is only 1 shard, there is no shard metric so we use the replicas metric - or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} - ) by (cluster_id, installation, provider, pipeline) - )[5m:]) - for: 40m - labels: - area: platform - severity: page - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" - ## Same as PrometheusAgentShardsMissing but triggers inhibition earlier, and does not page. - - alert: InhibitionPrometheusAgentShardsMissing - annotations: - description: '{{`Prometheus agent is missing shards.`}}' - summary: Prometheus agent is missing shards. - opsrecipe: prometheus-agent/ - expr: |- - max_over_time(sum by (cluster_id, installation, provider, pipeline)( - count( - ## number of remotes that are not mimir or grafana-cloud - prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"} - ) by (cluster_id, installation, provider, pipeline) - != - sum( - ## number of shards defined in the Prometheus CR - prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} - # if there is only 1 shard, there is no shard metric so we use the replicas metric - or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} - ) by (cluster_id, installation, provider, pipeline) - )[5m:]) - for: 2m - labels: - area: platform - severity: none - team: atlas - topic: observability - inhibit_monitoring_agent_down: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" + up{instance="prometheus-agent"} == 0 + or + absent(up{instance="prometheus-agent"}) == 1 + )[5m:] + ) + {{- else }} + expr: |- + ( + label_replace( + capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) unless on (cluster_id) ( + count(up{job="prometheus-agent"} > 0) by (cluster_id) + ) + {{- end }} + for: 2m + labels: + area: platform + severity: none + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + cancel_if_cluster_is_not_running_monitoring_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + ## This alert pages if one of the prometheus-agent shard is not running. + - alert: PrometheusAgentShardsMissing + annotations: + description: '{{`Prometheus agent is missing shards.`}}' + summary: Prometheus agent is missing shards. + opsrecipe: prometheus-agent/ + expr: |- + max_over_time(sum by (cluster_id, installation, provider, pipeline)( + count( + ## number of remotes that are not mimir or grafana-cloud + prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"} + ) by (cluster_id, installation, provider, pipeline) + != + sum( + ## number of shards defined in the Prometheus CR + prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} + # if there is only 1 shard, there is no shard metric so we use the replicas metric + or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} + ) by (cluster_id, installation, provider, pipeline) + )[5m:]) + for: 40m + labels: + area: platform + severity: page + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + cancel_if_cluster_is_not_running_monitoring_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + ## Same as PrometheusAgentShardsMissing but triggers inhibition earlier, and does not page. + - alert: InhibitionPrometheusAgentShardsMissing + annotations: + description: '{{`Prometheus agent is missing shards.`}}' + summary: Prometheus agent is missing shards. + opsrecipe: prometheus-agent/ + expr: |- + max_over_time(sum by (cluster_id, installation, provider, pipeline)( + count( + ## number of remotes that are not mimir or grafana-cloud + prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"} + ) by (cluster_id, installation, provider, pipeline) + != + sum( + ## number of shards defined in the Prometheus CR + prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} + # if there is only 1 shard, there is no shard metric so we use the replicas metric + or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} + ) by (cluster_id, installation, provider, pipeline) + )[5m:]) + for: 2m + labels: + area: platform + severity: none + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + cancel_if_cluster_is_not_running_monitoring_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml index 6628f6601..a9d130014 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml @@ -10,7 +10,7 @@ spec: groups: - name: prometheus-operator rules: - ## TODO(@giantswarm/team-atlas) remove once all clusters are passed v20 + ## TODO(@giantswarm/team-atlas) - remove once all clusters are passed v20 - alert: DuplicatePrometheusOperatorKubeletService annotations: description: '{{`Prometheus-operator in cluster {{ $labels.cluster_id }} has duplicate kubelet service.`}}' diff --git a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml index c9bc42ce4..1a1befca4 100644 --- a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml +++ b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml @@ -1,4 +1,4 @@ -# TODO - This is only used by the chart-operator, let's get rid of it when the chart operator is gone. +# TODO(@giantswarm/team-honeybadger) - This is only used by the chart-operator, let's get rid of it when the chart operator is gone. apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: diff --git a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml index a07271c66..6ac690b2b 100644 --- a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml +++ b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml @@ -1,4 +1,4 @@ -# TODO - This is only used by the chart-operator, let's get rid of it when the chart operator is gone. +# TODO(@giantswarm/team-honeybadger) - This is only used by the chart-operator, let's get rid of it when the chart operator is gone. apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: diff --git a/helm/prometheus-rules/templates/platform/honeybadger/recording-rules/helm-operations.rules.yml b/helm/prometheus-rules/templates/platform/honeybadger/recording-rules/helm-operations.rules.yml index a703dce91..2675857f0 100644 --- a/helm/prometheus-rules/templates/platform/honeybadger/recording-rules/helm-operations.rules.yml +++ b/helm/prometheus-rules/templates/platform/honeybadger/recording-rules/helm-operations.rules.yml @@ -1,4 +1,4 @@ -# TODO - This is only used by the chart-operator, let's get rid of it when the chart operator is gone. +# TODO(@giantswarm/team-honeybadger) - This is only used by the chart-operator, let's get rid of it when the chart operator is gone. apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/logging.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/logging.rules.test.yml new file mode 100644 index 000000000..2b0941d70 --- /dev/null +++ b/test/tests/providers/global/platform/atlas/alerting-rules/logging.rules.test.yml @@ -0,0 +1,229 @@ +--- +rule_files: + - logging.rules.yml + +tests: + # Test LoggingAgentDown + - interval: 1m + input_series: + # For the first 60min: test with 1 pod: none, up, down + - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-1xxxx", provider="aws", pipeline="testing"}' + values: "_x20 1+0x20 0+0x40" + - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-1xxxx", node="ip-10-0-5-1.eu-west-1.compute.internal", provider="aws", pipeline="testing"} + values: "1x180" + # From 60min: test with 2 pods: 1 up and 1 down, 2 up, 2 down. + - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-2xxxx", provider="aws", pipeline="testing"}' + values: "_x80 1+0x40 1+0x20 0+0x40" + - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-2xxxx", node="ip-10-0-5-2.eu-west-1.compute.internal", provider="aws", pipeline="testing"} + values: "1x180" + - series: 'up{container="alloy", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", job="alloy-logs", pod="alloy-3xxxx", provider="aws", pipeline="testing"}' + values: "_x80 0+0x40 1+0x20 0+0x40" + - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-3xxxx", node="ip-10-0-5-3.eu-west-1.compute.internal", provider="aws", pipeline="testing"} + values: "1x180" + alert_rule_test: + - alertname: LoggingAgentDown + eval_time: 10m + - alertname: LoggingAgentDown + eval_time: 30m + - alertname: LoggingAgentDown + eval_time: 71m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_node_unschedulable: "true" + cancel_if_node_not_ready: "true" + cluster_id: gauss + cluster_type: management_cluster + installation: gauss + node: ip-10-0-5-1.eu-west-1.compute.internal + pipeline: testing + pod: alloy-1xxxx + provider: aws + severity: page + team: atlas + topic: observability + exp_annotations: + description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." + opsrecipe: "logging-agent/" + # Tests with 2 pods + - alertname: LoggingAgentDown + eval_time: 111m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_node_unschedulable: "true" + cancel_if_node_not_ready: "true" + cluster_id: gauss + cluster_type: management_cluster + installation: gauss + node: ip-10-0-5-3.eu-west-1.compute.internal + pipeline: testing + pod: alloy-3xxxx + provider: aws + severity: page + team: atlas + topic: observability + exp_annotations: + description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." + opsrecipe: "logging-agent/" + - alertname: LoggingAgentDown + eval_time: 121m + - alertname: LoggingAgentDown + eval_time: 180m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_node_unschedulable: "true" + cancel_if_node_not_ready: "true" + cluster_id: gauss + cluster_type: management_cluster + installation: gauss + node: ip-10-0-5-2.eu-west-1.compute.internal + pipeline: testing + pod: alloy-2xxxx + provider: aws + severity: page + team: atlas + topic: observability + exp_annotations: + description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." + opsrecipe: "logging-agent/" + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_node_unschedulable: "true" + cancel_if_node_not_ready: "true" + cluster_id: gauss + cluster_type: management_cluster + installation: gauss + node: ip-10-0-5-3.eu-west-1.compute.internal + pipeline: testing + pod: alloy-3xxxx + provider: aws + severity: page + team: atlas + topic: observability + exp_annotations: + description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." + opsrecipe: "logging-agent/" + # Test LogForwardingErrors + - interval: 1m + input_series: + # Tests with multiple cases: no metrics, no requests, only status_code 204 ones, 204 ones and 500 that are less than 10% of the the total, 500 request that represent more than 10% of the total, only 500 ones + - series: 'loki_write_request_duration_seconds_count{status_code="500", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", pod="alloy-2j7z7"}' + values: "_x60 0+0x60 0+0x60 0+50x60 3000+100x60 9000+600x60" + - series: 'loki_write_request_duration_seconds_count{status_code="204", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", pod="alloy-2j7z7"}' + values: "_x60 0+0x60 0+600x60 36000+600x60 72000+600x60 108000+0x60" + alert_rule_test: + - alertname: LogForwardingErrors + eval_time: 30m + - alertname: LogForwardingErrors + eval_time: 90m + - alertname: LogForwardingErrors + eval_time: 150m + - alertname: LogForwardingErrors + eval_time: 210m + - alertname: LogForwardingErrors + eval_time: 270m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: page + team: atlas + topic: observability + exp_annotations: + description: "More that 10% of the requests to Loki are failing." + opsrecipe: "log-shipping-errors/" + - alertname: LogForwardingErrors + eval_time: 330m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: page + team: atlas + topic: observability + exp_annotations: + description: "More that 10% of the requests to Loki are failing." + opsrecipe: "log-shipping-errors/" + # Test LogReceivingErrors + - interval: 1m + input_series: + # Tests with multiple cases: no metrics, no requests, only status_code 204 ones, 204 ones and 500 that are less than 10% of the the total, 500 request that represent more than 10% of the total, only 500 ones + - series: 'loki_source_api_request_duration_seconds_count{status_code="500", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", route="api_v1_push", pod="alloy-2j7z7"}' + values: "_x60 0+0x60 0+0x60 0+50x60 3000+100x60 9000+600x60" + - series: 'loki_source_api_request_duration_seconds_count{status_code="204", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", route="api_v1_push", pod="alloy-2j7z7"}' + values: "_x60 0+0x60 0+600x60 36000+600x60 72000+600x60 108000+0x60" + alert_rule_test: + - alertname: LogReceivingErrors + eval_time: 30m + - alertname: LogReceivingErrors + eval_time: 90m + - alertname: LogReceivingErrors + eval_time: 150m + - alertname: LogReceivingErrors + eval_time: 210m + - alertname: LogReceivingErrors + eval_time: 270m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: page + team: atlas + topic: observability + exp_annotations: + description: "More that 10% of the loki requests to the observability gateway are failing." + opsrecipe: "log-shipping-errors/" + - alertname: LogReceivingErrors + eval_time: 330m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: page + team: atlas + topic: observability + exp_annotations: + description: "More that 10% of the loki requests to the observability gateway are failing." + opsrecipe: "log-shipping-errors/" diff --git a/test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml b/test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml index 54a65b1a6..d20c15e33 100644 --- a/test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml +++ b/test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml @@ -1,4 +1,4 @@ -# TODO - This is only used by the chart-operator, let's get rid of it when the chart operator is gone. +# TODO(@giantswarm/team-honeybadger) - This is only used by the chart-operator, let's get rid of it when the chart operator is gone. --- rule_files: - helm-operations.rules.yml From 759ae7f61e70b30b38c7ad25c546de540f996208 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Wed, 30 Oct 2024 16:38:40 +0100 Subject: [PATCH 04/24] Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml --- .../templates/platform/atlas/alerting-rules/storage.rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml index a1c006233..7b0798d5d 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml @@ -17,7 +17,7 @@ spec: annotations: description: '{{`The free space on the Data Disk for instance: {{ $labels.instance }} and PVC: {{ $labels.persistentvolumeclaim}} was below 10 percent for longer than 1 hour (current value {{ $value | printf "%.2f" }}).`}}' opsrecipe: low-disk-space/#persistent-volume - expr: kubelet_volume_stats_available_bytes{cluster_type="management_cluster", persistentvolumeclaim=~".*(alertmanager|grafana|loki|mimir|prometheus|pyroscope|tempo).*"}/kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~".*(alertmanager|grafana|loki|mimir|prometheus|pyroscope|tempo).*"} < 0.10 + expr: kubelet_volume_stats_available_bytes{cluster_type="management_cluster", persistentvolumeclaim=~".*(alertmanager|loki|mimir|prometheus|pyroscope|tempo).*"}/kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~".*(alertmanager|loki|mimir|prometheus|pyroscope|tempo).*"} < 0.10 for: 1h labels: area: platform From 9cee93af3dc58a28491e037e20084f3804c00b8e Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Wed, 30 Oct 2024 16:39:38 +0100 Subject: [PATCH 05/24] Update prometheus.rules.yml --- .../atlas/alerting-rules/prometheus.rules.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml index 3a6e62302..b31713f90 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml @@ -27,6 +27,23 @@ spec: severity: page team: atlas topic: observability + ## Pages Atlas when prometheus fails to send samples to cortex + - alert: PrometheusMissingGrafanaCloud + annotations: + description: 'Prometheus is not sending data to Grafana Cloud.' + opsrecipe: prometheus-grafanacloud/ + {{- if .Values.mimir.enabled }} + expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"}) + {{- else }} + expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"}) + {{- end }} + for: 1h + labels: + area: platform + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability - alert: PrometheusFailsToCommunicateWithRemoteStorageAPI annotations: description: '{{`Prometheus can''t communicate with Remote Storage API at {{ $labels.url }}.`}}' From 14d67c3b174e22be2feeebb62383c69d8de2cdaf Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Wed, 30 Oct 2024 16:40:07 +0100 Subject: [PATCH 06/24] Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml --- .../templates/platform/atlas/alerting-rules/alloy.rules.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml index edf5e61ce..1365fa848 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml @@ -9,7 +9,6 @@ metadata: namespace: {{ .Values.namespace }} spec: groups: - ## TODO(quentin) add tests for the alerts ## TODO(quentin) add opsrecipe for the alerts # List of alerts for on the state of the alloy components. # Alerts are coming from https://github.com/grafana/alloy/blob/ed52746567d2469a6a97a592ac5aec807646b327/operations/alloy-mixin/alerts/controller.libsonnet From b9c1deab0a289ec8e2f0876c9be3888f0f3ecb0f Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Wed, 30 Oct 2024 16:40:24 +0100 Subject: [PATCH 07/24] Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml --- .../templates/platform/atlas/alerting-rules/alloy.rules.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml index 1365fa848..7c1270285 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml @@ -9,7 +9,6 @@ metadata: namespace: {{ .Values.namespace }} spec: groups: - ## TODO(quentin) add opsrecipe for the alerts # List of alerts for on the state of the alloy components. # Alerts are coming from https://github.com/grafana/alloy/blob/ed52746567d2469a6a97a592ac5aec807646b327/operations/alloy-mixin/alerts/controller.libsonnet # We added the alert labels and added the missing labels from the aggregations. From 09929b0d2b509b505804db7b1b988ef4969326d5 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Wed, 30 Oct 2024 16:40:41 +0100 Subject: [PATCH 08/24] Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml --- .../templates/platform/atlas/alerting-rules/logging.rules.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml index 5e34e77fc..155b2cda0 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml @@ -7,7 +7,6 @@ metadata: namespace: {{ .Values.namespace }} spec: groups: - ## TODO(quentin) add opsrecipe for the alerts - name: logging-agent rules: # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready) From 40452a5fa4122525fb15b0df5093f9c0e742228c Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Wed, 30 Oct 2024 16:41:28 +0100 Subject: [PATCH 09/24] add missing tests --- .../atlas/alerting-rules/alloy.rules.test.yml | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml new file mode 100644 index 000000000..62ad40ae6 --- /dev/null +++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml @@ -0,0 +1,74 @@ +--- +rule_files: + - alloy.rules.yml + +tests: + # Test AlloySlowComponentEvaluations + - interval: 1m + input_series: + - series: 'alloy_component_evaluation_slow_seconds{cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", namespace="default", job="alloy-controller", component_path="path1", component_id="comp1"}' + values: "0+0x10 0+1x50 0x50" + alert_rule_test: + - alertname: AlloySlowComponentEvaluations + eval_time: 10m + - alertname: AlloySlowComponentEvaluations + eval_time: 50m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + namespace: default + job: alloy-controller + component_path: path1 + component_id: comp1 + severity: notify + team: atlas + topic: observability + exp_annotations: + dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller + description: "Component evaluations are taking too long under job alloy-controller, component_path path1, component_id comp1." + opsrecipe: "alloy-components/" + summary: "Component evaluations are taking too long." + - alertname: AlloySlowComponentEvaluations + eval_time: 80m + + # Test AlloyUnhealthyComponents + - interval: 1m + input_series: + - series: 'alloy_component_controller_running_components{health_type="unhealthy", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", namespace="default", job="alloy-controller"}' + values: "0+0x10 1+0x50 0x50" + alert_rule_test: + - alertname: AlloyUnhealthyComponents + eval_time: 10m + - alertname: AlloyUnhealthyComponents + eval_time: 30m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + namespace: default + job: alloy-controller + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller + description: "Unhealthy components detected under job alloy-controller" + opsrecipe: "alloy-components/" + summary: "Unhealthy components detected." + - alertname: AlloyUnhealthyComponents + eval_time: 80m From fbc9c8d61a0a5b43c31a215a7b9703dc84d429e4 Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Mon, 4 Nov 2024 15:20:30 +0100 Subject: [PATCH 10/24] change based on ops-recipes --- .../atlas/alerting-rules/alloy.rules.yml | 30 ++++++++++++++-- ...rules.yaml => logging-pipeline.rules.yaml} | 34 +++---------------- .../atlas/alerting-rules/alloy.rules.test.yml | 4 +-- ...st.yml => logging-pipeline.rules.test.yml} | 26 +++++++------- 4 files changed, 47 insertions(+), 47 deletions(-) rename helm/prometheus-rules/templates/platform/atlas/alerting-rules/{logging.rules.yaml => logging-pipeline.rules.yaml} (71%) rename test/tests/providers/global/platform/atlas/alerting-rules/{logging.rules.test.yml => logging-pipeline.rules.test.yml} (92%) diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml index 7c1270285..aa1959ded 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml @@ -18,7 +18,7 @@ spec: annotations: dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller description: '{{`Component evaluations are taking too long under job {{ $labels.job }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id }}.`}}' - opsrecipe: alloy-components/ + opsrecipe: alloy/ summary: Component evaluations are taking too long. expr: sum by (cluster_id, installation, provider, pipeline, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 for: 15m @@ -35,7 +35,7 @@ spec: annotations: dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller description: '{{`Unhealthy components detected under job {{ $labels.job }}`}}' - opsrecipe: alloy-components/ + opsrecipe: alloy/ summary: Unhealthy components detected. expr: sum by (cluster_id, installation, provider, pipeline, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m @@ -48,3 +48,29 @@ spec: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" + - name: logging-agent + rules: + # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready) + # and join the pods with the not running containers + - alert: LoggingAgentDown + annotations: + dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview + description: '{{`Scraping of all logging-agent pods to check if one failed every 30 minutes.`}}' + opsrecipe: alloy/ + expr: |- + kube_pod_info{pod=~"alloy-logs.*"} + * on(cluster_id, pod) + group_left () + up{job="alloy-logs", container="alloy"} == 0 + for: 30m + labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_node_unschedulable: "true" + cancel_if_node_not_ready: "true" diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging-pipeline.rules.yaml similarity index 71% rename from helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml rename to helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging-pipeline.rules.yaml index 155b2cda0..c45f70f42 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging-pipeline.rules.yaml @@ -3,37 +3,11 @@ kind: PrometheusRule metadata: labels: {{- include "labels.common" . | nindent 4 }} - name: logging.rules + name: logging-pipeline.rules namespace: {{ .Values.namespace }} spec: groups: - - name: logging-agent - rules: - # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready) - # and join the pods with the not running containers - - alert: LoggingAgentDown - annotations: - dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview - description: '{{`Scraping of all logging-agent pods to check if one failed every 30 minutes.`}}' - opsrecipe: logging-agent/ - expr: |- - kube_pod_info{pod=~"alloy-logs.*"} - * on(cluster_id, pod) - group_left () - up{job="alloy-logs", container="alloy"} == 0 - for: 30m - labels: - area: platform - severity: page - team: atlas - topic: observability - cancel_if_outside_working_hours: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_node_unschedulable: "true" - cancel_if_node_not_ready: "true" - - name: log-ingestion + - name: logging-pipeline rules: # Any alloy component that uses the loki.write component can throw such errors. # This includes alloy-logs and the observability-gateway @@ -41,7 +15,7 @@ spec: annotations: dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview description: '{{`More that 10% of the requests to Loki are failing.`}}' - opsrecipe: log-shipping-errors/ + opsrecipe: logging-pipeline/ expr: |- ( 100 @@ -79,7 +53,7 @@ spec: annotations: dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview description: '{{`More that 10% of the loki requests to the observability gateway are failing.`}}' - opsrecipe: log-shipping-errors/ + opsrecipe: logging-pipeline/ expr: |- ( 100 diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml index 62ad40ae6..2effa82d5 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml @@ -34,7 +34,7 @@ tests: exp_annotations: dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller description: "Component evaluations are taking too long under job alloy-controller, component_path path1, component_id comp1." - opsrecipe: "alloy-components/" + opsrecipe: "alloy/" summary: "Component evaluations are taking too long." - alertname: AlloySlowComponentEvaluations eval_time: 80m @@ -68,7 +68,7 @@ tests: exp_annotations: dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller description: "Unhealthy components detected under job alloy-controller" - opsrecipe: "alloy-components/" + opsrecipe: "alloy/" summary: "Unhealthy components detected." - alertname: AlloyUnhealthyComponents eval_time: 80m diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/logging.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml similarity index 92% rename from test/tests/providers/global/platform/atlas/alerting-rules/logging.rules.test.yml rename to test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml index 2b0941d70..31217a0a7 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/logging.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml @@ -1,6 +1,6 @@ --- rule_files: - - logging.rules.yml + - logging-pipeline.rules.yml tests: # Test LoggingAgentDown @@ -47,8 +47,8 @@ tests: team: atlas topic: observability exp_annotations: - description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." - opsrecipe: "logging-agent/" + description: "Scraping of all alloy pods to check if one failed every 30 minutes." + opsrecipe: "alloy/" # Tests with 2 pods - alertname: LoggingAgentDown eval_time: 111m @@ -72,8 +72,8 @@ tests: team: atlas topic: observability exp_annotations: - description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." - opsrecipe: "logging-agent/" + description: "Scraping of all alloy pods to check if one failed every 30 minutes." + opsrecipe: "alloy/" - alertname: LoggingAgentDown eval_time: 121m - alertname: LoggingAgentDown @@ -98,8 +98,8 @@ tests: team: atlas topic: observability exp_annotations: - description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." - opsrecipe: "logging-agent/" + description: "Scraping of all alloy pods to check if one failed every 30 minutes." + opsrecipe: "alloy/" - exp_labels: area: platform cancel_if_outside_working_hours: "true" @@ -119,8 +119,8 @@ tests: team: atlas topic: observability exp_annotations: - description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." - opsrecipe: "logging-agent/" + description: "Scraping of all alloy pods to check if one failed every 30 minutes." + opsrecipe: "alloy/" # Test LogForwardingErrors - interval: 1m input_series: @@ -155,7 +155,7 @@ tests: topic: observability exp_annotations: description: "More that 10% of the requests to Loki are failing." - opsrecipe: "log-shipping-errors/" + opsrecipe: "logging-pipeline/" - alertname: LogForwardingErrors eval_time: 330m exp_alerts: @@ -173,7 +173,7 @@ tests: topic: observability exp_annotations: description: "More that 10% of the requests to Loki are failing." - opsrecipe: "log-shipping-errors/" + opsrecipe: "logging-pipeline/" # Test LogReceivingErrors - interval: 1m input_series: @@ -208,7 +208,7 @@ tests: topic: observability exp_annotations: description: "More that 10% of the loki requests to the observability gateway are failing." - opsrecipe: "log-shipping-errors/" + opsrecipe: "logging-pipeline/" - alertname: LogReceivingErrors eval_time: 330m exp_alerts: @@ -226,4 +226,4 @@ tests: topic: observability exp_annotations: description: "More that 10% of the loki requests to the observability gateway are failing." - opsrecipe: "log-shipping-errors/" + opsrecipe: "logging-pipeline/" From 9e726643210d6c065e14351140a52ce4f16a4a4c Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Tue, 5 Nov 2024 11:53:41 +0100 Subject: [PATCH 11/24] Clean up some rules a bit --- CHANGELOG.md | 3 +- .../atlas/alerting-rules/alloy.rules.yml | 21 ++- ...rter.rules.yml => grafana-cloud.rules.yml} | 30 +++- .../atlas/alerting-rules/grafana.rules.yml | 4 +- .../kube-state-metrics.rules.yml | 1 - .../atlas/alerting-rules/mimir.rules.yml | 15 -- .../atlas/alerting-rules/prometheus.rules.yml | 18 -- ....rules.test.yml => grafana-cloud.test.yml} | 2 +- .../atlas/alerting-rules/mimir.rules.test.yml | 29 ---- .../atlas/alerting-rules/alloy.rules.test.yml | 154 ++++++++++++++++++ .../logging-pipeline.rules.test.yml | 118 -------------- 11 files changed, 204 insertions(+), 191 deletions(-) rename helm/prometheus-rules/templates/platform/atlas/alerting-rules/{mimir-to-grafana-cloud-exporter.rules.yml => grafana-cloud.rules.yml} (74%) rename test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/{mimir-to-grafana-cloud-exporter.rules.test.yml => grafana-cloud.test.yml} (99%) diff --git a/CHANGELOG.md b/CHANGELOG.md index e9ef272bf..92d0a37ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `LoggingAgentDown` to be alerted when the logging agent is down. - `LogForwardingErrors` to be alerted when the `loki.write` component is failing. - `LogReceivingErrors` to be alerted when the `loki.source.api` components of the gateway is failing. - - `MonitoringAgentFailing` and `InhibitionMonitoringAgentFailing` to be alerted when the monitoring agent is not able to send metrics. ### Changed @@ -22,6 +21,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `observability-operator` - `alloy-rules` - `observability-gateway` +- Move all `grafana-cloud` related alerts to their own file. +- Move all alloy related alerts to the alloy alert file and fix alloy-logs tests. ## [4.23.0] - 2024-10-30 diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml index aa1959ded..8b3e6256c 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml @@ -6,7 +6,7 @@ metadata: labels: {{- include "labels.common" . | nindent 4 }} name: alloy.rules - namespace: {{ .Values.namespace }} + namespace: {{ .Values.namespace }} spec: groups: # List of alerts for on the state of the alloy components. @@ -48,7 +48,24 @@ spec: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - - name: logging-agent + - name: alloy.rules + rules: + - alert: AlloyForPrometheusRulesDown + annotations: + description: 'Alloy sending PrometheusRules to Loki and Mimir ruler is down.' + opsrecipe: prometheus-rules/ + expr: count(up{job="alloy-rules", namespace="monitoring"} == 0) by (cluster_id, installation, provider, pipeline) > 0 + for: 1h + labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability + - name: alloy.logs rules: # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready) # and join the pods with the not running containers diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml similarity index 74% rename from helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml rename to helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml index 40d76d3d2..9560570ef 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml @@ -1,13 +1,35 @@ -{{- if .Values.mimir.enabled }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: labels: {{- include "labels.common" . | nindent 4 }} - name: mimir-to-grafana-cloud-exporter.rules - namespace: {{ .Values.namespace }} + {{- if not .Values.mimir.enabled }} + cluster_type: "management_cluster" + {{- end }} + name: grafana-cloud.rules + namespace: {{ .Values.namespace }} spec: groups: + - name: grafana-cloud + rules: + ## Pages Atlas when prometheus fails to send samples to cortex + - alert: PrometheusMissingGrafanaCloud + annotations: + description: 'Prometheus is not sending data to Grafana Cloud.' + opsrecipe: prometheus-grafanacloud/ + {{- if .Values.mimir.enabled }} + expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"}) + {{- else }} + expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"}) + {{- end }} + for: 1h + labels: + area: platform + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability + {{- if .Values.mimir.enabled }} - name: mimir-to-grafana-cloud-exporter rules: - alert: MimirToGrafanaCloudExporterDown @@ -73,4 +95,4 @@ spec: severity: page team: atlas topic: observability -{{- end }} + {{- end }} diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml index 39fb4a0a0..97a10780b 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml @@ -3,9 +3,9 @@ kind: PrometheusRule metadata: labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if not .Values.mimir.enabled }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: grafana.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml index 6c90a4e2c..83089fc33 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml @@ -85,7 +85,6 @@ spec: severity: page team: atlas topic: observability - - alert: KubeConfigMapCreatedMetricMissing annotations: description: '{{`kube_configmap_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index cd47324a8..6dc137889 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -61,21 +61,6 @@ spec: severity: page team: atlas topic: observability - - alert: AlloyForPrometheusRulesDown - annotations: - description: 'Alloy sending PrometheusRules to Mimir ruler is down.' - opsrecipe: prometheus-rules/ - expr: count(up{job="alloy-rules", namespace="mimir"} == 0) by (cluster_id, installation, provider, pipeline) > 0 - for: 1h - labels: - area: platform - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "true" - severity: page - team: atlas - topic: observability - alert: MimirRulerEventsFailed annotations: dashboard: 631e15d5d85afb2ca8e35d62984eeaa0/mimir-ruler diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml index b31713f90..a0bd48fe9 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml @@ -1,7 +1,6 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: - creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} name: prometheus.rules @@ -27,23 +26,6 @@ spec: severity: page team: atlas topic: observability - ## Pages Atlas when prometheus fails to send samples to cortex - - alert: PrometheusMissingGrafanaCloud - annotations: - description: 'Prometheus is not sending data to Grafana Cloud.' - opsrecipe: prometheus-grafanacloud/ - {{- if .Values.mimir.enabled }} - expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"}) - {{- else }} - expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"}) - {{- end }} - for: 1h - labels: - area: platform - cancel_if_outside_working_hours: "true" - severity: page - team: atlas - topic: observability - alert: PrometheusFailsToCommunicateWithRemoteStorageAPI annotations: description: '{{`Prometheus can''t communicate with Remote Storage API at {{ $labels.url }}.`}}' diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/grafana-cloud.test.yml similarity index 99% rename from test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml rename to test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/grafana-cloud.test.yml index ee5645cf0..79c5aa0f1 100644 --- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml +++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/grafana-cloud.test.yml @@ -1,6 +1,6 @@ --- rule_files: -- mimir-to-grafana-cloud-exporter.rules.yml +- grafana-cloud.rules.yml tests: # Tests for `MimirToGrafanaCloudExporterDown` alert diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml index 37d40af1d..6bdfeaeab 100644 --- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml +++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -86,35 +86,6 @@ tests: dashboard: ffcd83628d7d4b5a03d1cafd159e6c9c/mimir-overview description: "Mimir component : mimir-ingester is down." opsrecipe: "mimir/" - - interval: 1m - input_series: - # test with 1 pod: none, up, down - - series: 'up{job="alloy-rules", cluster_type="management_cluster", cluster_id="golem", provider="capa", pipeline="testing", installation="golem", namespace="mimir"}' - values: "_x20 1+0x70 0+0x70" - alert_rule_test: - - alertname: AlloyForPrometheusRulesDown - eval_time: 10m - - alertname: AlloyForPrometheusRulesDown - eval_time: 80m - - alertname: AlloyForPrometheusRulesDown - eval_time: 160m - exp_alerts: - - exp_labels: - area: platform - cancel_if_outside_working_hours: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cluster_id: golem - installation: golem - provider: capa - pipeline: testing - severity: page - team: atlas - topic: observability - exp_annotations: - description: "Alloy sending PrometheusRules to Mimir ruler is down." - opsrecipe: "prometheus-rules/" - interval: 1m input_series: # test: none, rate > 0, rate = 0 diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml index 2effa82d5..d8b9309a5 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml @@ -72,3 +72,157 @@ tests: summary: "Unhealthy components detected." - alertname: AlloyUnhealthyComponents eval_time: 80m + + # Test AlloyForPrometheusRulesDown + - interval: 1m + input_series: + # test with 1 pod: none, up, down + - series: 'up{job="alloy-rules", cluster_type="management_cluster", cluster_id="golem", provider="capa", pipeline="testing", installation="golem", namespace="monitoring"}' + values: "_x20 1+0x70 0+0x70" + alert_rule_test: + - alertname: AlloyForPrometheusRulesDown + eval_time: 10m + - alertname: AlloyForPrometheusRulesDown + eval_time: 80m + - alertname: AlloyForPrometheusRulesDown + eval_time: 160m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: golem + installation: golem + provider: capa + pipeline: testing + severity: page + team: atlas + topic: observability + exp_annotations: + description: "Alloy sending PrometheusRules to Loki and Mimir ruler is down." + opsrecipe: "prometheus-rules/" + + # Test LoggingAgentDown + - interval: 1m + input_series: + # For the first 60min: test with 1 pod: none, up, down + - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-logs-1xxxx", provider="aws", pipeline="testing"}' + values: "_x20 1+0x20 0+0x40" + - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-1xxxx", node="ip-10-0-5-1.eu-west-1.compute.internal", provider="aws", pipeline="testing"} + values: "1x180" + # From 60min: test with 2 pods: 1 up and 1 down, 2 up, 2 down. + - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-logs-2xxxx", provider="aws", pipeline="testing"}' + values: "_x80 1+0x40 1+0x20 0+0x40" + - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-2xxxx", node="ip-10-0-5-2.eu-west-1.compute.internal", provider="aws", pipeline="testing"} + values: "1x180" + - series: 'up{container="alloy", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", job="alloy-logs", pod="alloy-logs-3xxxx", provider="aws", pipeline="testing"}' + values: "_x80 0+0x40 1+0x20 0+0x40" + - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-3xxxx", node="ip-10-0-5-3.eu-west-1.compute.internal", provider="aws", pipeline="testing"} + values: "1x180" + alert_rule_test: + - alertname: LoggingAgentDown + eval_time: 10m + - alertname: LoggingAgentDown + eval_time: 30m + - alertname: LoggingAgentDown + eval_time: 71m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_node_unschedulable: "true" + cancel_if_node_not_ready: "true" + cluster_id: gauss + cluster_type: management_cluster + installation: gauss + node: ip-10-0-5-1.eu-west-1.compute.internal + pipeline: testing + pod: alloy-logs-1xxxx + provider: aws + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview" + description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." + opsrecipe: "alloy/" + # Tests with 2 pods + - alertname: LoggingAgentDown + eval_time: 111m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_node_unschedulable: "true" + cancel_if_node_not_ready: "true" + cluster_id: gauss + cluster_type: management_cluster + installation: gauss + node: ip-10-0-5-3.eu-west-1.compute.internal + pipeline: testing + pod: alloy-logs-3xxxx + provider: aws + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview" + description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." + opsrecipe: "alloy/" + - alertname: LoggingAgentDown + eval_time: 121m + - alertname: LoggingAgentDown + eval_time: 180m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_node_unschedulable: "true" + cancel_if_node_not_ready: "true" + cluster_id: gauss + cluster_type: management_cluster + installation: gauss + node: ip-10-0-5-2.eu-west-1.compute.internal + pipeline: testing + pod: alloy-logs-2xxxx + provider: aws + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview" + description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." + opsrecipe: "alloy/" + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_node_unschedulable: "true" + cancel_if_node_not_ready: "true" + cluster_id: gauss + cluster_type: management_cluster + installation: gauss + node: ip-10-0-5-3.eu-west-1.compute.internal + pipeline: testing + pod: alloy-logs-3xxxx + provider: aws + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview" + description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." + opsrecipe: "alloy/" diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml index 31217a0a7..fccbfa5a1 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml @@ -3,124 +3,6 @@ rule_files: - logging-pipeline.rules.yml tests: - # Test LoggingAgentDown - - interval: 1m - input_series: - # For the first 60min: test with 1 pod: none, up, down - - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-1xxxx", provider="aws", pipeline="testing"}' - values: "_x20 1+0x20 0+0x40" - - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-1xxxx", node="ip-10-0-5-1.eu-west-1.compute.internal", provider="aws", pipeline="testing"} - values: "1x180" - # From 60min: test with 2 pods: 1 up and 1 down, 2 up, 2 down. - - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-2xxxx", provider="aws", pipeline="testing"}' - values: "_x80 1+0x40 1+0x20 0+0x40" - - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-2xxxx", node="ip-10-0-5-2.eu-west-1.compute.internal", provider="aws", pipeline="testing"} - values: "1x180" - - series: 'up{container="alloy", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", job="alloy-logs", pod="alloy-3xxxx", provider="aws", pipeline="testing"}' - values: "_x80 0+0x40 1+0x20 0+0x40" - - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-3xxxx", node="ip-10-0-5-3.eu-west-1.compute.internal", provider="aws", pipeline="testing"} - values: "1x180" - alert_rule_test: - - alertname: LoggingAgentDown - eval_time: 10m - - alertname: LoggingAgentDown - eval_time: 30m - - alertname: LoggingAgentDown - eval_time: 71m - exp_alerts: - - exp_labels: - area: platform - cancel_if_outside_working_hours: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_node_unschedulable: "true" - cancel_if_node_not_ready: "true" - cluster_id: gauss - cluster_type: management_cluster - installation: gauss - node: ip-10-0-5-1.eu-west-1.compute.internal - pipeline: testing - pod: alloy-1xxxx - provider: aws - severity: page - team: atlas - topic: observability - exp_annotations: - description: "Scraping of all alloy pods to check if one failed every 30 minutes." - opsrecipe: "alloy/" - # Tests with 2 pods - - alertname: LoggingAgentDown - eval_time: 111m - exp_alerts: - - exp_labels: - area: platform - cancel_if_outside_working_hours: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_node_unschedulable: "true" - cancel_if_node_not_ready: "true" - cluster_id: gauss - cluster_type: management_cluster - installation: gauss - node: ip-10-0-5-3.eu-west-1.compute.internal - pipeline: testing - pod: alloy-3xxxx - provider: aws - severity: page - team: atlas - topic: observability - exp_annotations: - description: "Scraping of all alloy pods to check if one failed every 30 minutes." - opsrecipe: "alloy/" - - alertname: LoggingAgentDown - eval_time: 121m - - alertname: LoggingAgentDown - eval_time: 180m - exp_alerts: - - exp_labels: - area: platform - cancel_if_outside_working_hours: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_node_unschedulable: "true" - cancel_if_node_not_ready: "true" - cluster_id: gauss - cluster_type: management_cluster - installation: gauss - node: ip-10-0-5-2.eu-west-1.compute.internal - pipeline: testing - pod: alloy-2xxxx - provider: aws - severity: page - team: atlas - topic: observability - exp_annotations: - description: "Scraping of all alloy pods to check if one failed every 30 minutes." - opsrecipe: "alloy/" - - exp_labels: - area: platform - cancel_if_outside_working_hours: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_node_unschedulable: "true" - cancel_if_node_not_ready: "true" - cluster_id: gauss - cluster_type: management_cluster - installation: gauss - node: ip-10-0-5-3.eu-west-1.compute.internal - pipeline: testing - pod: alloy-3xxxx - provider: aws - severity: page - team: atlas - topic: observability - exp_annotations: - description: "Scraping of all alloy pods to check if one failed every 30 minutes." - opsrecipe: "alloy/" # Test LogForwardingErrors - interval: 1m input_series: From b7d53b3b9fa88f1c9adb78c2a1536e02c3365aec Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 5 Nov 2024 11:58:44 +0100 Subject: [PATCH 12/24] Update CHANGELOG.md --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e9ef272bf..8913d9edc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `LoggingAgentDown` to be alerted when the logging agent is down. - `LogForwardingErrors` to be alerted when the `loki.write` component is failing. - `LogReceivingErrors` to be alerted when the `loki.source.api` components of the gateway is failing. - - `MonitoringAgentFailing` and `InhibitionMonitoringAgentFailing` to be alerted when the monitoring agent is not able to send metrics. ### Changed From 1d49161dafc4edb7e726813909be5ffa1bde2851 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 5 Nov 2024 11:59:12 +0100 Subject: [PATCH 13/24] Update helm-operations.rules.yml --- .../honeybadger/recording-rules/helm-operations.rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/prometheus-rules/templates/platform/honeybadger/recording-rules/helm-operations.rules.yml b/helm/prometheus-rules/templates/platform/honeybadger/recording-rules/helm-operations.rules.yml index 2675857f0..a703dce91 100644 --- a/helm/prometheus-rules/templates/platform/honeybadger/recording-rules/helm-operations.rules.yml +++ b/helm/prometheus-rules/templates/platform/honeybadger/recording-rules/helm-operations.rules.yml @@ -1,4 +1,4 @@ -# TODO(@giantswarm/team-honeybadger) - This is only used by the chart-operator, let's get rid of it when the chart operator is gone. +# TODO - This is only used by the chart-operator, let's get rid of it when the chart operator is gone. apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: From 868779ee37fb7266bb3e0d6516b12076a4e965f8 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 5 Nov 2024 11:59:34 +0100 Subject: [PATCH 14/24] Update systemd.rules.yml --- .../templates/kaas/turtles/alerting-rules/systemd.rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/systemd.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/systemd.rules.yml index a58297b73..370c1a1f6 100644 --- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/systemd.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/systemd.rules.yml @@ -10,7 +10,7 @@ spec: groups: - name: systemd rules: - ## TODO(@giantswarm/team-tenet) Update those lists when all vintage clusters are gone + ## TODO(@giantswarm/team-turtles) Update those lists when all vintage clusters are gone - alert: ClusterCriticalSystemdUnitFailed annotations: description: '{{`Critical systemd unit {{ $labels.name }} is failed on {{ $labels.instance }}.`}}' From c15aab7116d73b073592d54f1097665d46ac1d00 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 5 Nov 2024 12:00:00 +0100 Subject: [PATCH 15/24] Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml --- .../atlas/alerting-rules/deployment.workload-cluster.rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml index afbec4e1a..fa9087331 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml @@ -13,7 +13,7 @@ spec: groups: - name: deployment rules: - # TODO(@giantswarm/team-honeybadger) - This is only used by the chart-operator, let's get rid of it when the chart operator is gone. + # TODO - This is only used by the chart-operator, let's get rid of it when the chart operator is gone. - alert: WorkloadClusterDeploymentNotSatisfied annotations: description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' From 393738de9af960358b6c22b97d2594d8424f1d75 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 5 Nov 2024 12:00:25 +0100 Subject: [PATCH 16/24] Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml --- .../platform/atlas/alerting-rules/prometheus-operator.rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml index a9d130014..6628f6601 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml @@ -10,7 +10,7 @@ spec: groups: - name: prometheus-operator rules: - ## TODO(@giantswarm/team-atlas) - remove once all clusters are passed v20 + ## TODO(@giantswarm/team-atlas) remove once all clusters are passed v20 - alert: DuplicatePrometheusOperatorKubeletService annotations: description: '{{`Prometheus-operator in cluster {{ $labels.cluster_id }} has duplicate kubelet service.`}}' From 54f9f7217871f6f58f8cac8a74b87f53874a83de Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 5 Nov 2024 12:00:50 +0100 Subject: [PATCH 17/24] Update helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml --- .../platform/honeybadger/alerting-rules/chart.rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml index 99048fe7b..1a584734b 100644 --- a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml +++ b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml @@ -1,4 +1,4 @@ -# TODO(@giantswarm/team-honeybadger) - This is only used by the chart-operator, let's get rid of it when the chart operator is gone. +# TODO - This is only used by the chart-operator, let's get rid of it when the chart operator is gone. apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: From bb9abda0caf0c9cfec17e5317739f6e4020fca6e Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 5 Nov 2024 12:01:09 +0100 Subject: [PATCH 18/24] Update helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml --- .../platform/honeybadger/alerting-rules/helm.rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml index 6ac690b2b..a07271c66 100644 --- a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml +++ b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml @@ -1,4 +1,4 @@ -# TODO(@giantswarm/team-honeybadger) - This is only used by the chart-operator, let's get rid of it when the chart operator is gone. +# TODO - This is only used by the chart-operator, let's get rid of it when the chart operator is gone. apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: From 068d45dfd1d6cfdef6b3793031d60618e5529697 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 5 Nov 2024 12:01:27 +0100 Subject: [PATCH 19/24] Update test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml --- .../honeybadger/alerting-rules/helm-operations.rules.test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml b/test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml index d20c15e33..54a65b1a6 100644 --- a/test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml +++ b/test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml @@ -1,4 +1,4 @@ -# TODO(@giantswarm/team-honeybadger) - This is only used by the chart-operator, let's get rid of it when the chart operator is gone. +# TODO - This is only used by the chart-operator, let's get rid of it when the chart operator is gone. --- rule_files: - helm-operations.rules.yml From 2f9c07c61cb749c48219853238576e1ea271a702 Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Tue, 5 Nov 2024 22:35:36 +0100 Subject: [PATCH 20/24] add alerts for alloy-metrics --- CHANGELOG.md | 6 ++ .../atlas/alerting-rules/alloy.rules.yml | 102 +++++++++++++++++- .../monitoring-pipeline.rules.yml | 80 ++++++++++++++ .../atlas/alerting-rules/prometheus.rules.yml | 59 +--------- .../atlas/alerting-rules/alloy.rules.test.yml | 35 ++++++ ...yml => monitoring-pipeline.rules.test.yml} | 60 ++++++++--- 6 files changed, 266 insertions(+), 76 deletions(-) create mode 100644 helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml rename test/tests/providers/global/platform/atlas/alerting-rules/{prometheus.rules.test.yml => monitoring-pipeline.rules.test.yml} (58%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 92d0a37ec..b90b0378a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `LoggingAgentDown` to be alerted when the logging agent is down. - `LogForwardingErrors` to be alerted when the `loki.write` component is failing. - `LogReceivingErrors` to be alerted when the `loki.source.api` components of the gateway is failing. + - `MonitoringAgentDown` to be alerted when the monitoring agent is down. + - `MonitoringAgentShardsNotSatisfied` to be alerted when the monitoring agent is missing any number of desired shards. ### Changed @@ -23,6 +25,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `observability-gateway` - Move all `grafana-cloud` related alerts to their own file. - Move all alloy related alerts to the alloy alert file and fix alloy-logs tests. +- Rename and move the following alerts as they are not specific to Prometheus: + - `PrometheusCriticalJobScrapingFailure` => `CriticalJobScrapingFailure` + - `PrometheusJobScrapingFailure` => `JobScrapingFailure` + - `PrometheusFailsToCommunicateWithRemoteStorageAPI` => `MetricForwardingErrors` ## [4.23.0] - 2024-10-30 diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml index 8b3e6256c..7d984b2dd 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml @@ -1,5 +1,5 @@ # This files describe common alloy alerting rules -# For alerts regarding monitoring and logging agents, please go to the respective files (logging.rules.yml and monitoring.rules.yml). +# For alerts regarding the monitoring pipeline and the logging pipeline, please go to the respective files (logging-pipeline.rules.yml and monitoring-pipeline.rules.yml). apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -91,3 +91,103 @@ spec: cancel_if_cluster_status_updating: "true" cancel_if_node_unschedulable: "true" cancel_if_node_not_ready: "true" + - name: alloy.metrics + rules: + # This alert pages if monitoring-agent fails to send samples to its remote write endpoint. + - alert: MonitoringAgentDown + annotations: + description: '{{`Monitoring agent fails to send samples.`}}' + summary: Monitoring agent fails to send samples to remote write endpoint. + opsrecipe: alloy/#monitoring-agent-down + dashboard: promRW001/prometheus-remote-write + expr: |- + count( + label_replace( + capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) by (cluster_id, installation, pipeline, provider) > 0 + unless on (cluster_id) ( + count(up{job="alloy-metrics"} > 0) by (cluster_id) + ) + for: 20m + labels: + area: platform + severity: page + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_has_no_workers: "true" + ## Same as MonitoringAgentDown, but triggers inhibition earlier and does not page. + - alert: InhibitionMonitoringAgentDown + annotations: + description: '{{`Monitoring agent fails to send samples.`}}' + summary: Monitoring agent fails to send samples to remote write endpoint. + opsrecipe: alloy/#monitoring-agent-down + dashboard: promRW001/prometheus-remote-write + expr: |- + count( + label_replace( + capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) by (cluster_id, installation, pipeline, provider) > 0 + unless on (cluster_id) ( + count(up{job="alloy-metrics"} > 0) by (cluster_id) + ) + for: 2m + labels: + area: platform + severity: none + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + ## This alert pages if any of the monitoring-agent shard is not running. + - alert: MonitoringAgentShardsNotSatisfied + annotations: + description: '{{`At least one of the monitoring agent shard is missing.`}}' + summary: Missing agent is missing shards. + opsrecipe: alloy/#monitoring-agent-down + expr: |- + kube_statefulset_status_replicas{statefulset="alloy-metrics"} + - kube_statefulset_status_replicas_ready{statefulset="alloy-metrics"} + > 0 + for: 40m + labels: + area: platform + severity: page + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + ## Same as MonitoringAgentShardsNotSatisfied but triggers inhibition earlier, and does not page. + - alert: InhibitionMonitoringAgentShardsNotSatisfied + annotations: + description: '{{`At least one of the monitoring agent shard is missing.`}}' + summary: Missing agent is missing shards. + opsrecipe: alloy/#monitoring-agent-down + expr: |- + kube_statefulset_status_replicas{statefulset="alloy-metrics"} + - kube_statefulset_status_replicas_ready{statefulset="alloy-metrics"} + > 0 + for: 2m + labels: + area: platform + severity: none + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml new file mode 100644 index 000000000..e666ea277 --- /dev/null +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml @@ -0,0 +1,80 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + {{- include "labels.common" . | nindent 4 }} + name: monitoring-pipeline.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: monitoring-pipeline + rules: + - alert: MetricForwardingErrors + annotations: + description: '{{`Monitoring agent can''t communicate with Remote Storage API at {{ $labels.url }}.`}}' + opsrecipe: monitoring-pipeline/ + dashboard: promRW001/prometheus-remote-write + expr: |- + rate(prometheus_remote_storage_samples_failed_total[10m]) > 0.1 + or rate(prometheus_remote_storage_samples_total[10m]) == 0 + or rate(prometheus_remote_storage_metadata_retried_total[10m]) > 0 + for: 1h + labels: + area: platform + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability + - alert: JobScrapingFailure + annotations: + dashboard: servicemonitors-details/servicemonitors-details + description: '{{`Monitoring agents for cluster {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}}' + summary: Monitoring agent failed to scrape all targets in a job. + opsrecipe: monitoring-job-scraping-failure/ + expr: |- + ( + count(up == 0) by (job, installation, cluster_id, provider, pipeline) + / + count(up) by (job, installation, cluster_id, provider, pipeline) + ) >= 1 + for: 1d + labels: + area: platform + severity: notify + team: atlas + topic: observability + cancel_if_outside_working_hours: "true" + - alert: CriticalJobScrapingFailure + annotations: + dashboard: servicemonitors-details/servicemonitors-details + description: '{{`Monitoring agents for cluster {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}}' + summary: Monitoring agent failed to scrape all targets in a job. + opsrecipe: monitoring-job-scraping-failure/ + ## We ignore bastion hosts node exporters + expr: |- + ( + count( + ( + up{job=~".*(apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics).*"} + or + up{job="kubelet", metrics_path="/metrics"} + ) == 0 + ) by (job, installation, cluster_id, provider, pipeline) + / + count( + up{job=~".*(apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics).*"} + or + up{job="kubelet", metrics_path="/metrics"} + ) by (job, installation, cluster_id, provider, pipeline) + ) >= 1 + for: 3d + labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_outside_working_hours: "true" + cancel_if_cluster_is_not_running_monitoring_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml index a0bd48fe9..7b48759a8 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml @@ -1,3 +1,4 @@ +# TODO(@giantswarm/team-atlas): revisit once vintage is gone apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -26,19 +27,6 @@ spec: severity: page team: atlas topic: observability - - alert: PrometheusFailsToCommunicateWithRemoteStorageAPI - annotations: - description: '{{`Prometheus can''t communicate with Remote Storage API at {{ $labels.url }}.`}}' - opsrecipe: prometheus-cant-communicate-with-remote-storage-api/ - dashboard: promRW001/prometheus-remote-write - expr: rate(prometheus_remote_storage_samples_failed_total[10m]) > 0.1 or rate(prometheus_remote_storage_samples_total[10m]) == 0 or rate(prometheus_remote_storage_metadata_retried_total[10m]) > 0 - for: 1h - labels: - area: platform - cancel_if_outside_working_hours: "true" - severity: page - team: atlas - topic: observability - alert: PrometheusRuleFailures annotations: description: {{`Prometheus {{$labels.installation}}/{{$labels.cluster_id}} has failed to evaluate rule(s) {{ printf "%.2f" $value }} time(s).`}} @@ -52,48 +40,3 @@ spec: team: atlas topic: observability cancel_if_outside_working_hours: "true" - - alert: PrometheusJobScrapingFailure - annotations: - description: {{`Prometheus {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}} - summary: Prometheus fails to scrape all targets in a job. - opsrecipe: prometheus-job-scraping-failure/ - expr: (count(up == 0) BY (job, installation, cluster_id, provider, pipeline) / count(up) BY (job, installation, cluster_id, provider, pipeline)) == 1 - for: 1d - labels: - area: platform - severity: notify - team: atlas - topic: observability - cancel_if_outside_working_hours: "true" - - alert: PrometheusCriticalJobScrapingFailure - annotations: - description: {{`Prometheus {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}} - summary: Prometheus fails to scrape all targets in a job. - opsrecipe: prometheus-job-scraping-failure/ - ## We ignore bastion hosts node exporters - expr: |- - ( - count( - ( - up{job=~"apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics"} - or - up{job="kubelet", metrics_path="/metrics"} - ) == 0 - ) BY (job, installation, cluster_id, provider, pipeline) - / - count( - up{job=~"apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics"} - or - up{job="kubelet", metrics_path="/metrics"} - ) BY (job, installation, cluster_id, provider, pipeline) - ) == 1 - for: 3d - labels: - area: platform - severity: page - team: atlas - topic: observability - cancel_if_outside_working_hours: "true" - cancel_if_cluster_is_not_running_monitoring_agent: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml index d8b9309a5..90e75a3fe 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml @@ -86,6 +86,7 @@ tests: eval_time: 80m - alertname: AlloyForPrometheusRulesDown eval_time: 160m + exp_alerts: - exp_labels: area: platform @@ -226,3 +227,37 @@ tests: dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview" description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." opsrecipe: "alloy/" + + # Test MonitoringAgentDown + - interval: 1m + input_series: + - series: 'up{job="alloy-metrics", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}' + values: "_x20 1+0x70 0+0x70" + - series: 'capi_cluster_status_condition{type="ControlPlaneReady", status="True", name="gauss", installation="gauss", provider="aws", pipeline="testing"}' + values: "1x150" + alert_rule_test: + - alertname: MonitoringAgentDown + eval_time: 10m + - alertname: MonitoringAgentDown + eval_time: 80m + - alertname: MonitoringAgentDown + eval_time: 140m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_has_no_workers: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + inhibit_monitoring_agent_down: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: page + team: atlas + topic: observability + exp_annotations: + description: "Monitoring agent fails to send samples." + opsrecipe: "alloy/#monitoring-agent-down" + dashboard: "promRW001/prometheus-remote-write" + summary: "Monitoring agent fails to send samples to remote write endpoint." diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/prometheus.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/monitoring-pipeline.rules.test.yml similarity index 58% rename from test/tests/providers/global/platform/atlas/alerting-rules/prometheus.rules.test.yml rename to test/tests/providers/global/platform/atlas/alerting-rules/monitoring-pipeline.rules.test.yml index 77cdd2167..ad97acbb7 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/prometheus.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/monitoring-pipeline.rules.test.yml @@ -1,13 +1,13 @@ --- rule_files: - - prometheus.rules.yml + - monitoring-pipeline.rules.yml # Setting evaluation interval to 1h # to make it faster on long test duration. evaluation_interval: 1h tests: - # Test PrometheusJobScrapingFailure and PrometheusCriticalJobScrapingFailure + # Test JobScrapingFailure and CriticalJobScrapingFailure - interval: 1h input_series: - series: 'up{job="apiserver", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing"}' @@ -30,14 +30,14 @@ tests: - series: 'up{job="app-exporter", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing"}' values: "1+0x120 0+0x120" alert_rule_test: - - alertname: PrometheusCriticalJobScrapingFailure + - alertname: CriticalJobScrapingFailure eval_time: 30m - - alertname: PrometheusJobScrapingFailure + - alertname: JobScrapingFailure eval_time: 1d - - alertname: PrometheusCriticalJobScrapingFailure + - alertname: CriticalJobScrapingFailure eval_time: 4d # This alert fires for both critical and non-critical targets - - alertname: PrometheusJobScrapingFailure + - alertname: JobScrapingFailure eval_time: 7d exp_alerts: - exp_labels: @@ -52,9 +52,10 @@ tests: pipeline: "testing" job: "kube-controller-manager" exp_annotations: - opsrecipe: "prometheus-job-scraping-failure/" - summary: "Prometheus fails to scrape all targets in a job." - description: "Prometheus gauss/gauss has failed to scrape all targets in kube-controller-manager job." + dashboard: servicemonitors-details/servicemonitors-details + opsrecipe: "monitoring-job-scraping-failure/" + summary: "Monitoring agent failed to scrape all targets in a job." + description: "Monitoring agents for cluster gauss/gauss has failed to scrape all targets in kube-controller-manager job." - exp_labels: area: platform severity: notify @@ -67,12 +68,13 @@ tests: pipeline: "testing" job: "app-exporter" exp_annotations: - opsrecipe: "prometheus-job-scraping-failure/" - summary: "Prometheus fails to scrape all targets in a job." - description: "Prometheus gauss/gauss has failed to scrape all targets in app-exporter job." - + dashboard: servicemonitors-details/servicemonitors-details + opsrecipe: "monitoring-job-scraping-failure/" + summary: "Monitoring agent failed to scrape all targets in a job." + description: "Monitoring agents for cluster gauss/gauss has failed to scrape all targets in app-exporter job." + # This fires only for critical target down. - - alertname: PrometheusCriticalJobScrapingFailure + - alertname: CriticalJobScrapingFailure eval_time: 9d exp_alerts: - exp_labels: @@ -90,6 +92,30 @@ tests: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" exp_annotations: - opsrecipe: "prometheus-job-scraping-failure/" - summary: "Prometheus fails to scrape all targets in a job." - description: "Prometheus gauss/gauss has failed to scrape all targets in kube-controller-manager job." + dashboard: servicemonitors-details/servicemonitors-details + opsrecipe: "monitoring-job-scraping-failure/" + summary: "Monitoring agent failed to scrape all targets in a job." + description: "Monitoring agents for cluster gauss/gauss has failed to scrape all targets in kube-controller-manager job." + + + # Test MetricForwardingErrors + - interval: 1m + input_series: + # remote write has no failure for 1 hour and then fails for 2 hours + - series: 'prometheus_remote_storage_samples_failed_total{url="http://remote-storage_samples_failed_total"}' + values: "0+0x60 0+100x120" + alert_rule_test: + - alertname: MetricForwardingErrors + eval_time: 180m + exp_alerts: + - exp_labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_outside_working_hours: "true" + url: "http://remote-storage_samples_failed_total" + exp_annotations: + description: "Monitoring agent can't communicate with Remote Storage API at http://remote-storage_samples_failed_total." + opsrecipe: "monitoring-pipeline/" + dashboard: "promRW001/prometheus-remote-write" From 4f9e241321c55610620453610a2cd2baccdefb50 Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Thu, 7 Nov 2024 20:52:54 +0100 Subject: [PATCH 21/24] improve monitoring agent down tests --- .../atlas/alerting-rules/alloy.rules.test.yml | 85 ++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml index 598b51ce2..749f9d916 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml @@ -230,14 +230,77 @@ tests: - interval: 1m input_series: - series: 'up{job="alloy-metrics", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}' - values: "_x20 1+0x70 0+0x70" + values: "_x40 1+0x50 0+0x70" - series: 'capi_cluster_status_condition{type="ControlPlaneReady", status="True", name="gauss", installation="gauss", provider="aws", pipeline="testing"}' values: "1x150" alert_rule_test: - alertname: MonitoringAgentDown eval_time: 10m + - alertname: InhibitionMonitoringAgentDown + eval_time: 10m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + inhibit_monitoring_agent_down: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: none + team: atlas + topic: observability + exp_annotations: + description: "Monitoring agent fails to send samples." + opsrecipe: "alloy/#monitoring-agent-down" + dashboard: "promRW001/prometheus-remote-write" + summary: "Monitoring agent fails to send samples to remote write endpoint." + - alertname: MonitoringAgentDown + eval_time: 30m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_has_no_workers: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + inhibit_monitoring_agent_down: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: page + team: atlas + topic: observability + exp_annotations: + description: "Monitoring agent fails to send samples." + opsrecipe: "alloy/#monitoring-agent-down" + dashboard: "promRW001/prometheus-remote-write" + summary: "Monitoring agent fails to send samples to remote write endpoint." + - alertname: InhibitionMonitoringAgentDown + eval_time: 30m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + inhibit_monitoring_agent_down: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: none + team: atlas + topic: observability + exp_annotations: + description: "Monitoring agent fails to send samples." + opsrecipe: "alloy/#monitoring-agent-down" + dashboard: "promRW001/prometheus-remote-write" + summary: "Monitoring agent fails to send samples to remote write endpoint." - alertname: MonitoringAgentDown eval_time: 80m + - alertname: InhibitionMonitoringAgentDown + eval_time: 80m - alertname: MonitoringAgentDown eval_time: 140m exp_alerts: @@ -259,3 +322,23 @@ tests: opsrecipe: "alloy/#monitoring-agent-down" dashboard: "promRW001/prometheus-remote-write" summary: "Monitoring agent fails to send samples to remote write endpoint." + - alertname: InhibitionMonitoringAgentDown + eval_time: 140m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + inhibit_monitoring_agent_down: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: none + team: atlas + topic: observability + exp_annotations: + description: "Monitoring agent fails to send samples." + opsrecipe: "alloy/#monitoring-agent-down" + dashboard: "promRW001/prometheus-remote-write" + summary: "Monitoring agent fails to send samples to remote write endpoint." From 553a1a49dd525938c3abec0c940345f1e3813d61 Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Thu, 7 Nov 2024 21:03:03 +0100 Subject: [PATCH 22/24] improve monitoring agent shards not satisfied tests --- .../atlas/alerting-rules/alloy.rules.yml | 4 +- .../atlas/alerting-rules/alloy.rules.test.yml | 80 +++++++++++++++++++ 2 files changed, 82 insertions(+), 2 deletions(-) diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml index 13b1a3d85..fc364f285 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml @@ -156,7 +156,7 @@ spec: - alert: MonitoringAgentShardsNotSatisfied annotations: description: '{{`At least one of the monitoring agent shard is missing.`}}' - summary: Missing agent is missing shards. + summary: Monitoring agent is missing shards. opsrecipe: alloy/#monitoring-agent-down expr: |- kube_statefulset_status_replicas{statefulset="alloy-metrics"} @@ -176,7 +176,7 @@ spec: - alert: InhibitionMonitoringAgentShardsNotSatisfied annotations: description: '{{`At least one of the monitoring agent shard is missing.`}}' - summary: Missing agent is missing shards. + summary: Monitoring agent is missing shards. opsrecipe: alloy/#monitoring-agent-down expr: |- kube_statefulset_status_replicas{statefulset="alloy-metrics"} diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml index 749f9d916..40aa3e248 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml @@ -342,3 +342,83 @@ tests: opsrecipe: "alloy/#monitoring-agent-down" dashboard: "promRW001/prometheus-remote-write" summary: "Monitoring agent fails to send samples to remote write endpoint." + + # Test MonitoringAgentShardsNotSatisfied + - interval: 1m + input_series: + - series: 'kube_statefulset_status_replicas{statefulset="alloy-metrics", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}' + values: "3+0x50 3+0x50 3+0x50" + - series: 'kube_statefulset_status_replicas_ready{statefulset="alloy-metrics", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}' + values: "3+0x10 2+0x90 3+0x50" + alert_rule_test: + - alertname: MonitoringAgentShardsNotSatisfied + eval_time: 10m + - alertname: MonitoringAgentShardsNotSatisfied + eval_time: 30m + - alertname: MonitoringAgentShardsNotSatisfied + eval_time: 30m + - alertname: InhibitionMonitoringAgentShardsNotSatisfied + eval_time: 30m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: none + statefulset: alloy-metrics + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + exp_annotations: + description: "At least one of the monitoring agent shard is missing." + summary: "Monitoring agent is missing shards." + opsrecipe: "alloy/#monitoring-agent-down" + - alertname: MonitoringAgentShardsNotSatisfied + eval_time: 60m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: page + statefulset: alloy-metrics + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + exp_annotations: + description: "At least one of the monitoring agent shard is missing." + summary: "Monitoring agent is missing shards." + opsrecipe: "alloy/#monitoring-agent-down" + - alertname: InhibitionMonitoringAgentShardsNotSatisfied + eval_time: 60m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing + severity: none + statefulset: alloy-metrics + team: atlas + topic: observability + inhibit_monitoring_agent_down: "true" + exp_annotations: + description: "At least one of the monitoring agent shard is missing." + summary: "Monitoring agent is missing shards." + opsrecipe: "alloy/#monitoring-agent-down" + - alertname: MonitoringAgentShardsNotSatisfied + eval_time: 130m + - alertname: InhibitionMonitoringAgentShardsNotSatisfied + eval_time: 130m From c7b460b7ae8b301566d6be15294c692f05f7e2f3 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Thu, 7 Nov 2024 23:21:58 +0100 Subject: [PATCH 23/24] Update test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Hervé Nicol --- .../global/platform/atlas/alerting-rules/alloy.rules.test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml index 40aa3e248..36c8bc5f5 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml @@ -347,7 +347,7 @@ tests: - interval: 1m input_series: - series: 'kube_statefulset_status_replicas{statefulset="alloy-metrics", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}' - values: "3+0x50 3+0x50 3+0x50" + values: "3+0x10 3+0x90 3+0x50" - series: 'kube_statefulset_status_replicas_ready{statefulset="alloy-metrics", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}' values: "3+0x10 2+0x90 3+0x50" alert_rule_test: From b476eac9548fdda9a36aa31d168be63f1b980186 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Thu, 7 Nov 2024 23:22:13 +0100 Subject: [PATCH 24/24] Update test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Hervé Nicol --- .../global/platform/atlas/alerting-rules/alloy.rules.test.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml index 36c8bc5f5..98549b422 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml @@ -355,8 +355,6 @@ tests: eval_time: 10m - alertname: MonitoringAgentShardsNotSatisfied eval_time: 30m - - alertname: MonitoringAgentShardsNotSatisfied - eval_time: 30m - alertname: InhibitionMonitoringAgentShardsNotSatisfied eval_time: 30m exp_alerts: