From fbc9c8d61a0a5b43c31a215a7b9703dc84d429e4 Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Mon, 4 Nov 2024 15:20:30 +0100 Subject: [PATCH] change based on ops-recipes --- .../atlas/alerting-rules/alloy.rules.yml | 30 ++++++++++++++-- ...rules.yaml => logging-pipeline.rules.yaml} | 34 +++---------------- .../atlas/alerting-rules/alloy.rules.test.yml | 4 +-- ...st.yml => logging-pipeline.rules.test.yml} | 26 +++++++------- 4 files changed, 47 insertions(+), 47 deletions(-) rename helm/prometheus-rules/templates/platform/atlas/alerting-rules/{logging.rules.yaml => logging-pipeline.rules.yaml} (71%) rename test/tests/providers/global/platform/atlas/alerting-rules/{logging.rules.test.yml => logging-pipeline.rules.test.yml} (92%) diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml index 7c127028..aa1959de 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml @@ -18,7 +18,7 @@ spec: annotations: dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller description: '{{`Component evaluations are taking too long under job {{ $labels.job }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id }}.`}}' - opsrecipe: alloy-components/ + opsrecipe: alloy/ summary: Component evaluations are taking too long. expr: sum by (cluster_id, installation, provider, pipeline, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 for: 15m @@ -35,7 +35,7 @@ spec: annotations: dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller description: '{{`Unhealthy components detected under job {{ $labels.job }}`}}' - opsrecipe: alloy-components/ + opsrecipe: alloy/ summary: Unhealthy components detected. expr: sum by (cluster_id, installation, provider, pipeline, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m @@ -48,3 +48,29 @@ spec: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" + - name: logging-agent + rules: + # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready) + # and join the pods with the not running containers + - alert: LoggingAgentDown + annotations: + dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview + description: '{{`Scraping of all logging-agent pods to check if one failed every 30 minutes.`}}' + opsrecipe: alloy/ + expr: |- + kube_pod_info{pod=~"alloy-logs.*"} + * on(cluster_id, pod) + group_left () + up{job="alloy-logs", container="alloy"} == 0 + for: 30m + labels: + area: platform + severity: page + team: atlas + topic: observability + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_node_unschedulable: "true" + cancel_if_node_not_ready: "true" diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging-pipeline.rules.yaml similarity index 71% rename from helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml rename to helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging-pipeline.rules.yaml index 155b2cda..c45f70f4 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging-pipeline.rules.yaml @@ -3,37 +3,11 @@ kind: PrometheusRule metadata: labels: {{- include "labels.common" . | nindent 4 }} - name: logging.rules + name: logging-pipeline.rules namespace: {{ .Values.namespace }} spec: groups: - - name: logging-agent - rules: - # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready) - # and join the pods with the not running containers - - alert: LoggingAgentDown - annotations: - dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview - description: '{{`Scraping of all logging-agent pods to check if one failed every 30 minutes.`}}' - opsrecipe: logging-agent/ - expr: |- - kube_pod_info{pod=~"alloy-logs.*"} - * on(cluster_id, pod) - group_left () - up{job="alloy-logs", container="alloy"} == 0 - for: 30m - labels: - area: platform - severity: page - team: atlas - topic: observability - cancel_if_outside_working_hours: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_node_unschedulable: "true" - cancel_if_node_not_ready: "true" - - name: log-ingestion + - name: logging-pipeline rules: # Any alloy component that uses the loki.write component can throw such errors. # This includes alloy-logs and the observability-gateway @@ -41,7 +15,7 @@ spec: annotations: dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview description: '{{`More that 10% of the requests to Loki are failing.`}}' - opsrecipe: log-shipping-errors/ + opsrecipe: logging-pipeline/ expr: |- ( 100 @@ -79,7 +53,7 @@ spec: annotations: dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview description: '{{`More that 10% of the loki requests to the observability gateway are failing.`}}' - opsrecipe: log-shipping-errors/ + opsrecipe: logging-pipeline/ expr: |- ( 100 diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml index 62ad40ae..2effa82d 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml @@ -34,7 +34,7 @@ tests: exp_annotations: dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller description: "Component evaluations are taking too long under job alloy-controller, component_path path1, component_id comp1." - opsrecipe: "alloy-components/" + opsrecipe: "alloy/" summary: "Component evaluations are taking too long." - alertname: AlloySlowComponentEvaluations eval_time: 80m @@ -68,7 +68,7 @@ tests: exp_annotations: dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller description: "Unhealthy components detected under job alloy-controller" - opsrecipe: "alloy-components/" + opsrecipe: "alloy/" summary: "Unhealthy components detected." - alertname: AlloyUnhealthyComponents eval_time: 80m diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/logging.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml similarity index 92% rename from test/tests/providers/global/platform/atlas/alerting-rules/logging.rules.test.yml rename to test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml index 2b0941d7..31217a0a 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/logging.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml @@ -1,6 +1,6 @@ --- rule_files: - - logging.rules.yml + - logging-pipeline.rules.yml tests: # Test LoggingAgentDown @@ -47,8 +47,8 @@ tests: team: atlas topic: observability exp_annotations: - description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." - opsrecipe: "logging-agent/" + description: "Scraping of all alloy pods to check if one failed every 30 minutes." + opsrecipe: "alloy/" # Tests with 2 pods - alertname: LoggingAgentDown eval_time: 111m @@ -72,8 +72,8 @@ tests: team: atlas topic: observability exp_annotations: - description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." - opsrecipe: "logging-agent/" + description: "Scraping of all alloy pods to check if one failed every 30 minutes." + opsrecipe: "alloy/" - alertname: LoggingAgentDown eval_time: 121m - alertname: LoggingAgentDown @@ -98,8 +98,8 @@ tests: team: atlas topic: observability exp_annotations: - description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." - opsrecipe: "logging-agent/" + description: "Scraping of all alloy pods to check if one failed every 30 minutes." + opsrecipe: "alloy/" - exp_labels: area: platform cancel_if_outside_working_hours: "true" @@ -119,8 +119,8 @@ tests: team: atlas topic: observability exp_annotations: - description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." - opsrecipe: "logging-agent/" + description: "Scraping of all alloy pods to check if one failed every 30 minutes." + opsrecipe: "alloy/" # Test LogForwardingErrors - interval: 1m input_series: @@ -155,7 +155,7 @@ tests: topic: observability exp_annotations: description: "More that 10% of the requests to Loki are failing." - opsrecipe: "log-shipping-errors/" + opsrecipe: "logging-pipeline/" - alertname: LogForwardingErrors eval_time: 330m exp_alerts: @@ -173,7 +173,7 @@ tests: topic: observability exp_annotations: description: "More that 10% of the requests to Loki are failing." - opsrecipe: "log-shipping-errors/" + opsrecipe: "logging-pipeline/" # Test LogReceivingErrors - interval: 1m input_series: @@ -208,7 +208,7 @@ tests: topic: observability exp_annotations: description: "More that 10% of the loki requests to the observability gateway are failing." - opsrecipe: "log-shipping-errors/" + opsrecipe: "logging-pipeline/" - alertname: LogReceivingErrors eval_time: 330m exp_alerts: @@ -226,4 +226,4 @@ tests: topic: observability exp_annotations: description: "More that 10% of the loki requests to the observability gateway are failing." - opsrecipe: "log-shipping-errors/" + opsrecipe: "logging-pipeline/"