From 9e726643210d6c065e14351140a52ce4f16a4a4c Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Tue, 5 Nov 2024 11:53:41 +0100 Subject: [PATCH] Clean up some rules a bit --- CHANGELOG.md | 3 +- .../atlas/alerting-rules/alloy.rules.yml | 21 ++- ...rter.rules.yml => grafana-cloud.rules.yml} | 30 +++- .../atlas/alerting-rules/grafana.rules.yml | 4 +- .../kube-state-metrics.rules.yml | 1 - .../atlas/alerting-rules/mimir.rules.yml | 15 -- .../atlas/alerting-rules/prometheus.rules.yml | 18 -- ....rules.test.yml => grafana-cloud.test.yml} | 2 +- .../atlas/alerting-rules/mimir.rules.test.yml | 29 ---- .../atlas/alerting-rules/alloy.rules.test.yml | 154 ++++++++++++++++++ .../logging-pipeline.rules.test.yml | 118 -------------- 11 files changed, 204 insertions(+), 191 deletions(-) rename helm/prometheus-rules/templates/platform/atlas/alerting-rules/{mimir-to-grafana-cloud-exporter.rules.yml => grafana-cloud.rules.yml} (74%) rename test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/{mimir-to-grafana-cloud-exporter.rules.test.yml => grafana-cloud.test.yml} (99%) diff --git a/CHANGELOG.md b/CHANGELOG.md index e9ef272b..92d0a37e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `LoggingAgentDown` to be alerted when the logging agent is down. - `LogForwardingErrors` to be alerted when the `loki.write` component is failing. - `LogReceivingErrors` to be alerted when the `loki.source.api` components of the gateway is failing. - - `MonitoringAgentFailing` and `InhibitionMonitoringAgentFailing` to be alerted when the monitoring agent is not able to send metrics. ### Changed @@ -22,6 +21,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `observability-operator` - `alloy-rules` - `observability-gateway` +- Move all `grafana-cloud` related alerts to their own file. +- Move all alloy related alerts to the alloy alert file and fix alloy-logs tests. ## [4.23.0] - 2024-10-30 diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml index aa1959de..8b3e6256 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml @@ -6,7 +6,7 @@ metadata: labels: {{- include "labels.common" . | nindent 4 }} name: alloy.rules - namespace: {{ .Values.namespace }} + namespace: {{ .Values.namespace }} spec: groups: # List of alerts for on the state of the alloy components. @@ -48,7 +48,24 @@ spec: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - - name: logging-agent + - name: alloy.rules + rules: + - alert: AlloyForPrometheusRulesDown + annotations: + description: 'Alloy sending PrometheusRules to Loki and Mimir ruler is down.' + opsrecipe: prometheus-rules/ + expr: count(up{job="alloy-rules", namespace="monitoring"} == 0) by (cluster_id, installation, provider, pipeline) > 0 + for: 1h + labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability + - name: alloy.logs rules: # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready) # and join the pods with the not running containers diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml similarity index 74% rename from helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml rename to helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml index 40d76d3d..9560570e 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml @@ -1,13 +1,35 @@ -{{- if .Values.mimir.enabled }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: labels: {{- include "labels.common" . | nindent 4 }} - name: mimir-to-grafana-cloud-exporter.rules - namespace: {{ .Values.namespace }} + {{- if not .Values.mimir.enabled }} + cluster_type: "management_cluster" + {{- end }} + name: grafana-cloud.rules + namespace: {{ .Values.namespace }} spec: groups: + - name: grafana-cloud + rules: + ## Pages Atlas when prometheus fails to send samples to cortex + - alert: PrometheusMissingGrafanaCloud + annotations: + description: 'Prometheus is not sending data to Grafana Cloud.' + opsrecipe: prometheus-grafanacloud/ + {{- if .Values.mimir.enabled }} + expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"}) + {{- else }} + expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"}) + {{- end }} + for: 1h + labels: + area: platform + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability + {{- if .Values.mimir.enabled }} - name: mimir-to-grafana-cloud-exporter rules: - alert: MimirToGrafanaCloudExporterDown @@ -73,4 +95,4 @@ spec: severity: page team: atlas topic: observability -{{- end }} + {{- end }} diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml index 39fb4a0a..97a10780 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml @@ -3,9 +3,9 @@ kind: PrometheusRule metadata: labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + {{- if not .Values.mimir.enabled }} cluster_type: "management_cluster" -{{- end }} + {{- end }} name: grafana.rules namespace: {{ .Values.namespace }} spec: diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml index 6c90a4e2..83089fc3 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml @@ -85,7 +85,6 @@ spec: severity: page team: atlas topic: observability - - alert: KubeConfigMapCreatedMetricMissing annotations: description: '{{`kube_configmap_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index cd47324a..6dc13788 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -61,21 +61,6 @@ spec: severity: page team: atlas topic: observability - - alert: AlloyForPrometheusRulesDown - annotations: - description: 'Alloy sending PrometheusRules to Mimir ruler is down.' - opsrecipe: prometheus-rules/ - expr: count(up{job="alloy-rules", namespace="mimir"} == 0) by (cluster_id, installation, provider, pipeline) > 0 - for: 1h - labels: - area: platform - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "true" - severity: page - team: atlas - topic: observability - alert: MimirRulerEventsFailed annotations: dashboard: 631e15d5d85afb2ca8e35d62984eeaa0/mimir-ruler diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml index b31713f9..a0bd48fe 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml @@ -1,7 +1,6 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: - creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} name: prometheus.rules @@ -27,23 +26,6 @@ spec: severity: page team: atlas topic: observability - ## Pages Atlas when prometheus fails to send samples to cortex - - alert: PrometheusMissingGrafanaCloud - annotations: - description: 'Prometheus is not sending data to Grafana Cloud.' - opsrecipe: prometheus-grafanacloud/ - {{- if .Values.mimir.enabled }} - expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"}) - {{- else }} - expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"}) - {{- end }} - for: 1h - labels: - area: platform - cancel_if_outside_working_hours: "true" - severity: page - team: atlas - topic: observability - alert: PrometheusFailsToCommunicateWithRemoteStorageAPI annotations: description: '{{`Prometheus can''t communicate with Remote Storage API at {{ $labels.url }}.`}}' diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/grafana-cloud.test.yml similarity index 99% rename from test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml rename to test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/grafana-cloud.test.yml index ee5645cf..79c5aa0f 100644 --- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml +++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/grafana-cloud.test.yml @@ -1,6 +1,6 @@ --- rule_files: -- mimir-to-grafana-cloud-exporter.rules.yml +- grafana-cloud.rules.yml tests: # Tests for `MimirToGrafanaCloudExporterDown` alert diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml index 37d40af1..6bdfeaea 100644 --- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml +++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -86,35 +86,6 @@ tests: dashboard: ffcd83628d7d4b5a03d1cafd159e6c9c/mimir-overview description: "Mimir component : mimir-ingester is down." opsrecipe: "mimir/" - - interval: 1m - input_series: - # test with 1 pod: none, up, down - - series: 'up{job="alloy-rules", cluster_type="management_cluster", cluster_id="golem", provider="capa", pipeline="testing", installation="golem", namespace="mimir"}' - values: "_x20 1+0x70 0+0x70" - alert_rule_test: - - alertname: AlloyForPrometheusRulesDown - eval_time: 10m - - alertname: AlloyForPrometheusRulesDown - eval_time: 80m - - alertname: AlloyForPrometheusRulesDown - eval_time: 160m - exp_alerts: - - exp_labels: - area: platform - cancel_if_outside_working_hours: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cluster_id: golem - installation: golem - provider: capa - pipeline: testing - severity: page - team: atlas - topic: observability - exp_annotations: - description: "Alloy sending PrometheusRules to Mimir ruler is down." - opsrecipe: "prometheus-rules/" - interval: 1m input_series: # test: none, rate > 0, rate = 0 diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml index 2effa82d..d8b9309a 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml @@ -72,3 +72,157 @@ tests: summary: "Unhealthy components detected." - alertname: AlloyUnhealthyComponents eval_time: 80m + + # Test AlloyForPrometheusRulesDown + - interval: 1m + input_series: + # test with 1 pod: none, up, down + - series: 'up{job="alloy-rules", cluster_type="management_cluster", cluster_id="golem", provider="capa", pipeline="testing", installation="golem", namespace="monitoring"}' + values: "_x20 1+0x70 0+0x70" + alert_rule_test: + - alertname: AlloyForPrometheusRulesDown + eval_time: 10m + - alertname: AlloyForPrometheusRulesDown + eval_time: 80m + - alertname: AlloyForPrometheusRulesDown + eval_time: 160m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: golem + installation: golem + provider: capa + pipeline: testing + severity: page + team: atlas + topic: observability + exp_annotations: + description: "Alloy sending PrometheusRules to Loki and Mimir ruler is down." + opsrecipe: "prometheus-rules/" + + # Test LoggingAgentDown + - interval: 1m + input_series: + # For the first 60min: test with 1 pod: none, up, down + - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-logs-1xxxx", provider="aws", pipeline="testing"}' + values: "_x20 1+0x20 0+0x40" + - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-1xxxx", node="ip-10-0-5-1.eu-west-1.compute.internal", provider="aws", pipeline="testing"} + values: "1x180" + # From 60min: test with 2 pods: 1 up and 1 down, 2 up, 2 down. + - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-logs-2xxxx", provider="aws", pipeline="testing"}' + values: "_x80 1+0x40 1+0x20 0+0x40" + - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-2xxxx", node="ip-10-0-5-2.eu-west-1.compute.internal", provider="aws", pipeline="testing"} + values: "1x180" + - series: 'up{container="alloy", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", job="alloy-logs", pod="alloy-logs-3xxxx", provider="aws", pipeline="testing"}' + values: "_x80 0+0x40 1+0x20 0+0x40" + - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-3xxxx", node="ip-10-0-5-3.eu-west-1.compute.internal", provider="aws", pipeline="testing"} + values: "1x180" + alert_rule_test: + - alertname: LoggingAgentDown + eval_time: 10m + - alertname: LoggingAgentDown + eval_time: 30m + - alertname: LoggingAgentDown + eval_time: 71m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_node_unschedulable: "true" + cancel_if_node_not_ready: "true" + cluster_id: gauss + cluster_type: management_cluster + installation: gauss + node: ip-10-0-5-1.eu-west-1.compute.internal + pipeline: testing + pod: alloy-logs-1xxxx + provider: aws + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview" + description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." + opsrecipe: "alloy/" + # Tests with 2 pods + - alertname: LoggingAgentDown + eval_time: 111m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_node_unschedulable: "true" + cancel_if_node_not_ready: "true" + cluster_id: gauss + cluster_type: management_cluster + installation: gauss + node: ip-10-0-5-3.eu-west-1.compute.internal + pipeline: testing + pod: alloy-logs-3xxxx + provider: aws + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview" + description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." + opsrecipe: "alloy/" + - alertname: LoggingAgentDown + eval_time: 121m + - alertname: LoggingAgentDown + eval_time: 180m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_node_unschedulable: "true" + cancel_if_node_not_ready: "true" + cluster_id: gauss + cluster_type: management_cluster + installation: gauss + node: ip-10-0-5-2.eu-west-1.compute.internal + pipeline: testing + pod: alloy-logs-2xxxx + provider: aws + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview" + description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." + opsrecipe: "alloy/" + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_node_unschedulable: "true" + cancel_if_node_not_ready: "true" + cluster_id: gauss + cluster_type: management_cluster + installation: gauss + node: ip-10-0-5-3.eu-west-1.compute.internal + pipeline: testing + pod: alloy-logs-3xxxx + provider: aws + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview" + description: "Scraping of all logging-agent pods to check if one failed every 30 minutes." + opsrecipe: "alloy/" diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml index 31217a0a..fccbfa5a 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml @@ -3,124 +3,6 @@ rule_files: - logging-pipeline.rules.yml tests: - # Test LoggingAgentDown - - interval: 1m - input_series: - # For the first 60min: test with 1 pod: none, up, down - - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-1xxxx", provider="aws", pipeline="testing"}' - values: "_x20 1+0x20 0+0x40" - - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-1xxxx", node="ip-10-0-5-1.eu-west-1.compute.internal", provider="aws", pipeline="testing"} - values: "1x180" - # From 60min: test with 2 pods: 1 up and 1 down, 2 up, 2 down. - - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-2xxxx", provider="aws", pipeline="testing"}' - values: "_x80 1+0x40 1+0x20 0+0x40" - - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-2xxxx", node="ip-10-0-5-2.eu-west-1.compute.internal", provider="aws", pipeline="testing"} - values: "1x180" - - series: 'up{container="alloy", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", job="alloy-logs", pod="alloy-3xxxx", provider="aws", pipeline="testing"}' - values: "_x80 0+0x40 1+0x20 0+0x40" - - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-3xxxx", node="ip-10-0-5-3.eu-west-1.compute.internal", provider="aws", pipeline="testing"} - values: "1x180" - alert_rule_test: - - alertname: LoggingAgentDown - eval_time: 10m - - alertname: LoggingAgentDown - eval_time: 30m - - alertname: LoggingAgentDown - eval_time: 71m - exp_alerts: - - exp_labels: - area: platform - cancel_if_outside_working_hours: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_node_unschedulable: "true" - cancel_if_node_not_ready: "true" - cluster_id: gauss - cluster_type: management_cluster - installation: gauss - node: ip-10-0-5-1.eu-west-1.compute.internal - pipeline: testing - pod: alloy-1xxxx - provider: aws - severity: page - team: atlas - topic: observability - exp_annotations: - description: "Scraping of all alloy pods to check if one failed every 30 minutes." - opsrecipe: "alloy/" - # Tests with 2 pods - - alertname: LoggingAgentDown - eval_time: 111m - exp_alerts: - - exp_labels: - area: platform - cancel_if_outside_working_hours: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_node_unschedulable: "true" - cancel_if_node_not_ready: "true" - cluster_id: gauss - cluster_type: management_cluster - installation: gauss - node: ip-10-0-5-3.eu-west-1.compute.internal - pipeline: testing - pod: alloy-3xxxx - provider: aws - severity: page - team: atlas - topic: observability - exp_annotations: - description: "Scraping of all alloy pods to check if one failed every 30 minutes." - opsrecipe: "alloy/" - - alertname: LoggingAgentDown - eval_time: 121m - - alertname: LoggingAgentDown - eval_time: 180m - exp_alerts: - - exp_labels: - area: platform - cancel_if_outside_working_hours: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_node_unschedulable: "true" - cancel_if_node_not_ready: "true" - cluster_id: gauss - cluster_type: management_cluster - installation: gauss - node: ip-10-0-5-2.eu-west-1.compute.internal - pipeline: testing - pod: alloy-2xxxx - provider: aws - severity: page - team: atlas - topic: observability - exp_annotations: - description: "Scraping of all alloy pods to check if one failed every 30 minutes." - opsrecipe: "alloy/" - - exp_labels: - area: platform - cancel_if_outside_working_hours: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_node_unschedulable: "true" - cancel_if_node_not_ready: "true" - cluster_id: gauss - cluster_type: management_cluster - installation: gauss - node: ip-10-0-5-3.eu-west-1.compute.internal - pipeline: testing - pod: alloy-3xxxx - provider: aws - severity: page - team: atlas - topic: observability - exp_annotations: - description: "Scraping of all alloy pods to check if one failed every 30 minutes." - opsrecipe: "alloy/" # Test LogForwardingErrors - interval: 1m input_series: