diff --git a/CHANGELOG.md b/CHANGELOG.md index 3367dbbb9..bcec58d93 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Get rid of the `app`, `role` and `node` external labels in Atlas rules. + ## [4.1.0] - 2024-05-30 ### Added diff --git a/README.md b/README.md index 187597dfa..8bb86bc24 100644 --- a/README.md +++ b/README.md @@ -265,7 +265,7 @@ In order to incorporate the SLO Framework in the Prometheus rules, several rules Those rules can be written according to this template : ``` # Amout of requests for VPA -- expr: "count(up{app=~'vertical-pod-autoscaler.*'}) by (cluster_type,cluster_id)" +- expr: "count(up{job=~'vertical-pod-autoscaler.*'}) by (cluster_type,cluster_id)" labels: class: MEDIUM area: platform @@ -278,7 +278,7 @@ Those rules can be written according to this template : # and summed with 1 so the final result is 0 : no error recorded. # If up was unsuccessful, there is an error. Up returns 0, multiplied by -1 and summed # with 1 so the final result is 1 : 1 error is recorded . -- expr: "sum((up{app=~'vertical-pod-autoscaler.*'} * -1) + 1) by (cluster_id, cluster_type)" +- expr: "sum((up{job=~'vertical-pod-autoscaler.*'} * -1) + 1) by (cluster_id, cluster_type)" labels: class: MEDIUM area: platform diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alertmanager.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alertmanager.rules.yml index 65597e7ed..52ed23d44 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alertmanager.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alertmanager.rules.yml @@ -20,7 +20,7 @@ spec: expr: rate(alertmanager_notifications_failed_total{integration!="opsgenie", cluster_type="management_cluster"}[20m]) > 0 for: 45m labels: - area: empowerment + area: platform severity: page team: atlas topic: monitoring @@ -33,7 +33,7 @@ spec: expr: rate(alertmanager_notifications_failed_total{integration="opsgenie", cluster_type="management_cluster"}[20m]) > 0 for: 30m labels: - area: empowerment + area: platform severity: notify team: atlas topic: monitoring diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/fluentbit.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/fluentbit.rules.yml index b0090193a..23d997907 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/fluentbit.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/fluentbit.rules.yml @@ -18,7 +18,7 @@ spec: expr: rate(fluentbit_output_retries_failed_total[10m]) > 0 for: 20m labels: - area: empowerment + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -34,7 +34,7 @@ spec: expr: rate(fluentbit_output_dropped_records_total[10m]) / (rate(fluentbit_output_proc_records_total[10m]) + rate(fluentbit_output_dropped_records_total[10m])) > 0.01 for: 20m labels: - area: empowerment + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -46,10 +46,10 @@ spec: annotations: description: '{{`Fluentbit is down on node ({{ $labels.node }}).`}}' opsrecipe: fluentbit-down/ - expr: sum(up{app="fluent-logshipping-app"}) by (app, cluster_id, installation, provider, pipeline, job, namespace, node) == 0 + expr: sum(up{job="fluent-logshipping-app"}) by (job, cluster_id, installation, provider, pipeline, namespace, node) == 0 for: 15m labels: - area: empowerment + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -64,11 +64,11 @@ spec: expr: kube_daemonset_status_number_unavailable{daemonset="fluent-logshipping-app"} > 0 for: 1h labels: - area: kaas + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cancel_if_outside_working_hours: "true" severity: page team: atlas - topic: managementcluster + topic: observability diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml index b53ffc401..65ba2ae6c 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml @@ -20,7 +20,7 @@ spec: expr: up{service="grafana", cluster_type="management_cluster"} == 0 for: 1h labels: - area: managedservices + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -38,7 +38,7 @@ spec: expr: sum by(cluster_id, installation, provider, pipeline) (increase(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"}[2h])) < 1 or absent(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"}) for: 6h labels: - area: managedservices + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -60,7 +60,7 @@ spec: expr: sum by (cronjob, cluster_id, installation, provider, pipeline) (label_replace(avg_over_time(kube_job_status_failed{job_name=~"grafana-permissions.*", reason!="BackoffLimitExceeded", cluster_type="management_cluster"}[60m]), "cronjob", "$1", "job_name", "(grafana-permissions)-.*")) > 0 for: 6h labels: - area: managedservices + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -77,7 +77,7 @@ spec: expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="grafana-permissions", cluster_type="management_cluster"}) > 86400 or count by (cluster_id, cronjob, installation, namespace, provider, pipeline) (label_replace(max_over_time(kube_job_status_succeeded{job_name=~"grafana-permissions-.+", cluster_type="management_cluster"}[1d]), "cronjob", "grafana-permissions", "job_name", "grafana-permissions-.+") == 1) == 0 labels: - area: empowerment + area: platform severity: page team: atlas topic: managementcluster diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/inhibit.prometheus-agent.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/inhibit.prometheus-agent.rules.yml index c42a8a334..20e1f460f 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/inhibit.prometheus-agent.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/inhibit.prometheus-agent.rules.yml @@ -33,13 +33,13 @@ spec: count( label_replace( sum_over_time( - app_operator_app_info{app="prometheus-agent"}[5m] + app_operator_app_info{name="prometheus-agent"}[5m] ), "cluster_id", "$1", "namespace", "(.*)" ) ) by (cluster_id) labels: cluster_is_not_running_prometheus_agent: "true" - area: empowerment + area: platform team: atlas topic: monitoring {{- end }} diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/keda.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/keda.rules.yml index d16666ffb..0680acdd2 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/keda.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/keda.rules.yml @@ -16,7 +16,7 @@ spec: expr: count by (cluster_id, installation, provider, pipeline) (up{container=~"keda-.*"} == 0) > 0 for: 10m labels: - area: kaas + area: platform cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -31,7 +31,7 @@ spec: expr: increase(keda_scaled_object_errors[10m])> 0 for: 15m labels: - area: kaas + area: platform cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -46,7 +46,7 @@ spec: expr: increase(keda_webhook_scaled_object_validation_errors[10m]) > 0 for: 15m labels: - area: kaas + area: platform cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -61,7 +61,7 @@ spec: expr: increase(keda_scaler_errors[10m]) > 0 for: 15m labels: - area: kaas + area: platform cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml index 952001b0e..7b8e40942 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml @@ -18,16 +18,17 @@ spec: expr: |- ( # modern clusters - label_replace(up{app="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",instance=~".*:8080"} == 1) + label_replace(up{job="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{job="kube-state-metrics",instance=~".*:8080"} == 1) ) and ( # vintage clusters without servicemonitor + # We need to keep the app label until all clusters are migrated to a release >= 18.2. TODO(atlas): Remove when this is the case label_replace(up{app="kube-state-metrics",container=""}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",container=""} == 1) ) {{- else }} expr: |- - count by (cluster_id, installation, provider, pipeline) (label_replace(up{app="kube-state-metrics", instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*")) == 0 + count by (cluster_id, installation, provider, pipeline) (label_replace(up{job="kube-state-metrics", instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*")) == 0 or ( label_replace( capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, @@ -37,12 +38,12 @@ spec: "(.*)" ) == 1 ) unless on (cluster_id, customer, installation, pipeline, provider, region) ( - count(up{app="kube-state-metrics", instance=~".*:8080"} == 1) by (cluster_id, customer, installation, pipeline, provider, region) + count(up{job="kube-state-metrics", instance=~".*:8080"} == 1) by (cluster_id, customer, installation, pipeline, provider, region) ) {{- end }} for: 15m labels: - area: kaas + area: platform cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -61,7 +62,7 @@ spec: expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{handler="metrics", job="kube-state-metrics"}[5m])) by (le, cluster_id, installation, provider, pipeline)) > 7 for: 15m labels: - area: kaas + area: platform cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -78,10 +79,11 @@ spec: opsrecipe: kube-state-metrics-down/ expr: |- # When it looks up but we don't have metrics - count({app="kube-state-metrics", __name__=~"kube_.+"}) by (cluster_id, installation, provider, pipeline) <= 100 + # We need to keep the app label until all clusters are migrated to a release >= 18.2. TODO(atlas): Remove when this is the case + count({job="kube-state-metrics", __name__=~"kube_.+"} or {app="kube-state-metrics", __name__=~"kube_.+"}) by (cluster_id, installation, provider, pipeline) <= 100 for: 20m labels: - area: kaas + area: platform cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -116,7 +118,7 @@ spec: {{- end }} for: 30m labels: - area: kaas + area: platform cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page @@ -144,7 +146,7 @@ spec: {{- end }} for: 30m labels: - area: kaas + area: platform cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page @@ -172,7 +174,7 @@ spec: {{- end }} for: 30m labels: - area: kaas + area: platform cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page @@ -200,7 +202,7 @@ spec: {{- end }} for: 30m labels: - area: kaas + area: platform cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page @@ -228,7 +230,7 @@ spec: {{- end }} for: 30m labels: - area: kaas + area: platform cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page @@ -256,7 +258,7 @@ spec: {{- end }} for: 30m labels: - area: kaas + area: platform cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page @@ -284,7 +286,7 @@ spec: {{- end }} for: 30m labels: - area: kaas + area: platform cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page @@ -312,7 +314,7 @@ spec: {{- end }} for: 30m labels: - area: kaas + area: platform cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page @@ -340,7 +342,7 @@ spec: {{- end }} for: 30m labels: - area: kaas + area: platform cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page @@ -368,7 +370,7 @@ spec: {{- end }} for: 30m labels: - area: kaas + area: platform cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml index 3f0d6a7e5..d23316568 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml @@ -22,7 +22,7 @@ spec: ) > 5 for: 5m labels: - area: managedservices + area: platform # This label is used to ensure the alert go through even for non-stable installations all_pipelines: "true" cancel_if_outside_working_hours: "true" @@ -41,7 +41,7 @@ spec: > 10 for: 120m labels: - area: managedservices + area: platform cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -58,7 +58,7 @@ spec: expr: | sum(increase(loki_panic_total[10m])) by (cluster_id, installation, provider, pipeline, namespace, job) > 0 labels: - area: managedservices + area: platform cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -73,10 +73,10 @@ spec: description: '{{`Loki pod {{ $labels.pod }} (namespace {{ $labels.namespace }}) sees {{ $value }} unhealthy ring members`}}' opsrecipe: loki/ expr: | - sum by (app, cluster_id, container, customer, installation, provider, pipeline, name, namespace, organization, pod) (cortex_ring_members{state="Unhealthy", app="loki"}) > 0 + sum by (job, cluster_id, container, customer, installation, provider, pipeline, name, namespace, organization, pod) (cortex_ring_members{state="Unhealthy", job=~"loki/.*"}) > 0 for: 30m labels: - area: managedservices + area: platform cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index 5bc920026..835c4ef61 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -15,14 +15,14 @@ spec: annotations: description: This alert is used to ensure the entire alerting pipeline is functional. opsrecipe: mimir/ - expr: up{app="mimir"} > 0 + expr: up{job=~"mimir/.*", container!="prometheus"} > 0 labels: - area: "empowerment" + area: platform installation: {{ .Values.managementCluster.name }} # TODO(@team-atlas): We need this label as long as we have the old and new heartbeats. Let's remove once the legacy monitoring is gone - type: "mimir-heartbeat" - team: "atlas" - topic: "observability" + type: mimir-heartbeat + team: atlas + topic: observability # Coming from https://github.com/giantswarm/giantswarm/issues/30124 # This alert ensures Mimir containers are not restarting too often (flappiness). # If it is not the the case, this can incur high costs by cloud providers (s3 api calls are quite expensive). @@ -37,7 +37,7 @@ spec: ) > 5 for: 5m labels: - area: managedservices + area: platform # This label is used to ensure the alert go through even for non-stable installations all_pipelines: "true" cancel_if_outside_working_hours: "true" @@ -48,10 +48,10 @@ spec: annotations: description: '{{`Mimir component : {{ $labels.service }} is down.`}}' opsrecipe: mimir/ - expr: count(up{app="mimir"} == 0) by (cluster_id, installation, provider, pipeline, service) > 0 + expr: count(up{job=~"mimir/.*", container!="prometheus"} == 0) by (cluster_id, installation, provider, pipeline, service) > 0 for: 5m labels: - area: managedservices + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -63,10 +63,10 @@ spec: annotations: description: 'Grafana-agent sending PrometheusRules to Mimir ruler is down.' opsrecipe: mimir/ - expr: count(up{app="grafana-agent", namespace="mimir"} == 0) by (cluster_id, installation, provider, pipeline) > 0 + expr: count(up{job="grafana-agent-rules", namespace="mimir"} == 0) by (cluster_id, installation, provider, pipeline) > 0 for: 1h labels: - area: managedservices + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -81,7 +81,7 @@ spec: expr: rate(mimir_rules_events_failed_total{cluster_type="management_cluster", namespace="mimir"}[5m]) > 0 for: 1h labels: - area: managedservices + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml index d8766c43b..1865de343 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml @@ -43,7 +43,7 @@ spec: {{- end }} for: 20m labels: - area: empowerment + area: platform severity: page team: atlas topic: observability @@ -84,7 +84,7 @@ spec: {{- end }} for: 2m labels: - area: empowerment + area: platform severity: none team: atlas topic: observability @@ -114,7 +114,7 @@ spec: )[5m:]) for: 20m labels: - area: empowerment + area: platform severity: page team: atlas topic: observability @@ -145,7 +145,7 @@ spec: )[5m:]) for: 2m labels: - area: empowerment + area: platform severity: none team: atlas topic: observability diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-meta-operator.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-meta-operator.rules.yml index 1b5e67f50..e91cd66bf 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-meta-operator.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-meta-operator.rules.yml @@ -10,13 +10,13 @@ spec: - name: observability rules: - alert: "Heartbeat" - expr: up{app="prometheus",instance!="prometheus-agent"} == 1 + expr: up{job=~".*prometheus/prometheus.*",instance!="prometheus-agent"} == 1 labels: - area: "empowerment" + area: platform installation: {{ .Values.managementCluster.name }} - team: "atlas" - topic: "observability" - type: "heartbeat" + team: atlas + topic: observability + type: heartbeat namespace: "monitoring" # Needed due to https://github.com/prometheus-operator/prometheus-operator/issues/3737 annotations: description: This alert is used to ensure the entire alerting pipeline is functional. @@ -50,28 +50,28 @@ spec: > 0 for: 10m labels: - area: "empowerment" + area: platform cancel_if_mc_kube_state_metrics_down: "true" cancel_if_cluster_status_creating: "true" cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} installation: {{ .Values.managementCluster.name }} - severity: "page" - team: "atlas" - topic: "observability" + severity: page + team: atlas + topic: observability - alert: "PrometheusMetaOperatorReconcileErrors" annotations: description: '{{`prometheus-meta-operator controller {{ $labels.controller }} too many reconcile errors.`}}' opsrecipe: "pmo-reconcile-errors/" dashboard: piJK9Vm4z/operatorkit expr: | - avg_over_time(operatorkit_controller_errors_total{app="prometheus-meta-operator"}[20m]) > 0 + avg_over_time(operatorkit_controller_errors_total{job="monitoring/prometheus-meta-operator"}[20m]) > 0 for: 1h labels: - area: "empowerment" + area: platform cancel_if_mc_kube_state_metrics_down: "false" cancel_if_cluster_status_creating: "true" cancel_if_outside_working_hours: "true" installation: {{ .Values.managementCluster.name }} - severity: "page" - team: "atlas" - topic: "observability" + severity: page + team: atlas + topic: observability diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml index f5a48b77b..dcfb63a90 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml @@ -35,10 +35,10 @@ spec: annotations: description: '{{`Prometheus-operator ({{ $labels.instance }}) is down.`}}' opsrecipe: "prometheus-operator/" - expr: up{app=~"prometheus-operator.*|kube-prometheus-.*"} == 0 + expr: up{job=~"prometheus-operator-app-operator|kube-prometheus-stack-operator"} == 0 for: 15m labels: - area: empowerment + area: platform cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -53,10 +53,10 @@ spec: annotations: description: Errors while performing List operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace. opsrecipe: "prometheus-operator/" - expr: (sum by (cluster_id, installation, provider, pipeline, controller, namespace) (rate(prometheus_operator_list_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_list_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4 + expr: (sum by (cluster_id, installation, provider, pipeline, controller, namespace) (rate(prometheus_operator_list_operations_failed_total{job=~"prometheus-operator-app-operator|kube-prometheus-stack-operator"}[10m])) / sum by (cluster_id, installation, provider, pipeline, controller, namespace) (rate(prometheus_operator_list_operations_total{job=~"prometheus-operator-app-operator|kube-prometheus-stack-operator"}[10m]))) > 0.4 for: 15m labels: - area: empowerment + area: platform cancel_if_outside_working_hours: "true" severity: page team: atlas @@ -65,10 +65,10 @@ spec: annotations: description: Errors while performing watch operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace. opsrecipe: "prometheus-operator/" - expr: (sum by (cluster_id, installation, provider, pipeline, controller, namespace) (rate(prometheus_operator_watch_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_watch_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4 + expr: (sum by (cluster_id, installation, provider, pipeline, controller, namespace) (rate(prometheus_operator_watch_operations_failed_total{job=~"prometheus-operator-app-operator|kube-prometheus-stack-operator"}[10m])) / sum by (cluster_id, installation, provider, pipeline, controller, namespace) (rate(prometheus_operator_watch_operations_total{job=~"prometheus-operator-app-operator|kube-prometheus-stack-operator"}[10m]))) > 0.4 for: 15m labels: - area: empowerment + area: platform cancel_if_outside_working_hours: "true" severity: page team: atlas @@ -77,10 +77,10 @@ spec: annotations: description: Controller {{`{{`}} $labels.controller {{`}}`}} in {{`{{`}} $labels.namespace {{`}}`}} namespace fails to reconcile {{`{{`}} $value {{`}}`}} objects. opsrecipe: "prometheus-operator/" - expr: min_over_time(prometheus_operator_syncs{status="failed",app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) > 0 + expr: min_over_time(prometheus_operator_syncs{status="failed", job=~"prometheus-operator-app-operator|kube-prometheus-stack-operator"}[5m]) > 0 for: 10m labels: - area: empowerment + area: platform cancel_if_outside_working_hours: "true" severity: page team: atlas @@ -89,10 +89,10 @@ spec: annotations: description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of reconciling operations failed for {{`{{`}} $labels.controller {{`}}`}} controller in {{`{{`}} $labels.namespace {{`}}`}} namespace.' opsrecipe: "prometheus-operator/" - expr: (sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_reconcile_errors_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) / (sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_reconcile_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) > 0.1 + expr: (sum by (cluster_id, installation, provider, pipeline, controller, namespace) (rate(prometheus_operator_reconcile_errors_total{job=~"prometheus-operator-app-operator|kube-prometheus-stack-operator"}[5m]))) / (sum by (cluster_id, installation, provider, pipeline, controller, namespace) (rate(prometheus_operator_reconcile_operations_total{job=~"prometheus-operator-app-operator|kube-prometheus-stack-operator"}[5m]))) > 0.1 for: 10m labels: - area: empowerment + area: platform cancel_if_outside_working_hours: "true" severity: page team: atlas @@ -101,10 +101,10 @@ spec: annotations: description: Errors while reconciling Prometheus in {{`{{`}} $labels.namespace {{`}}`}} Namespace. opsrecipe: "prometheus-operator/" - expr: rate(prometheus_operator_node_address_lookup_errors_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) > 0.1 + expr: rate(prometheus_operator_node_address_lookup_errors_total{job=~"prometheus-operator-app-operator|kube-prometheus-stack-operator"}[5m]) > 0.1 for: 10m labels: - area: empowerment + area: platform cancel_if_outside_working_hours: "true" severity: page team: atlas @@ -113,10 +113,10 @@ spec: annotations: description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace isn't ready to reconcile {{`{{`}} $labels.controller {{`}}`}} resources. opsrecipe: "prometheus-operator/" - expr: min by (cluster_id, installation, provider, pipeline, namespace, controller) (max_over_time(prometheus_operator_ready{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) == 0) + expr: min by (cluster_id, installation, provider, pipeline, namespace, controller) (max_over_time(prometheus_operator_ready{job=~"prometheus-operator-app-operator|kube-prometheus-stack-operator"}[5m]) == 0) for: 5m labels: - area: empowerment + area: platform cancel_if_outside_working_hours: "true" severity: page team: atlas @@ -125,10 +125,10 @@ spec: annotations: description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace rejected {{`{{`}} printf "%0.0f" $value {{`}}`}} {{`{{`}} $labels.controller {{`}}`}}/{{`{{`}} $labels.resource {{`}}`}} resources. opsrecipe: "prometheus-operator/" - expr: min_over_time(prometheus_operator_managed_resources{state="rejected",app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) > 0 + expr: min_over_time(prometheus_operator_managed_resources{state="rejected", job=~"prometheus-operator-app-operator|kube-prometheus-stack-operator"}[5m]) > 0 for: 5m labels: - area: empowerment + area: platform cancel_if_outside_working_hours: "true" severity: page team: atlas diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml index 5fac63d41..054d4980b 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml @@ -14,10 +14,10 @@ spec: annotations: description: '{{`Prometheus can''t communicate with Kubernetes API.`}}' opsrecipe: prometheus-cant-communicate/ - expr: rate(prometheus_sd_kubernetes_http_request_total{app!="promxy-app", status_code=""}[15m]) > 0.25 + expr: rate(prometheus_sd_kubernetes_http_request_total{job!="promxy-app", status_code=""}[15m]) > 0.25 for: 30m labels: - area: empowerment + area: platform cancel_if_any_cluster_control_plane_unhealthy: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_updating: "true" @@ -39,7 +39,7 @@ spec: {{- end }} for: 1h labels: - area: empowerment + area: platform cancel_if_outside_working_hours: "true" severity: page team: atlas @@ -52,7 +52,7 @@ spec: expr: rate(prometheus_remote_storage_samples_failed_total[10m]) > 0.1 or rate(prometheus_remote_storage_samples_total[10m]) == 0 or rate(prometheus_remote_storage_metadata_retried_total[10m]) > 0 for: 1h labels: - area: empowerment + area: platform cancel_if_outside_working_hours: "true" severity: page team: atlas @@ -65,7 +65,7 @@ spec: expr: rate(prometheus_rule_evaluation_failures_total[5m]) > 0 for: 1h labels: - area: empowerment + area: platform severity: page team: atlas topic: observability @@ -78,7 +78,7 @@ spec: expr: (count(up == 0) BY (job, installation, cluster_id, provider, pipeline) / count(up) BY (job, installation, cluster_id, provider, pipeline)) == 1 for: 1d labels: - area: empowerment + area: platform severity: notify team: atlas topic: observability @@ -92,22 +92,22 @@ spec: expr: |- ( count( - up{ - app=~"kubernetes|kube-controller-manager|kube-scheduler|kubelet|node-exporter|kube-state-metrics", - job!~".*bastions.*" - } == 0 - ) BY (app,job, installation, cluster_id, provider, pipeline) + ( + up{job=~"apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics"} + or + up{job="kubelet", metrics_path="/metrics"} + ) == 0 + ) BY (job, installation, cluster_id, provider, pipeline) / count( - up{ - app=~"kubernetes|kube-controller-manager|kube-scheduler|kubelet|node-exporter|kube-state-metrics", - job!~".*bastions.*" - } - ) BY (app, job, installation, cluster_id, provider, pipeline) + up{job=~"apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics"} + or + up{job="kubelet", metrics_path="/metrics"} + ) BY (job, installation, cluster_id, provider, pipeline) ) == 1 for: 3d labels: - area: empowerment + area: platform severity: page team: atlas topic: observability diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/promtail.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/promtail.rules.yml index f81b2fc8b..9c75cb1ce 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/promtail.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/promtail.rules.yml @@ -16,10 +16,10 @@ spec: expr: count(up{container="promtail"} == 0) by (cluster_id, installation, provider, pipeline) > 0 for: 30m labels: - area: "empowerment" + area: platform severity: page - team: "atlas" - topic: "observability" + team: atlas + topic: observability cancel_if_outside_working_hours: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -33,10 +33,10 @@ spec: 100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[2m])) by (cluster_id, installation, provider, pipeline, namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[2m])) by (cluster_id, installation, provider, pipeline, namespace, job, route, instance) > 10 for: 15m labels: - area: "empowerment" + area: platform severity: page - team: "atlas" - topic: "observability" + team: atlas + topic: observability cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/silence-operator.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/silence-operator.rules.yml index d39c1579f..6b88d5c8a 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/silence-operator.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/silence-operator.rules.yml @@ -14,15 +14,15 @@ spec: description: '{{`silence-operator controller {{ $labels.controller }} too many reconcile errors.`}}' opsrecipe: "operator-not-reconciling/" expr: | - avg_over_time(operatorkit_controller_errors_total{app="silence-operator", cluster_type="management_cluster"}[20m]) > 0 + avg_over_time(operatorkit_controller_errors_total{job="monitoring/silence-operator", cluster_type="management_cluster"}[20m]) > 0 for: 1h labels: - area: "empowerment" + area: platform cancel_if_outside_working_hours: "true" installation: {{ .Values.managementCluster.name }} - severity: "page" - team: "atlas" - topic: "observability" + severity: page + team: atlas + topic: observability - alert: SilenceOperatorSyncJobHasNotBeenScheduledForTooLong annotations: description: '{{`CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been scheduled for more than 1 day.`}}' @@ -32,7 +32,7 @@ spec: expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="silence-operator-sync", cluster_type="management_cluster"}) > 86400 or count by (cronjob, cluster_id, installation, namespace, provider, pipeline) (label_replace(max_over_time(kube_job_status_succeeded{job_name=~"silence-operator-sync-.+", cluster_type="management_cluster"}[1d]), "cronjob", "silence-operator-sync", "job_name", "silence-operator-sync-.+") == 1) == 0 labels: - area: empowerment + area: platform severity: page team: atlas topic: managementcluster diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/sloth.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/sloth.rules.yml index e386c9fc6..6f4d7b431 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/sloth.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/sloth.rules.yml @@ -13,10 +13,10 @@ spec: annotations: description: 'Sloth is down.' opsrecipe: sloth-down/ - expr: count(up{app="sloth"} == 0) by (cluster_id, installation, provider, pipeline) > 0 + expr: count(up{job="monitoring/sloth"} == 0) by (cluster_id, installation, provider, pipeline) > 0 for: 5m labels: - area: managedservices + area: platform cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" diff --git a/test/hack/bin/verify-rules.sh b/test/hack/bin/verify-rules.sh index c3e3074e0..09de7b828 100755 --- a/test/hack/bin/verify-rules.sh +++ b/test/hack/bin/verify-rules.sh @@ -39,15 +39,6 @@ main() { expected_failure_relative_file_global="test/conf/promtool_ignore" expected_failure_file_global="$GIT_WORKDIR/$expected_failure_relative_file_global" - # Retrieve all files we're going to check - local -a all_files - mapfile -t all_files < <( - cd "$GIT_WORKDIR" || return 1 - # filter alerting-rules files, and remove prefix `helm/prometheus-rules/templates/` - git ls-files | - sed -En 's_^helm/prometheus-rules/templates/.*\.ya?ml)$_\1_p' || echo error - ) - # Get prefixes whitelisted via the failure_file local -a expected_failure_prefixes_global=() [[ -f "$expected_failure_file_global" ]] \ @@ -60,9 +51,10 @@ main() { local -a promtool_test_errors=() local -a failing_extraction=() - # Create generated directory with all test files + # Clean and create generated directory with all test files local outputPath="$GIT_WORKDIR/test/hack/output" - [[ -d "$outputPath/generated" ]] || cp -r "$GIT_WORKDIR/test/tests/providers/." "$outputPath/generated/" + rm -rf "$outputPath/generated" + cp -r "$GIT_WORKDIR/test/tests/providers/." "$outputPath/generated/" # We remove the global directory rm -rf "$outputPath/generated/global" diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml index 6ae320f99..c0c98203e 100644 --- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml +++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -6,15 +6,16 @@ tests: - interval: 1m input_series: # For the first 60min: test with 1 pod: up, none, up, down, up - - series: 'up{app="mimir"}' + - series: 'up{job="mimir/ingester", container="ingester"}' values: "1+0x60 _x30 1+0x30 0+0x30 1+0x30" alert_rule_test: - alertname: Heartbeat eval_time: 20m exp_alerts: - exp_labels: - app: mimir - area: empowerment + area: platform + job: mimir/ingester + container: ingester installation: myinstall team: atlas topic: observability @@ -28,8 +29,9 @@ tests: eval_time: 95m exp_alerts: - exp_labels: - app: mimir - area: empowerment + area: platform + job: mimir/ingester + container: ingester installation: myinstall team: atlas topic: observability @@ -43,8 +45,9 @@ tests: eval_time: 165m exp_alerts: - exp_labels: - app: mimir - area: empowerment + area: platform + job: mimir/ingester + container: ingester installation: myinstall team: atlas topic: observability @@ -55,7 +58,7 @@ tests: - interval: 1m input_series: # For the first 60min: test with 1 pod: none, up, down - - series: 'up{app="mimir",cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", service="mimir-ingester"}' + - series: 'up{job="mimir/ingester", container="ingester", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", service="mimir-ingester"}' values: "_x20 1+0x20 0+0x20" alert_rule_test: - alertname: MimirComponentDown @@ -67,7 +70,7 @@ tests: exp_alerts: - exp_labels: service: mimir-ingester - area: managedservices + area: platform severity: page team: atlas topic: observability @@ -85,7 +88,7 @@ tests: - interval: 1m input_series: # test with 1 pod: none, up, down - - series: 'up{app="grafana-agent",cluster_type="management_cluster", cluster_id="golem", provider="capa", pipeline="testing", installation="golem", namespace="mimir"}' + - series: 'up{job="grafana-agent-rules", cluster_type="management_cluster", cluster_id="golem", provider="capa", pipeline="testing", installation="golem", namespace="mimir"}' values: "_x20 1+0x70 0+0x70" alert_rule_test: - alertname: GrafanaAgentForPrometheusRulesDown @@ -96,7 +99,7 @@ tests: eval_time: 160m exp_alerts: - exp_labels: - area: managedservices + area: platform cancel_if_outside_working_hours: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -123,7 +126,7 @@ tests: eval_time: 95m exp_alerts: - exp_labels: - area: managedservices + area: platform cancel_if_outside_working_hours: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -155,7 +158,7 @@ tests: exp_alerts: - exp_labels: all_pipelines: "true" - area: managedservices + area: platform cancel_if_outside_working_hours: "true" cluster_type: management_cluster container: mimir-ingester diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml index 3c81bef0b..c950765c2 100644 --- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml +++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml @@ -16,7 +16,7 @@ tests: eval_time: 30m exp_alerts: - exp_labels: - area: empowerment + area: platform severity: page team: atlas topic: observability @@ -43,7 +43,7 @@ tests: eval_time: 30m exp_alerts: - exp_labels: - area: empowerment + area: platform severity: none team: atlas topic: observability @@ -70,7 +70,7 @@ tests: eval_time: 90m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: gauss cluster_type: workload_cluster severity: page @@ -97,7 +97,7 @@ tests: eval_time: 90m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: gauss cluster_type: workload_cluster severity: none @@ -127,11 +127,11 @@ tests: # Tests for `PrometheusAgentShardsMissing` alert - interval: 1m input_series: - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - series: 'prometheus_operator_spec_shards{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' values: '3+0x60 5+0x60 3+0x60' @@ -146,7 +146,7 @@ tests: eval_time: 100m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -167,7 +167,7 @@ tests: eval_time: 100m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -188,7 +188,7 @@ tests: eval_time: 125m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -209,7 +209,7 @@ tests: eval_time: 125m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -233,11 +233,11 @@ tests: # Tests for `PrometheusAgentShardsMissing` alert with missing `prometheus_operator_spec_shards` metric - interval: 1m input_series: - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - series: 'prometheus_operator_spec_replicas{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' values: '3+0x60 5+0x60 3+0x60' @@ -250,7 +250,7 @@ tests: eval_time: 100m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -271,7 +271,7 @@ tests: eval_time: 100m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -292,7 +292,7 @@ tests: eval_time: 125m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -313,7 +313,7 @@ tests: eval_time: 125m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws diff --git a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml index a7f332e20..406901ee6 100644 --- a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml +++ b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml @@ -16,7 +16,7 @@ tests: eval_time: 30m exp_alerts: - exp_labels: - area: empowerment + area: platform severity: page team: atlas topic: observability @@ -34,7 +34,7 @@ tests: eval_time: 30m exp_alerts: - exp_labels: - area: empowerment + area: platform severity: none team: atlas topic: observability @@ -52,7 +52,7 @@ tests: eval_time: 90m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: gauss cluster_type: workload_cluster severity: page @@ -73,7 +73,7 @@ tests: eval_time: 90m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: gauss cluster_type: workload_cluster severity: none @@ -97,11 +97,11 @@ tests: # Tests for `PrometheusAgentShardsMissing` alert - interval: 1m input_series: - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - series: 'prometheus_operator_spec_shards{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' values: '3+0x60 5+0x60 3+0x60' @@ -116,7 +116,7 @@ tests: eval_time: 100m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -137,7 +137,7 @@ tests: eval_time: 100m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -158,7 +158,7 @@ tests: eval_time: 125m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -179,7 +179,7 @@ tests: eval_time: 125m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -203,11 +203,11 @@ tests: # Tests for `PrometheusAgentShardsMissing` alert with missing `prometheus_operator_spec_shards` metric - interval: 1m input_series: - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - series: 'prometheus_operator_spec_replicas{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' values: '3+0x60 5+0x60 3+0x60' @@ -220,7 +220,7 @@ tests: eval_time: 100m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -241,7 +241,7 @@ tests: eval_time: 100m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -262,7 +262,7 @@ tests: eval_time: 125m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -283,7 +283,7 @@ tests: eval_time: 125m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws diff --git a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml index a7f332e20..406901ee6 100644 --- a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml +++ b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml @@ -16,7 +16,7 @@ tests: eval_time: 30m exp_alerts: - exp_labels: - area: empowerment + area: platform severity: page team: atlas topic: observability @@ -34,7 +34,7 @@ tests: eval_time: 30m exp_alerts: - exp_labels: - area: empowerment + area: platform severity: none team: atlas topic: observability @@ -52,7 +52,7 @@ tests: eval_time: 90m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: gauss cluster_type: workload_cluster severity: page @@ -73,7 +73,7 @@ tests: eval_time: 90m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: gauss cluster_type: workload_cluster severity: none @@ -97,11 +97,11 @@ tests: # Tests for `PrometheusAgentShardsMissing` alert - interval: 1m input_series: - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - series: 'prometheus_operator_spec_shards{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' values: '3+0x60 5+0x60 3+0x60' @@ -116,7 +116,7 @@ tests: eval_time: 100m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -137,7 +137,7 @@ tests: eval_time: 100m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -158,7 +158,7 @@ tests: eval_time: 125m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -179,7 +179,7 @@ tests: eval_time: 125m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -203,11 +203,11 @@ tests: # Tests for `PrometheusAgentShardsMissing` alert with missing `prometheus_operator_spec_shards` metric - interval: 1m input_series: - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - series: 'prometheus_operator_spec_replicas{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' values: '3+0x60 5+0x60 3+0x60' @@ -220,7 +220,7 @@ tests: eval_time: 100m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -241,7 +241,7 @@ tests: eval_time: 100m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -262,7 +262,7 @@ tests: eval_time: 125m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -283,7 +283,7 @@ tests: eval_time: 125m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alertmanager.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alertmanager.rules.test.yml index f9b927c1f..e529d9450 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/alertmanager.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/alertmanager.rules.test.yml @@ -22,7 +22,7 @@ tests: eval_time: 106m exp_alerts: - exp_labels: - area: empowerment + area: platform cancel_if_outside_working_hours: "true" cluster_type: management_cluster integration: slack @@ -49,7 +49,7 @@ tests: eval_time: 91m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_type: management_cluster integration: opsgenie severity: notify diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/grafana.management-cluster.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/grafana.rules.test.yml similarity index 74% rename from test/tests/providers/global/platform/atlas/alerting-rules/grafana.management-cluster.rules.test.yml rename to test/tests/providers/global/platform/atlas/alerting-rules/grafana.rules.test.yml index 74e01dc36..c46dd9cda 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/grafana.management-cluster.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/grafana.rules.test.yml @@ -1,25 +1,24 @@ --- rule_files: - - grafana.management-cluster.rules.yml + - grafana.rules.yml tests: - interval: 1m input_series: - - series: 'up{service="grafana", cluster_id="gauss", installation="gauss", instance="grafana"}' + - series: 'up{service="grafana", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", instance="grafana"}' values: "1+0x20 0+0x100" alert_rule_test: - alertname: GrafanaDown eval_time: 90m exp_alerts: - exp_labels: - area: managedservices - cancel_if_cluster_control_plane_unhealthy: "true" + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - cancel_if_scrape_timeout: "true" cancel_if_outside_working_hours: "true" cluster_id: "gauss" + cluster_type: management_cluster installation: "gauss" instance: "grafana" service: "grafana" diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/kube-state-metrics.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/kube-state-metrics.rules.test.yml index dbc5be112..dcc8fef72 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/kube-state-metrics.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/kube-state-metrics.rules.test.yml @@ -23,25 +23,25 @@ tests: # - 02:30 all is up again # - 03:00 we don't have enough metrics # - 03:30 all is up again - - series: 'up{app="kube-state-metrics", cluster_id="testinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="testinstall", instance="192.0.2.10:8080", job="kube-state-metrics", namespace="kube-system", node="ip-10-0-1-1.eu-west-1.compute.internal", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-d7f4ff68d-72vzx", provider="aws", region="eu-west-1", service="prometheus-operator-app-kube-state-metrics", service_priority="highest"}' + - series: 'up{job="kube-state-metrics", cluster_id="testinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="testinstall", instance="192.0.2.10:8080", namespace="kube-system", node="ip-10-0-1-1.eu-west-1.compute.internal", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-d7f4ff68d-72vzx", provider="aws", region="eu-west-1", service="prometheus-operator-app-kube-state-metrics", service_priority="highest"}' values: "_x30 1x30 0x30 1x30 1x30 1x30 1x30 1x30" - - series: 'up{app="kube-state-metrics", cluster_id="testinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="metrics", installation="testinstall", instance="192.0.2.10:8081", job="kube-state-metrics", namespace="kube-system", node="ip-10-0-1-1.eu-west-1.compute.internal", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-d7f4ff68d-72vzx", provider="aws", region="eu-west-1", service="prometheus-operator-app-kube-state-metrics", service_priority="highest"}' + - series: 'up{job="kube-state-metrics", cluster_id="testinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="metrics", installation="testinstall", instance="192.0.2.10:8081", namespace="kube-system", node="ip-10-0-1-1.eu-west-1.compute.internal", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-d7f4ff68d-72vzx", provider="aws", region="eu-west-1", service="prometheus-operator-app-kube-state-metrics", service_priority="highest"}' values: "_x30 1x30 1x30 1x30 0x30 1x30 1x30 1x30" - - series: 'testmetric2{app="kube-state-metrics"}' + - series: 'testmetric2{job="kube-state-metrics"}' values: "0x1000" - - series: 'testmetric3{app="kube-state-metrics"}' + - series: 'testmetric3{job="kube-state-metrics"}' values: "0x1000" - - series: 'testmetric4{app="kube-state-metrics"}' + - series: 'testmetric4{job="kube-state-metrics"}' values: "0x1000" - - series: 'testmetric5{app="kube-state-metrics"}' + - series: 'testmetric5{job="kube-state-metrics"}' values: "0x1000" - - series: 'testmetric6{app="kube-state-metrics"}' + - series: 'testmetric6{job="kube-state-metrics"}' values: "0x1000" - - series: 'testmetric7{app="kube-state-metrics"}' + - series: 'testmetric7{job="kube-state-metrics"}' values: "0x1000" - - series: 'testmetric8{app="kube-state-metrics"}' + - series: 'testmetric8{job="kube-state-metrics"}' values: "0x1000" - - series: 'testmetric9{app="kube-state-metrics"}' + - series: 'testmetric9{job="kube-state-metrics"}' values: "_x30 1x30 1x30 1x30 1x30 1x30 _x30 1x30" alert_rule_test: # - 00:00 Start with no metrics @@ -49,7 +49,7 @@ tests: eval_time: 25m exp_alerts: - exp_labels: - area: "kaas" + area: "platform" cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_has_no_workers: "true" cancel_if_cluster_status_creating: "true" @@ -76,7 +76,7 @@ tests: eval_time: 85m exp_alerts: - exp_labels: - area: "kaas" + area: "platform" cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_has_no_workers: "true" cancel_if_cluster_status_creating: "true" @@ -109,7 +109,7 @@ tests: eval_time: 205m exp_alerts: - exp_labels: - area: "kaas" + area: "platform" cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_has_no_workers: "true" cancel_if_cluster_status_creating: "true" @@ -141,25 +141,25 @@ tests: # - 00:30 all goes up # - 01:00 up goes down # - 01:30 All is up again - - series: 'up{app="kube-state-metrics", cluster_id="testvintage", cluster_type="workload_cluster", customer="giantswarm", installation="testinstall", instance="10.0.2.4:10301", job="test-prometheus/workload-test/0", namespace="kube-system", node="ip-10-1-0-3.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-v2-3-0-67b5fdc5d4-78mhf", provider="aws", service_priority="highest"}' + - series: 'up{job="kube-state-metrics", cluster_id="testvintage", cluster_type="workload_cluster", customer="giantswarm", installation="testinstall", instance="10.0.2.4:10301", namespace="kube-system", node="ip-10-1-0-3.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-v2-3-0-67b5fdc5d4-78mhf", provider="aws", service_priority="highest"}' values: "_x30 1x30 0x30 1x30" - - series: 'testmetric2{app="kube-state-metrics"}' + - series: 'testmetric2{job="kube-state-metrics"}' values: "0x1000" - - series: 'testmetric3{app="kube-state-metrics"}' + - series: 'testmetric3{job="kube-state-metrics"}' values: "0x1000" - - series: 'testmetric4{app="kube-state-metrics"}' + - series: 'testmetric4{job="kube-state-metrics"}' values: "0x1000" - - series: 'testmetric5{app="kube-state-metrics"}' + - series: 'testmetric5{job="kube-state-metrics"}' values: "0x1000" - - series: 'testmetric6{app="kube-state-metrics"}' + - series: 'testmetric6{job="kube-state-metrics"}' values: "0x1000" - - series: 'testmetric7{app="kube-state-metrics"}' + - series: 'testmetric7{job="kube-state-metrics"}' values: "0x1000" - - series: 'testmetric8{app="kube-state-metrics"}' + - series: 'testmetric8{job="kube-state-metrics"}' values: "0x1000" - - series: 'testmetric9{app="kube-state-metrics"}' + - series: 'testmetric9{job="kube-state-metrics"}' values: "0x1000" - - series: 'testmetric10{app="kube-state-metrics"}' + - series: 'testmetric10{job="kube-state-metrics"}' values: "0x1000" alert_rule_test: # - 00:00 Start with no metrics @@ -167,7 +167,7 @@ tests: eval_time: 25m exp_alerts: - exp_labels: - area: "kaas" + area: "platform" cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_has_no_workers: "true" cancel_if_cluster_status_creating: "true" @@ -194,7 +194,7 @@ tests: eval_time: 85m exp_alerts: - exp_labels: - area: "kaas" + area: "platform" cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_has_no_workers: "true" cancel_if_cluster_status_creating: "true" diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml index 01ec1ca70..2b2e38560 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml @@ -5,15 +5,15 @@ rule_files: tests: - interval: 1m input_series: - - series: 'cortex_ring_members{app="loki", cluster_id="zj88t", cluster_type="workload_cluster", container="compactor", customer="giantswarm", installation="gorilla", instance="10.7.116.221:3100", job="zj88t-prometheus/workload-zj88t/0", name="compactor", namespace="loki", node="ip-10-6-2-178.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-compactor-676b8c897b-rq298", provider="aws", pipeline="stable", service_priority="highest", state="Unhealthy"}' + - series: 'cortex_ring_members{job="loki/loki-read", cluster_id="gaia-wc01", cluster_type="workload_cluster", container="loki", customer="giantswarm", installation="gaia", name="ingester", namespace="loki", node="ip-10-6-2-178.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-read-b7d9b85d4-xxxxx", provider="aws", pipeline="stable", service_priority="highest", state="Unhealthy"}' values: "0+0x20 1+0x160" # 1 unhealthy value after 20 minutes - - series: 'cortex_ring_members{app="loki", cluster_id="zj88t", cluster_type="workload_cluster", container="loki", customer="giantswarm", installation="gorilla", instance="10.7.116.221:3100", job="zj88t-prometheus/workload-zj88t/0", name="distributor", namespace="loki", node="ip-10-6-2-178.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-write-0", provider="aws", pipeline="stable", service_priority="highest", state="Unhealthy"}' + - series: 'cortex_ring_members{job="loki/loki-read", cluster_id="gaia-wc01", cluster_type="workload_cluster", container="loki", customer="giantswarm", installation="gaia", name="ingester", namespace="loki", node="ip-10-6-2-178.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-read-b7d9b85d4-yyyyy", provider="aws", pipeline="stable", service_priority="highest", state="Unhealthy"}' values: "_x30 1+0x10 0+0x60" # no data for 30 minutes then 1 unhealthy value for 10 minutes and back to normal for 1 hour - - series: 'loki_panic_total{app="loki-compactor", cluster_id="zj88t", cluster_type="workload_cluster", container="compactor", customer="giantswarm", installation="gorilla", instance="10.7.116.221:3100", job="zj88t-prometheus/workload-zj88t/0", namespace="loki", node="ip-10-6-2-178.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-compactor-676b8c897b-rq298", provider="aws", pipeline="stable", service_priority="highest"}' + - series: 'loki_panic_total{job="loki/loki-read", cluster_id="gaia-wc01", cluster_type="workload_cluster", container="loki", customer="giantswarm", installation="gaia", namespace="loki", node="ip-10-6-2-178.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-read-b7d9b85d4-zzzzz", provider="aws", pipeline="stable", service_priority="highest"}' values: "0+0x20 1+0x160" # 1 panic after 20 minutes - - series: 'loki_request_duration_seconds_count{app="loki-distributor", cluster_id="zj88t", cluster_type="workload_cluster", container="distributor", customer="giantswarm", installation="gorilla", instance="10.7.75.90:3100", job="zj88t-prometheus/workload-zj88t/0", method="POST", namespace="loki", node="ip-10-6-2-141.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-distributor-74b78f5559-tz6zs", provider="aws", pipeline="stable", route="loki_api_v1_push", service_priority="highest", status_code="204", ws="false"}' + - series: 'loki_request_duration_seconds_count{job="loki/loki-write", cluster_id="gaia-wc01", cluster_type="workload_cluster", container="loki", customer="giantswarm", installation="gaia", instance="10.7.75.90:3100", method="POST", namespace="loki", node="ip-10-6-2-141.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-write-0", provider="aws", pipeline="stable", route="loki_api_v1_push", service_priority="highest", status_code="204", ws="false"}' values: "0+60x180" # 1 request per second OK for 3 hours - - series: 'loki_request_duration_seconds_count{app="loki-distributor", cluster_id="zj88t", cluster_type="workload_cluster", container="distributor", customer="giantswarm", installation="gorilla", instance="10.7.75.90:3100", job="zj88t-prometheus/workload-zj88t/0", method="POST", namespace="loki", node="ip-10-6-2-141.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-distributor-74b78f5559-tz6zs", provider="aws", pipeline="stable", route="loki_api_v1_push", service_priority="highest", status_code="503", ws="false"}' + - series: 'loki_request_duration_seconds_count{job="loki/loki-write", cluster_id="gaia-wc01", cluster_type="workload_cluster", container="loki", customer="giantswarm", installation="gaia", instance="10.7.75.90:3100", method="POST", namespace="loki", node="ip-10-6-2-141.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-write-0", provider="aws", pipeline="stable", route="loki_api_v1_push", service_priority="highest", status_code="503", ws="false"}' values: "0+0x20 0+30x160" # After 20 minutes, we also have 0.5 rq/s failing alert_rule_test: - alertname: LokiRequestPanics @@ -23,18 +23,18 @@ tests: eval_time: 25m # After 25 minutes, should fire an alert for the t+20 error exp_alerts: - exp_labels: - area: managedservices + area: platform cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cancel_if_outside_working_hours: "true" cancel_if_scrape_timeout: "true" - cluster_id: zj88t - installation: gorilla + cluster_id: gaia-wc01 + installation: gaia pipeline: stable provider: aws - job: zj88t-prometheus/workload-zj88t/0 + job: loki/loki-read namespace: loki severity: page team: atlas @@ -53,18 +53,18 @@ tests: eval_time: 160m # Alert after more than 120m of incident exp_alerts: - exp_labels: - area: managedservices + area: platform cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cancel_if_outside_working_hours: "true" cancel_if_scrape_timeout: "true" - cluster_id: zj88t - installation: gorilla + cluster_id: gaia-wc01 + installation: gaia provider: aws pipeline: stable - job: zj88t-prometheus/workload-zj88t/0 + job: loki/loki-write namespace: loki route: loki_api_v1_push severity: page @@ -87,29 +87,29 @@ tests: eval_time: 60m # now the event has been there for 20 minutes, we should have an alert. exp_alerts: - exp_labels: - app: loki - area: managedservices + area: platform cancel_if_cluster_control_plane_unhealthy: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cancel_if_scrape_timeout: "true" cancel_if_outside_working_hours: "true" - cluster_id: zj88t - container: compactor + cluster_id: gaia-wc01 + container: loki customer: giantswarm - installation: gorilla + installation: gaia provider: aws pipeline: stable - name: compactor + job: loki/loki-read + name: ingester namespace: loki organization: giantswarm-production - pod: loki-compactor-676b8c897b-rq298 + pod: loki-read-b7d9b85d4-xxxxx severity: page team: atlas topic: observability exp_annotations: - description: "Loki pod loki-compactor-676b8c897b-rq298 (namespace loki) sees 1 unhealthy ring members" + description: "Loki pod loki-read-b7d9b85d4-xxxxx (namespace loki) sees 1 unhealthy ring members" opsrecipe: "loki/" - interval: 1m input_series: @@ -124,7 +124,7 @@ tests: exp_alerts: - exp_labels: all_pipelines: "true" - area: managedservices + area: platform cancel_if_outside_working_hours: "true" cluster_type: management_cluster namespace: loki diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/prometheus.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/prometheus.rules.test.yml index 75280b3af..c2e8b2ce4 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/prometheus.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/prometheus.rules.test.yml @@ -10,24 +10,24 @@ tests: # Test PrometheusJobScrapingFailure and PrometheusCriticalJobScrapingFailure - interval: 1h input_series: - - series: 'up{app="kubernetes", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing", job="gauss-prometheus/kubernetes-apiserver-gauss/0"}' + - series: 'up{job="apiserver", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing"}' values: "1+0x240" # critcal target up for 5d and down for 5d - - series: 'up{app="kube-controller-manager", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing", job="gauss-prometheus/kubernetes-controller-manager-gauss/0"}' + - series: 'up{job="kube-controller-manager", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing"}' values: "1+0x120 0+0x120" - - series: 'up{app="kube-scheduler", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing", job="gauss-prometheus/kubernetes-scheduler-gauss/0"}' + - series: 'up{job="kube-scheduler", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing"}' values: "1+0x240" - - series: 'up{app="kubelet", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing", job="gauss-prometheus/kubernetes-kubelet-gauss/0"}' + - series: 'up{job="kubelet", metrics_path="/metrics", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing"}' values: "1+0x240" - - series: 'up{app="node-exporter", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing", job="gauss-prometheus/node-exporter-gauss/0"}' + - series: 'up{job="node-exporter", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing"}' values: "1+0x240" - - series: 'up{app="kube-state-metrics", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing", job="gauss-prometheus/kube-state-metrics-gauss/0"}' + - series: 'up{job="kube-state-metrics", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing"}' values: "1+0x240" # Add bastion host test to ensure we do not page - - series: 'up{app="node-exporter", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing", job="gauss-prometheus/bastions/0"}' + - series: 'up{job="node-exporter", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing"}' values: "1+0x240" # non-critcal target up for 5d and down for 5d - - series: 'up{app="app-exporter", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing", job="gauss-prometheus/app-exporter-gauss/0"}' + - series: 'up{job="app-exporter", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing"}' values: "1+0x120 0+0x120" alert_rule_test: - alertname: PrometheusCriticalJobScrapingFailure @@ -41,7 +41,7 @@ tests: eval_time: 7d exp_alerts: - exp_labels: - area: empowerment + area: platform severity: notify team: atlas topic: observability @@ -50,13 +50,13 @@ tests: installation: "gauss" provider: "aws" pipeline: "testing" - job: "gauss-prometheus/kubernetes-controller-manager-gauss/0" + job: "kube-controller-manager" exp_annotations: opsrecipe: "prometheus-job-scraping-failure/" summary: "Prometheus fails to scrape all targets in a job." - description: "Prometheus gauss/gauss has failed to scrape all targets in gauss-prometheus/kubernetes-controller-manager-gauss/0 job." + description: "Prometheus gauss/gauss has failed to scrape all targets in kube-controller-manager job." - exp_labels: - area: empowerment + area: platform severity: notify team: atlas topic: observability @@ -65,27 +65,26 @@ tests: installation: "gauss" provider: "aws" pipeline: "testing" - job: "gauss-prometheus/app-exporter-gauss/0" + job: "app-exporter" exp_annotations: opsrecipe: "prometheus-job-scraping-failure/" summary: "Prometheus fails to scrape all targets in a job." - description: "Prometheus gauss/gauss has failed to scrape all targets in gauss-prometheus/app-exporter-gauss/0 job." + description: "Prometheus gauss/gauss has failed to scrape all targets in app-exporter job." # This fires only for critical target down. - alertname: PrometheusCriticalJobScrapingFailure eval_time: 9d exp_alerts: - exp_labels: - area: empowerment + area: platform severity: page team: atlas topic: observability - app: "kube-controller-manager" cluster_id: "gauss" installation: "gauss" provider: "aws" pipeline: "testing" - job: "gauss-prometheus/kubernetes-controller-manager-gauss/0" + job: "kube-controller-manager" cancel_if_outside_working_hours: "true" cancel_if_cluster_is_not_running_prometheus_agent: "true" cancel_if_cluster_status_creating: "true" @@ -93,4 +92,4 @@ tests: exp_annotations: opsrecipe: "prometheus-job-scraping-failure/" summary: "Prometheus fails to scrape all targets in a job." - description: "Prometheus gauss/gauss has failed to scrape all targets in gauss-prometheus/kubernetes-controller-manager-gauss/0 job." + description: "Prometheus gauss/gauss has failed to scrape all targets in kube-controller-manager job." diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/promtail.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/promtail.rules.test.yml index 643580a82..6d4108f3f 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/promtail.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/promtail.rules.test.yml @@ -22,7 +22,7 @@ tests: eval_time: 71m exp_alerts: - exp_labels: - area: empowerment + area: platform cancel_if_outside_working_hours: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -42,7 +42,7 @@ tests: eval_time: 111m exp_alerts: - exp_labels: - area: empowerment + area: platform cancel_if_outside_working_hours: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -63,7 +63,7 @@ tests: eval_time: 180m exp_alerts: - exp_labels: - area: empowerment + area: platform cancel_if_outside_working_hours: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -98,7 +98,7 @@ tests: eval_time: 270m exp_alerts: - exp_labels: - area: empowerment + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -116,7 +116,7 @@ tests: eval_time: 330m exp_alerts: - exp_labels: - area: empowerment + area: platform cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/silence-operator.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/silence-operator.rules.test.yml index 799f41e38..282db0569 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/silence-operator.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/silence-operator.rules.test.yml @@ -5,7 +5,7 @@ rule_files: tests: - interval: 1m input_series: - - series: 'operatorkit_controller_errors_total{app="silence-operator", controller="silence-controller", cluster_type="management_cluster"}' + - series: 'operatorkit_controller_errors_total{job="monitoring/silence-operator", controller="silence-controller", cluster_type="management_cluster"}' values: "0x30 1+0x20 20x45 20-1x20 0x100" alert_rule_test: - alertname: SilenceOperatorReconcileErrors @@ -14,8 +14,8 @@ tests: eval_time: 95m exp_alerts: - exp_labels: - app: silence-operator - area: "empowerment" + job: "monitoring/silence-operator" + area: platform cancel_if_outside_working_hours: "true" cluster_type: management_cluster controller: silence-controller diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/sloth.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/sloth.rules.test.yml index 0c791a02f..5841ae58e 100644 --- a/test/tests/providers/global/platform/atlas/alerting-rules/sloth.rules.test.yml +++ b/test/tests/providers/global/platform/atlas/alerting-rules/sloth.rules.test.yml @@ -6,7 +6,7 @@ tests: - interval: 1m input_series: # For the first 60min: test with 1 pod: none, up, down - - series: 'up{app="sloth",cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}' + - series: 'up{job="monitoring/sloth", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}' values: "_x20 1+0x20 0+0x20" alert_rule_test: - alertname: SlothDown @@ -17,7 +17,7 @@ tests: eval_time: 50m exp_alerts: - exp_labels: - area: managedservices + area: platform cluster_id: gauss installation: gauss provider: aws diff --git a/test/tests/providers/vintage/aws/platform/atlas/alerting-rules/inhibit.prometheus-agent.rules.test.yml b/test/tests/providers/vintage/aws/platform/atlas/alerting-rules/inhibit.prometheus-agent.rules.test.yml index bc83d6ef2..aede699d4 100644 --- a/test/tests/providers/vintage/aws/platform/atlas/alerting-rules/inhibit.prometheus-agent.rules.test.yml +++ b/test/tests/providers/vintage/aws/platform/atlas/alerting-rules/inhibit.prometheus-agent.rules.test.yml @@ -6,16 +6,16 @@ tests: - interval: 1m input_series: # - cluster 1: "clu01" - - series: 'kube_namespace_created{app="kube-state-metrics", cluster_id="myinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="myinstall", instance="100.64.25.34:8080", job="kube-state-metrics", namespace="clu01-prometheus", node="ip-10-0-5-14.eu-central-1.compute.internal", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-f7b868f49-ngvr8", service="prometheus-operator-app-kube-state-metrics"}' + - series: 'kube_namespace_created{cluster_id="myinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="myinstall", instance="100.64.25.34:8080", job="kube-state-metrics", namespace="clu01-prometheus", node="ip-10-0-5-14.eu-central-1.compute.internal", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-f7b868f49-ngvr8", service="prometheus-operator-app-kube-state-metrics"}' values: '1671707388+0x40' # - cluster 2: "clu02" - - series: 'kube_namespace_created{app="kube-state-metrics", cluster_id="myinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="myinstall", instance="100.64.25.34:8080", job="kube-state-metrics", namespace="clu02-prometheus", node="ip-10-0-5-14.eu-central-1.compute.internal", organization="giantswarm", pipeline="stable", pod="prometheus-operator-app-kube-state-metrics-f7b868f49-ngvr8", service="prometheus-operator-app-kube-state-metrics"}' + - series: 'kube_namespace_created{cluster_id="myinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="myinstall", instance="100.64.25.34:8080", job="kube-state-metrics", namespace="clu02-prometheus", node="ip-10-0-5-14.eu-central-1.compute.internal", organization="giantswarm", pipeline="stable", pod="prometheus-operator-app-kube-state-metrics-f7b868f49-ngvr8", service="prometheus-operator-app-kube-state-metrics"}' values: '1671707388+0x40' # - cluster 3: "myinstall", the install name - - series: 'kube_namespace_created{app="kube-state-metrics", cluster_id="myinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="myinstall", instance="100.64.25.34:8080", job="kube-state-metrics", namespace="myinstall-prometheus", node="ip-10-0-5-14.eu-central-1.compute.internal", organization="giantswarm", pipeline="stable", pod="prometheus-operator-app-kube-state-metrics-f7b868f49-ngvr8", service="prometheus-operator-app-kube-state-metrics"}' + - series: 'kube_namespace_created{cluster_id="myinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="myinstall", instance="100.64.25.34:8080", job="kube-state-metrics", namespace="myinstall-prometheus", node="ip-10-0-5-14.eu-central-1.compute.internal", organization="giantswarm", pipeline="stable", pod="prometheus-operator-app-kube-state-metrics-f7b868f49-ngvr8", service="prometheus-operator-app-kube-state-metrics"}' values: "1671707388+0x40" # prometheus-agent app info for "clu01" - - series: 'app_operator_app_info{app="prometheus-agent", app_version="2.40.5", catalog="giantswarm-playground", cluster_id="myinstall", cluster_missing="false", cluster_type="management_cluster", customer="giantswarm", deployed_version="0.1.7", endpoint="web", installation="myinstall", instance="app-exporter", job="app-exporter", name="prometheus-agent", namespace="clu01", node="ip-10-0-5-141.eu-central-1.compute.internal", organization="giantswarm", pipeline="stable", pod="app-exporter-6865c9c648-sg5vg", service="app-exporter", status="deployed", team="atlas", upgrade_available="false", version="0.1.7", version_mismatch="false"}' + - series: 'app_operator_app_info{name="prometheus-agent", app_version="2.40.5", catalog="giantswarm-playground", cluster_id="myinstall", cluster_missing="false", cluster_type="management_cluster", customer="giantswarm", deployed_version="0.1.7", endpoint="web", installation="myinstall", instance="app-exporter", job="app-exporter", namespace="clu01", node="ip-10-0-5-141.eu-central-1.compute.internal", organization="giantswarm", pipeline="stable", pod="app-exporter-6865c9c648-sg5vg", service="app-exporter", status="deployed", team="atlas", upgrade_available="false", version="0.1.7", version_mismatch="false"}' values: "1+0x40" alert_rule_test: #- alertname: InhibitionClusterIsNotRunningPrometheusAgent @@ -24,7 +24,7 @@ tests: eval_time: 10m exp_alerts: - exp_labels: - area: empowerment + area: platform team: atlas topic: monitoring cluster_is_not_running_prometheus_agent: "true" diff --git a/test/tests/providers/vintage/aws/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/vintage/aws/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml index a7f332e20..406901ee6 100644 --- a/test/tests/providers/vintage/aws/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml +++ b/test/tests/providers/vintage/aws/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml @@ -16,7 +16,7 @@ tests: eval_time: 30m exp_alerts: - exp_labels: - area: empowerment + area: platform severity: page team: atlas topic: observability @@ -34,7 +34,7 @@ tests: eval_time: 30m exp_alerts: - exp_labels: - area: empowerment + area: platform severity: none team: atlas topic: observability @@ -52,7 +52,7 @@ tests: eval_time: 90m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: gauss cluster_type: workload_cluster severity: page @@ -73,7 +73,7 @@ tests: eval_time: 90m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: gauss cluster_type: workload_cluster severity: none @@ -97,11 +97,11 @@ tests: # Tests for `PrometheusAgentShardsMissing` alert - interval: 1m input_series: - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - series: 'prometheus_operator_spec_shards{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' values: '3+0x60 5+0x60 3+0x60' @@ -116,7 +116,7 @@ tests: eval_time: 100m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -137,7 +137,7 @@ tests: eval_time: 100m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -158,7 +158,7 @@ tests: eval_time: 125m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -179,7 +179,7 @@ tests: eval_time: 125m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -203,11 +203,11 @@ tests: # Tests for `PrometheusAgentShardsMissing` alert with missing `prometheus_operator_spec_shards` metric - interval: 1m input_series: - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - series: 'prometheus_operator_spec_replicas{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' values: '3+0x60 5+0x60 3+0x60' @@ -220,7 +220,7 @@ tests: eval_time: 100m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -241,7 +241,7 @@ tests: eval_time: 100m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -262,7 +262,7 @@ tests: eval_time: 125m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws @@ -283,7 +283,7 @@ tests: eval_time: 125m exp_alerts: - exp_labels: - area: empowerment + area: platform cluster_id: test01 installation: myinstall provider: aws