diff --git a/CHANGELOG.md b/CHANGELOG.md index 36fc71d93..7ccc3d1b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Make Atlas rules compatible with Mimir: + - Add labels `cluster_id, installation, provider, pipeline` for each aggregation functions + - Rewrite some of `absent` functions + ## [3.6.2] - 2024-04-04 ### Changed diff --git a/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml index b2df1ee3e..f334e89c7 100644 --- a/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml @@ -17,7 +17,7 @@ spec: annotations: description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' opsrecipe: deployment-not-satisfied/ - expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*"} > 0 + expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*|mimir.*|loki.*|object-storage.*|logging-operator.*|silence-operator.*|sloth.*"} > 0 for: 30m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml b/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml index 93b9d51d8..383e9579c 100644 --- a/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml @@ -46,7 +46,7 @@ spec: annotations: description: '{{`Fluentbit is down on node ({{ $labels.node }}).`}}' opsrecipe: fluentbit-down/ - expr: sum(up{app="fluent-logshipping-app"}) by (app, cluster_id, job, namespace, node) == 0 + expr: sum(up{app="fluent-logshipping-app"}) by (app, cluster_id, installation, provider, pipeline, job, namespace, node) == 0 for: 15m labels: area: empowerment diff --git a/helm/prometheus-rules/templates/alerting-rules/grafana.rules.yml b/helm/prometheus-rules/templates/alerting-rules/grafana.rules.yml index cb8b4196c..87b515c4b 100644 --- a/helm/prometheus-rules/templates/alerting-rules/grafana.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/grafana.rules.yml @@ -31,11 +31,11 @@ spec: - alert: GrafanaFolderPermissionsDown # Monitors that folder permissions have been updated. # We have a cronjob (grafana-permissions) that runs every 20 minutes. - # When successfully run, folders permissions successful updates counter increases. + # When successfully run, folders permissions successful updates counter increases. annotations: description: '{{`Grafana Folder not updated for ({{ $labels.instance }}).`}}' opsrecipe: grafana-perms/ - expr: sum(increase(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"}[2h])) < 1 or absent(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"}) + expr: sum by(cluster_id, installation, provider, pipeline) (increase(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"}[2h])) < 1 or absent(grafana_http_request_duration_seconds_count{handler="/api/folders/:uid/permissions/", method="POST", namespace="monitoring", service="grafana", status_code="200", cluster_type="management_cluster"}) for: 6h labels: area: managedservices @@ -57,7 +57,7 @@ spec: # - we create cronjob label from cron name (label_replace) # - we sum number of failed to have one global value # - we avg_over_time to avoid 0 value when a cron was skipped for whatever reason - expr: sum(label_replace(avg_over_time(kube_job_status_failed{job_name=~"grafana-permissions.*", reason!="BackoffLimitExceeded", cluster_type="management_cluster"}[60m]), "cronjob", "$1", "job_name", "(grafana-permissions)-.*")) by (cronjob) > 0 + expr: sum by (cronjob, cluster_id, installation, provider, pipeline) (label_replace(avg_over_time(kube_job_status_failed{job_name=~"grafana-permissions.*", reason!="BackoffLimitExceeded", cluster_type="management_cluster"}[60m]), "cronjob", "$1", "job_name", "(grafana-permissions)-.*")) > 0 for: 6h labels: area: managedservices @@ -75,7 +75,7 @@ spec: # This alert triggers when the grafana permission job did not schedule for more than 1 day # or if the job did not run successfully at least once in the last day expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="grafana-permissions", cluster_type="management_cluster"}) > 86400 - or count(max_over_time(kube_job_status_succeeded{job_name=~"grafana-permission.+", cluster_type="management_cluster"}[1d]) == 1) == 0 + or count by (cluster_id, installation, provider, pipeline) (max_over_time(kube_job_status_succeeded{job_name=~"grafana-permission.+", cluster_type="management_cluster"}[1d]) == 1) == 0 labels: area: empowerment severity: page diff --git a/helm/prometheus-rules/templates/alerting-rules/inhibit.prometheus-agent.rules.yml b/helm/prometheus-rules/templates/alerting-rules/inhibit.prometheus-agent.rules.yml index 9e3b65e8c..c42a8a334 100644 --- a/helm/prometheus-rules/templates/alerting-rules/inhibit.prometheus-agent.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/inhibit.prometheus-agent.rules.yml @@ -15,7 +15,7 @@ spec: # we retrieve the list of existing cluster IDs from `kube_namespace_created` # excluding the MC's one, because it's always using prometheus-agent and namespace is not named after cluster name # then compare it with the list of deployed prometheus-agents from `app_operator_app_info` - # + # # Will only produce data (and inhibitions) on MC because it's where app_operator is running # but that's enough to have the inhibitions on the installation-global alertmanager - alert: InhibitionClusterIsNotRunningPrometheusAgent diff --git a/helm/prometheus-rules/templates/alerting-rules/keda.rules.yml b/helm/prometheus-rules/templates/alerting-rules/keda.rules.yml index abd0b8880..1e30ceca8 100644 --- a/helm/prometheus-rules/templates/alerting-rules/keda.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/keda.rules.yml @@ -13,7 +13,7 @@ spec: - alert: KedaDown annotations: description: 'Keda is down.' - expr: count (up{container=~"keda-.*"} == 0) > 0 + expr: count by (cluster_id, installation, provider, pipeline) (up{container=~"keda-.*"} == 0) > 0 for: 10m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml index 2a41fef68..7a259ad6f 100644 --- a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml @@ -14,16 +14,23 @@ spec: annotations: description: '{{`KubeStateMetrics ({{ $labels.instance }}) is down.`}}' opsrecipe: kube-state-metrics-down/ + {{- if not .Values.mimir.enabled }} + expr: label_replace(up{app="kube-state-metrics",container=""}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",container=""} == 1) + {{- else }} expr: |- - ( - # modern clusters - label_replace(up{app="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",instance=~".*:8080"} == 1) - ) - and - ( - # vintage clusters without servicemonitor - label_replace(up{app="kube-state-metrics",container=""}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",container=""} == 1) - ) + count by (cluster_id, installation, provider, pipeline) (label_replace(up{app="kube-state-metrics", instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*")) == 0 + or ( + label_replace( + capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + ) unless on (cluster_id, customer, installation, pipeline, provider, region) ( + count(up{app="kube-state-metrics", instance=~".*:8080"} == 1) by (cluster_id, customer, installation, pipeline, provider, region) + ) + {{- end }} for: 15m labels: area: kaas @@ -42,7 +49,7 @@ spec: annotations: description: '{{`KubeStateMetrics ({{ $labels.instance }}) is too slow.`}}' opsrecipe: kube-state-metrics-down/ - expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{handler="metrics", job="kube-state-metrics"}[5m])) by (le, cluster_id)) > 7 + expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{handler="metrics", job="kube-state-metrics"}[5m])) by (le, cluster_id, installation, provider, pipeline)) > 7 for: 15m labels: area: kaas @@ -62,7 +69,7 @@ spec: opsrecipe: kube-state-metrics-down/ expr: |- # When it looks up but we don't have metrics - count({app="kube-state-metrics"}) by (cluster_id) < 10 + count({app="kube-state-metrics"}) by (cluster_id, installation, provider, pipeline) < 10 for: 20m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/loki.rules.yml b/helm/prometheus-rules/templates/alerting-rules/loki.rules.yml index 9f87870bf..568fa9c7e 100644 --- a/helm/prometheus-rules/templates/alerting-rules/loki.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/loki.rules.yml @@ -35,9 +35,9 @@ spec: description: This alert checks that we have less than 10% errors on Loki requests. opsrecipe: loki/ expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (cluster_id, namespace, job, route) + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster_id, installation, provider, pipeline, namespace, job, route) / - sum(rate(loki_request_duration_seconds_count[1m])) by (cluster_id, namespace, job, route) + sum(rate(loki_request_duration_seconds_count[2m])) by (cluster_id, installation, provider, pipeline, namespace, job, route) > 10 for: 120m labels: @@ -56,7 +56,7 @@ spec: description: This alert checks that we have no panic errors on Loki. opsrecipe: loki/ expr: | - sum(increase(loki_panic_total[10m])) by (cluster_id, namespace, job) > 0 + sum(increase(loki_panic_total[10m])) by (cluster_id, installation, provider, pipeline, namespace, job) > 0 labels: area: managedservices cancel_if_apiserver_down: "true" @@ -73,7 +73,7 @@ spec: description: '{{`Loki pod {{ $labels.pod }} (namespace {{ $labels.namespace }}) sees {{ $value }} unhealthy ring members`}}' opsrecipe: loki/ expr: | - sum (min_over_time(cortex_ring_members{state="Unhealthy"}[30m])) by (app, cluster_id, container, customer, installation, name, namespace, organization, pod) > 0 + sum (min_over_time(cortex_ring_members{state="Unhealthy"}[30m])) by (app, cluster_id, container, customer, installation, provider, pipeline, name, namespace, organization, pod) > 0 labels: area: managedservices cancel_if_apiserver_down: "true" diff --git a/helm/prometheus-rules/templates/alerting-rules/microendpoint.rules.yml b/helm/prometheus-rules/templates/alerting-rules/microendpoint.rules.yml index dcf6b0f74..45d2e9fdf 100644 --- a/helm/prometheus-rules/templates/alerting-rules/microendpoint.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/microendpoint.rules.yml @@ -17,7 +17,7 @@ spec: annotations: description: '{{`CR version {{ $labels.version }} in cluster {{ $labels.cluster_id }} is reconciled by multiple apps including {{ $labels.app }}.`}}' opsrecipe: multiple-operators-running-same-version/ - expr: sum(label_replace(giantswarm_build_info{app=~"prometheus-meta-operator.*"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, version) > 1 + expr: sum(label_replace(giantswarm_build_info{app=~"prometheus-meta-operator.*"}, "version", "$1", "reconciled_version", "(.+)")) by (app, cluster_id, installation, provider, pipeline, version) > 1 for: 5m labels: area: empowerment diff --git a/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml index 15d5b7198..66cad5210 100644 --- a/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml @@ -45,7 +45,7 @@ spec: - alert: MimirComponentDown annotations: description: '{{`Mimir component : {{ $labels.service }} is down.`}}' - expr: count(up{app="mimir"} == 0) by (cluster_id, service) > 0 + expr: count(up{app="mimir"} == 0) by (cluster_id, installation, provider, pipeline, service) > 0 for: 5m labels: area: managedservices @@ -59,7 +59,7 @@ spec: - alert: GrafanaAgentForPrometheusRulesDown annotations: description: 'Grafana-agent sending PrometheusRules to Mimir ruler is down.' - expr: count(up{app="grafana-agent", namespace="mimir"} == 0) by (cluster_id) > 0 + expr: count(up{app="grafana-agent", namespace="mimir"} == 0) by (cluster_id, installation, provider, pipeline) > 0 for: 1h labels: area: managedservices diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml index b4a9e2e32..d8766c43b 100644 --- a/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml @@ -82,7 +82,7 @@ spec: count(up{job="prometheus-agent"} > 0) by (cluster_id, customer, installation, pipeline, provider, region) ) {{- end }} - for: 1m + for: 2m labels: area: empowerment severity: none @@ -99,18 +99,18 @@ spec: summary: Prometheus agent is missing shards. opsrecipe: prometheus-agent/ expr: |- - max_over_time(sum by (cluster_id)( + max_over_time(sum by (cluster_id, installation, provider, pipeline)( count( ## number of remotes that are not mimir or grafana-cloud prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"} - ) by (cluster_id) + ) by (cluster_id, installation, provider, pipeline) != sum( ## number of shards defined in the Prometheus CR prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} # if there is only 1 shard, there is no shard metric so we use the replicas metric or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} - ) by (cluster_id) + ) by (cluster_id, installation, provider, pipeline) )[5m:]) for: 20m labels: @@ -130,20 +130,20 @@ spec: summary: Prometheus agent is missing shards. opsrecipe: prometheus-agent/ expr: |- - max_over_time(sum by (cluster_id)( + max_over_time(sum by (cluster_id, installation, provider, pipeline)( count( ## number of remotes that are not mimir or grafana-cloud prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"} - ) by (cluster_id) + ) by (cluster_id, installation, provider, pipeline) != sum( ## number of shards defined in the Prometheus CR prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} # if there is only 1 shard, there is no shard metric so we use the replicas metric or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} - ) by (cluster_id) + ) by (cluster_id, installation, provider, pipeline) )[5m:]) - for: 1m + for: 2m labels: area: empowerment severity: none diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus-meta-operator.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus-meta-operator.rules.yml index 446a397c3..8c5c5072c 100644 --- a/helm/prometheus-rules/templates/alerting-rules/prometheus-meta-operator.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/prometheus-meta-operator.rules.yml @@ -28,20 +28,22 @@ spec: # If a prometheus is missing, this alert will fire. This alert will not check if a prometheus is running when it should not (e.g. deleted cluster) expr: | ( - sum by(cluster_id) ( + sum by(cluster_id, installation, provider, pipeline) ( {__name__=~"cluster_service_cluster_info|cluster_operator_cluster_status", status!="Deleting"} - ) unless sum by(cluster_id) ( + ) unless sum by(cluster_id, installation, provider, pipeline) ( label_replace( kube_pod_container_status_running{container="prometheus", namespace!="{{ .Values.managementCluster.name }}-prometheus", namespace=~".*-prometheus"}, "cluster_id", "$2", "pod", "(prometheus-)(.+)(-.+)" ) ) - ) + ( - sum by (cluster_name) ( - capi_cluster_status_phase{phase!="Deleting"} - ) unless sum by (cluster_name) ( - label_replace(kube_pod_container_status_running{container="prometheus",namespace=~".*-prometheus"}, - "cluster_name", "$2", "pod", "(prometheus-)(.+)(-.+)" + ) or ( + sum by (cluster_id, installation, provider, pipeline) ( + label_replace(capi_cluster_status_phase{phase!="Deleting"}, + "cluster_id", "$1", "name", "(.+)" + ) + ) unless sum by (cluster_id, installation, provider, pipeline) ( + label_replace(kube_pod_container_status_running{container="prometheus",namespace=~".*-prometheus"}, + "cluster_id", "$2", "pod", "(prometheus-)(.+)(-.+)" ) ) ) diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus-operator.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus-operator.rules.yml index 842d5aac0..f3eb598f4 100644 --- a/helm/prometheus-rules/templates/alerting-rules/prometheus-operator.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/prometheus-operator.rules.yml @@ -33,7 +33,7 @@ spec: - alert: PrometheusOperatorListErrors annotations: description: Errors while performing List operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace. - expr: (sum by (cluster_id, controller, namespace) (rate(prometheus_operator_list_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, controller,namespace) (rate(prometheus_operator_list_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4 + expr: (sum by (cluster_id, installation, provider, pipeline, controller, namespace) (rate(prometheus_operator_list_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_list_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4 for: 15m labels: area: empowerment @@ -44,7 +44,7 @@ spec: - alert: PrometheusOperatorWatchErrors annotations: description: Errors while performing watch operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace. - expr: (sum by (cluster_id, controller, namespace) (rate(prometheus_operator_watch_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, controller,namespace) (rate(prometheus_operator_watch_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4 + expr: (sum by (cluster_id, installation, provider, pipeline, controller, namespace) (rate(prometheus_operator_watch_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_watch_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4 for: 15m labels: area: empowerment @@ -66,7 +66,7 @@ spec: - alert: PrometheusOperatorReconcileErrors annotations: description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of reconciling operations failed for {{`{{`}} $labels.controller {{`}}`}} controller in {{`{{`}} $labels.namespace {{`}}`}} namespace.' - expr: (sum by (cluster_id, controller,namespace) (rate(prometheus_operator_reconcile_errors_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) / (sum by (cluster_id, controller,namespace) (rate(prometheus_operator_reconcile_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) > 0.1 + expr: (sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_reconcile_errors_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) / (sum by (cluster_id, installation, provider, pipeline, controller,namespace) (rate(prometheus_operator_reconcile_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]))) > 0.1 for: 10m labels: area: empowerment @@ -88,7 +88,7 @@ spec: - alert: PrometheusOperatorNotReady annotations: description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace isn't ready to reconcile {{`{{`}} $labels.controller {{`}}`}} resources. - expr: min by (cluster_id, namespace, controller) (max_over_time(prometheus_operator_ready{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) == 0) + expr: min by (cluster_id, installation, provider, pipeline, namespace, controller) (max_over_time(prometheus_operator_ready{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) == 0) for: 5m labels: area: empowerment diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus.rules.yml index a9741da5e..7edfe1036 100644 --- a/helm/prometheus-rules/templates/alerting-rules/prometheus.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/prometheus.rules.yml @@ -32,7 +32,7 @@ spec: annotations: description: 'Prometheus is not sending data to Grafana Cloud.' opsrecipe: tbd/ - expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"}) + expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"}) for: 1h labels: area: empowerment @@ -71,7 +71,7 @@ spec: description: {{`Prometheus {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}} summary: Prometheus fails to scrape all targets in a job. opsrecipe: prometheus-job-scraping-failure/ - expr: (count(up == 0) BY (job, installation, cluster_id) / count(up) BY (job, installation, cluster_id)) == 1 + expr: (count(up == 0) BY (job, installation, cluster_id, provider, pipeline) / count(up) BY (job, installation, cluster_id, provider, pipeline)) == 1 for: 1d labels: area: empowerment @@ -92,14 +92,14 @@ spec: app=~"kubernetes|kube-controller-manager|kube-scheduler|kubelet|node-exporter|kube-state-metrics", job!~".*bastions.*" } == 0 - ) BY (app,job, installation, cluster_id) + ) BY (app,job, installation, cluster_id, provider, pipeline) / count( up{ app=~"kubernetes|kube-controller-manager|kube-scheduler|kubelet|node-exporter|kube-state-metrics", job!~".*bastions.*" } - ) BY (app, job, installation, cluster_id) + ) BY (app, job, installation, cluster_id, provider, pipeline) ) == 1 for: 3d labels: diff --git a/helm/prometheus-rules/templates/alerting-rules/promtail.rules.yml b/helm/prometheus-rules/templates/alerting-rules/promtail.rules.yml index a4261a4ec..6bf489845 100644 --- a/helm/prometheus-rules/templates/alerting-rules/promtail.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/promtail.rules.yml @@ -13,7 +13,7 @@ spec: annotations: description: '{{`Scraping of all promtail pods to check if one failed every 30 minutes.`}}' opsrecipe: promtail-is-not-running/ - expr: count(up{container="promtail"} == 0) by (cluster_id) > 0 + expr: count(up{container="promtail"} == 0) by (cluster_id, installation, provider, pipeline) > 0 for: 30m labels: area: "empowerment" @@ -26,11 +26,11 @@ spec: cancel_if_cluster_status_updating: "true" # Not tested - alert: PromtailRequestsErrors - annotations: + annotations: description: This alert checks if that the amount of failed requests is below 10% for promtail opsrecipe: promtail-requests-are-failing/ expr: | - 100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (cluster_id, namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (cluster_id, namespace, job, route, instance) > 10 + 100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[2m])) by (cluster_id, installation, provider, pipeline, namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[2m])) by (cluster_id, installation, provider, pipeline, namespace, job, route, instance) > 10 for: 15m labels: area: "empowerment" diff --git a/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml b/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml index 6756a633f..0f1b7c68e 100644 --- a/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml @@ -30,7 +30,7 @@ spec: # This alert triggers when the silence operator sync job did not schedule for more than 1 day # or if the job did not run successfully at least once in the last day expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="silence-operator-sync", cluster_type="management_cluster"}) > 86400 - or count(max_over_time(kube_job_status_succeeded{job_name=~"silence-operator-sync.+", cluster_type="management_cluster"}[1d]) == 1) by (cluster_id) == 0 + or count(max_over_time(kube_job_status_succeeded{job_name=~"silence-operator-sync.+", cluster_type="management_cluster"}[1d]) == 1) by (cluster_id, installation, provider, pipeline) == 0 labels: area: empowerment severity: page diff --git a/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml b/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml index 93a5a1257..1aaab0de0 100644 --- a/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml @@ -12,7 +12,7 @@ spec: - alert: SlothDown annotations: description: 'Sloth is down.' - expr: count(up{app="sloth"} == 0) by (cluster_id) > 0 + expr: count(up{app="sloth"} == 0) by (cluster_id, installation, provider, pipeline) > 0 for: 5m labels: area: managedservices diff --git a/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml b/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml index 331e53ba6..cb88667c4 100644 --- a/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml +++ b/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml @@ -20,12 +20,12 @@ spec: record: raw_slo_requests # The first statement ensures that an api-server error is counted if the kubernetes api is not up for a specific cluster. # The next statement returns 1 for a cluster with "updated", "created" or unknown (absent) status. - # It returns 0 for clusters in "updating", "creating" and "deleting" status. + # It returns 0 for clusters in "updating", "creating" and "deleting" status. # By multiplying with this statement we ensure that errors for transitioning clusters are not counted. - - expr: sum((up{app='kubernetes'} * -1) + 1) by (cluster_id, cluster_type) * - ignoring (cluster_type) group_left (cluster_id) + - expr: sum((up{app='kubernetes'} * -1) + 1) by (cluster_id, cluster_type) * + ignoring (cluster_type) group_left (cluster_id) ( - max(cluster_operator_cluster_status{status=~"Updated|Created"}) by (cluster_id, cluster_type) + max(cluster_operator_cluster_status{status=~"Updated|Created"}) by (cluster_id, cluster_type) or absent(cluster_operator_cluster_status) ) labels: @@ -51,8 +51,8 @@ spec: area: kaas label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: raw_slo_requests - # -- the errors are counted as follows: - # -- pods in a daemonset that are UNAVAILABLE NOW and have been UNAVAILABLE 10 MINUTES AGO + # -- the errors are counted as follows: + # -- pods in a daemonset that are UNAVAILABLE NOW and have been UNAVAILABLE 10 MINUTES AGO # -- which are on a SCHEDULABLE node that was CREATED AT LEAST 10 MINUTES AGO - expr: | ( @@ -370,7 +370,7 @@ spec: # -- Managed Prometheus # Set SLO request to always be 1 when a managed prometheus target is present. - - expr: (up{app="prometheus-operator-app-prometheus",container="prometheus"}*0)+1 + - expr: (up{app=~"kube-prometheus-stack-prometheus-operator|prometheus-operator-app-prometheus",container=~"kube-prometheus-stack|prometheus"}*0)+1 labels: area: managed-apps class: MEDIUM @@ -378,7 +378,7 @@ spec: label_application_giantswarm_io_team: atlas record: raw_slo_requests # Set SLO error to be 1 when a managed prometheus is down. - - expr: (up{app="prometheus-operator-app-prometheus",container="prometheus"}*-1)+1 == 1 + - expr: (up{app=~"kube-prometheus-stack-prometheus-operator|prometheus-operator-app-prometheus",container=~"kube-prometheus-stack|prometheus"}*-1)+1 == 1 labels: area: managed-apps class: MEDIUM @@ -388,7 +388,7 @@ spec: # -- Managed Alertmanager # Set SLO request to always be 1 when a managed alertmanager target is present. - - expr: (up{app="prometheus-operator-app-alertmanager", container="alertmanager"}*0)+1 + - expr: (up{app=~"alertmanager|prometheus-operator-app-alertmanager",container="alertmanager"}*0)+1 labels: area: managed-apps class: MEDIUM @@ -396,7 +396,7 @@ spec: label_application_giantswarm_io_team: atlas record: raw_slo_requests # Set SLO error to be 1 when a managed alertmanager is down. - - expr: (up{app="prometheus-operator-app-alertmanager",container="alertmanager"}*-1)+1 == 1 + - expr: (up{app=~"alertmanager|prometheus-operator-app-alertmanager",container="alertmanager"}*-1)+1 == 1 labels: area: managed-apps class: MEDIUM diff --git a/test/tests/providers/global/kube-state-metrics.rules.test.yml b/test/tests/providers/global/kube-state-metrics.rules.test.yml index 8f5891193..5b5577cf0 100644 --- a/test/tests/providers/global/kube-state-metrics.rules.test.yml +++ b/test/tests/providers/global/kube-state-metrics.rules.test.yml @@ -61,6 +61,10 @@ tests: severity: "page" team: "atlas" topic: "observability" + cluster_id: "testinstall" + installation: "testinstall" + provider: "aws" + pipeline: "testing" exp_annotations: description: "KubeStateMetrics () is down." opsrecipe: "kube-state-metrics-down/" @@ -84,6 +88,10 @@ tests: severity: "page" team: "atlas" topic: "observability" + cluster_id: "testinstall" + installation: "testinstall" + provider: "aws" + pipeline: "testing" exp_annotations: description: "KubeStateMetrics () is down." opsrecipe: "kube-state-metrics-down/" @@ -113,6 +121,10 @@ tests: severity: "page" team: "atlas" topic: "observability" + cluster_id: "testinstall" + installation: "testinstall" + provider: "aws" + pipeline: "testing" exp_annotations: description: "KubeStateMetrics () is down." opsrecipe: "kube-state-metrics-down/" @@ -167,6 +179,10 @@ tests: severity: "page" team: "atlas" topic: "observability" + cluster_id: "testvintage" + installation: "testinstall" + provider: "aws" + pipeline: "testing" exp_annotations: description: "KubeStateMetrics () is down." opsrecipe: "kube-state-metrics-down/" @@ -190,6 +206,10 @@ tests: severity: "page" team: "atlas" topic: "observability" + cluster_id: "testvintage" + installation: "testinstall" + provider: "aws" + pipeline: "testing" exp_annotations: description: "KubeStateMetrics () is down." opsrecipe: "kube-state-metrics-down/" diff --git a/test/tests/providers/global/loki.rules.test.yml b/test/tests/providers/global/loki.rules.test.yml index d3cfab8b9..7fce86cc1 100644 --- a/test/tests/providers/global/loki.rules.test.yml +++ b/test/tests/providers/global/loki.rules.test.yml @@ -5,14 +5,14 @@ rule_files: tests: - interval: 1m input_series: - - series: 'cortex_ring_members{app="loki-compactor", cluster_id="zj88t", cluster_type="workload_cluster", container="compactor", customer="giantswarm", installation="gorilla", instance="10.7.116.221:3100", job="zj88t-prometheus/workload-zj88t/0", name="compactor", namespace="loki", node="ip-10-6-2-178.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-compactor-676b8c897b-rq298", provider="aws", service_priority="highest", state="Unhealthy"}' + - series: 'cortex_ring_members{app="loki-compactor", cluster_id="zj88t", cluster_type="workload_cluster", container="compactor", customer="giantswarm", installation="gorilla", instance="10.7.116.221:3100", job="zj88t-prometheus/workload-zj88t/0", name="compactor", namespace="loki", node="ip-10-6-2-178.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-compactor-676b8c897b-rq298", provider="aws", pipeline="stable", service_priority="highest", state="Unhealthy"}' values: "0+0x20 1+0x160" # 1 unhealthy value after 20 minutes - - series: 'loki_panic_total{app="loki-compactor", cluster_id="zj88t", cluster_type="workload_cluster", container="compactor", customer="giantswarm", installation="gorilla", instance="10.7.116.221:3100", job="zj88t-prometheus/workload-zj88t/0", namespace="loki", node="ip-10-6-2-178.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-compactor-676b8c897b-rq298", provider="aws", service_priority="highest"}' + - series: 'loki_panic_total{app="loki-compactor", cluster_id="zj88t", cluster_type="workload_cluster", container="compactor", customer="giantswarm", installation="gorilla", instance="10.7.116.221:3100", job="zj88t-prometheus/workload-zj88t/0", namespace="loki", node="ip-10-6-2-178.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-compactor-676b8c897b-rq298", provider="aws", pipeline="stable", service_priority="highest"}' values: "0+0x20 1+0x160" # 1 panic after 20 minutes - - series: 'loki_request_duration_seconds_count{app="loki-distributor", cluster_id="zj88t", cluster_type="workload_cluster", container="distributor", customer="giantswarm", installation="gorilla", instance="10.7.75.90:3100", job="zj88t-prometheus/workload-zj88t/0", method="POST", namespace="loki", node="ip-10-6-2-141.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-distributor-74b78f5559-tz6zs", provider="aws", route="loki_api_v1_push", service_priority="highest", status_code="204", ws="false"}' + - series: 'loki_request_duration_seconds_count{app="loki-distributor", cluster_id="zj88t", cluster_type="workload_cluster", container="distributor", customer="giantswarm", installation="gorilla", instance="10.7.75.90:3100", job="zj88t-prometheus/workload-zj88t/0", method="POST", namespace="loki", node="ip-10-6-2-141.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-distributor-74b78f5559-tz6zs", provider="aws", pipeline="stable", route="loki_api_v1_push", service_priority="highest", status_code="204", ws="false"}' values: "0+60x180" # 1 request per second OK for 3 hours - - series: 'loki_request_duration_seconds_count{app="loki-distributor", cluster_id="zj88t", cluster_type="workload_cluster", container="distributor", customer="giantswarm", installation="gorilla", instance="10.7.75.90:3100", job="zj88t-prometheus/workload-zj88t/0", method="POST", namespace="loki", node="ip-10-6-2-141.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-distributor-74b78f5559-tz6zs", provider="aws", route="loki_api_v1_push", service_priority="highest", status_code="503", ws="false"}' - values: "0+0x20 0+30x160" # After 20 minutes, we also have 0.5 rq/s failing + - series: 'loki_request_duration_seconds_count{app="loki-distributor", cluster_id="zj88t", cluster_type="workload_cluster", container="distributor", customer="giantswarm", installation="gorilla", instance="10.7.75.90:3100", job="zj88t-prometheus/workload-zj88t/0", method="POST", namespace="loki", node="ip-10-6-2-141.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-distributor-74b78f5559-tz6zs", provider="aws", pipeline="stable", route="loki_api_v1_push", service_priority="highest", status_code="503", ws="false"}' + values: "0+0x20 0+30x160" # After 20 minutes, we also have 0.5 rq/s failing alert_rule_test: - alertname: LokiRequestPanics eval_time: 15m # should be OK after 15 minutes @@ -29,6 +29,9 @@ tests: cancel_if_outside_working_hours: "true" cancel_if_scrape_timeout: "true" cluster_id: zj88t + installation: gorilla + pipeline: stable + provider: aws job: zj88t-prometheus/workload-zj88t/0 namespace: loki severity: page @@ -56,6 +59,9 @@ tests: cancel_if_outside_working_hours: "true" cancel_if_scrape_timeout: "true" cluster_id: zj88t + installation: gorilla + provider: aws + pipeline: stable job: zj88t-prometheus/workload-zj88t/0 namespace: loki route: loki_api_v1_push @@ -88,6 +94,8 @@ tests: container: compactor customer: giantswarm installation: gorilla + provider: aws + pipeline: stable name: compactor namespace: loki organization: giantswarm-production diff --git a/test/tests/providers/global/mimir.rules.test.yml b/test/tests/providers/global/mimir.rules.test.yml index d67e708c4..41e338180 100644 --- a/test/tests/providers/global/mimir.rules.test.yml +++ b/test/tests/providers/global/mimir.rules.test.yml @@ -55,7 +55,7 @@ tests: - interval: 1m input_series: # For the first 60min: test with 1 pod: none, up, down - - series: 'up{app="mimir",cluster_type="management_cluster", cluster_id="gauss", installation="gauss", service="mimir-ingester"}' + - series: 'up{app="mimir",cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", service="mimir-ingester"}' values: "_x20 1+0x20 0+0x20" alert_rule_test: - alertname: MimirComponentDown @@ -76,12 +76,15 @@ tests: cancel_if_cluster_status_updating: "true" cancel_if_outside_working_hours: "true" cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing exp_annotations: description: "Mimir component : mimir-ingester is down." - interval: 1m input_series: # test with 1 pod: none, up, down - - series: 'up{app="grafana-agent",cluster_type="management_cluster", cluster_id="golem", installation="golem", namespace="mimir"}' + - series: 'up{app="grafana-agent",cluster_type="management_cluster", cluster_id="golem", provider="capa", pipeline="testing", installation="golem", namespace="mimir"}' values: "_x20 1+0x70 0+0x70" alert_rule_test: - alertname: GrafanaAgentForPrometheusRulesDown @@ -98,6 +101,9 @@ tests: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cluster_id: golem + installation: golem + provider: capa + pipeline: testing severity: page team: atlas topic: observability diff --git a/test/tests/providers/global/prometheus-agent.rules.test.yml b/test/tests/providers/global/prometheus-agent.rules.test.yml index c4602f313..ac09a027c 100644 --- a/test/tests/providers/global/prometheus-agent.rules.test.yml +++ b/test/tests/providers/global/prometheus-agent.rules.test.yml @@ -94,15 +94,15 @@ tests: # Tests for `PrometheusAgentShardsMissing` alert - interval: 1m input_series: - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_operator_spec_shards{cluster_id="test01", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' + - series: 'prometheus_operator_spec_shards{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' values: '3+0x60 5+0x60 3+0x60' - - series: 'prometheus_operator_spec_replicas{cluster_id="test01", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' + - series: 'prometheus_operator_spec_replicas{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' values: '1+0x180' alert_rule_test: - alertname: PrometheusAgentShardsMissing @@ -115,6 +115,9 @@ tests: - exp_labels: area: empowerment cluster_id: test01 + installation: myinstall + provider: aws + pipeline: testing severity: page team: atlas topic: observability @@ -133,6 +136,9 @@ tests: - exp_labels: area: empowerment cluster_id: test01 + installation: myinstall + provider: aws + pipeline: testing severity: none team: atlas topic: observability @@ -151,6 +157,9 @@ tests: - exp_labels: area: empowerment cluster_id: test01 + installation: myinstall + provider: aws + pipeline: testing severity: page team: atlas topic: observability @@ -169,6 +178,9 @@ tests: - exp_labels: area: empowerment cluster_id: test01 + installation: myinstall + provider: aws + pipeline: testing severity: none team: atlas topic: observability @@ -188,13 +200,13 @@ tests: # Tests for `PrometheusAgentShardsMissing` alert with missing `prometheus_operator_spec_shards` metric - interval: 1m input_series: - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' values: "10000+0x180" - - series: 'prometheus_operator_spec_replicas{cluster_id="test01", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' + - series: 'prometheus_operator_spec_replicas{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' values: '3+0x60 5+0x60 3+0x60' alert_rule_test: - alertname: PrometheusAgentShardsMissing @@ -207,6 +219,9 @@ tests: - exp_labels: area: empowerment cluster_id: test01 + installation: myinstall + provider: aws + pipeline: testing severity: page team: atlas topic: observability @@ -225,6 +240,9 @@ tests: - exp_labels: area: empowerment cluster_id: test01 + installation: myinstall + provider: aws + pipeline: testing severity: none team: atlas topic: observability @@ -243,6 +261,9 @@ tests: - exp_labels: area: empowerment cluster_id: test01 + installation: myinstall + provider: aws + pipeline: testing severity: page team: atlas topic: observability @@ -261,6 +282,9 @@ tests: - exp_labels: area: empowerment cluster_id: test01 + installation: myinstall + provider: aws + pipeline: testing severity: none team: atlas topic: observability diff --git a/test/tests/providers/global/prometheus.rules.test.yml b/test/tests/providers/global/prometheus.rules.test.yml index 0a89ff09b..1ac8adbf9 100644 --- a/test/tests/providers/global/prometheus.rules.test.yml +++ b/test/tests/providers/global/prometheus.rules.test.yml @@ -10,24 +10,24 @@ tests: # Test PrometheusJobScrapingFailure and PrometheusCriticalJobScrapingFailure - interval: 1h input_series: - - series: 'up{app="kubernetes",installation="gauss",cluster_id="gauss",job="gauss-prometheus/kubernetes-apiserver-gauss/0"}' + - series: 'up{app="kubernetes", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing", job="gauss-prometheus/kubernetes-apiserver-gauss/0"}' values: "1+0x240" # critcal target up for 5d and down for 5d - - series: 'up{app="kube-controller-manager",installation="gauss",cluster_id="gauss",job="gauss-prometheus/kubernetes-controller-manager-gauss/0"}' + - series: 'up{app="kube-controller-manager", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing", job="gauss-prometheus/kubernetes-controller-manager-gauss/0"}' values: "1+0x120 0+0x120" - - series: 'up{app="kube-scheduler",installation="gauss",cluster_id="gauss",job="gauss-prometheus/kubernetes-scheduler-gauss/0"}' + - series: 'up{app="kube-scheduler", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing", job="gauss-prometheus/kubernetes-scheduler-gauss/0"}' values: "1+0x240" - - series: 'up{app="kubelet",installation="gauss",cluster_id="gauss",job="gauss-prometheus/kubernetes-kubelet-gauss/0"}' + - series: 'up{app="kubelet", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing", job="gauss-prometheus/kubernetes-kubelet-gauss/0"}' values: "1+0x240" - - series: 'up{app="node-exporter",installation="gauss",cluster_id="gauss",job="gauss-prometheus/node-exporter-gauss/0"}' + - series: 'up{app="node-exporter", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing", job="gauss-prometheus/node-exporter-gauss/0"}' values: "1+0x240" - - series: 'up{app="kube-state-metrics",installation="gauss",cluster_id="gauss",job="gauss-prometheus/kube-state-metrics-gauss/0"}' + - series: 'up{app="kube-state-metrics", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing", job="gauss-prometheus/kube-state-metrics-gauss/0"}' values: "1+0x240" # Add bastion host test to ensure we do not page - - series: 'up{app="node-exporter",installation="gauss",cluster_id="gauss",job="gauss-prometheus/bastions/0"}' + - series: 'up{app="node-exporter", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing", job="gauss-prometheus/bastions/0"}' values: "1+0x240" # non-critcal target up for 5d and down for 5d - - series: 'up{app="app-exporter",installation="gauss",cluster_id="gauss",job="gauss-prometheus/app-exporter-gauss/0"}' + - series: 'up{app="app-exporter", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing", job="gauss-prometheus/app-exporter-gauss/0"}' values: "1+0x120 0+0x120" alert_rule_test: - alertname: PrometheusCriticalJobScrapingFailure @@ -48,6 +48,8 @@ tests: cancel_if_outside_working_hours: "true" cluster_id: "gauss" installation: "gauss" + provider: "aws" + pipeline: "testing" job: "gauss-prometheus/kubernetes-controller-manager-gauss/0" exp_annotations: opsrecipe: "prometheus-job-scraping-failure/" @@ -61,6 +63,8 @@ tests: cancel_if_outside_working_hours: "true" cluster_id: "gauss" installation: "gauss" + provider: "aws" + pipeline: "testing" job: "gauss-prometheus/app-exporter-gauss/0" exp_annotations: opsrecipe: "prometheus-job-scraping-failure/" @@ -79,6 +83,8 @@ tests: app: "kube-controller-manager" cluster_id: "gauss" installation: "gauss" + provider: "aws" + pipeline: "testing" job: "gauss-prometheus/kubernetes-controller-manager-gauss/0" cancel_if_outside_working_hours: "true" cancel_if_cluster_is_not_running_prometheus_agent: "true" diff --git a/test/tests/providers/global/promtail.rules.test.yml b/test/tests/providers/global/promtail.rules.test.yml index b2739c330..f83f846ce 100644 --- a/test/tests/providers/global/promtail.rules.test.yml +++ b/test/tests/providers/global/promtail.rules.test.yml @@ -6,12 +6,12 @@ tests: - interval: 1m input_series: # For the first 60min: test with 1 pod: none, up, down - - series: 'up{container="promtail",cluster_type="management_cluster", cluster_id="gauss", installation="gauss", node="ip-10-0-5-35.eu-west-1.compute.internal"}' - values: "_x20 1+0x20 0+0x40" + - series: 'up{container="promtail", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-35.eu-west-1.compute.internal"}' + values: "_x20 1+0x20 0+0x40" # From 60min: test with 2 pods: 1 up and 1 down, 2 up, 2 down. - - series: 'up{container="promtail",cluster_type="management_cluster", cluster_id="gauss", installation="gauss", node="ip-10-0-5-145.eu-west-1.compute.internal"}' + - series: 'up{container="promtail", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal"}' values: "_x80 1+0x40 1+0x20 0+0x40" - - series: 'up{container="promtail",cluster_type="management_cluster", cluster_id="gauss", installation="gauss", node="ip-10-0-5-76.eu-west-1.compute.internal"}' + - series: 'up{container="promtail", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-76.eu-west-1.compute.internal"}' values: "_x80 0+0x40 1+0x20 0+0x40" alert_rule_test: - alertname: PromtailDown @@ -28,6 +28,9 @@ tests: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing severity: page team: atlas topic: observability @@ -45,6 +48,9 @@ tests: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing severity: page team: atlas topic: observability @@ -63,6 +69,9 @@ tests: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing severity: page team: atlas topic: observability @@ -72,9 +81,9 @@ tests: - interval: 1m input_series: # Tests with multiple cases: no metrics, no requests, only status_code 204 ones, 204 ones and 500 that are less than 10% of the the total, 500 request that represent more than 10% of the total, only 500 ones - - series: 'promtail_request_duration_seconds_count{status_code="500", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", node="ip-10-0-5-145.eu-west-1.compute.internal", pod="promtail-2j7z7"}' + - series: 'promtail_request_duration_seconds_count{status_code="500", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", pod="promtail-2j7z7"}' values: "_x60 0+0x60 0+0x60 0+50x60 3000+100x60 9000+600x60" - - series: 'promtail_request_duration_seconds_count{status_code="204", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", node="ip-10-0-5-145.eu-west-1.compute.internal", pod="promtail-2j7z7"}' + - series: 'promtail_request_duration_seconds_count{status_code="204", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", pod="promtail-2j7z7"}' values: "_x60 0+0x60 0+600x60 36000+600x60 72000+600x60 108000+0x60" alert_rule_test: - alertname: PromtailRequestsErrors @@ -94,6 +103,9 @@ tests: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing severity: page team: atlas topic: observability @@ -109,6 +121,9 @@ tests: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing severity: page team: atlas topic: observability diff --git a/test/tests/providers/global/sloth.rules.test.yml b/test/tests/providers/global/sloth.rules.test.yml index 05915b9fb..76a14bd1a 100644 --- a/test/tests/providers/global/sloth.rules.test.yml +++ b/test/tests/providers/global/sloth.rules.test.yml @@ -6,7 +6,7 @@ tests: - interval: 1m input_series: # For the first 60min: test with 1 pod: none, up, down - - series: 'up{app="sloth",cluster_type="management_cluster", cluster_id="gauss", installation="gauss"}' + - series: 'up{app="sloth",cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}' values: "_x20 1+0x20 0+0x20" alert_rule_test: - alertname: SlothDown @@ -19,6 +19,9 @@ tests: - exp_labels: area: managedservices cluster_id: gauss + installation: gauss + provider: aws + pipeline: testing severity: page team: atlas topic: observability