diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index 831f4986..06de5bc1 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -94,14 +94,14 @@ spec: description: 'Mimir ingester is consuming too much resources and needs to be scaled up.' opsrecipe: mimir/ expr: |- - sum by (namespace) (container_memory_working_set_bytes{container="ingester", namespace="mimir"}) + sum by (cluster_id, installation, namespace, pipeline, provider) (container_memory_working_set_bytes{container="ingester", namespace="mimir"}) / - sum by(namespace) (kube_pod_container_resource_requests{container="ingester", namespace="mimir", unit="byte"}) + sum by(cluster_id, installation, namespace, pipeline, provider) (kube_pod_container_resource_requests{container="ingester", namespace="mimir", unit="byte"}) >= 0.90 or - sum(rate(container_cpu_usage_seconds_total{container="ingester", namespace="mimir"}[5m])) by (namespace) + sum(rate(container_cpu_usage_seconds_total{container="ingester", namespace="mimir"}[5m])) by (cluster_id, installation, namespace, pipeline, provider) / - sum by(namespace) (kube_pod_container_resource_requests{container="ingester", namespace="mimir", unit="core"}) + sum by(cluster_id, installation, namespace, pipeline, provider) (kube_pod_container_resource_requests{container="ingester", namespace="mimir", unit="core"}) >= 0.90 for: 30m labels: @@ -118,14 +118,14 @@ spec: description: 'Mimir ingester is consuming very few resources and needs to be scaled down.' opsrecipe: mimir/ expr: |- - sum by (namespace) (container_memory_working_set_bytes{container="ingester", namespace="mimir"}) + sum by (cluster_id, installation, namespace, pipeline, provider) (container_memory_working_set_bytes{container="ingester", namespace="mimir"}) / - sum by(namespace) (kube_pod_container_resource_requests{container="ingester", namespace="mimir", unit="byte"}) + sum by(cluster_id, installation, namespace, pipeline, provider) (kube_pod_container_resource_requests{container="ingester", namespace="mimir", unit="byte"}) <= 0.30 or - sum(rate(container_cpu_usage_seconds_total{container="ingester", namespace="mimir"}[5m])) by (namespace) + sum(rate(container_cpu_usage_seconds_total{container="ingester", namespace="mimir"}[5m])) by (cluster_id, installation, namespace, pipeline, provider) / - sum by(namespace) (kube_pod_container_resource_requests{container="ingester", namespace="mimir", unit="core"}) + sum by(cluster_id, installation, namespace, pipeline, provider) (kube_pod_container_resource_requests{container="ingester", namespace="mimir", unit="core"}) <= 0.30 for: 30m labels: diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml index 895f1092..d8518d1c 100644 --- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml +++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -173,13 +173,13 @@ tests: # Test for MimirIngesterNeedsToBeScaledUp alert - interval: 1m input_series: - - series: 'sum by (namespace) (container_memory_working_set_bytes{container="ingester", namespace="mimir", cluster_type="management_cluster"}' + - series: 'sum by (cluster_id, installation, namespace, pipeline, provider) (container_memory_working_set_bytes{container="ingester", namespace="mimir", cluster_type="management_cluster"}' values: "15+0x20 23+0x40 16+0x140 23+0x40 15+0x60" # mimir-ingester real memory usage gradually increases until it goes beyond 90% of the memory requests. - - series: 'sum by(namespace) (kube_pod_container_resource_requests{container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster"}' + - series: 'sum by(cluster_id, installation, namespace, pipeline, provider) (kube_pod_container_resource_requests{container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster"}' values: "24+0x300" # mimir-ingester memory requests stay the same for the entire duration of the test. - - series: 'sum(rate(container_cpu_usage_seconds_total{container="ingester", namespace="mimir", cluster_type="management_cluster"}[5m])) by (namespace)' + - series: 'sum(rate(container_cpu_usage_seconds_total{container="ingester", namespace="mimir", cluster_type="management_cluster"}[5m])) by (cluster_id, installation, namespace, pipeline, provider)' values: "1+0x100 2.8+0x40 1+0x60 2.8+0x40 1+0x60" # mimir-ingester real cpu usage gradually increases until it goes beyond 90% of the cpu requests. - - series: 'sum by(namespace) (kube_pod_container_resource_requests{container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster"})' + - series: 'sum by(cluster_id, installation, namespace, pipeline, provider) (kube_pod_container_resource_requests{container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster"})' values: "3+0x300" # mimir-ingester cpu requests stay the same for the entire duration of the test alert_rule_test: - alertname: MimirIngesterNeedsToBeScaledUp @@ -250,13 +250,13 @@ tests: # Test for MimirIngesterNeedsToBeScaledDown alert - interval: 1m input_series: - - series: 'sum by (namespace) (container_memory_working_set_bytes{container="ingester", namespace="mimir", cluster_type="management_cluster"}' + - series: 'sum by (cluster_id, installation, namespace, pipeline, provider) (container_memory_working_set_bytes{container="ingester", namespace="mimir", cluster_type="management_cluster"}' values: "15+0x20 5+0x40 16+0x140 5+0x40 15+0x60" # mimir-ingester real memory usage gradually increases until it goes beyond 90% of the memory requests. - - series: 'sum by(namespace) (kube_pod_container_resource_requests{container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster"}' + - series: 'sum by(cluster_id, installation, namespace, pipeline, provider) (kube_pod_container_resource_requests{container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster"}' values: "24+0x300" # mimir-ingester memory requests stay the same for the entire duration of the test. - - series: 'sum(rate(container_cpu_usage_seconds_total{container="ingester", namespace="mimir", cluster_type="management_cluster"}[5m])) by (namespace)' + - series: 'sum(rate(container_cpu_usage_seconds_total{container="ingester", namespace="mimir", cluster_type="management_cluster"}[5m])) by (cluster_id, installation, namespace, pipeline, provider)' values: "1+0x100 0.6+0x40 1+0x60 0.6+0x40 1+0x60" # mimir-ingester real cpu usage gradually increases until it goes beyond 90% of the cpu requests. - - series: 'sum by(namespace) (kube_pod_container_resource_requests{container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster"})' + - series: 'sum by(cluster_id, installation, namespace, pipeline, provider) (kube_pod_container_resource_requests{container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster"})' values: "3+0x300" # mimir-ingester cpu requests stay the same for the entire duration of the test alert_rule_test: - alertname: MimirIngesterNeedsToBeScaledDown