diff --git a/CHANGELOG.md b/CHANGELOG.md index 3566cff7d..b2b6506b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added a new alerting rule to `falco.rules.yml` to fire an alert for XZ-backdoor. - Added `CiliumAPITooSlow`. - Added `CODEOWNERS` files. +- Added `MimirIngesterNeedsToBeScaledUp` and `MimirIngesterNeedsToBeScaledDown` alerting rules to `mimir-rules.yml`. ### Changed diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index 835c4ef61..66b7a8532 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -89,4 +89,52 @@ spec: severity: page team: atlas topic: observability + - alert: MimirIngesterNeedsToBeScaledUp + annotations: + description: 'Mimir ingester is consuming too much resources and needs to be scaled up.' + opsrecipe: mimir-ingester/ + expr: |- + sum by (cluster_id, installation, namespace, pipeline, provider) (container_memory_working_set_bytes{container="ingester", namespace="mimir"}) + / + sum by(cluster_id, installation, namespace, pipeline, provider) (kube_pod_container_resource_requests{container="ingester", namespace="mimir", unit="byte"}) + >= 0.90 + or + sum(rate(container_cpu_usage_seconds_total{container="ingester", namespace="mimir"}[5m])) by (cluster_id, installation, namespace, pipeline, provider) + / + sum by(cluster_id, installation, namespace, pipeline, provider) (kube_pod_container_resource_requests{container="ingester", namespace="mimir", unit="core"}) + >= 0.90 + for: 30m + labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability + - alert: MimirIngesterNeedsToBeScaledDown + annotations: + description: 'Mimir ingester is consuming very few resources and needs to be scaled down.' + opsrecipe: mimir-ingester/ + expr: |- + sum by (cluster_id, installation, namespace, pipeline, provider) (container_memory_working_set_bytes{container="ingester", namespace="mimir"}) + / + sum by(cluster_id, installation, namespace, pipeline, provider) (kube_pod_container_resource_requests{container="ingester", namespace="mimir", unit="byte"}) + <= 0.30 + and + sum(rate(container_cpu_usage_seconds_total{container="ingester", namespace="mimir"}[5m])) by (cluster_id, installation, namespace, pipeline, provider) + / + sum by(cluster_id, installation, namespace, pipeline, provider) (kube_pod_container_resource_requests{container="ingester", namespace="mimir", unit="core"}) + <= 0.30 + for: 30m + labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability {{- end }} diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml index c0c98203e..80d0aba54 100644 --- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml +++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -152,7 +152,6 @@ tests: alert_rule_test: - alertname: MimirRestartingTooOften eval_time: 15m # should be OK after 15 minutes - exp_alerts: - alertname: MimirRestartingTooOften eval_time: 85m # After 85 minutes, should fire an alert for the t+85 error exp_alerts: @@ -171,4 +170,151 @@ tests: opsrecipe: "mimir/" - alertname: MimirRestartingTooOften eval_time: 140m # After 140m minutes, all should be back to normal + # Test for MimirIngesterNeedsToBeScaledUp alert + - interval: 1m + input_series: + # mimir-ingester real memory usage gradually increases until it goes beyond 90% of the memory requests. + - series: 'container_memory_working_set_bytes{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "8+0x20 11+0x40 8+0x140 11+0x40 8+0x60" + - series: 'container_memory_working_set_bytes{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "8+0x20 11+0x40 8+0x140 11+0x40 8+0x60" + # mimir-ingester memory requests stay the same for the entire duration of the test. + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "12+0x300" + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "12+0x300" + # mimir-ingester real cpu usage gradually increases until it goes beyond 90% of the cpu requests. + - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "0+60x100 6000+110x40 10400+60x60 14000+110x40 18400+60x60" + - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "0+60x300" + # mimir-ingester cpu requests stay the same for the entire duration of the test. + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "1.5+0x300" + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "1.5+0x300" + alert_rule_test: + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 15m + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 55m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: golem + installation: "golem" + pipeline: "testing" + provider: "capa" + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + description: Mimir ingester is consuming too much resources and needs to be scaled up. + opsrecipe: "mimir-ingester/" + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 100m + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 140m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: golem + installation: "golem" + pipeline: "testing" + provider: "capa" + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + description: Mimir ingester is consuming too much resources and needs to be scaled up. + opsrecipe: "mimir-ingester/" + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 180m + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 235m exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: golem + installation: "golem" + pipeline: "testing" + provider: "capa" + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + description: Mimir ingester is consuming too much resources and needs to be scaled up. + opsrecipe: "mimir-ingester/" + - alertname: MimirIngesterNeedsToBeScaledUp + eval_time: 280m + # Test for MimirIngesterNeedsToBeScaledDown alert + - interval: 1m + input_series: + # mimir-ingester real memory usage gradually decreases until it goes below 30% of the memory requests. + - series: 'container_memory_working_set_bytes{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "8+0x20 2+0x40 8+0x140 2+0x40 8+0x60" + - series: 'container_memory_working_set_bytes{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "8+0x20 2+0x40 8+0x140 2+0x40 8+0x60" + # mimir-ingester memory requests stay the same for the entire duration of the test. + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "12+0x300" + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "12+0x300" + # mimir-ingester real cpu usage gradually increases until it goes below 30% of the cpu requests. + - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "0+60x100 6000+10x40 6400+60x60 10000+10x40 10400+60x60" + - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "0+30x300" + # mimir-ingester cpu requests stay the same for the entire duration of the test + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "1.5+0x300" + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + values: "1.5+0x300" + alert_rule_test: + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 15m + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 55m + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 100m + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 135m + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 180m + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 240m + exp_alerts: + - exp_labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + cluster_id: golem + installation: "golem" + pipeline: "testing" + provider: "capa" + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + description: Mimir ingester is consuming very few resources and needs to be scaled down. + opsrecipe: "mimir-ingester/" + - alertname: MimirIngesterNeedsToBeScaledDown + eval_time: 280m