Skip to content

Commit

Permalink
Add guard to CPU and memory HPA queries against missing samples
Browse files Browse the repository at this point in the history
  • Loading branch information
jhalterman committed Mar 22, 2024
1 parent f573a03 commit 617fd16
Show file tree
Hide file tree
Showing 4 changed files with 244 additions and 3 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

### Jsonnet

* [BUGFIX] Guard against missing samples in KEDA queries. #7691

### Mimirtool

* [BUGFIX] Fix panic in `loadgen` subcommand. #7629
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1957,6 +1957,7 @@ spec:
name: alertmanager
triggers:
- metadata:
ignoreNullValues: "false"
metricName: cortex_alertmanager_cpu_hpa_default
query: |
max_over_time(
Expand All @@ -1966,11 +1967,20 @@ spec:
max by (pod) (up{container="alertmanager",namespace="default"}) > 0
)[15m:]
) * 1000
and
count (
count_over_time(
present_over_time(
container_cpu_usage_seconds_total{container="alertmanager",namespace="default"}[1m]
)[15m:1m]
) >= 15
)
serverAddress: http://prometheus.default:9090/prometheus
threshold: "1780"
name: cortex_alertmanager_cpu_hpa_default
type: prometheus
- metadata:
ignoreNullValues: "false"
metricName: cortex_alertmanager_memory_hpa_default
query: |
max_over_time(
Expand All @@ -1991,6 +2001,14 @@ spec:
max by (pod) (kube_pod_container_status_last_terminated_reason{container="alertmanager", namespace="default", reason="OOMKilled"})
or vector(0)
)
and
count (
count_over_time(
present_over_time(
container_memory_working_set_bytes{container="alertmanager",namespace="default"}[1m]
)[15m:1m]
) >= 15
)
serverAddress: http://prometheus.default:9090/prometheus
threshold: "9556302233"
name: cortex_alertmanager_memory_hpa_default
Expand All @@ -2017,6 +2035,7 @@ spec:
name: distributor
triggers:
- metadata:
ignoreNullValues: "false"
metricName: cortex_distributor_cpu_hpa_default
query: |
max_over_time(
Expand All @@ -2026,11 +2045,20 @@ spec:
max by (pod) (up{container="distributor",namespace="default"}) > 0
)[15m:]
) * 1000
and
count (
count_over_time(
present_over_time(
container_cpu_usage_seconds_total{container="distributor",namespace="default"}[1m]
)[15m:1m]
) >= 15
)
serverAddress: http://prometheus.default:9090/prometheus
threshold: "1780"
name: cortex_distributor_cpu_hpa_default
type: prometheus
- metadata:
ignoreNullValues: "false"
metricName: cortex_distributor_memory_hpa_default
query: |
max_over_time(
Expand All @@ -2051,6 +2079,14 @@ spec:
max by (pod) (kube_pod_container_status_last_terminated_reason{container="distributor", namespace="default", reason="OOMKilled"})
or vector(0)
)
and
count (
count_over_time(
present_over_time(
container_memory_working_set_bytes{container="distributor",namespace="default"}[1m]
)[15m:1m]
) >= 15
)
serverAddress: http://prometheus.default:9090/prometheus
threshold: "3058016714"
name: cortex_distributor_memory_hpa_default
Expand Down Expand Up @@ -2122,6 +2158,7 @@ spec:
name: query-frontend
triggers:
- metadata:
ignoreNullValues: "false"
metricName: query_frontend_cpu_hpa_default
query: |
max_over_time(
Expand All @@ -2131,11 +2168,20 @@ spec:
max by (pod) (up{container="query-frontend",namespace="default"}) > 0
)[15m:]
) * 1000
and
count (
count_over_time(
present_over_time(
container_cpu_usage_seconds_total{container="query-frontend",namespace="default"}[1m]
)[15m:1m]
) >= 15
)
serverAddress: http://prometheus.default:9090/prometheus
threshold: "2225"
name: query_frontend_cpu_hpa_default
type: prometheus
- metadata:
ignoreNullValues: "false"
metricName: query_frontend_memory_hpa_default
query: |
max_over_time(
Expand All @@ -2156,6 +2202,14 @@ spec:
max by (pod) (kube_pod_container_status_last_terminated_reason{container="query-frontend", namespace="default", reason="OOMKilled"})
or vector(0)
)
and
count (
count_over_time(
present_over_time(
container_memory_working_set_bytes{container="query-frontend",namespace="default"}[1m]
)[15m:1m]
) >= 15
)
serverAddress: http://prometheus.default:9090/prometheus
threshold: "559939584"
name: query_frontend_memory_hpa_default
Expand All @@ -2182,6 +2236,7 @@ spec:
name: ruler
triggers:
- metadata:
ignoreNullValues: "false"
metricName: ruler_cpu_hpa_default
query: |
max_over_time(
Expand All @@ -2191,11 +2246,20 @@ spec:
max by (pod) (up{container="ruler",namespace="default"}) > 0
)[15m:]
) * 1000
and
count (
count_over_time(
present_over_time(
container_cpu_usage_seconds_total{container="ruler",namespace="default"}[1m]
)[15m:1m]
) >= 15
)
serverAddress: http://prometheus.default:9090/prometheus
threshold: "890"
name: ruler_cpu_hpa_default
type: prometheus
- metadata:
ignoreNullValues: "false"
metricName: ruler_memory_hpa_default
query: |
max_over_time(
Expand All @@ -2216,6 +2280,14 @@ spec:
max by (pod) (kube_pod_container_status_last_terminated_reason{container="ruler", namespace="default", reason="OOMKilled"})
or vector(0)
)
and
count (
count_over_time(
present_over_time(
container_memory_working_set_bytes{container="ruler",namespace="default"}[1m]
)[15m:1m]
) >= 15
)
serverAddress: http://prometheus.default:9090/prometheus
threshold: "5733781340"
name: ruler_memory_hpa_default
Expand All @@ -2242,6 +2314,7 @@ spec:
name: ruler-querier
triggers:
- metadata:
ignoreNullValues: "false"
metricName: ruler_querier_cpu_hpa_default
query: |
max_over_time(
Expand All @@ -2251,11 +2324,20 @@ spec:
max by (pod) (up{container="ruler-querier",namespace="default"}) > 0
)[15m:]
) * 1000
and
count (
count_over_time(
present_over_time(
container_cpu_usage_seconds_total{container="ruler-querier",namespace="default"}[1m]
)[15m:1m]
) >= 15
)
serverAddress: http://prometheus.default:9090/prometheus
threshold: "178"
name: ruler_querier_cpu_hpa_default
type: prometheus
- metadata:
ignoreNullValues: "false"
metricName: ruler_querier_memory_hpa_default
query: |
max_over_time(
Expand All @@ -2276,6 +2358,14 @@ spec:
max by (pod) (kube_pod_container_status_last_terminated_reason{container="ruler-querier", namespace="default", reason="OOMKilled"})
or vector(0)
)
and
count (
count_over_time(
present_over_time(
container_memory_working_set_bytes{container="ruler-querier",namespace="default"}[1m]
)[15m:1m]
) >= 15
)
serverAddress: http://prometheus.default:9090/prometheus
threshold: "955630223"
name: ruler_querier_memory_hpa_default
Expand All @@ -2302,6 +2392,7 @@ spec:
name: ruler-query-frontend
triggers:
- metadata:
ignoreNullValues: "false"
metricName: ruler_query_frontend_cpu_hpa_default
query: |
max_over_time(
Expand All @@ -2311,11 +2402,20 @@ spec:
max by (pod) (up{container="ruler-query-frontend",namespace="default"}) > 0
)[15m:]
) * 1000
and
count (
count_over_time(
present_over_time(
container_cpu_usage_seconds_total{container="ruler-query-frontend",namespace="default"}[1m]
)[15m:1m]
) >= 15
)
serverAddress: http://prometheus.default:9090/prometheus
threshold: "1780"
name: ruler_query_frontend_cpu_hpa_default
type: prometheus
- metadata:
ignoreNullValues: "false"
metricName: ruler_query_frontend_memory_hpa_default
query: |
max_over_time(
Expand All @@ -2336,6 +2436,14 @@ spec:
max by (pod) (kube_pod_container_status_last_terminated_reason{container="ruler-query-frontend", namespace="default", reason="OOMKilled"})
or vector(0)
)
and
count (
count_over_time(
present_over_time(
container_memory_working_set_bytes{container="ruler-query-frontend",namespace="default"}[1m]
)[15m:1m]
) >= 15
)
serverAddress: http://prometheus.default:9090/prometheus
threshold: "559939584"
name: ruler_query_frontend_memory_hpa_default
Expand Down
Loading

0 comments on commit 617fd16

Please sign in to comment.