Skip to content

Commit

Permalink
fix some rules labels for mimir (#1179)
Browse files Browse the repository at this point in the history
Signed-off-by: QuentinBisson <[email protected]>
  • Loading branch information
QuentinBisson authored May 14, 2024
1 parent c998734 commit efa800a
Show file tree
Hide file tree
Showing 8 changed files with 14 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ spec:
{{`Machine {{ $labels.exported_namespace}}/{{ $labels.name }} stuck in phase {{ $labels.phase }} for more than 30 minutes.`}}
opsrecipe: capi-machine/
dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
expr: capi_machine_status_phase{phase!~"Running", name!~".*bastion.*"} > 0
expr: capi_machine_status_phase{phase!="Running", name!~".*bastion.*"} > 0
for: 30m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ spec:
annotations:
description: '{{`Cannot renew Certificate for Secret {{ $labels.namespace }}/{{ $labels.certificatename }} because it is missing.`}}'
opsrecipe: managed-app-cert-manager/missing-certificate-for-secret/
expr: count(cert_exporter_secret_not_after{cluster_type="management_cluster", secretkey="tls.crt", certificatename!=""}) by (cluster_id, certificatename, namespace) unless count(label_replace(cert_exporter_certificate_cr_not_after{cluster_type="management_cluster"}, "certificatename", "$1", "name", "(.*)")) by (cluster_id, certificatename,namespace)
expr: count(cert_exporter_secret_not_after{cluster_type="management_cluster", secretkey="tls.crt", certificatename!=""}) by (cluster_id, installation, pipeline, provider, certificatename, namespace) unless count(label_replace(cert_exporter_certificate_cr_not_after{cluster_type="management_cluster"}, "certificatename", "$1", "name", "(.*)")) by (cluster_id, installation, pipeline, provider, certificatename, namespace)
for: 5m
labels:
area: kaas
Expand Down
4 changes: 2 additions & 2 deletions helm/prometheus-rules/templates/alerting-rules/dex.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ spec:
annotations:
description: '{{`Dex running on {{ $labels.cluster_id }} is reporting an increased error rate.`}}'
opsrecipe: dex-error-rate-high/
expr: sum(increase(http_requests_total{app="dex", handler!="/token", code=~"^[4]..$|[5]..$", cluster_type="management_cluster"}[5m])) by (cluster_id) > 10
expr: sum(increase(http_requests_total{app="dex", handler!="/token", code=~"^[4]..$|[5]..$", cluster_type="management_cluster"}[5m])) by (cluster_id, installation, pipeline, provider) > 10
for: 30m
labels:
area: managedapps
Expand All @@ -29,7 +29,7 @@ spec:
annotations:
description: '{{`dex-operator failed to renew secret of {{ $labels.app_registration_name }} for {{ $labels.app_owner }} on provider {{ $labels.provider_type }}.`}}'
opsrecipe: dex-operator/
expr: min by(app_registration_name, app_owner, app_namespace, provider_name, provider_type, installation, cluster_id) (aggregation:dex_operator_idp_secret_expiry_time{cluster_type="management_cluster"}) - time() < 60*60*12
expr: min by(app_registration_name, app_owner, app_namespace, provider_name, provider_type, installation, cluster_id, pipeline, provider) (aggregation:dex_operator_idp_secret_expiry_time{cluster_type="management_cluster"}) - time() < 60*60*12
for: 30m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ spec:
- alert: KubeletPLEGLatencyTooHigh
annotations:
description: '{{`Kubelet ({{ $labels.instance }}) PLEG latency is too high.`}}'
expr: histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster_id, instance, le)) > 100
expr: histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (cluster_id, installation, pipeline, provider, instance, le)) > 100
for: 10m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ spec:
for: 1d
labels:
area: empowerment
severity: none
severity: notify
team: atlas
topic: observability
cancel_if_outside_working_hours: "true"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ spec:
label_join(policy_report_result{
policy!="check-deprecated-apis-1-25",
cluster_type="management_cluster",
kind=~"Deployment"
kind="Deployment"
}, "deployment", ",", "name")
) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, deployment, category, policy, status)
* on(deployment) group_left(team, app) sum(
Expand All @@ -442,7 +442,7 @@ spec:
label_join(policy_report_result{
policy!="check-deprecated-apis-1-25",
cluster_type="management_cluster",
kind=~"DaemonSet"
kind="DaemonSet"
}, "daemonset", ",", "name")
) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, daemonset, category, policy, status)
* on(daemonset) group_left(team, app) sum(
Expand All @@ -460,7 +460,7 @@ spec:
label_join(policy_report_result{
policy!="check-deprecated-apis-1-25",
cluster_type="management_cluster",
kind=~"StatefulSet"
kind="StatefulSet"
}, "statefulset", ",", "name")
) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, statefulset, category, policy, status)
* on(statefulset) group_left(team, app) sum(
Expand All @@ -478,7 +478,7 @@ spec:
label_join(policy_report_result{
policy!="check-deprecated-apis-1-25",
cluster_type="management_cluster",
kind=~"Job"
kind="Job"
}, "job", ",", "name")
) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, job, category, policy, status)
* on(job) group_left(team, app) sum(
Expand All @@ -496,7 +496,7 @@ spec:
label_join(policy_report_result{
policy!="check-deprecated-apis-1-25",
cluster_type="management_cluster",
kind=~"CronJob"
kind="CronJob"
}, "cronjob", ",", "name")
) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, cronjob, category, policy, status)
* on(cronjob) group_left(team, app) sum(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ spec:
groups:
- name: helm-operations.recording
rules:
- expr: "sum by (cluster_id, release, event) (helmclient_library_event_total{release!=''})"
- expr: "sum by (cluster_id, installation, pipeline, provider, release, event) (helmclient_library_event_total{release!=''})"
record: monitoring:helm:number_of_operations_on_release
4 changes: 2 additions & 2 deletions test/tests/providers/global/prometheus.rules.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ tests:
exp_alerts:
- exp_labels:
area: empowerment
severity: none
severity: notify
team: atlas
topic: observability
cancel_if_outside_working_hours: "true"
Expand All @@ -57,7 +57,7 @@ tests:
description: "Prometheus gauss/gauss has failed to scrape all targets in gauss-prometheus/kubernetes-controller-manager-gauss/0 job."
- exp_labels:
area: empowerment
severity: none
severity: notify
team: atlas
topic: observability
cancel_if_outside_working_hours: "true"
Expand Down

0 comments on commit efa800a

Please sign in to comment.