Skip to content

Commit

Permalink
bigmac label update
Browse files Browse the repository at this point in the history
  • Loading branch information
ssyno committed Jun 11, 2024
1 parent a1161ea commit aca2acd
Show file tree
Hide file tree
Showing 5 changed files with 10 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ spec:
annotations:
description: '{{`cert-manager in namespace {{ $labels.namespace }} is down.`}}'
opsrecipe: cert-manager-down/
expr: label_replace(up{app=~"cert-manager-(app|controller)"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0
expr: label_replace(up{job=~"cert-manager-(app|controller)"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0
for: 15m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ spec:
annotations:
description: '{{`Dex running on {{ $labels.cluster_id }} is reporting an increased error rate.`}}'
opsrecipe: dex-error-rate-high/
expr: sum(increase(http_requests_total{app="dex", handler!="/token", code=~"^[4]..$|[5]..$", cluster_type="management_cluster"}[5m])) by (cluster_id, installation, pipeline, provider) > 10
expr: sum(increase(http_requests_total{job="dex", handler!="/token", code=~"^[4]..$|[5]..$", cluster_type="management_cluster"}[5m])) by (cluster_id, installation, pipeline, provider) > 10
for: 30m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ spec:
"secret",
"(.*)-teleport-join-token"
)
) by (cluster_id, installation, provider, pipeline)
) by (cluster_id, installation, provider)
!= sum (
capi_cluster_status_phase{phase="Provisioned"}
) by (cluster_id, installation, provider, pipeline)
) by (cluster_id, installation, provider)
for: 60m
labels:
area: kaas
Expand All @@ -48,10 +48,10 @@ spec:
"configmap",
"(.*)-teleport-kube-agent-config"
)
) by (cluster_id, installation, provider, pipeline)
) by (cluster_id, installation, provider)
!= sum (
capi_cluster_status_phase{phase="Provisioned"}
) by (cluster_id, installation, provider, pipeline)
) by (cluster_id, installation, provider)
for: 60m
labels:
area: kaas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,14 @@ rule_files:
tests:
- interval: 1m
input_series:
- series: 'up{app="cert-manager-app", cluster_id="12345", cluster_type="workload_cluster", container="cert-manager", customer="giantswarm", installation="golem", instance="10.0.0.0:1234", job="12345-prometheus/workload-12345/0", namespace="kube-system", node="ip-10-0-0-0.eu-central-1.compute.internal", organization="giantswarm", pod="cert-manager-controller-7fcc585578-gnprd", provider="capa", service_priority="highest"}'
- series: 'up{cluster_id="12345", cluster_type="workload_cluster", container="cert-manager", customer="giantswarm", installation="golem", instance="10.0.0.0:1234", job="12345-prometheus/workload-12345/0", namespace="kube-system", organization="giantswarm", pod="cert-manager-controller-7fcc585578-gnprd", provider="capa", service_priority="highest"}'
values: "0+0x60"
alert_rule_test:
- alertname: CertManagerDown
eval_time: 15m
exp_alerts:
- exp_labels:
alertname: CertManagerDown
app: cert-manager-app
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
Expand All @@ -27,7 +26,6 @@ tests:
ip: 10.0.0.0
job: 12345-prometheus/workload-12345/0
namespace: kube-system
node: ip-10-0-0-0.eu-central-1.compute.internal
organization: giantswarm
pod: cert-manager-controller-7fcc585578-gnprd
provider: capa
Expand All @@ -41,7 +39,7 @@ tests:
opsrecipe: "cert-manager-down/"
- interval: 1m
input_series:
- series: 'up{app="cert-manager-app", cluster_id="12345", cluster_type="workload_cluster", container="cert-manager", customer="giantswarm", installation="golem", instance="10.0.0.0:1234", job="12345-prometheus/workload-12345/0", namespace="kube-system", node="ip-10-0-0-0.eu-central-1.compute.internal", organization="giantswarm", pod="cert-manager-controller-7fcc585578-gnprd", provider="capa", service_priority="highest"}'
- series: 'up{cluster_id="12345", cluster_type="workload_cluster", container="cert-manager", customer="giantswarm", installation="golem", instance="10.0.0.0:1234", job="12345-prometheus/workload-12345/0", namespace="kube-system", organization="giantswarm", pod="cert-manager-controller-7fcc585578-gnprd", provider="capa", service_priority="highest"}'
values: "1+0x60"
alert_rule_test:
- alertname: CertManagerDown
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,14 @@ rule_files:
tests:
- interval: 1m
input_series:
- series: 'up{app="cert-manager-app", cluster_id="12345", cluster_type="workload_cluster", container="cert-manager", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="12345-prometheus/workload-12345/0", namespace="kube-system", node="ip-10-0-0-0.eu-central-1.compute.internal", organization="giantswarm", pod="cert-manager-controller-7fcc585578-gnprd", provider="aws", service_priority="highest"}'
- series: 'up{cluster_id="12345", cluster_type="workload_cluster", container="cert-manager", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="12345-prometheus/workload-12345/0", namespace="kube-system", organization="giantswarm", pod="cert-manager-controller-7fcc585578-gnprd", provider="aws", service_priority="highest"}'
values: "0+0x60"
alert_rule_test:
- alertname: CertManagerDown
eval_time: 15m
exp_alerts:
- exp_labels:
alertname: CertManagerDown
app: cert-manager-app
area: kaas
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
Expand All @@ -27,7 +26,6 @@ tests:
ip: 10.0.0.0
job: 12345-prometheus/workload-12345/0
namespace: kube-system
node: ip-10-0-0-0.eu-central-1.compute.internal
organization: giantswarm
pod: cert-manager-controller-7fcc585578-gnprd
provider: aws
Expand All @@ -41,7 +39,7 @@ tests:
opsrecipe: "cert-manager-down/"
- interval: 1m
input_series:
- series: 'up{app="cert-manager-app", cluster_id="12345", cluster_type="workload_cluster", container="cert-manager", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="12345-prometheus/workload-12345/0", namespace="kube-system", node="ip-10-0-0-0.eu-central-1.compute.internal", organization="giantswarm", pod="cert-manager-controller-7fcc585578-gnprd", provider="aws", service_priority="highest"}'
- series: 'up{cluster_id="12345", cluster_type="workload_cluster", container="cert-manager", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="12345-prometheus/workload-12345/0", namespace="kube-system", organization="giantswarm", pod="cert-manager-controller-7fcc585578-gnprd", provider="aws", service_priority="highest"}'
values: "1+0x60"
alert_rule_test:
- alertname: CertManagerDown
Expand Down

0 comments on commit aca2acd

Please sign in to comment.