Skip to content

Commit

Permalink
Merge branch 'master' into fix-ksm-alerts-when-multiple-pods
Browse files Browse the repository at this point in the history
  • Loading branch information
hervenicol authored Nov 15, 2023
2 parents 2478679 + 87d9570 commit 5b31e3a
Show file tree
Hide file tree
Showing 15 changed files with 188 additions and 24 deletions.
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
version: 2.1
orbs:
architect: giantswarm/architect@4.33.0
architect: giantswarm/architect@4.34.1

workflows:
package-and-push-chart-on-tag:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/zz_generated.check_values_schema.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# DO NOT EDIT. Generated with:
#
# devctl@6.14.0
# devctl@6.17.0
#
name: 'Values and schema'
on:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/zz_generated.create_release.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# DO NOT EDIT. Generated with:
#
# devctl@6.14.0
# devctl@6.17.0
#
name: Create Release
on:
Expand Down Expand Up @@ -93,7 +93,7 @@ jobs:
uses: giantswarm/[email protected]
with:
binary: "architect"
version: "6.11.0"
version: "6.13.0"
- name: Install semver
uses: giantswarm/[email protected]
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/zz_generated.create_release_pr.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# DO NOT EDIT. Generated with:
#
# devctl@6.14.0
# devctl@6.17.0
#
name: Create Release PR
on:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/zz_generated.gitleaks.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# DO NOT EDIT. Generated with:
#
# devctl@6.14.0
# devctl@6.17.0
#
name: gitleaks

Expand Down
19 changes: 18 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed

- Support multiple KSM pods in our alerts.
- Split prometheus-agent alerts (`PrometheusAgentFailing` and `PrometheusAgentShardsMissing`) in 2:
- existing alerts will fire later
- new inhibitions alerts will fire earlier

## [2.140.2] - 2023-11-13

### Fixed

- Use `exported_namespace` for certificate expiration alerts.

## [2.140.1] - 2023-11-13

### Fixed

- Fix `raw_slo_requests` recording rule expression for kubelet status.

## [2.140.0] - 2023-11-13

Expand Down Expand Up @@ -2264,7 +2279,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Add existing rules from https://github.com/giantswarm/prometheus-meta-operator/pull/637/commits/bc6a26759eb955de92b41ed5eb33fa37980660f2

[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.140.0...HEAD
[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.140.2...HEAD
[2.140.2]: https://github.com/giantswarm/prometheus-rules/compare/v2.140.1...v2.140.2
[2.140.1]: https://github.com/giantswarm/prometheus-rules/compare/v2.140.0...v2.140.1
[2.140.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.139.0...v2.140.0
[2.139.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.138.3...v2.139.0
[2.138.3]: https://github.com/giantswarm/prometheus-rules/compare/v2.138.2...v2.138.3
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# DO NOT EDIT. Generated with:
#
# devctl@6.14.0
# devctl@6.17.0
#

include Makefile.*.mk
Expand Down
2 changes: 1 addition & 1 deletion Makefile.gen.app.mk
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# DO NOT EDIT. Generated with:
#
# devctl@6.14.0
# devctl@6.17.0
#

##@ App
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ spec:
annotations:
description: '{{`Certificate CR {{ $labels.namespace }}/{{ $labels.name }} on {{ $labels.cluster_id }} will expire in less than two weeks.`}}'
opsrecipe: managed-app-cert-manager/certificate-secret-will-expire-in-less-than-two-weeks/
expr: (cert_exporter_certificate_cr_not_after{managed_issuer="true",namespace=~"kube-system|giantswarm|monitoring"} - time()) < 13 * 24 * 60 * 60
expr: (cert_exporter_certificate_cr_not_after{managed_issuer="true",exported_namespace=~"kube-system|giantswarm|monitoring"} - time()) < 13 * 24 * 60 * 60
labels:
area: kaas
cancel_if_outside_working_hours: "true"
Expand All @@ -71,7 +71,7 @@ spec:
annotations:
description: '{{`Certificate CR {{ $labels.namespace }}/{{ $labels.name }} on {{ $labels.cluster_id }} will expire in less than two weeks.`}}'
opsrecipe: managed-app-cert-manager/certificate-secret-will-expire-in-less-than-two-weeks/
expr: (cert_exporter_certificate_cr_not_after{managed_issuer="true",cluster_type="workload_cluster",namespace!~"kube-system|giantswarm|monitoring"} - time()) < 13 * 24 * 60 * 60
expr: (cert_exporter_certificate_cr_not_after{managed_issuer="true",cluster_type="workload_cluster",exported_namespace!~"kube-system|giantswarm|monitoring"} - time()) < 13 * 24 * 60 * 60
labels:
area: kaas
cancel_if_outside_working_hours: "true"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ spec:
absent(up{instance="prometheus-agent"}) == 1
)[5m:]
)
for: 10m
for: 20m
labels:
area: empowerment
severity: page
Expand All @@ -38,6 +38,33 @@ spec:
cancel_if_cluster_is_not_running_prometheus_agent: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
## Same as PrometheusAgentFailing, but triggers inhibition earlier and does not page.
- alert: PrometheusAgentFailingInhibition
annotations:
description: '{{`Prometheus agent remote write is failing.`}}'
summary: Prometheus agent fails to send samples to remote write endpoint.
opsrecipe: prometheus-agent-remote-write-failed/
dashboard: promRW001/prometheus-remote-write
# expr: count(absent_over_time(up{instance="prometheus-agent"}[10m]))
expr: |-
max_over_time(
sum by (cluster_type, cluster_id, installation, instance, service)
(
up{instance="prometheus-agent"} == 0
or
absent(up{instance="prometheus-agent"}) == 1
)[5m:]
)
for: 1m
labels:
area: empowerment
severity: none
team: atlas
topic: observability
inhibit_prometheus_agent_down: "true"
cancel_if_cluster_is_not_running_prometheus_agent: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
## Page Atlas if prometheus agent is missing shards to send samples to MC prometheus.
- alert: PrometheusAgentShardsMissing
annotations:
Expand All @@ -63,7 +90,7 @@ spec:
)
)
)[5m:])
for: 10m
for: 20m
labels:
area: empowerment
severity: page
Expand All @@ -74,4 +101,40 @@ spec:
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_outside_working_hours: "true"
## Same as PrometheusAgentShardsMissing but triggers inhibition earlier, and does not page.
- alert: PrometheusAgentShardsMissingInhibition
annotations:
description: '{{`Prometheus agent is missing shards.`}}'
summary: Prometheus agent is missing shards.
opsrecipe: prometheus-agent-missing-shards/
expr: |-
max_over_time(sum(
count(
## number of remotes that are not mimir or grafana-cloud
prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir"}
)
!=
sum(
## number of shards defined in the Prometheus CR
prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}
or
(
# if there is only 1 shard, there is no shard metric so we use the replicas metric
absent(prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"})
and on(controller, name)
prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
)
)
)[5m:])
for: 1m
labels:
area: empowerment
severity: none
team: atlas
topic: observability
inhibit_prometheus_agent_down: "true"
cancel_if_cluster_is_not_running_prometheus_agent: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_outside_working_hours: "true"
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ spec:
record: slo_target

# -- kubelet whole cluster
- expr: "kube_node_status_condition"
- expr: kube_node_status_condition{condition="Ready"}
labels:
class: MEDIUM
area: kaas
Expand Down
10 changes: 6 additions & 4 deletions test/tests/providers/capi/capz/certificate.all.rules.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ tests:
# CertificateSecretWillExpireInLessThanTwoWeeks within 2 weeks of expiration
- interval: 1d
input_series:
- series: 'cert_exporter_secret_not_after{app="cert-exporter-deployment", cluster_id="gollem", cluster_type="management_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="gollem-prometheus/workload-gollem/0", name="athena-certs-secret", namespace="giantswarm", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-5c47b4c55c-49wt9", provider="kvm", secretkey="tls.crt", service_priority="highest"}'
- series: 'cert_exporter_secret_not_after{app="cert-exporter-deployment", cluster_id="gollem", cluster_type="management_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="gollem-prometheus/workload-gollem/0", name="athena-certs-secret", namespace="giantswarm", exported_namespace="giantswarm", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-5c47b4c55c-49wt9", provider="kvm", secretkey="tls.crt", service_priority="highest"}'
values: "2678400x60"
alert_rule_test:
- alertname: CertificateSecretWillExpireInLessThanTwoWeeks
Expand All @@ -21,6 +21,7 @@ tests:
cluster_type: management_cluster
container: cert-exporter
customer: giantswarm
exported_namespace: giantswarm
instance: 10.0.0.0:1234
job: gollem-prometheus/workload-gollem/0
namespace: giantswarm
Expand All @@ -41,15 +42,15 @@ tests:
# CertificateSecretWillExpireInLessThanTwoWeeks not within 2 weeks of expiration
- interval: 1d
input_series:
- series: 'cert_exporter_secret_not_after{app="cert-exporter-deployment", cluster_id="gollem", cluster_type="management_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="gollem-prometheus/workload-gollem/0", name="athena-certs-secret", namespace="giantswarm", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-5c47b4c55c-49wt9", provider="kvm", secretkey="tls.crt", service_priority="highest"}'
- series: 'cert_exporter_secret_not_after{app="cert-exporter-deployment", cluster_id="gollem", cluster_type="management_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="gollem-prometheus/workload-gollem/0", name="athena-certs-secret", namespace="giantswarm", exported_namespace="giantswarm", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-5c47b4c55c-49wt9", provider="kvm", secretkey="tls.crt", service_priority="highest"}'
values: "2678400x60"
alert_rule_test:
- alertname: CertificateSecretWillExpireInLessThanTwoWeeks
eval_time: 10d
# GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks within 2 weeks of expiration
- interval: 1d
input_series:
- series: 'cert_exporter_certificate_cr_not_after{app="cert-exporter-deployment", cluster_id="12345", cluster_type="workload_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", issuer_ref="kiam-ca-issuer", job="12345-prometheus/workload-12345/0", managed_issuer="true", name="kiam-agent", namespace="kube-system", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-57bbbfd856-8r8dr", provider="kvm", service_priority="highest"}'
- series: 'cert_exporter_certificate_cr_not_after{app="cert-exporter-deployment", cluster_id="12345", cluster_type="workload_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", issuer_ref="kiam-ca-issuer", job="12345-prometheus/workload-12345/0", managed_issuer="true", name="kiam-agent", namespace="kube-system", exported_namespace="kube-system", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-57bbbfd856-8r8dr", provider="kvm", service_priority="highest"}'
values: "2678400x60"
alert_rule_test:
- alertname: GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks
Expand All @@ -64,6 +65,7 @@ tests:
cluster_type: workload_cluster
container: cert-exporter
customer: giantswarm
exported_namespace: kube-system
instance: 10.0.0.0:1234
job: 12345-prometheus/workload-12345/0
namespace: kube-system
Expand All @@ -85,7 +87,7 @@ tests:
# GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks not within 2 weeks of expiration
- interval: 1d
input_series:
- series: 'cert_exporter_certificate_cr_not_after{app="cert-exporter-deployment", cluster_id="12345", cluster_type="workload_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", issuer_ref="kiam-ca-issuer", job="12345-prometheus/workload-12345/0", managed_issuer="true", name="kiam-agent", namespace="kube-system", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-57bbbfd856-8r8dr", provider="kvm", service_priority="highest"}'
- series: 'cert_exporter_certificate_cr_not_after{app="cert-exporter-deployment", cluster_id="12345", cluster_type="workload_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", issuer_ref="kiam-ca-issuer", job="12345-prometheus/workload-12345/0", managed_issuer="true", name="kiam-agent", namespace="kube-system", exported_namespace="kube-system", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-57bbbfd856-8r8dr", provider="kvm", service_priority="highest"}'
values: "2678400x60"
alert_rule_test:
- alertname: GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ tests:
# CertificateSecretWillExpireInLessThanTwoWeeks within 2 weeks of expiration
- interval: 1d
input_series:
- series: 'cert_exporter_secret_not_after{app="cert-exporter-deployment", cluster_id="gollem", cluster_type="management_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="gollem-prometheus/workload-gollem/0", name="athena-certs-secret", namespace="giantswarm", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-5c47b4c55c-49wt9", provider="kvm", secretkey="tls.crt", service_priority="highest"}'
- series: 'cert_exporter_secret_not_after{app="cert-exporter-deployment", cluster_id="gollem", cluster_type="management_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="gollem-prometheus/workload-gollem/0", name="athena-certs-secret", namespace="giantswarm", exported_namespace="giantswarm", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-5c47b4c55c-49wt9", provider="kvm", secretkey="tls.crt", service_priority="highest"}'
values: "2678400x60"
alert_rule_test:
- alertname: CertificateSecretWillExpireInLessThanTwoWeeks
Expand All @@ -21,6 +21,7 @@ tests:
cluster_type: management_cluster
container: cert-exporter
customer: giantswarm
exported_namespace: giantswarm
instance: 10.0.0.0:1234
job: gollem-prometheus/workload-gollem/0
namespace: giantswarm
Expand All @@ -41,15 +42,15 @@ tests:
# CertificateSecretWillExpireInLessThanTwoWeeks not within 2 weeks of expiration
- interval: 1d
input_series:
- series: 'cert_exporter_secret_not_after{app="cert-exporter-deployment", cluster_id="gollem", cluster_type="management_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="gollem-prometheus/workload-gollem/0", name="athena-certs-secret", namespace="giantswarm", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-5c47b4c55c-49wt9", provider="kvm", secretkey="tls.crt", service_priority="highest"}'
- series: 'cert_exporter_secret_not_after{app="cert-exporter-deployment", cluster_id="gollem", cluster_type="management_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="gollem-prometheus/workload-gollem/0", name="athena-certs-secret", namespace="giantswarm", exported_namespace="giantswarm", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-5c47b4c55c-49wt9", provider="kvm", secretkey="tls.crt", service_priority="highest"}'
values: "2678400x60"
alert_rule_test:
- alertname: CertificateSecretWillExpireInLessThanTwoWeeks
eval_time: 10d
# GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks within 2 weeks of expiration
- interval: 1d
input_series:
- series: 'cert_exporter_certificate_cr_not_after{app="cert-exporter-deployment", cluster_id="12345", cluster_type="workload_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", issuer_ref="kiam-ca-issuer", job="12345-prometheus/workload-12345/0", managed_issuer="true", name="kiam-agent", namespace="kube-system", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-57bbbfd856-8r8dr", provider="kvm", service_priority="highest"}'
- series: 'cert_exporter_certificate_cr_not_after{app="cert-exporter-deployment", cluster_id="12345", cluster_type="workload_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", issuer_ref="kiam-ca-issuer", job="12345-prometheus/workload-12345/0", managed_issuer="true", name="kiam-agent", namespace="kube-system", exported_namespace="giantswarm", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-57bbbfd856-8r8dr", provider="kvm", service_priority="highest"}'
values: "2678400x60"
alert_rule_test:
- alertname: GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks
Expand All @@ -64,6 +65,7 @@ tests:
cluster_type: workload_cluster
container: cert-exporter
customer: giantswarm
exported_namespace: giantswarm
instance: 10.0.0.0:1234
job: 12345-prometheus/workload-12345/0
namespace: kube-system
Expand All @@ -85,7 +87,7 @@ tests:
# GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks not within 2 weeks of expiration
- interval: 1d
input_series:
- series: 'cert_exporter_certificate_cr_not_after{app="cert-exporter-deployment", cluster_id="12345", cluster_type="workload_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", issuer_ref="kiam-ca-issuer", job="12345-prometheus/workload-12345/0", managed_issuer="true", name="kiam-agent", namespace="kube-system", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-57bbbfd856-8r8dr", provider="kvm", service_priority="highest"}'
- series: 'cert_exporter_certificate_cr_not_after{app="cert-exporter-deployment", cluster_id="12345", cluster_type="workload_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", issuer_ref="kiam-ca-issuer", job="12345-prometheus/workload-12345/0", managed_issuer="true", name="kiam-agent", namespace="kube-system", exported_namespace="giantswarm", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-57bbbfd856-8r8dr", provider="kvm", service_priority="highest"}'
values: "2678400x60"
alert_rule_test:
- alertname: GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks
Expand Down
Loading

0 comments on commit 5b31e3a

Please sign in to comment.