diff --git a/.circleci/config.yml b/.circleci/config.yml index 581fa5fdc..5765a995d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,6 +1,6 @@ version: 2.1 orbs: - architect: giantswarm/architect@4.29.0 + architect: giantswarm/architect@4.31.0 workflows: package-and-push-chart-on-tag: diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 216a017b3..5223c1157 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,3 +1,10 @@ +Before adding a new alerting rule into this repository you should consider creating an SLO rules instead. +SLO helps you both increase the quality of your monitoring and reduce the alert noise. + +* How to create a SLO rule: https://github.com/giantswarm/sloth-rules#how-to-create-a-slo +* Documentation: https://intranet.giantswarm.io/docs/monitoring/slo-alerting/ + +--- Towards: https://github.com/giantswarm/... This PR ... diff --git a/.github/workflows/alert_tests.yaml b/.github/workflows/alert_tests.yaml index ce5eb77b6..ecd515b38 100644 --- a/.github/workflows/alert_tests.yaml +++ b/.github/workflows/alert_tests.yaml @@ -7,7 +7,7 @@ jobs: promtool-unit-tests: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: "0" - name: run promtool unit tests @@ -15,7 +15,7 @@ jobs: inhibition-tests: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: "0" - name: run inhibition tests diff --git a/.github/workflows/zz_generated.add-team-labels.yaml b/.github/workflows/zz_generated.add-team-labels.yaml index 9e96e1d5d..2d258807c 100644 --- a/.github/workflows/zz_generated.add-team-labels.yaml +++ b/.github/workflows/zz_generated.add-team-labels.yaml @@ -14,7 +14,7 @@ jobs: mkdir -p artifacts wget --header "Authorization: token ${{ secrets.ISSUE_AUTOMATION }}" \ -O artifacts/users.yaml \ - https://raw.githubusercontent.com/giantswarm/github/master/tools/issue-automation/user-mapping.yaml + https://raw.githubusercontent.com/giantswarm/github/main/tools/issue-automation/user-mapping.yaml - name: Upload Artifact uses: actions/upload-artifact@v3 with: diff --git a/.github/workflows/zz_generated.add-to-project-board.yaml b/.github/workflows/zz_generated.add-to-project-board.yaml index 0392ed51f..a8569831b 100644 --- a/.github/workflows/zz_generated.add-to-project-board.yaml +++ b/.github/workflows/zz_generated.add-to-project-board.yaml @@ -16,7 +16,7 @@ jobs: mkdir -p artifacts wget --header "Authorization: token ${{ secrets.ISSUE_AUTOMATION }}" \ -O artifacts/users.yaml \ - https://raw.githubusercontent.com/giantswarm/github/master/tools/issue-automation/user-mapping.yaml + https://raw.githubusercontent.com/giantswarm/github/main/tools/issue-automation/user-mapping.yaml - name: Upload Artifact uses: actions/upload-artifact@v3 with: @@ -28,7 +28,7 @@ jobs: mkdir -p artifacts wget --header "Authorization: token ${{ secrets.ISSUE_AUTOMATION }}" \ -O artifacts/labels.yaml \ - https://raw.githubusercontent.com/giantswarm/github/master/tools/issue-automation/label-mapping.yaml + https://raw.githubusercontent.com/giantswarm/github/main/tools/issue-automation/label-mapping.yaml - name: Upload Artifact uses: actions/upload-artifact@v3 with: diff --git a/.github/workflows/zz_generated.check_values_schema.yaml b/.github/workflows/zz_generated.check_values_schema.yaml index d87509a24..c450aeeaa 100644 --- a/.github/workflows/zz_generated.check_values_schema.yaml +++ b/.github/workflows/zz_generated.check_values_schema.yaml @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.3.1 +# devctl@6.9.0 # name: 'Values and schema' on: @@ -34,6 +34,12 @@ jobs: run: | for chart_yaml in helm/*/Chart.yaml; do helm_dir="${chart_yaml%/Chart.yaml}" + + if [ ! -f ${helm_dir}/values.schema.json ]; then + echo "Skipping validation for '${helm_dir}' folder, because 'values.schema.json' does not exist..." + continue + fi + values=${helm_dir}/values.yaml if [ -f ${helm_dir}/ci/ci-values.yaml ]; then # merge ci-values.yaml into values.yaml (providing required values) diff --git a/.github/workflows/zz_generated.create_release.yaml b/.github/workflows/zz_generated.create_release.yaml index 6446af66d..57c5dd5ca 100644 --- a/.github/workflows/zz_generated.create_release.yaml +++ b/.github/workflows/zz_generated.create_release.yaml @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.3.1 +# devctl@6.9.0 # name: Create Release on: @@ -15,7 +15,7 @@ on: jobs: debug_info: name: Debug info - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Print github context JSON run: | @@ -24,7 +24,7 @@ jobs: EOF gather_facts: name: Gather facts - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 outputs: project_go_path: ${{ steps.get_project_go_path.outputs.path }} ref_version: ${{ steps.ref_version.outputs.refversion }} @@ -84,7 +84,7 @@ jobs: echo "refversion=${refversion}" >> $GITHUB_OUTPUT update_project_go: name: Update project.go - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 if: ${{ needs.gather_facts.outputs.version != '' && needs.gather_facts.outputs.project_go_path != '' && needs.gather_facts.outputs.ref_version != 'true' }} needs: - gather_facts @@ -146,7 +146,7 @@ jobs: hub pull-request -f -m "${{ env.title }}" -b ${{ env.base }} -h ${{ env.branch }} -r ${{ github.actor }} create_release: name: Create release - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: - gather_facts if: ${{ needs.gather_facts.outputs.version }} @@ -194,7 +194,7 @@ jobs: create-release-branch: name: Create release branch - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: - gather_facts if: ${{ needs.gather_facts.outputs.version }} diff --git a/.github/workflows/zz_generated.create_release_pr.yaml b/.github/workflows/zz_generated.create_release_pr.yaml index 8b714ef01..6f07166ea 100644 --- a/.github/workflows/zz_generated.create_release_pr.yaml +++ b/.github/workflows/zz_generated.create_release_pr.yaml @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.3.1 +# devctl@6.9.0 # name: Create Release PR on: @@ -30,7 +30,7 @@ on: jobs: debug_info: name: Debug info - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Print github context JSON run: | @@ -39,7 +39,7 @@ jobs: EOF gather_facts: name: Gather facts - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 outputs: repo_name: ${{ steps.gather_facts.outputs.repo_name }} branch: ${{ steps.gather_facts.outputs.branch }} @@ -136,7 +136,7 @@ jobs: fi create_release_pr: name: Create release PR - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: - gather_facts if: ${{ needs.gather_facts.outputs.skip != 'true' }} diff --git a/.github/workflows/zz_generated.gitleaks.yaml b/.github/workflows/zz_generated.gitleaks.yaml index e1ca75a20..2c70a482b 100644 --- a/.github/workflows/zz_generated.gitleaks.yaml +++ b/.github/workflows/zz_generated.gitleaks.yaml @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.3.1 +# devctl@6.9.0 # name: gitleaks diff --git a/CHANGELOG.md b/CHANGELOG.md index afad1cbea..4053d1299 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,9 +7,286 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed + +- Add missing team label to slo alerts. + +### Changed + +- Change ownership from Atlas to Turtles/Phoenix for all vertical pod autoscaler alerts + +## [2.137.0] - 2023-10-04 + +### Removed + +- Remove role label usage instead of relying on `kube_node_role`` metric. + +## [2.136.0] - 2023-10-04 + +### Changed + +- Remove PrometheusAvailabilityRatio alert. + +## [2.135.0] - 2023-10-02 + +### Changed + +- Handover cert-manager alerts to BigMac +- Ignore ETCD alerts on EKS clusters. + +## [2.134.1] - 2023-09-26 + +### Fixed + +- Improve InhibitionClusterIsNotRunningPrometheusAgent to keep paging if the kube-state-metrics metric is missing for 5 minutes (avoid flapping of inhibitions). + +## [2.134.0] - 2023-09-21 + +### Changed + +- Split `KubeStateMetricsDown` alert into 2 alerts : `KubeStateMetricsDown` and `KubeStateMetricsNotRetrievingMetrics` + +## [2.133.0] - 2023-09-19 + +### Changed + +- Add missing prometheus-agent inhibition to `KubeStateMetricsDown` alert +- Change time duration before `ManagementClusterDeploymentMissingAWS` pages because it is dependant on the `PrometheusAgentFailing` alert. + +### Fixed + +- Remove `cancel_if_outside_working_hours` from PrometheusAgent alerts. + +## [2.132.0] - 2023-09-15 + +### Changed + +- `PrometheusAgentFailing` and `PrometheusAgentShardsMissing`: keep alerts for 5min after it's solved + +## [2.131.0] - 2023-09-12 + +### Changed + +- Remove `DNSRequestDurationTooSlow` in favor of SLO alerting. + +## [2.130.0] - 2023-09-12 + +### Changed + +- Refactor the Kyverno policy reports recording rule to include missing apps from Team Overview dashboard. +- Change `ClusterUnhealthyPhase` severity to page, so that we get paged when a cluster is not working properly. + +## [2.129.0] - 2023-09-11 + +### Changed + +- Unit tests for `PrometheusAgentShardsMissing` +- fixes for `PrometheusAgentShardsMissing` + +## [2.128.0] - 2023-09-05 + +### Added + +- Unit tests for KubeStateMetricsDown + +### Changed + +- Loki alerts only during working hours +- `PrometheusAgentFailing` does not rely on KSM metrics anymore +- Prometheus-agent inhibition rework, run on the MC +- `ManagementClusterApp` alerts now check for default catalog as well + +## [2.127.0] - 2023-08-21 + +### Changed + +- WorkloadClusterApp alerts now also monitor default catalog + +## [2.126.1] - 2023-08-14 + +### Changed + +- Changed master memory limits to 80% + +### Fixed + +- Revert change concerning port 8081 in `KubeStateMetricsDown` alert. + +## [2.126.0] - 2023-08-10 + +## Changed + +- `ManagementClusterWebhookDurationExceedsTimeout`, `WorkloadClusterWebhookDurationExceedsTimeoutSolutionEngineers`, `WorkloadClusterWebhookDurationExceedsTimeoutHoneybadger`, `WorkloadClusterWebhookDurationExceedsTimeoutCabbage`, and `WorkloadClusterWebhookDurationExceedsTimeoutAtlas` are changed to use the 95th percentile latency of the webhook, instead of the average rate of change. + +## [2.125.0] - 2023-08-09 + +## Changed + +- `KubeStateMetricsDown` also triggers when KSM does not show enough data (less than 10 metrics) + +## [2.124.0] - 2023-08-08 + +### Added + +- Add `WorkloadClusterDeploymentScaledDownToZeroShield` for Shield deployments on WCs. + +### Fixed + +- Add port 8081 for the `instance` label in `KubeStateMetricsDown` alert. + +### Changed + +- Move CoreDNS alerts from phoenix to cabbage. + +## [2.123.0] - 2023-08-03 + +### Changed + +- Ignore `prometheus` PVCs in `PersistentVolumeSpaceTooLow` alert (they have a dedicated alert). + +## [2.122.0] - 2023-08-02 + +### Changed + +- Allow 1 error/5 minutes for `ManagementClusterAPIServerAdmissionWebhookErrors`. + +### Fixed + +- Add webhook name in `ManagementClusterAPIServerAdmissionWebhookErrors` alert title. + +## [2.121.0] - 2023-08-02 + +### Changed + +- Move Cert-manager alerts to Cabbage + +### Fixed + +- Make `ManagementClusterContainerIsRestartingTooFrequentlyAWS` alert title include the involved pod. +- Make `DeploymentNotSatisfiedKaas` alert title include the involved deployment. +- Make `WorkloadClusterNonCriticalDeploymentNotSatisfiedKaas` alert title include the involved deployment. +- Make `WorkloadClusterDeploymentNotSatisfiedKaas` alert title include the involved deployment. +- Make `WorkloadClusterContainerIsRestartingTooFrequentlyAWS` alert title include the involved pod. +- Make `WorkloadClusterManagedDeploymentNotSatisfiedPhoenix` alert title include the involved deployment. + +## [2.120.0] - 2023-08-01 + +### Changed + +- Move Kyverno certificate expiry alert from KaaS to Managed Services. +- Decrease sensitivity for alerting on KVM WC critical pods from 10m to 15m. + +## [2.119.0] - 2023-07-31 + +### Changed + +- Assign `clippy` rules to `phoenix`. + +## [2.118.1] - 2023-07-31 + +### Fixed + +- Check division by zero in `ManagementClusterWebhookDurationExceedsTimeout` alert's query. + +## [2.118.0] - 2023-07-28 + +### Changed + +- Increase alert threshold for KVM WC critical pods from 5m to 10m. + +## [2.117.0] - 2023-07-27 + +### Changed + +- Increase time window of `ManagementClusterAPIServerAdmissionWebhookErrors` from 5m to 15m. + +## [2.116.0] - 2023-07-20 + +### Fixed + +- Fix `KubeStateMetricsDown` on pre-servicemonitor clusters + +### Changed + +- Switch `HighNumberOfAllocatedSockets` and `HighNumberOfOrphanedSockets` from Rocket to provider teams. + +## [2.115.1] - 2023-07-20 + +### Fixed + +- Fix `KubeStateMetricsDown` + +## [2.115.0] - 2023-07-20 + +### Added + +- New alert `KubeStateMetricsSlow` that inhibits KSM related alerts. + +### Fixed + +- Fix `KubeStateMetricsDown` inhibition. + +## [2.114.0] - 2023-07-20 + +### Added + +- Add `DNSRequestDurationTooSlow` to catch slow DNS. + +### Removed + +- Remove `CoreDNSLoadUnbalanced` alert. +- Remove `CoreDNSCPUUsageTooHigh` alert. + +## [2.113.0] - 2023-07-18 + +### Added + +- Add cilium BPF map monitoring. +- Add `VpaComponentTooManyRestarts` alerting rule. + +### Changed + +- Make `VaultIsDown` page after 40m. + +## [2.112.0] - 2023-07-13 + +### Fixed + +- Use all nodes instead of just the Ready ones as raw_slo_requests + +### Removed + +- Remove kiam-agent and kiam-server from the ServiceLevelBurnRateTooHigh alert + +## [2.111.0] - 2023-07-11 + +### Removed + +- Remove `CoreDNSLatencyTooHigh` alert as it's flaky and superseeded by SLO alert. + +## [2.110.0] - 2023-07-10 + +### Changed + +- Change `ManagementClusterAPIServerAdmissionWebhookErrors` severity to page. +- CAPA alerts only during business hours. +- Fix Kyverno recording rule to ignore WorkloadCluster Apps. +- Make `CoreDNSLatencyTooHigh` alert page rather than notify. + +## [2.109.0] - 2023-06-30 + +### Added + +- Add two new alerts for ALB role errors. +- Add dashboard link to `ServiceLevelBurnRateTooHigh` alert. +- Ship Kyverno policy enforcement status to Grafana Cloud. + +## [2.108.0] - 2023-06-28 + ### Changed - Change `for` setting of `WorkloadClusterCriticalPodNotRunningAWS` to 20 minutes. +- Remove duplicate workload_name label in favor of existing daemonset|statefulset|deployment labels. ### Removed @@ -21,11 +298,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Split Grafana Cloud recording rules into smaller groups. +### Added + +- Add rule for AWS load balancer controller deployment satisfied. + ## [2.106.0] - 2023-06-22 ### Added -- Add alerts for legacy vault's etcd backups. +- Add alerts for legacy vault's etcd backups. ## [2.105.0] - 2023-06-22 @@ -1942,7 +2223,41 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add existing rules from https://github.com/giantswarm/prometheus-meta-operator/pull/637/commits/bc6a26759eb955de92b41ed5eb33fa37980660f2 -[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.107.0...HEAD +[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v2.137.0...HEAD +[2.137.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.136.0...v2.137.0 +[2.136.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.135.0...v2.136.0 +[2.135.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.134.1...v2.135.0 +[2.134.1]: https://github.com/giantswarm/prometheus-rules/compare/v2.134.0...v2.134.1 +[2.134.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.133.0...v2.134.0 +[2.133.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.132.0...v2.133.0 +[2.132.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.131.0...v2.132.0 +[2.131.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.130.0...v2.131.0 +[2.130.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.129.0...v2.130.0 +[2.129.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.128.0...v2.129.0 +[2.128.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.127.0...v2.128.0 +[2.127.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.126.1...v2.127.0 +[2.126.1]: https://github.com/giantswarm/prometheus-rules/compare/v2.126.0...v2.126.1 +[2.126.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.125.0...v2.126.0 +[2.125.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.124.0...v2.125.0 +[2.124.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.123.0...v2.124.0 +[2.123.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.122.0...v2.123.0 +[2.122.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.121.0...v2.122.0 +[2.121.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.120.0...v2.121.0 +[2.120.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.119.0...v2.120.0 +[2.119.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.118.1...v2.119.0 +[2.118.1]: https://github.com/giantswarm/prometheus-rules/compare/v2.118.0...v2.118.1 +[2.118.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.117.0...v2.118.0 +[2.117.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.116.0...v2.117.0 +[2.116.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.115.1...v2.116.0 +[2.115.1]: https://github.com/giantswarm/prometheus-rules/compare/v2.115.0...v2.115.1 +[2.115.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.114.0...v2.115.0 +[2.114.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.113.0...v2.114.0 +[2.113.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.112.0...v2.113.0 +[2.112.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.111.0...v2.112.0 +[2.111.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.110.0...v2.111.0 +[2.110.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.109.0...v2.110.0 +[2.109.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.108.0...v2.109.0 +[2.108.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.107.0...v2.108.0 [2.107.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.106.0...v2.107.0 [2.106.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.105.0...v2.106.0 [2.105.0]: https://github.com/giantswarm/prometheus-rules/compare/v2.104.0...v2.105.0 diff --git a/Makefile b/Makefile index 4ae00496c..6b6025aaa 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.3.1 +# devctl@6.9.0 # include Makefile.*.mk diff --git a/Makefile.gen.app.mk b/Makefile.gen.app.mk index ab6210d46..0929f9089 100644 --- a/Makefile.gen.app.mk +++ b/Makefile.gen.app.mk @@ -1,6 +1,6 @@ # DO NOT EDIT. Generated with: # -# devctl@6.3.1 +# devctl@6.9.0 # ##@ App diff --git a/helm/prometheus-rules/templates/_helpers.tpl b/helm/prometheus-rules/templates/_helpers.tpl index 69514611c..b735b6664 100644 --- a/helm/prometheus-rules/templates/_helpers.tpl +++ b/helm/prometheus-rules/templates/_helpers.tpl @@ -31,18 +31,18 @@ giantswarm.io/service-type: {{ .Values.serviceType }} {{- define "providerTeam" -}} {{- if has .Values.managementCluster.provider.kind (list "kvm" "openstack" "cloud-director" "vsphere") -}} rocket -{{- else if has .Values.managementCluster.provider.kind (list "gcp" "capa") -}} +{{- else if has .Values.managementCluster.provider.kind (list "gcp" "capa" "capz") -}} {{- /* hydra alerts merged into phoenix business hours on-call */ -}} phoenix -{{- else if eq .Values.managementCluster.provider.kind "capz" -}} -clippy {{- else -}} phoenix {{- end -}} {{- end -}} {{- define "workingHoursOnly" -}} -{{- if has .Values.managementCluster.provider.kind (list "openstack" "capz") -}} +{{- if has .Values.managementCluster.provider.kind (list "openstack" "capz" "capa") -}} +"true" +{{- else if eq .Values.managementCluster.pipeline "stable-testing" -}} "true" {{- else -}} "false" diff --git a/helm/prometheus-rules/templates/alerting-rules/apiserver.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/apiserver.management-cluster.rules.yml index fca1441d8..a634df541 100644 --- a/helm/prometheus-rules/templates/alerting-rules/apiserver.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/apiserver.management-cluster.rules.yml @@ -34,19 +34,19 @@ spec: annotations: description: '{{`Kubernetes API Server {{ $labels.cluster_id }} having admission webhook errors.`}}' opsrecipe: apiserver-admission-webhook-errors/ - expr: rate(apiserver_admission_webhook_rejection_count{cluster_type="management_cluster", error_type=~"calling_webhook_error|apiserver_internal_error"}[5m]) > 0 - for: 5m + expr: label_replace(rate(apiserver_admission_webhook_rejection_count{cluster_type="management_cluster", error_type=~"calling_webhook_error|apiserver_internal_error"}[5m]), "service", "$1", "name", "(.*)") > 1 + for: 15m labels: area: kaas cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} - severity: notify + severity: page team: {{ include "providerTeam" . }} topic: managementcluster - alert: ManagementClusterWebhookDurationExceedsTimeout annotations: - description: '{{`Kubernetes API Server admission webhook for {{ $labels.cluster_id }} is timing out.`}}' + description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' opsrecipe: apiserver-admission-webhook-errors/ - expr: rate(apiserver_admission_webhook_admission_duration_seconds_sum[5m]) / rate(apiserver_admission_webhook_admission_duration_seconds_count[5m]) > 8 + expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket[5m])) by (cluster_id, name, app, le)) > 5 for: 15m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml index 0ef9c51b1..8f1ab06a9 100644 --- a/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml @@ -28,6 +28,7 @@ spec: severity: notify team: {{ include "providerTeam" . }} topic: kubernetes + - alert: WorkloadClusterAPIServerAdmissionWebhookErrors annotations: description: '{{`Kubernetes API Server {{ $labels.cluster_id }} having admission webhook errors.`}}' @@ -44,9 +45,9 @@ spec: # Webhooks that are not explicitely owner by any team (customer owned ones). - alert: WorkloadClusterWebhookDurationExceedsTimeoutSolutionEngineers annotations: - description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} for {{ $labels.cluster_id }} is timing out.`}}' + description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' opsrecipe: apiserver-admission-webhook-errors/ - expr: rate(apiserver_admission_webhook_admission_duration_seconds_count{cluster_type="workload_cluster",name!~".*(prometheus|vpa.k8s.io|linkerd|validate.nginx.ingress.kubernetes.io|kong.konghq.com|cert-manager.io|kyverno|app-admission-controller).*"}[5m]) > 0 AND rate(apiserver_admission_webhook_admission_duration_seconds_sum[5m]) / rate(apiserver_admission_webhook_admission_duration_seconds_count[5m]) > 8 + expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name!~".*(prometheus|vpa.k8s.io|linkerd|validate.nginx.ingress.kubernetes.io|kong.konghq.com|cert-manager.io|kyverno|app-admission-controller).*"}[5m])) by (cluster_id, name, app, le)) > 5 for: 15m labels: area: kaas @@ -58,9 +59,9 @@ spec: # Webhooks owned by Honeybadger - alert: WorkloadClusterWebhookDurationExceedsTimeoutHoneybadger annotations: - description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} for {{ $labels.cluster_id }} is timing out.`}}' + description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' opsrecipe: apiserver-admission-webhook-errors/ - expr: rate(apiserver_admission_webhook_admission_duration_seconds_count{cluster_type="workload_cluster",name=~".*(kyverno|app-admission-controller).*"}[5m]) > 0 AND rate(apiserver_admission_webhook_admission_duration_seconds_sum[5m]) / rate(apiserver_admission_webhook_admission_duration_seconds_count[5m]) > 8 + expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(kyverno|app-admission-controller).*"}[5m])) by (cluster_id, name, app, le)) > 5 for: 15m labels: area: kaas @@ -72,9 +73,9 @@ spec: # Webhooks owned by Cabbage - alert: WorkloadClusterWebhookDurationExceedsTimeoutCabbage annotations: - description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} for {{ $labels.cluster_id }} is timing out.`}}' + description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' opsrecipe: apiserver-admission-webhook-errors/ - expr: rate(apiserver_admission_webhook_admission_duration_seconds_count{cluster_type="workload_cluster",name=~".*(linkerd|validate.nginx.ingress.kubernetes.io|kong.konghq.com|cert-manager.io).*"}[5m]) > 0 AND rate(apiserver_admission_webhook_admission_duration_seconds_sum[5m]) / rate(apiserver_admission_webhook_admission_duration_seconds_count[5m]) > 8 + expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(linkerd|validate.nginx.ingress.kubernetes.io|kong.konghq.com|cert-manager.io).*"}[5m])) by (cluster_id, name, app, le)) > 5 for: 15m labels: area: kaas @@ -83,12 +84,26 @@ spec: team: cabbage topic: kubernetes + # Webhooks owned by Phoenix + - alert: WorkloadClusterWebhookDurationExceedsTimeoutPhoenix + annotations: + description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' + opsrecipe: apiserver-admission-webhook-errors/ + expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(vpa.k8s.io).*"}[5m])) by (cluster_id, name, app, le)) > 5 + for: 15m + labels: + area: kaas + cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} + severity: page + team: phoenix + topic: kubernetes + # Webhooks owned by Atlas - alert: WorkloadClusterWebhookDurationExceedsTimeoutAtlas annotations: - description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} for {{ $labels.cluster_id }} is timing out.`}}' + description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' opsrecipe: apiserver-admission-webhook-errors/ - expr: rate(apiserver_admission_webhook_admission_duration_seconds_count{cluster_type="workload_cluster",name=~".*(prometheus|vpa.k8s.io).*"}[5m]) > 0 AND rate(apiserver_admission_webhook_admission_duration_seconds_sum[5m]) / rate(apiserver_admission_webhook_admission_duration_seconds_count[5m]) > 8 + expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(prometheus).*"}[5m])) by (cluster_id, name, app, le)) > 5 for: 15m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/app.rules.yml b/helm/prometheus-rules/templates/alerting-rules/app.rules.yml index 692b0c943..842050904 100644 --- a/helm/prometheus-rules/templates/alerting-rules/app.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/app.rules.yml @@ -15,7 +15,7 @@ spec: annotations: description: '{{`Management Cluster App {{ $labels.name }}, version {{ $labels.version }} is {{if $labels.status }} in {{ $labels.status }} state. {{else}} not installed. {{end}}`}}' opsrecipe: app-failed/ - expr: app_operator_app_info{status!~"(?i:(deployed|cordoned))", catalog=~"control-plane-.*",team!~"^$|noteam"} + expr: app_operator_app_info{status!~"(?i:(deployed|cordoned))", catalog=~"(control-plane-.*|default)",team!~"^$|noteam", namespace=~".*gianstswarm"} for: 30m labels: area: managedservices @@ -30,7 +30,7 @@ spec: annotations: description: 'Current version of {{`App {{ $labels.name }} is {{ $labels.deployed_version }} but it should be {{ $labels.version }}.`}}' opsrecipe: app-pending-update/ - expr: app_operator_app_info{catalog=~"control-plane-.*", deployed_version!="", status="deployed", version_mismatch="true" ,team!~"^$|noteam"} + expr: app_operator_app_info{catalog=~"(control-plane-.*|default)", deployed_version!="", status="deployed", version_mismatch="true" ,team!~"^$|noteam", namespace=~".*gianstswarm"} for: 40m labels: area: managedservices @@ -46,9 +46,9 @@ spec: description: '{{`Workload Cluster App {{ if $labels.exported_namespace }}{{ $labels.exported_namespace }}{{ else }}{{ $labels.namespace }}{{ end }}/{{ $labels.name }}, version {{ $labels.version }} is {{if $labels.status }} in {{ $labels.status }} state. {{else}} not installed. {{end}}`}}' opsrecipe: app-failed/ {{- if eq .Values.managementCluster.provider.flavor "capi" }} - expr: label_replace(app_operator_app_info{status!~"(?i:(deployed|cordoned|not-installed))", catalog=~"giantswarm|cluster", team!~"^$|noteam"}, "cluster_id", "$1", "name", "([a-zA-Z0-9]+)-.*") == 1 + expr: label_replace(app_operator_app_info{status!~"(?i:(deployed|cordoned|not-installed))", catalog=~"giantswarm|cluster|default", team!~"^$|noteam"}, "cluster_id", "$1", "name", "([a-zA-Z0-9]+)-.*") == 1 {{- else }} - expr: label_replace(app_operator_app_info{status!~"(?i:(deployed|cordoned|not-installed))", catalog="giantswarm", team!~"^$|noteam"}, "cluster_id", "$1", "namespace", {{ include "namespaceNotGiantswarm" . }}) == 1 + expr: label_replace(app_operator_app_info{status!~"(?i:(deployed|cordoned|not-installed))", catalog=~"giantswarm|default", team!~"^$|noteam"}, "cluster_id", "$1", "namespace", {{ include "namespaceNotGiantswarm" . }}) == 1 {{- end }} for: 30m labels: @@ -66,9 +66,9 @@ spec: description: '{{`Workload Cluster App {{ if $labels.exported_namespace }}{{ $labels.exported_namespace }}{{ else }}{{ $labels.namespace }}{{ end }}/{{ $labels.name }}, version {{ $labels.version }} is {{if $labels.status }} in {{ $labels.status }} state. {{else}} not installed. {{end}}`}}' opsrecipe: app-failed/ {{- if eq .Values.managementCluster.provider.flavor "capi" }} - expr: label_replace(app_operator_app_info{status="not-installed", catalog=~"giantswarm|cluster", team!~"^$|noteam"}, "cluster_id", "$1", "name", "([a-zA-Z0-9]+)-.*") == 1 + expr: label_replace(app_operator_app_info{status="not-installed", catalog=~"giantswarm|cluster|default", team!~"^$|noteam"}, "cluster_id", "$1", "name", "([a-zA-Z0-9]+)-.*") == 1 {{- else }} - expr: label_replace(app_operator_app_info{status="not-installed", catalog="giantswarm", team!~"^$|noteam"}, "cluster_id", "$1", "namespace", {{ include "namespaceNotGiantswarm" . }}) == 1 + expr: label_replace(app_operator_app_info{status="not-installed", catalog=~"giantswarm|default", team!~"^$|noteam"}, "cluster_id", "$1", "namespace", {{ include "namespaceNotGiantswarm" . }}) == 1 {{- end }} for: 30m labels: @@ -86,9 +86,9 @@ spec: description: 'Current version of {{`App {{ $labels.name }} is {{ $labels.deployed_version }} but it should be {{ $labels.version }}.`}}' opsrecipe: app-pending-update/ {{- if eq .Values.managementCluster.provider.flavor "capi" }} - expr: label_replace(app_operator_app_info{catalog=~"giantswarm|cluster", deployed_version!="", status="deployed", version_mismatch="true" ,team!~"^$|noteam"}, "cluster_id", "$1", "name", "([a-zA-Z0-9]+)-.*") == 1 + expr: label_replace(app_operator_app_info{catalog=~"giantswarm|cluster|default", deployed_version!="", status="deployed", version_mismatch="true" ,team!~"^$|noteam"}, "cluster_id", "$1", "name", "([a-zA-Z0-9]+)-.*") == 1 {{- else }} - expr: label_replace(app_operator_app_info{catalog="giantswarm", deployed_version!="", status="deployed", version_mismatch="true" ,team!~"^$|noteam"}, "cluster_id", "$1", "namespace", {{ include "namespaceNotGiantswarm" . }}) == 1 + expr: label_replace(app_operator_app_info{catalog=~"giantswarm|default", deployed_version!="", status="deployed", version_mismatch="true" ,team!~"^$|noteam"}, "cluster_id", "$1", "namespace", {{ include "namespaceNotGiantswarm" . }}) == 1 {{- end }} for: 40m labels: diff --git a/helm/prometheus-rules/templates/alerting-rules/aws-load-balancer-controller.rules.yml b/helm/prometheus-rules/templates/alerting-rules/aws-load-balancer-controller.rules.yml new file mode 100644 index 000000000..8eef7b0e5 --- /dev/null +++ b/helm/prometheus-rules/templates/alerting-rules/aws-load-balancer-controller.rules.yml @@ -0,0 +1,45 @@ +{{- if eq .Values.managementCluster.provider.kind "aws" }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + creationTimestamp: null + labels: + {{- include "labels.common" . | nindent 4 }} + cluster_type: "workload_cluster" + name: aws-load-balancer-controller.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: aws-load-balancer-controller + rules: + - alert: AWSLoadBalancerAssumeRoleErrors + annotations: + description: '{{`AWS load balancer pod {{ $labels.namespace}}/{{ $labels.pod_name }} on {{ $labels.cluster_id}}/{{ $labels.cluster }} can not assume the role.`}}' + opsrecipe: alb-role-errors#assume-role-errors + expr: increase(aws_api_calls_total{error_code="WebIdentityErr"}[20m]) > 0 + for: 40m + labels: + area: managedservices + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "false" + severity: page + team: phoenix + topic: alb + - alert: AWSLoadBalancerRolePolicyErrors + annotations: + description: '{{`AWS load balancer pod {{ $labels.namespace}}/{{ $labels.pod_name }} on {{ $labels.cluster_id}}/{{ $labels.cluster }} has a wrong role policy.`}}' + opsrecipe: alb-role-errors#role-policy-errors + expr: increase(aws_api_calls_total{error_code="UnauthorizedOperation"}[20m]) > 0 + for: 40m + labels: + area: managedservices + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "false" + severity: page + team: phoenix + topic: alb +{{- end }} diff --git a/helm/prometheus-rules/templates/alerting-rules/aws.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/aws.management-cluster.rules.yml index f1b0d4558..956f72321 100644 --- a/helm/prometheus-rules/templates/alerting-rules/aws.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/aws.management-cluster.rules.yml @@ -133,7 +133,7 @@ spec: description: '{{`Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}' opsrecipe: container-is-restarting-too-often/ ## route53-manager is only used in China as route53 did not used to exist there - expr: increase(kube_pod_container_status_restarts_total{container=~"aws-admission-controller.*|aws-node.*|aws-operator.*|cluster-operator.*|route53-manager.*"}[1h]) > 6 + expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"aws-admission-controller.*|aws-node.*|aws-operator.*|cluster-operator.*|route53-manager.*"}[1h]), "service", "/", "namespace", "pod") > 6 for: 5m labels: area: kaas @@ -146,10 +146,10 @@ spec: topic: kubernetes - alert: ManagementClusterDeploymentMissingAWS annotations: - description: '{{`Deployment {{ $labels.workload_name }} is missing.`}}' + description: '{{`Deployment {{ $labels.deployment }} is missing.`}}' opsrecipe: management-cluster-deployment-is-missing/ expr: absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="aws-admission-controller"}) - for: 5m + for: 15m labels: area: kaas cancel_if_prometheus_agent_down: "true" diff --git a/helm/prometheus-rules/templates/alerting-rules/aws.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/aws.workload-cluster.rules.yml index 91ed1e82a..0720535e6 100644 --- a/helm/prometheus-rules/templates/alerting-rules/aws.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/aws.workload-cluster.rules.yml @@ -16,7 +16,7 @@ spec: annotations: description: '{{`Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}' opsrecipe: container-is-restarting-too-often/ - expr: increase(kube_pod_container_status_restarts_total{container=~"aws-node.*|kiam-agent.*|kiam-server.*|cluster-autoscaler.*|ebs-csi-.*|aws-pod-identity-webhook.*|etcd-kubernetes-resources-count-exporter.*"}[1h]) > 10 + expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"aws-node.*|kiam-agent.*|kiam-server.*|cluster-autoscaler.*|ebs-csi-.*|aws-pod-identity-webhook.*|etcd-kubernetes-resources-count-exporter.*"}[1h]),"service","/","namespace","pod") > 10 for: 10m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/capi-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/capi-cluster.rules.yml index 227767b6d..982844ab1 100644 --- a/helm/prometheus-rules/templates/alerting-rules/capi-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/capi-cluster.rules.yml @@ -15,7 +15,7 @@ spec: labels: area: kaas cancel_if_outside_working_hours: {{include "workingHoursOnly" .}} - severity: notify + severity: page team: {{include "providerTeam" .}} topic: managementcluster annotations: diff --git a/helm/prometheus-rules/templates/alerting-rules/cert-manager.rules.yml b/helm/prometheus-rules/templates/alerting-rules/cert-manager.rules.yml index cc553f4d1..f7ab66b53 100644 --- a/helm/prometheus-rules/templates/alerting-rules/cert-manager.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/cert-manager.rules.yml @@ -23,7 +23,7 @@ spec: area: kaas cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: notify - team: {{ include "providerTeam" . }} + team: bigmac topic: observability - alert: CertManagerDown annotations: @@ -38,7 +38,7 @@ spec: cancel_if_cluster_status_deleting: "true" cancel_if_kubelet_down: "true" severity: page - team: {{ include "providerTeam" . }} + team: bigmac topic: cert-manager - alert: CertManagerTooManyCertificateRequests annotations: @@ -50,5 +50,5 @@ spec: area: kaas cancel_if_outside_working_hours: "true" severity: notify - team: {{ include "providerTeam" . }} + team: bigmac topic: cert-manager diff --git a/helm/prometheus-rules/templates/alerting-rules/certificate.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/certificate.all.rules.yml index cec9132b6..b78ac094a 100644 --- a/helm/prometheus-rules/templates/alerting-rules/certificate.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/certificate.all.rules.yml @@ -40,7 +40,7 @@ spec: opsrecipe: kyverno-certificate-secret-will-expire-in-less-than-two-days/ expr: (cert_exporter_secret_not_after{name=~".*kyverno.*"} - time()) < 2 * 24 * 60 * 60 labels: - area: kaas + area: managedservices cancel_if_outside_working_hours: "true" severity: notify team: shield diff --git a/helm/prometheus-rules/templates/alerting-rules/cilium.rules.yml b/helm/prometheus-rules/templates/alerting-rules/cilium.rules.yml new file mode 100644 index 000000000..55fa67cc6 --- /dev/null +++ b/helm/prometheus-rules/templates/alerting-rules/cilium.rules.yml @@ -0,0 +1,35 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + creationTimestamp: null + labels: + {{- include "labels.common" . | nindent 4 }} + name: cilium.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: cilium + rules: + - alert: CiliumBPFMapAlmostFull + annotations: + description: '{{`Cilium BPF map is about to fill up.`}}' + opsrecipe: cilium-bpf-map/ + expr: avg(cilium_bpf_map_pressure) by (map_name) * 100 > 80 + for: 15m + labels: + area: kaas + cancel_if_outside_working_hours: "true" + severity: page + team: phoenix + topic: cilium + - alert: CiliumBPFMapFull + annotations: + description: '{{`Cilium BPF map is about filled up.`}}' + opsrecipe: cilium-bpf-map/ + expr: avg(cilium_bpf_map_pressure) by (map_name) * 100 > 95 + for: 15m + labels: + area: kaas + severity: page + team: phoenix + topic: cilium \ No newline at end of file diff --git a/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml b/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml index b05c21edb..af5454208 100644 --- a/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml @@ -10,52 +10,6 @@ spec: groups: - name: coredns rules: - - alert: CoreDNSCPUUsageTooHigh - annotations: - description: '{{`CoreDNS CPU usage is too high.`}}' - expr: rate(container_cpu_user_seconds_total{pod=~"coredns-.*"}[5m]) > 0.15 - for: 5m - labels: - area: kaas - cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} - severity: notify - team: {{ include "providerTeam" . }} - topic: observability - - alert: CoreDNSLatencyTooHigh - # There are two sub-queries here that need to be true for the alert to fire. - # - # The first part calculates the rate of DNS requests per second, - # comparing it with a threshold. - # As a low rate of DNS queries can lead to a misleading mean average, - # we ignore clusters that only have a low rate of DNS requests. - # - # The second part takes the rate of latency for requests (per cluster), - # dividing it by the rate of number of requests (per cluster), - # giving a mean average of DNS request latency, - # and then comparing it with the threshold. - # - # If both are true - that is, there are a high number of DNS requests, - # and they are on average taking longer than we'd like, - # then the alert fires. - annotations: - description: '{{`CoreDNS mean latency is too high.`}}' - opsrecipe: dns-issue-mitigation/ - expr: sum( irate( coredns_dns_request_duration_seconds_count{zone!="dropped"}[15m] ) ) by (cluster_id) > 500 and sum( irate( coredns_dns_request_duration_seconds_sum[15m] ) ) by (cluster_id) / sum( irate( coredns_dns_request_duration_seconds_count[15m] ) ) by (cluster_id) > 0.003 - # This is intentionally low. - # - # DNS latency tends to spike for a short period of time (< 2 minutes), - # but this can still impact larger customer workloads. - # - # In practice, because we ignore clusters that have a low number of - # DNS requests (see the first subquery above), even a short spike - # implies a problem that should be taken care of. - for: 1m - labels: - area: kaas - cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} - severity: notify - team: {{ include "providerTeam" . }} - topic: dns - alert: CoreDNSDeploymentNotSatisfied annotations: description: '{{`CoreDNS Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' @@ -63,36 +17,21 @@ spec: expr: sum(kube_deployment_status_replicas_available{deployment=~"coredns.*"}) / (sum(kube_deployment_status_replicas_available{deployment=~"coredns.*"}) + sum(kube_deployment_status_replicas_unavailable{deployment=~"coredns.*"}))* 100 < 51 for: 10m labels: - area: managedservices + area: empowerment cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: page - team: {{ include "providerTeam" . }} + team: cabbage topic: dns - alert: CoreDNSMaxHPAReplicasReached expr: kube_hpa_status_current_replicas{hpa="coredns"} == kube_hpa_spec_max_replicas{hpa="coredns"} AND kube_hpa_spec_min_replicas{hpa="coredns"} != kube_hpa_spec_max_replicas{hpa="coredns"} for: 120m labels: - area: kaas + area: empowerment cancel_if_outside_working_hours: "true" severity: page - team: {{ include "providerTeam" . }} + team: cabbage topic: dns annotations: description: '{{`CoreDNS Deployment {{ $labels.namespace}}/{{ $labels.deployment }} has been scaled to its maximum replica count for too long.`}}' - # This alert checks the percentage of the dns requests that are handled by a single pod. The result of the query should always be 0 (that means load is spread evenly between all coredns pods). - # If it's > 20 for 10 minutes there is something weird happening in the cluster. - # This is only relevant if there is a meaningful number of DNS requests happening - - alert: CoreDNSLoadUnbalanced - expr: sum by (cluster_id) (rate(coredns_dns_requests_total[10m])) > 10 AND (sum by(cluster_id,pod) (rate(coredns_dns_requests_total[10m])) / ignoring(pod) group_left sum by (cluster_id) (rate(coredns_dns_requests_total[10m])) * 100) - ignoring(pod) group_left 100 / sum by (cluster_id) (kube_deployment_status_replicas{deployment=~"coredns|coredns-cp"}) > 20 - for: 10m - labels: - area: kaas - cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} - severity: page - team: {{ include "providerTeam" . }} - topic: dns - annotations: - description: '{{`CoreDNS Load has been unbalanced for more than 10m.`}}' - opsrecipe: core-dns-unbalanced/ diff --git a/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml index 5d5ae43b6..03469f8dd 100644 --- a/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml @@ -101,7 +101,7 @@ spec: annotations: description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' opsrecipe: deployment-not-satisfied/ - expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"aws-admission-controller.*|aws-operator-.+|azure-admission-controller-.+|azure-operator.*|azure-collector.*|cluster-operator-.+|cluster-api-core-webhook.*|coredns-.+|event-exporter-.*|etcd-kubernetes-resources-count-exporter-.*|upgrade-schedule-operator.*|worker-.+|master-.+", cluster_id!~"argali|giraffe"} > 0 + expr: label_join(kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"aws-admission-controller.*|aws-operator-.+|azure-admission-controller-.+|azure-operator.*|azure-collector.*|cluster-operator-.+|cluster-api-core-webhook.*|event-exporter-.*|etcd-kubernetes-resources-count-exporter-.*|upgrade-schedule-operator.*|worker-.+|master-.+", cluster_id!~"argali|giraffe"}, "service", "/", "namespace", "deployment") > 0 for: 30m labels: area: kaas @@ -204,7 +204,7 @@ spec: annotations: description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' opsrecipe: deployment-not-satisfied-china/ - expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"(ingress-nginx|nginx-ingress-controller)-.+", cluster_id=~"argali|giraffe"} > 0 + expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"(ingress-nginx|nginx-ingress-controller|coredns)-.+", cluster_id=~"argali|giraffe"} > 0 for: 3h labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml index 139701976..b38ea04f2 100644 --- a/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/deployment.workload-cluster.rules.yml @@ -27,7 +27,7 @@ spec: annotations: description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' opsrecipe: workload-cluster-deployment-not-satisfied/ - expr: kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment=~"metrics-server|vertical-pod-autoscaler(-app)?-admission-controller|vertical-pod-autoscaler(-app)?-recommender|vertical-pod-autoscaler(-app)?-updater|aws-pod-identity-webhook.*|cluster-autoscaler"} > 0 + expr: label_join(kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment=~"metrics-server|vertical-pod-autoscaler(-app)?-admission-controller|vertical-pod-autoscaler(-app)?-recommender|vertical-pod-autoscaler(-app)?-updater|aws-pod-identity-webhook.*|cluster-autoscaler|aws-load-balancer-controller"}, "service", "/", "namespace", "deployment") > 0 for: 30m labels: area: kaas @@ -39,7 +39,7 @@ spec: annotations: description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' opsrecipe: workload-cluster-deployment-not-satisfied/ - expr: kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment=~"etcd-kubernetes-resources-count-exporter"} > 0 + expr: label_join(kube_deployment_status_replicas_unavailable{cluster_type="workload_cluster", deployment=~"etcd-kubernetes-resources-count-exporter"}, "service", "/", "namespace", "deployment") > 0 for: 30m labels: area: kaas @@ -52,7 +52,7 @@ spec: annotations: description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' opsrecipe: workload-cluster-managed-deployment-not-satisfied/ - expr: managed_app_deployment_status_replicas_unavailable{cluster_type="workload_cluster", managed_app=~"cert-manager.*"} > 0 + expr: label_join(managed_app_deployment_status_replicas_unavailable{cluster_type="workload_cluster", managed_app=~"cert-manager.*"}, "service", "/", "namespace", "deployment") > 0 for: 30m labels: area: managedservices @@ -71,3 +71,14 @@ spec: severity: notify team: honeybadger topic: releng + - alert: WorkloadClusterDeploymentScaledDownToZeroShield + annotations: + description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} has been scaled down to zero for prolonged period of time.`}}' + expr: kube_deployment_spec_replicas{cluster_type="workload_cluster", deployment=~"trivy-operator|starboard-exporter|jiralert"} == 0 + for: 4h + labels: + area: managedservices + cancel_if_outside_working_hours: true + severity: notify + team: shield + topic: releng diff --git a/helm/prometheus-rules/templates/alerting-rules/disk.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/disk.management-cluster.rules.yml index d2fe80d2a..9a5d14d40 100644 --- a/helm/prometheus-rules/templates/alerting-rules/disk.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/disk.management-cluster.rules.yml @@ -39,7 +39,7 @@ spec: annotations: description: '{{`Etcd volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}' opsrecipe: low-disk-space/#etcd-volume - expr: 100 * node_filesystem_free_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd"} / node_filesystem_size_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd"} < 10 + expr: 100 * node_filesystem_free_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd", provider!~"eks"} / node_filesystem_size_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd", provider!~"eks"} < 10 for: 10m labels: area: kaas @@ -75,7 +75,7 @@ spec: annotations: description: '{{`Persistent volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}' opsrecipe: low-disk-space/#persistent-volume - expr: 100 * ((node_filesystem_free_bytes{mountpoint=~"(/rootfs)?/var/lib/kubelet.*"} / node_filesystem_size_bytes{mountpoint=~"(/rootfs)?/var/lib/kubelet.*"}) * on (pod) group_left kube_pod_info{priority_class!="prometheus"}) < 10 + expr: 100 * ((node_filesystem_free_bytes{mountpoint=~"(/rootfs)?/var/lib/kubelet.*",mountpoint!~".*prometheus/[0-9]+"} / node_filesystem_size_bytes{mountpoint=~"(/rootfs)?/var/lib/kubelet.*",mountpoint!~".*prometheus/[0-9]+"}) * on (pod) group_left kube_pod_info{priority_class!="prometheus"}) < 10 for: 10m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/disk.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/disk.workload-cluster.rules.yml index b45a7e0a7..68eff7115 100644 --- a/helm/prometheus-rules/templates/alerting-rules/disk.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/disk.workload-cluster.rules.yml @@ -26,7 +26,7 @@ spec: annotations: description: '{{`Etcd volume {{ $labels.mountpoint}} on {{ $labels.instance }} does not have enough free space.`}}' opsrecipe: low-disk-space/#etcd-volume - expr: 100 * node_filesystem_free_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd"} / node_filesystem_size_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd"} < 10 + expr: 100 * node_filesystem_free_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd", provider!~"eks"} / node_filesystem_size_bytes{mountpoint=~"(/rootfs)?/var/lib/etcd", provider!~"eks"} < 10 for: 10m labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/etcd.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/etcd.management-cluster.rules.yml index 04902f685..4d54a993c 100644 --- a/helm/prometheus-rules/templates/alerting-rules/etcd.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/etcd.management-cluster.rules.yml @@ -15,7 +15,7 @@ spec: annotations: description: '{{`Etcd ({{ $labels.instance }}) has a too high commit duration.`}}' opsrecipe: etcd-high-commit-duration/ - expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="management_cluster"}[5m])) > 1.0 + expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="management_cluster", provider!~"eks"}[5m])) > 1.0 for: 15m labels: area: kaas @@ -27,7 +27,7 @@ spec: annotations: description: '{{`Etcd ({{ $labels.instance }}) has a too large database.`}}' opsrecipe: etcd-db-size-too-large/ - expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="management_cluster"} / etcd_server_quota_backend_bytes{cluster_type="management_cluster"}) * 100 > 80 + expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="management_cluster", provider!~"eks"} / etcd_server_quota_backend_bytes{cluster_type="management_cluster", provider!~"eks"}) * 100 > 80 for: 90m labels: area: kaas @@ -38,7 +38,7 @@ spec: - alert: ManagementClusterEtcdNumberOfLeaderChangesTooHigh annotations: description: '{{`Etcd has too many leader changes.`}}' - expr: increase(etcd_server_leader_changes_seen_total{cluster_type="management_cluster"}[1h]) > 8 + expr: increase(etcd_server_leader_changes_seen_total{cluster_type="management_cluster", provider!~"eks"}[1h]) > 8 labels: area: kaas cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} @@ -49,7 +49,7 @@ spec: annotations: description: '{{`Etcd has no leader.`}}' opsrecipe: etcd-has-no-leader/ - expr: etcd_server_has_leader{role=~"master|control-plane", cluster_type="management_cluster"} == 0 + expr: etcd_server_has_leader{cluster_type="management_cluster", provider!~"eks"} == 0 for: 5m labels: area: kaas @@ -61,7 +61,7 @@ spec: annotations: description: '{{`Etcd metrics missing for {{ $labels.cluster_id }}.`}}' opsrecipe: etcd-metrics-missing/ - expr: count(up{cluster_type="management_cluster"}) by (cluster_id) unless count(etcd_server_id) by (cluster_id) + expr: count(up{cluster_type="management_cluster", provider!~"eks"}) by (cluster_id) unless count(etcd_server_id{provider!~"eks"}) by (cluster_id) for: 1h labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/etcd.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/etcd.workload-cluster.rules.yml index 127cd03d0..882221e39 100644 --- a/helm/prometheus-rules/templates/alerting-rules/etcd.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/etcd.workload-cluster.rules.yml @@ -15,7 +15,7 @@ spec: annotations: description: '{{`Etcd ({{ $labels.instance }}) on workload cluster {{ $labels.cluster_id }} is down.`}}' opsrecipe: etcd-down/ - expr: up{cluster_type="workload_cluster", app="etcd"} == 0 + expr: up{cluster_type="workload_cluster", app="etcd", provider!~"eks"} == 0 for: 20m labels: area: kaas @@ -31,7 +31,7 @@ spec: annotations: description: '{{`Etcd ({{ $labels.instance }}) has a too high commit duration.`}}' opsrecipe: etcd-high-commit-duration/ - expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="workload_cluster"}[5m])) > 1.0 + expr: histogram_quantile(0.95, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster_type="workload_cluster", provider!~"eks"}[5m])) > 1.0 for: 15m labels: area: kaas @@ -43,7 +43,7 @@ spec: annotations: description: '{{`Etcd ({{ $labels.instance }}) has a too large database.`}}' opsrecipe: etcd-db-size-too-large/ - expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="workload_cluster"} / etcd_server_quota_backend_bytes{cluster_type="workload_cluster"}) * 100 > 80 + expr: (etcd_mvcc_db_total_size_in_bytes{cluster_type="workload_cluster", provider!~"eks"} / etcd_server_quota_backend_bytes{cluster_type="workload_cluster", provider!~"eks"}) * 100 > 80 for: 15m labels: area: kaas @@ -54,7 +54,7 @@ spec: - alert: WorkloadClusterEtcdNumberOfLeaderChangesTooHigh annotations: description: '{{`Etcd has too many leader changes.`}}' - expr: increase(etcd_server_leader_changes_seen_total{cluster_type="workload_cluster"}[1h]) > 8 + expr: increase(etcd_server_leader_changes_seen_total{cluster_type="workload_cluster", provider!~"eks"}[1h]) > 8 labels: area: kaas severity: notify @@ -64,7 +64,7 @@ spec: annotations: description: '{{`Etcd has no leader.`}}' opsrecipe: etcd-has-no-leader/ - expr: etcd_server_has_leader{cluster_type="workload_cluster", container!~"loki|promtail"} == 0 + expr: etcd_server_has_leader{cluster_type="workload_cluster", container!~"loki|promtail", provider!~"eks"} == 0 for: 35m labels: area: kaas @@ -76,7 +76,7 @@ spec: annotations: description: '{{`Etcd metrics missing for {{ $labels.cluster_id }}.`}}' opsrecipe: etcd-metrics-missing/ - expr: count(up{cluster_type="workload_cluster"}) by (cluster_id) unless count(etcd_server_id) by (cluster_id) + expr: count(up{cluster_type="workload_cluster", provider!~"eks"}) by (cluster_id) unless count(etcd_server_id{provider!~"eks"}) by (cluster_id) for: 1h labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/gcp.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/gcp.management-cluster.rules.yml index 5343b879e..a187f8c27 100644 --- a/helm/prometheus-rules/templates/alerting-rules/gcp.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/gcp.management-cluster.rules.yml @@ -45,7 +45,7 @@ spec: topic: kubernetes - alert: ManagementClusterDeploymentMissingGCP annotations: - description: '{{`Deployment {{ $labels.workload_name }} is missing.`}}' + description: '{{`Deployment {{ $labels.deployment }} is missing.`}}' opsrecipe: management-cluster-deployment-is-missing/ expr: absent(kube_deployment_status_condition{namespace=~"giantswarm|kube-system", condition="Available", deployment=~"capi-controller-manager.*|capg-controller-manager.*|capi-kubeadm-bootstrap-controller-manager.*|capi-kubeadm-control-plane-controller-manager.*|dns-operator-gcp.*|.*workload-identity-operator-gcp.*"}) for: 5m diff --git a/helm/prometheus-rules/templates/alerting-rules/inhibit.prometheus-agent.rules.yml b/helm/prometheus-rules/templates/alerting-rules/inhibit.prometheus-agent.rules.yml index 437c09f36..2fe54e1af 100644 --- a/helm/prometheus-rules/templates/alerting-rules/inhibit.prometheus-agent.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/inhibit.prometheus-agent.rules.yml @@ -12,14 +12,31 @@ spec: - name: inhibit.prometheus-agent rules: # this inhibition fires when a cluster is not running prometheus-agent. - # If we have prometheus-agent statefulset, it means prometheus-agent is installed on this cluster - # so, raise an inhibition unless prometheus-agent runs on the cluster + # we retrieve the list of existing cluster IDs from `kube_namespace_created` + # excluding the MC's one, because it's always using prometheus-agent and namespace is not named after cluster name + # then compare it with the list of deployed prometheus-agents from `app_operator_app_info` # - # Will produce data (and inhibitions) on MC/WC. + # Will only produce data (and inhibitions) on MC because it's where app_operator is running + # but that's enough to have the inhibitions on the installation-global alertmanager - alert: InhibitionClusterIsNotRunningPrometheusAgent annotations: description: '{{`Cluster ({{ $labels.cluster_id }}) is not running Prometheus Agent.`}}' - expr: (count by (cluster_id) (prometheus_build_info{app="prometheus"}) unless count by (cluster_id) (kube_statefulset_created{namespace="kube-system",statefulset=~"prometheus-prometheus-agent.*"} > 0)) + expr: |- + count( + label_replace( + sum_over_time( + kube_namespace_created{namespace!="{{ .Values.managementCluster.name }}-prometheus", namespace=~".+-prometheus"}[5m] + ), "cluster_id", "$1", "namespace", "(.+)-prometheus" + ) + ) by (cluster_id) + unless + count( + label_replace( + sum_over_time( + app_operator_app_info{app="prometheus-agent"}[5m] + ), "cluster_id", "$1", "namespace", "(.*)" + ) + ) by (cluster_id) labels: cluster_is_not_running_prometheus_agent: "true" area: empowerment diff --git a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml index 97c934854..e635ae988 100644 --- a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml @@ -10,10 +10,77 @@ spec: groups: - name: kube-state-metrics rules: + - alert: KubeStateMetricsDown + annotations: + description: '{{`KubeStateMetrics ({{ $labels.instance }}) is down.`}}' + opsrecipe: kube-state-metrics-down/ + expr: |- + ( + # modern clusters + label_replace(up{app="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",instance=~".*:8080"} == 1) + ) + and + ( + # vintage clusters without servicemonitor + label_replace(up{app="kube-state-metrics",container=""}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics",container=""} == 1) + ) + for: 15m + labels: + area: kaas + cancel_if_apiserver_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_has_no_workers: "true" + inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "false" + severity: page + team: atlas + topic: observability + - alert: KubeStateMetricsSlow + annotations: + description: '{{`KubeStateMetrics ({{ $labels.instance }}) is too slow.`}}' + opsrecipe: kube-state-metrics-down/ + expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{handler="metrics", job="kube-state-metrics"}[5m])) by (le, cluster_id)) > 7 + for: 15m + labels: + area: kaas + cancel_if_apiserver_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_has_no_workers: "true" + inhibit_kube_state_metrics_down: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability + - alert: KubeStateMetricsNotRetrievingMetrics + annotations: + description: '{{`KubeStateMetrics ({{ $labels.instance }}) is not retrieving metrics.`}}' + opsrecipe: kube-state-metrics-down/ + expr: |- + # When it looks up but we don't have metrics + count({app="kube-state-metrics"}) < 10 + for: 20m + labels: + area: kaas + cancel_if_apiserver_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_has_no_workers: "true" + inhibit_kube_state_metrics_down: "true" + cancel_if_kubelet_down: "true" + cancel_if_kube_state_metrics_down: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability - alert: KubeConfigMapCreatedMetricMissing annotations: description: '{{`kube_configmap_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' - opsrecipe: kube-steate-metrics-metrics-missing/ + opsrecipe: kube-state-metrics-down/ expr: absent(kube_configmap_created{}) for: 30m labels: @@ -26,7 +93,7 @@ spec: - alert: KubeDaemonSetCreatedMetricMissing annotations: description: '{{`kube_daemonset_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' - opsrecipe: kube-steate-metrics-metrics-missing/ + opsrecipe: kube-state-metrics-down/ expr: absent(kube_daemonset_created{}) for: 30m labels: @@ -39,7 +106,7 @@ spec: - alert: KubeDeploymentCreatedMetricMissing annotations: description: '{{`kube_deployment_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' - opsrecipe: kube-steate-metrics-metrics-missing/ + opsrecipe: kube-state-metrics-down/ expr: absent(kube_deployment_created{}) for: 30m labels: @@ -52,7 +119,7 @@ spec: - alert: KubeEndpointCreatedMetricMissing annotations: description: '{{`kube_endpoint_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' - opsrecipe: kube-steate-metrics-metrics-missing/ + opsrecipe: kube-state-metrics-down/ expr: absent(kube_endpoint_created{}) for: 30m labels: @@ -65,7 +132,7 @@ spec: - alert: KubeNamespaceCreatedMetricMissing annotations: description: '{{`kube_namespace_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' - opsrecipe: kube-steate-metrics-metrics-missing/ + opsrecipe: kube-state-metrics-down/ expr: absent(kube_namespace_created{}) for: 30m labels: @@ -78,7 +145,7 @@ spec: - alert: KubeNodeCreatedMetricMissing annotations: description: '{{`kube_node_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' - opsrecipe: kube-steate-metrics-metrics-missing/ + opsrecipe: kube-state-metrics-down/ expr: absent(kube_node_created{}) for: 30m labels: @@ -91,7 +158,7 @@ spec: - alert: KubePodCreatedMetricMissing annotations: description: '{{`kube_pod_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' - opsrecipe: kube-steate-metrics-metrics-missing/ + opsrecipe: kube-state-metrics-down/ expr: absent(kube_pod_created{}) for: 30m labels: @@ -104,7 +171,7 @@ spec: - alert: KubeReplicaSetCreatedMetricMissing annotations: description: '{{`kube_replicaset_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' - opsrecipe: kube-steate-metrics-metrics-missing/ + opsrecipe: kube-state-metrics-down/ expr: absent(kube_replicaset_created{}) for: 30m labels: @@ -117,7 +184,7 @@ spec: - alert: KubeSecretCreatedMetricMissing annotations: description: '{{`kube_secret_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' - opsrecipe: kube-steate-metrics-metrics-missing/ + opsrecipe: kube-state-metrics-down/ expr: absent(kube_secret_created{}) for: 30m labels: @@ -130,7 +197,7 @@ spec: - alert: KubeServiceCreatedMetricMissing annotations: description: '{{`kube_service_created metric is missing for cluster {{ $labels.cluster_id }}.`}}' - opsrecipe: kube-steate-metrics-metrics-missing/ + opsrecipe: kube-state-metrics-down/ expr: absent(kube_service_created{}) for: 30m labels: diff --git a/helm/prometheus-rules/templates/alerting-rules/kvm.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/kvm.workload-cluster.rules.yml index 0473b62aa..43a21f921 100644 --- a/helm/prometheus-rules/templates/alerting-rules/kvm.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/kvm.workload-cluster.rules.yml @@ -17,7 +17,7 @@ spec: description: '{{`Critical pod {{ $labels.namespace }}/{{ $labels.pod }} is not running.`}}' opsrecipe: critical-pod-is-not-running/ expr: kube_pod_container_status_running{container=~"(k8s-api-server|k8s-controller-manager|k8s-scheduler)"} != 1 - for: 5m + for: 15m labels: area: kaas severity: page diff --git a/helm/prometheus-rules/templates/alerting-rules/kyverno.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/kyverno.all.rules.yml new file mode 100644 index 000000000..ece840e2d --- /dev/null +++ b/helm/prometheus-rules/templates/alerting-rules/kyverno.all.rules.yml @@ -0,0 +1,76 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + creationTimestamp: null + labels: + {{- include "labels.common" . | nindent 4 }} + name: kyverno.all.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: webhooks + rules: + - alert: KyvernoWebhookHasNoAvailableReplicas + annotations: + description: 'Kyverno has no available replicas but webhooks are present.' + opsrecipe: kyverno-webhooks/ + expr: sum(kube_validatingwebhookconfiguration_info{validatingwebhookconfiguration=~"kyverno-.*"}) > 0 and sum(kube_deployment_status_replicas{deployment=~"kyverno|kyverno-admission-controller"}) == 0 + for: 15m + labels: + area: managedservices + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "false" + severity: page + team: shield + topic: kyverno + - name: reports + rules: + - alert: KyvernoAdmissionReportCountTooHigh + annotations: + description: "{{`Kyverno {{ $labels.kind }} are too high. This is an indicator that Kyverno\'s report processing may not be keeping up with cluster demand.`}}" + opsrecipe: kyverno-reports/ + expr: aggregation:kyverno_resource_counts{kind=~"(clusteradmissionreports|admissionreports).kyverno.io"} > 50000 + for: 15m + labels: + area: managedservices + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "false" + severity: page + team: shield + topic: kyverno + - name: replicas + rules: + - alert: KyvernoScaledDownTooLong + annotations: + description: 'Kyverno has been scaled down for too long.' + opsrecipe: kyverno-scaled-down/ + expr: sum(kube_deployment_spec_replicas{deployment=~"kyverno|kyverno-kyverno-plugin|kyverno-policy-reporter"}) == 0 + for: 4h + labels: + area: managedservices + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: shield + topic: kyverno + - alert: KyvernoUnsafeReplicaCount + annotations: + description: "Kyverno\'s admission controller deployment must use at least 3 replicas, or be scaled to 0." + opsrecipe: KyvernoWronglyScaled/ + expr: sum(kube_deployment_spec_replicas{deployment="kyverno"}) != 0 and sum(kube_deployment_spec_replicas{deployment="kyverno"}) < 3 + for: 1h + labels: + area: managedservices + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: notify + team: shield + topic: kyverno diff --git a/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml index e3a198d18..54399dc3f 100644 --- a/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml @@ -27,7 +27,7 @@ spec: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cancel_if_scrape_timeout: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: "true" severity: page team: atlas topic: observability @@ -44,7 +44,7 @@ spec: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cancel_if_scrape_timeout: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: "true" severity: page team: atlas topic: observability diff --git a/helm/prometheus-rules/templates/alerting-rules/management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/management-cluster.rules.yml index deea00f0b..4a7cc08e8 100644 --- a/helm/prometheus-rules/templates/alerting-rules/management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/management-cluster.rules.yml @@ -16,7 +16,7 @@ spec: annotations: description: '{{`Management cluster {{ $labels.cluster_id }} has less than 3 nodes.`}}' opsrecipe: management-cluster-less-than-three-workers/ - expr: sum(kubelet_node_name{cluster_type="management_cluster", role="worker"}) < 3 + expr: sum(kubelet_node_name{cluster_type="management_cluster"} * on (node) kube_node_role{role="worker"}) < 3 for: 1h labels: area: kaas diff --git a/helm/prometheus-rules/templates/alerting-rules/network.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/network.all.rules.yml index cdbde150c..7c41ed2e9 100644 --- a/helm/prometheus-rules/templates/alerting-rules/network.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/network.all.rules.yml @@ -74,7 +74,7 @@ spec: area: kaas cancel_if_outside_working_hours: "true" severity: notify - team: rocket + team: {{ include "providerTeam" . }} topic: network - alert: HighNumberOfOrphanedSockets annotations: @@ -87,5 +87,5 @@ spec: area: kaas cancel_if_outside_working_hours: "true" severity: page - team: rocket + team: {{ include "providerTeam" . }} topic: network diff --git a/helm/prometheus-rules/templates/alerting-rules/node.workload_cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/node.workload_cluster.rules.yml index debf9b5f2..34ea70022 100644 --- a/helm/prometheus-rules/templates/alerting-rules/node.workload_cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/node.workload_cluster.rules.yml @@ -137,7 +137,7 @@ spec: / (node_memory_MemTotal_bytes{cluster_type="workload_cluster"} ) * 100 ) - ) > 90 + ) > 80 and ( node_memory_MemFree_bytes{cluster_type="workload_cluster"} + node_memory_Cached_bytes{cluster_type="workload_cluster"} diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml index 3946e893b..042e2175a 100644 --- a/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml @@ -18,7 +18,16 @@ spec: summary: Prometheus agent fails to send samples to remote write endpoint. opsrecipe: prometheus-agent-remote-write-failed/ dashboard: promRW001/prometheus-remote-write - expr: count(absent_over_time(up{instance="prometheus-agent"}[10m])) and count((present_over_time(kube_statefulset_created{namespace="kube-system",statefulset=~"prometheus-prometheus-agent.*"}[10m]))) + # expr: count(absent_over_time(up{instance="prometheus-agent"}[10m])) + expr: |- + max_over_time( + sum by (cluster_type, cluster_id, installation, instance, service) + ( + up{instance="prometheus-agent"} == 0 + or + absent(up{instance="prometheus-agent"}) == 1 + )[5m:] + ) for: 10m labels: area: empowerment @@ -29,7 +38,6 @@ spec: cancel_if_cluster_is_not_running_prometheus_agent: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" ## Page Atlas if prometheus agent is missing shards to send samples to MC prometheus. - alert: PrometheusAgentShardsMissing annotations: @@ -37,20 +45,25 @@ spec: summary: Prometheus agent is missing shards. opsrecipe: prometheus-agent-missing-shards/ expr: |- - count( - ## number of remotes that are not mimir or grafana-cloud - prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir"} - ) != ( - ## number of shards defined in the Prometheus CR - prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} - or ( - # if there is only 1 shard, there is no shard metric so we use the replicas metric - absent(prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}) - and on(controller, name) - prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} + max_over_time(sum( + count( + ## number of remotes that are not mimir or grafana-cloud + prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir"} ) - ) - for: 30m + != + sum( + ## number of shards defined in the Prometheus CR + prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"} + or + ( + # if there is only 1 shard, there is no shard metric so we use the replicas metric + absent(prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}) + and on(controller, name) + prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"} + ) + ) + )[5m:]) + for: 10m labels: area: empowerment severity: page diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus-availability.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus-availability.rules.yml deleted file mode 100644 index 7d772ac4f..000000000 --- a/helm/prometheus-rules/templates/alerting-rules/prometheus-availability.rules.yml +++ /dev/null @@ -1,32 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - creationTimestamp: null - labels: - {{- include "labels.common" . | nindent 4 }} - cluster_type: "management_cluster" - name: prometheus-availability.rules - namespace: {{ .Values.namespace }} -spec: - groups: - - name: prometheus - rules: - - alert: PrometheusAvailabilityRatio - annotations: - description: '{{`Prometheus {{$labels.pod}} has availability ratio of {{ printf "%.2f" $value }} (min 0.8) over the last hour.`}}' - opsrecipe: prometheus-resource-limit-reached/ - dashboard: promavailability/prometheus-availability - expr: label_replace(avg(avg_over_time(kube_pod_status_ready{namespace=~"(.*)-prometheus", condition="true"}[1h])) by (pod), "cluster_id", "$1", "pod", "prometheus-(.+)-(.+)") < 0.8 - # At startup, availability starts at 0 for a few minutes. So ratio grows slowly from 0. - for: 30m - labels: - area: empowerment - cancel_if_any_apiserver_down: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_has_no_workers: "true" - cancel_if_outside_working_hours: "true" - severity: page - team: atlas - topic: observability diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus-meta-operator.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus-meta-operator.rules.yml index 16415a861..446a397c3 100644 --- a/helm/prometheus-rules/templates/alerting-rules/prometheus-meta-operator.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/prometheus-meta-operator.rules.yml @@ -60,6 +60,7 @@ spec: annotations: description: '{{`prometheus-meta-operator controller {{ $labels.controller }} too many reconcile errors.`}}' opsrecipe: "pmo-reconcile-errors/" + dashboard: piJK9Vm4z/operatorkit expr: | avg_over_time(operatorkit_controller_errors_total{app="prometheus-meta-operator"}[20m]) > 0 for: 1h @@ -67,7 +68,7 @@ spec: area: "empowerment" cancel_if_mc_kube_state_metrics_down: "false" cancel_if_cluster_status_creating: "true" - cancel_if_outside_working_hours: "true" + cancel_if_outside_working_hours: true installation: {{ .Values.managementCluster.name }} severity: "page" team: "atlas" diff --git a/helm/prometheus-rules/templates/alerting-rules/service-level.rules.yml b/helm/prometheus-rules/templates/alerting-rules/service-level.rules.yml index d7d395662..53bc5e938 100644 --- a/helm/prometheus-rules/templates/alerting-rules/service-level.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/service-level.rules.yml @@ -14,6 +14,7 @@ spec: annotations: description: '{{`Service level burn rate is too high for {{ $labels.service }} service.`}}' opsrecipe: service-level-burn-rate-too-high/ + dashboard: https://giantswarm.grafana.net/d/service-level/service-level?orgId=1 expr: | ( slo_errors_per_request:ratio_rate1h{service!~"efk-.*|.*external-dns.*|kong-.*|.*(ingress-nginx|nginx-ingress-controller).*"} > on (service) group_left slo_threshold_high diff --git a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml index 76046e152..edc29786e 100644 --- a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml @@ -25,6 +25,7 @@ spec: cancel_if_kubelet_down: "true" cancel_if_cluster_has_no_workers: "true" cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} + cancel_if_prometheus_agent_down: "true" severity: notify team: honeybadger topic: releng @@ -41,24 +42,7 @@ spec: cancel_if_kubelet_down: "true" cancel_if_cluster_has_no_workers: "true" cancel_if_outside_working_hours: "true" - severity: page - team: atlas - topic: observability - - alert: KubeStateMetricsDown - annotations: - description: '{{`KubeStateMetrics ({{ $labels.instance }}) is down.`}}' - opsrecipe: kube-state-metrics-down/ - expr: label_replace(up{app="kube-state-metrics"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{app="kube-state-metrics"} == 1) - for: 15m - labels: - area: kaas - cancel_if_apiserver_down: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_has_no_workers: "true" - inhibit_kube_state_metrics_down: "true" - cancel_if_kubelet_down: "true" - cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} + cancel_if_prometheus_agent_down: "true" severity: page team: atlas topic: observability diff --git a/helm/prometheus-rules/templates/alerting-rules/vault.rules.yml b/helm/prometheus-rules/templates/alerting-rules/vault.rules.yml index 7bf9d148e..1707c4360 100644 --- a/helm/prometheus-rules/templates/alerting-rules/vault.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/vault.rules.yml @@ -17,7 +17,7 @@ spec: description: '{{`Vault is down.`}}' opsrecipe: vault-is-down/ expr: vault_up == 0 - for: 20m + for: 40m labels: area: kaas cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} @@ -57,6 +57,7 @@ spec: labels: area: kaas cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} + cancel_if_prometheus_agent_down: "true" severity: page team: {{ include "providerTeam" . }} topic: vault diff --git a/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml new file mode 100644 index 000000000..8f02dcf68 --- /dev/null +++ b/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml @@ -0,0 +1,31 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + {{- include "labels.common" . | nindent 4 }} + name: vpa.all.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: vpa + rules: + - alert: VpaComponentTooManyRestarts + annotations: + description: This pages when one of the vpa's component has restarted too much over the last 10min. + opsrecipe: vpa-component-too-many-restarts/ + expr: | + 1 - sum(increase(kube_pod_container_status_restarts_total{container=~"recommender|updater|admission-controller"}[10m])) by (container, cluster_id, cluster_type, customer, installation, pipeline, provider, region)/100 < 0.98 + or + 1 - sum(increase(kube_pod_container_status_restarts_total{container="vertical-pod-autoscaler-app"}[10m])) by (container, cluster_id, cluster_type, customer, installation, pipeline, provider, region)/100 < 0.98 + for: 10m + labels: + area: kaas + cancel_if_apiserver_down: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_scrape_timeout: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: phoenix + topic: observability diff --git a/helm/prometheus-rules/templates/recording-rules/grafana-cloud.rules.yml b/helm/prometheus-rules/templates/recording-rules/grafana-cloud.rules.yml index fb46e75be..e1bdfc241 100644 --- a/helm/prometheus-rules/templates/recording-rules/grafana-cloud.rules.yml +++ b/helm/prometheus-rules/templates/recording-rules/grafana-cloud.rules.yml @@ -275,13 +275,6 @@ spec: record: aggregation:prometheus:memory_percentage - expr: sum(label_replace(container_memory_working_set_bytes{container='prometheus', namespace=~'.*-prometheus'}, "cluster_id", "$2", "pod", "(prometheus-)(.+)(-.+)")) by (cluster_type , cluster_id) record: aggregation:prometheus:memory_usage - - name: managed-apps.grafana-cloud.recording - rules: - # Managed apps basic SLI metrics - - expr: sum(monitoring:managed_apps:service_level:primary:error_budget_used) by (cluster_type, cluster_id,workload_name,workload_type) >= 1 - record: aggregation:managed_apps:service_level:basic:error_budget_depleted - - expr: sum(monitoring:managed_apps:service_level:primary:error_budget_used) by (cluster_type, cluster_id,workload_name,workload_type) >= 0.75 - record: aggregation:managed_apps:service_level:basic:error_budget_low - name: dex.grafana-cloud.recording rules: # Dex activity and status based on ingress controller data @@ -314,7 +307,10 @@ spec: # Kyverno-related resource counts by kind - expr: sum(etcd_kubernetes_resources_count{kind=~".*.kyverno.io|clusterpolicyreports.wgpolicyk8s.io|policyreports.wgpolicyk8s.io"}) by (cluster_id, kind) record: aggregation:kyverno_resource_counts - # Kyverno policy status by team - Deployments + # Kyverno policy enforcement status + - expr: sum(kyverno_policy) by (background, category, kind, policy, rule, type, validationFailureAction) + record: aggregation:kyverno_policy_status + # Kyverno policy workload status by team - Deployments - expr: |- label_join( sum( @@ -322,21 +318,17 @@ spec: policy!="check-deprecated-apis-1-25", cluster_type="management_cluster", kind=~"Deployment" - }, "deployment", ",", "name") - ) by (deployment, category, policy, status) - * on(deployment) group_left(team, app) - sum( - sum( - label_join(kube_deployment_labels{}, "app", ",", "label_app_kubernetes_io_name") - ) by (deployment, app) - * on(app) group_left(team) - sum( - app_operator_app_info{team!="noteam"} - ) by (app, team) - ) by (team, deployment, app), + }, "deployment", ",", "name") + ) by (deployment, category, policy, status) + * on(deployment) group_left(team, app) sum( + label_join(label_join(kube_deployment_labels{ + cluster_type="management_cluster", + label_application_giantswarm_io_team!="" + }, "app", ",", "label_app_kubernetes_io_name"), "team", ",", "label_application_giantswarm_io_team") + ) by (team, app, deployment), "name", ",", "deployment") record: aggregation:kyverno_policy_deployment_status_team - # Kyverno policy status by team - DaemonSets + # Kyverno policy workload status by team - DaemonSets - expr: |- label_join( sum( @@ -344,21 +336,17 @@ spec: policy!="check-deprecated-apis-1-25", cluster_type="management_cluster", kind=~"DaemonSet" - }, "daemonset", ",", "name") - ) by (daemonset, category, policy, status) - * on(daemonset) group_left(team, app) - sum( - sum( - label_join(kube_daemonset_labels{}, "app", ",", "label_app_kubernetes_io_name") - ) by (daemonset, app) - * on(app) group_left(team) - sum( - app_operator_app_info{team!="noteam"} - ) by (app, team) - ) by (team, daemonset, app), + }, "daemonset", ",", "name") + ) by (daemonset, category, policy, status) + * on(daemonset) group_left(team, app) sum( + label_join(label_join(kube_daemonset_labels{ + cluster_type="management_cluster", + label_application_giantswarm_io_team!="" + }, "app", ",", "label_app_kubernetes_io_name"), "team", ",", "label_application_giantswarm_io_team") + ) by (team, app, daemonset), "name", ",", "daemonset") record: aggregation:kyverno_policy_daemonset_status_team - # Kyverno policy status by team - StatefulSets + # Kyverno policy workload status by team - StatefulSets - expr: |- label_join( sum( @@ -366,21 +354,17 @@ spec: policy!="check-deprecated-apis-1-25", cluster_type="management_cluster", kind=~"StatefulSet" - }, "statefulset", ",", "name") - ) by (statefulset, category, policy, status) - * on(statefulset) group_left(team, app) - sum( - sum( - label_join(kube_statefulset_labels{}, "app", ",", "label_app_kubernetes_io_name") - ) by (statefulset, app) - * on(app) group_left(team) - sum( - app_operator_app_info{team!="noteam"} - ) by (app, team) - ) by (team, statefulset, app), + }, "statefulset", ",", "name") + ) by (statefulset, category, policy, status) + * on(statefulset) group_left(team, app) sum( + label_join(label_join(kube_statefulset_labels{ + cluster_type="management_cluster", + label_application_giantswarm_io_team!="" + }, "app", ",", "label_app_kubernetes_io_name"), "team", ",", "label_application_giantswarm_io_team") + ) by (team, app, statefulset), "name", ",", "statefulset") record: aggregation:kyverno_policy_statefulset_status_team - # Kyverno policy status by team - Job + # Kyverno policy workload status by team - Job - expr: |- label_join( sum( @@ -388,21 +372,17 @@ spec: policy!="check-deprecated-apis-1-25", cluster_type="management_cluster", kind=~"Job" - }, "job", ",", "name") - ) by (job, category, policy, status) - * on(job) group_left(team, app) - sum( - sum( - label_join(kube_job_labels{}, "app", ",", "label_app_kubernetes_io_name") - ) by (job, app) - * on(app) group_left(team) - sum( - app_operator_app_info{team!="noteam"} - ) by (app, team) - ) by (team, job, app), + }, "job", ",", "name") + ) by (job, category, policy, status) + * on(job) group_left(team, app) sum( + label_join(label_join(kube_job_labels{ + cluster_type="management_cluster", + label_application_giantswarm_io_team!="" + }, "app", ",", "label_app_kubernetes_io_name"), "team", ",", "label_application_giantswarm_io_team") + ) by (team, app, job), "name", ",", "job") record: aggregation:kyverno_policy_job_status_team - # Kyverno policy status by team - CronJob + # Kyverno policy workload status by team - CronJob - expr: |- label_join( sum( @@ -410,18 +390,14 @@ spec: policy!="check-deprecated-apis-1-25", cluster_type="management_cluster", kind=~"CronJob" - }, "cronjob", ",", "name") - ) by (cronjob, category, policy, status) - * on(cronjob) group_left(team, app) - sum( - sum( - label_join(kube_cronjob_labels{}, "app", ",", "label_app_kubernetes_io_name") - ) by (cronjob, app) - * on(app) group_left(team) - sum( - app_operator_app_info{team!="noteam"} - ) by (app, team) - ) by (team, cronjob, app), + }, "cronjob", ",", "name") + ) by (cronjob, category, policy, status) + * on(cronjob) group_left(team, app) sum( + label_join(label_join(kube_cronjob_labels{ + cluster_type="management_cluster", + label_application_giantswarm_io_team!="" + }, "app", ",", "label_app_kubernetes_io_name"), "team", ",", "label_application_giantswarm_io_team") + ) by (team, app, cronjob), "name", ",", "cronjob") record: aggregation:kyverno_policy_cronjob_status_team - name: starboard.grafana-cloud.recording diff --git a/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml b/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml index 5d3f88ae9..6c1b732cf 100644 --- a/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml +++ b/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml @@ -15,6 +15,7 @@ spec: class: HIGH area: kaas service: api-server + label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: raw_slo_requests # The first statement ensures that an api-server error is counted if the kubernetes api is not up for a specific cluster. # The next statement returns 1 for a cluster with "updated", "created" or unknown (absent) status. @@ -36,16 +37,18 @@ spec: labels: area: kaas service: api-server + label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: slo_target # -- daemonset - expr: | label_replace( - kube_daemonset_status_desired_number_scheduled{namespace=~"giantswarm|kube-system", daemonset=~"aws-node|aws-cloud-controller-manager|ebs-csi-node|kiam-server|kiam-agent|calico-node|cert-exporter|kube-proxy|net-exporter|node-exporter|azure-cloud-controller-manager|azure-cloud-node-manager|azure-scheduled-events|csi-azuredisk-node|cilium"}, - "service", "$1", "workload_name", "(.*)" ) + kube_daemonset_status_desired_number_scheduled{namespace=~"giantswarm|kube-system", daemonset=~"aws-node|aws-cloud-controller-manager|ebs-csi-node|calico-node|cert-exporter|kube-proxy|net-exporter|node-exporter|azure-cloud-controller-manager|azure-cloud-node-manager|azure-scheduled-events|csi-azuredisk-node|cilium"}, + "service", "$1", "daemonset", "(.*)" ) labels: class: MEDIUM area: kaas + label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: raw_slo_requests # -- the errors are counted as follows: # -- pods in a daemonset that are UNAVAILABLE NOW and have been UNAVAILABLE 10 MINUTES AGO @@ -54,12 +57,12 @@ spec: ( ( label_replace( - kube_daemonset_status_number_unavailable{namespace=~"giantswarm|kube-system", daemonset=~"aws-node|aws-cloud-controller-manager|ebs-csi-node|kiam-server|kiam-agent|calico-node|cert-exporter|kube-proxy|net-exporter|node-exporter|azure-cloud-controller-manager|azure-cloud-node-manager|azure-scheduled-events|csi-azuredisk-node|cilium"}, - "service", "$1", "workload_name", "(.*)" ) > 0 + kube_daemonset_status_number_unavailable{namespace=~"giantswarm|kube-system", daemonset=~"aws-node|aws-cloud-controller-manager|ebs-csi-node|calico-node|cert-exporter|kube-proxy|net-exporter|node-exporter|azure-cloud-controller-manager|azure-cloud-node-manager|azure-scheduled-events|csi-azuredisk-node|cilium"}, + "service", "$1", "daemonset", "(.*)" ) > 0 and on (daemonset, node) label_replace( - kube_daemonset_status_number_unavailable{namespace=~"giantswarm|kube-system", daemonset=~"aws-node|aws-cloud-controller-manager|ebs-csi-node|kiam-server|kiam-agent|calico-node|cert-exporter|kube-proxy|net-exporter|node-exporter|azure-cloud-controller-manager|azure-cloud-node-manager|azure-scheduled-events|csi-azuredisk-node|cilium"} offset 10m, - "service", "$1", "workload_name", "(.*)" ) > 0 + kube_daemonset_status_number_unavailable{namespace=~"giantswarm|kube-system", daemonset=~"aws-node|aws-cloud-controller-manager|ebs-csi-node|calico-node|cert-exporter|kube-proxy|net-exporter|node-exporter|azure-cloud-controller-manager|azure-cloud-node-manager|azure-scheduled-events|csi-azuredisk-node|cilium"} offset 10m, + "service", "$1", "daemonset", "(.*)" ) > 0 ) and on (node) kube_node_spec_unschedulable == 0 @@ -72,17 +75,19 @@ spec: record: raw_slo_errors # -- 99% availability # -- this expression collects all the daemonsets and assigns the same slo target to all of them - - expr: sum by (service, area) (raw_slo_errors{area="kaas", service=~"aws-node|aws-cloud-controller-manager|ebs-csi-node|kiam-server|kiam-agent|calico-node|cert-exporter|kube-proxy|net-exporter|node-exporter|azure-cloud-controller-manager|azure-cloud-node-manager|azure-scheduled-events|csi-azuredisk-node|cilium"} - raw_slo_errors{area="kaas", service=~"aws-node|aws-cloud-controller-manager|ebs-csi-node|kiam-server|kiam-agent|calico-node|cert-exporter|kube-proxy|net-exporter|node-exporter|azure-cloud-controller-manager|azure-cloud-node-manager|azure-scheduled-events|csi-azuredisk-node|cilium"}) + 1-0.99 + - expr: sum by (service, area) (raw_slo_errors{area="kaas", service=~"aws-node|aws-cloud-controller-manager|ebs-csi-node|calico-node|cert-exporter|kube-proxy|net-exporter|node-exporter|azure-cloud-controller-manager|azure-cloud-node-manager|azure-scheduled-events|csi-azuredisk-node|cilium"} - raw_slo_errors{area="kaas", service=~"aws-node|aws-cloud-controller-manager|ebs-csi-node|calico-node|cert-exporter|kube-proxy|net-exporter|node-exporter|azure-cloud-controller-manager|azure-cloud-node-manager|azure-scheduled-events|csi-azuredisk-node|cilium"}) + 1-0.99 labels: area: kaas + label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: slo_target # -- kubelet whole cluster - - expr: "kube_node_status_condition{condition='Ready',status='true'}" + - expr: "kube_node_status_condition" labels: class: MEDIUM area: kaas service: kubelet + label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: raw_slo_requests - expr: | ( @@ -96,12 +101,14 @@ spec: area: kaas class: MEDIUM service: kubelet + label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: raw_slo_errors # -- 99% availability - expr: "vector((1 - 0.99))" labels: area: kaas service: kubelet + label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: slo_target # kubelet - single nodepool @@ -109,6 +116,7 @@ spec: labels: area: kaas class: MEDIUM + label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: raw_slo_requests - expr: | @@ -122,12 +130,14 @@ spec: labels: area: kaas class: MEDIUM + label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: raw_slo_errors - expr: | label_replace(max by (nodepool) (kube_node_labels{nodepool=~".+"}), "service", "kubelet nodepool $1", "nodepool", "(.+)") * (1 - 0.99) labels: area: kaas + label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: slo_target # -- node-exporter @@ -137,6 +147,7 @@ spec: class: MEDIUM area: kaas service: node-exporter + label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: raw_slo_requests # record of number of node-exporters that are down. # up == 1 when node-exporters are up, and up == 0 when node-exporters are down - @@ -149,12 +160,14 @@ spec: area: kaas class: MEDIUM service: node-exporter + label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: raw_slo_errors # -- 99% availability - expr: "vector((1 - 0.99))" labels: area: kaas service: node-exporter + label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: slo_target # -- managed-apps @@ -164,9 +177,9 @@ spec: kube_daemonset_labels * on (daemonset, namespace) group_right(label_application_giantswarm_io_team) ( label_replace( kube_daemonset_status_number_unavailable - and on(daemonset,cluster_id,cluster_type,namespace,workload_name) + and on(daemonset,cluster_id,cluster_type,namespace) kube_daemonset_labels{label_giantswarm_io_monitoring_basic_sli='true'}, - "service", "$1", "workload_name", "(.*)" ) + "service", "$1", "daemonset", "(.*)" ) ) labels: class: MEDIUM @@ -177,9 +190,9 @@ spec: kube_deployment_labels * on (deployment, namespace) group_right(label_application_giantswarm_io_team) ( label_replace( kube_deployment_status_replicas_unavailable - and on(deployment,cluster_id,cluster_type,namespace, workload_name) + and on(deployment,cluster_id,cluster_type,namespace) kube_deployment_labels{label_giantswarm_io_monitoring_basic_sli='true'}, - "service", "$1", "workload_name", "(.*)" ) + "service", "$1", "deployment", "(.*)" ) ) labels: class: MEDIUM @@ -190,9 +203,9 @@ spec: kube_statefulset_labels * on (statefulset, namespace) group_right(label_application_giantswarm_io_team) ( label_replace( kube_statefulset_status_replicas - kube_statefulset_status_replicas_current - and on(statefulset,cluster_id,cluster_type,namespace, workload_name) + and on(statefulset,cluster_id,cluster_type,namespace) kube_statefulset_labels{label_giantswarm_io_monitoring_basic_sli='true'}, - "service", "$1", "workload_name", "(.*)" ) + "service", "$1", "statefulset", "(.*)" ) ) labels: class: MEDIUM @@ -203,9 +216,9 @@ spec: kube_daemonset_labels * on (daemonset, namespace) group_right(label_application_giantswarm_io_team) ( label_replace( kube_daemonset_status_desired_number_scheduled - and on(daemonset,cluster_id,cluster_type,namespace, workload_name) + and on(daemonset,cluster_id,cluster_type,namespace) kube_daemonset_labels{label_giantswarm_io_monitoring_basic_sli='true'}, - "service", "$1", "workload_name", "(.*)" ) + "service", "$1", "daemonset", "(.*)" ) ) labels: class: MEDIUM @@ -215,9 +228,9 @@ spec: kube_deployment_labels * on (deployment, namespace) group_right(label_application_giantswarm_io_team) ( label_replace( kube_deployment_status_replicas - and on(deployment,cluster_id,cluster_type,namespace, workload_name) + and on(deployment,cluster_id,cluster_type,namespace) kube_deployment_labels{label_giantswarm_io_monitoring_basic_sli='true'}, - "service", "$1", "workload_name", "(.*)" ) + "service", "$1", "deployment", "(.*)" ) ) labels: class: MEDIUM @@ -227,9 +240,9 @@ spec: kube_statefulset_labels * on (statefulset, namespace) group_right(label_application_giantswarm_io_team) ( label_replace( kube_statefulset_status_replicas - and on(statefulset,cluster_id,cluster_type,namespace, workload_name) + and on(statefulset,cluster_id,cluster_type,namespace) kube_statefulset_labels{label_giantswarm_io_monitoring_basic_sli='true'}, - "service", "$1", "workload_name", "(.*)" ) + "service", "$1", "statefulset", "(.*)" ) ) labels: class: MEDIUM @@ -246,17 +259,20 @@ spec: labels: class: MEDIUM area: kaas + label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: raw_slo_requests # record number of errors. - expr: label_replace(sum(rest_client_requests_total{app=~"kube-controller-manager|kube-scheduler", code=~"5..|"}) by (cluster_id,app), "service", "$1", "app", "(.*)") labels: area: kaas class: MEDIUM + label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: raw_slo_errors # -- 99% availability - expr: label_replace(group(rest_client_requests_total{app=~"kube-controller-manager|kube-scheduler"}) by (app), "service", "$1", "app", "(.*)") * 0 + 1 - 0.99 labels: area: kaas + label_application_giantswarm_io_team: {{ include "providerTeam" . }} record: slo_target # core k8s components azure API requests @@ -265,17 +281,20 @@ spec: labels: class: MEDIUM area: kaas + label_application_giantswarm_io_team: phoenix record: raw_slo_requests # record number of errors. - expr: label_replace(sum(cloudprovider_azure_api_request_errors{app=~"kube-controller-manager|kube-scheduler"}) by (cluster_id,app), "service", "$1", "app", "(.*)") labels: area: kaas class: MEDIUM + label_application_giantswarm_io_team: phoenix record: raw_slo_errors # -- 99% availability - expr: label_replace(group(cloudprovider_azure_api_request_duration_seconds_count{app=~"kube-controller-manager|kube-scheduler"}) by (app), "service", "$1", "app", "(.*)") * 0 + 1 - 0.99 labels: area: kaas + label_application_giantswarm_io_team: phoenix record: slo_target # core k8s components aws API requests @@ -284,17 +303,20 @@ spec: labels: class: MEDIUM area: kaas + label_application_giantswarm_io_team: phoenix record: raw_slo_requests # record number of errors. - expr: label_replace(sum(cloudprovider_aws_api_request_errors{app=~"kube-controller-manager|kube-scheduler"}) by (cluster_id,app), "service", "$1", "app", "(.*)") labels: area: kaas class: MEDIUM + label_application_giantswarm_io_team: phoenix record: raw_slo_errors # -- 99% availability - expr: label_replace(group(cloudprovider_aws_api_request_duration_seconds_count{app=~"kube-controller-manager|kube-scheduler"}) by (app), "service", "$1", "app", "(.*)") * 0 + 1 - 0.99 labels: area: kaas + label_application_giantswarm_io_team: phoenix record: slo_target # -- Managed Prometheus @@ -304,6 +326,7 @@ spec: area: managed-apps class: MEDIUM service: managed-prometheus + label_application_giantswarm_io_team: atlas record: raw_slo_requests # Set SLO error to be 1 when a managed prometheus is down. - expr: (up{app="prometheus-operator-app-prometheus",container="prometheus"}*-1)+1 == 1 @@ -311,6 +334,7 @@ spec: area: managed-apps class: MEDIUM service: managed-prometheus + label_application_giantswarm_io_team: atlas record: raw_slo_errors # -- Managed Alertmanager @@ -320,6 +344,7 @@ spec: area: managed-apps class: MEDIUM service: managed-alertmanager + label_application_giantswarm_io_team: atlas record: raw_slo_requests # Set SLO error to be 1 when a managed alertmanager is down. - expr: (up{app="prometheus-operator-app-alertmanager",container="alertmanager"}*-1)+1 == 1 @@ -327,6 +352,7 @@ spec: area: managed-apps class: MEDIUM service: managed-alertmanager + label_application_giantswarm_io_team: atlas record: raw_slo_errors # -- generic stuff @@ -335,9 +361,9 @@ spec: record: slo_burnrate_high - expr: "vector(12)" record: slo_burnrate_low - - expr: sum(raw_slo_requests) by (service, cluster_type, cluster_id, area) + - expr: sum(raw_slo_requests) by (service, cluster_type, cluster_id, area, label_application_giantswarm_io_team) record: slo_requests - - expr: sum(raw_slo_errors) by (service, cluster_type, cluster_id, area) + - expr: sum(raw_slo_errors) by (service, cluster_type, cluster_id, area, label_application_giantswarm_io_team) record: slo_errors - expr: sum(sum_over_time(raw_slo_errors[5m])) by (cluster_type, cluster_id, service, class, area, label_application_giantswarm_io_team) / sum(sum_over_time(raw_slo_requests[5m])) by (cluster_type, cluster_id, service, class, area, label_application_giantswarm_io_team) record: slo_errors_per_request:ratio_rate5m diff --git a/renovate.json b/renovate.json index f4415e61e..111e73b0a 100644 --- a/renovate.json +++ b/renovate.json @@ -6,12 +6,14 @@ "dependencyDashboard": true, "ignorePaths": [ ".github/workflows/zz_generated.*", - ".github/workflows/codeql-analysis.yml" + ".github/workflows/codeql-analysis.yml", + ".github/workflows/pre_commit_*.yaml" ], "ignoreDeps": [ + "actions/setup-go", "architect", - "zricethezav/gitleaks-action", - "actions/setup-go" + "github.com/imdario/mergo", + "zricethezav/gitleaks-action" ], "regexManagers": [ { diff --git a/test/conf/promtool_ignore b/test/conf/promtool_ignore index 5393cce32..3112fea9d 100644 --- a/test/conf/promtool_ignore +++ b/test/conf/promtool_ignore @@ -71,7 +71,6 @@ templates/alerting-rules/systemd.workload-cluster.rules.yml templates/alerting-rules/tiller.all.rules.yml templates/alerting-rules/tiller.workload-cluster.rules.yml templates/alerting-rules/timesync.rules.yml -templates/alerting-rules/up.all.rules.yml templates/alerting-rules/up.management-cluster.rules.yml templates/alerting-rules/vault.rules.yml templates/recording-rules/grafana-cloud.rules.yml diff --git a/test/hack/bin/template-chart.sh b/test/hack/bin/template-chart.sh index a9ab7dda6..ce82ecfd9 100755 --- a/test/hack/bin/template-chart.sh +++ b/test/hack/bin/template-chart.sh @@ -17,6 +17,7 @@ main() { "$GIT_WORKDIR"/helm/prometheus-rules \ --set="managementCluster.provider.flavor=${BASH_REMATCH[1]}" \ --set="managementCluster.provider.kind=${BASH_REMATCH[2]}" \ + --set="managementCluster.name=myinstall" \ --output-dir "$GIT_WORKDIR"/test/hack/output/"$provider" done } diff --git a/test/hack/checkLabels/go.mod b/test/hack/checkLabels/go.mod index 7c387dbd1..5bf987e66 100644 --- a/test/hack/checkLabels/go.mod +++ b/test/hack/checkLabels/go.mod @@ -5,14 +5,14 @@ go 1.19 require ( // Try to keep version in sync with our prometheus rule CRD version. // see https://github.com/giantswarm/prometheus-operator-crd/blob/master/helm/prometheus-operator-crd/Chart.yaml#L11 - github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.66.0 + github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.68.0 sigs.k8s.io/yaml v1.3.0 ) -require github.com/prometheus/alertmanager v0.25.0 +require github.com/prometheus/alertmanager v0.26.0 require ( - github.com/aws/aws-sdk-go v1.44.156 // indirect + github.com/aws/aws-sdk-go v1.44.317 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/go-kit/kit v0.10.0 // indirect @@ -26,30 +26,30 @@ require ( github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/jpillora/backoff v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/kr/pretty v0.3.0 // indirect + github.com/kr/pretty v0.3.1 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect github.com/pkg/errors v0.9.1 // indirect - github.com/prometheus/client_golang v1.14.0 // indirect - github.com/prometheus/client_model v0.3.0 // indirect - github.com/prometheus/common v0.38.0 // indirect + github.com/prometheus/client_golang v1.15.1 // indirect + github.com/prometheus/client_model v0.4.0 // indirect + github.com/prometheus/common v0.44.0 // indirect github.com/prometheus/common/sigv4 v0.1.0 // indirect - github.com/prometheus/procfs v0.8.0 // indirect + github.com/prometheus/procfs v0.9.0 // indirect github.com/rogpeppe/go-internal v1.10.0 // indirect - golang.org/x/net v0.10.0 // indirect - golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1 // indirect - golang.org/x/sys v0.8.0 // indirect - golang.org/x/text v0.10.0 // indirect + golang.org/x/net v0.15.0 // indirect + golang.org/x/oauth2 v0.8.0 // indirect + golang.org/x/sys v0.12.0 // indirect + golang.org/x/text v0.13.0 // indirect google.golang.org/appengine v1.6.7 // indirect - google.golang.org/protobuf v1.28.1 // indirect + google.golang.org/protobuf v1.30.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect - k8s.io/api v0.27.2 // indirect - k8s.io/apimachinery v0.27.2 // indirect + k8s.io/api v0.28.1 // indirect + k8s.io/apimachinery v0.28.1 // indirect k8s.io/klog/v2 v2.100.1 // indirect - k8s.io/utils v0.0.0-20230505201702-9f6742963106 // indirect + k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.3.0 // indirect ) diff --git a/test/hack/checkLabels/go.sum b/test/hack/checkLabels/go.sum index d85316fde..4dd053dd3 100644 --- a/test/hack/checkLabels/go.sum +++ b/test/hack/checkLabels/go.sum @@ -67,6 +67,8 @@ github.com/aws/aws-sdk-go v1.34.28/go.mod h1:H7NKnBqNVzoTJpGfLrQkkD+ytBA93eiDYi/ github.com/aws/aws-sdk-go v1.38.35/go.mod h1:hcU610XS61/+aQV88ixoOzUoG7v3b31pl2zKMmprdro= github.com/aws/aws-sdk-go v1.44.156 h1:3RhbBTZ87HoI5OP2JjcKdd5qOnyo9YOAW8+Bb/h0vSE= github.com/aws/aws-sdk-go v1.44.156/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= +github.com/aws/aws-sdk-go v1.44.317 h1:+8XWrLmGMwPPXSRSLPzhgcGnzJ2mYkgkrcB9C/GnSOU= +github.com/aws/aws-sdk-go v1.44.317/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= github.com/aws/aws-sdk-go-v2 v0.18.0/go.mod h1:JWVYvqSMppoMJC0x5wdwiImzgXTI9FuZwxzkQq9wy+g= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= @@ -391,6 +393,7 @@ github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFB github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/pty v1.1.5/go.mod h1:9r2w37qlBe7rQ6e1fg1S/9xpWHSnaqNdHD3WcMdbPDA= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= @@ -499,10 +502,16 @@ github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.65.2 h github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.65.2/go.mod h1:xcfWyzl4BpEe5jnVkw7D1yCHU7GHjfjCERJsEfGbpSU= github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.66.0 h1:PPW01FLVjJHMNcbAL1DDD9EZceSQKMOU/VpK0irrxrI= github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.66.0/go.mod h1:KZHvrby65G+rA4V/vMTUXDV22TI+GgLIrCigYClpjzk= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.67.1 h1:u1Mw9irznvsBPxQxjUmCel1ufP3UgzA1CILj7/2tpNw= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.67.1/go.mod h1:KZHvrby65G+rA4V/vMTUXDV22TI+GgLIrCigYClpjzk= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.68.0 h1:yl9ceUSUBo9woQIO+8eoWpcxZkdZgm89g+rVvu37TUw= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.68.0/go.mod h1:9Uuu3pEU2jB8PwuqkHvegQ0HV/BlZRJUyfTYAqfdVF8= github.com/prometheus/alertmanager v0.22.2 h1:JrDZalSEMb2/2bqGAhls6ZnvOxbC5jMIu29JV+uWTC0= github.com/prometheus/alertmanager v0.22.2/go.mod h1:rYinOWxFuCnNssc3iOjn2oMTlhLaPcUuqV5yk5JKUAE= github.com/prometheus/alertmanager v0.25.0 h1:vbXKUR6PYRiZPRIKfmXaG+dmCKG52RtPL4Btl8hQGvg= github.com/prometheus/alertmanager v0.25.0/go.mod h1:MEZ3rFVHqKZsw7IcNS/m4AWZeXThmJhumpiWR4eHU/w= +github.com/prometheus/alertmanager v0.26.0 h1:uOMJWfIwJguc3NaM3appWNbbrh6G/OjvaHMk22aBBYc= +github.com/prometheus/alertmanager v0.26.0/go.mod h1:rVcnARltVjavgVaNnmevxK7kOn7IZavyf0KNgHkbEpU= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v0.9.3-0.20190127221311-3c4408c8b829/go.mod h1:p2iRAGwDERtqlqzRXnrOVns+ignqQo//hLXqYxZYVNs= github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= @@ -512,6 +521,8 @@ github.com/prometheus/client_golang v1.10.0/go.mod h1:WJM3cc3yu7XKBKa/I8WeZm+V3e github.com/prometheus/client_golang v1.11.0/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0= github.com/prometheus/client_golang v1.14.0 h1:nJdhIvne2eSX/XRAFV9PcvFFRbrjbcTUj0VP62TMhnw= github.com/prometheus/client_golang v1.14.0/go.mod h1:8vpkKitgIVNcqrRBWh1C4TIUQgYNtG/XQE4E/Zae36Y= +github.com/prometheus/client_golang v1.15.1 h1:8tXpTmJbyH5lydzFPoxSIJ0J46jdh3tylbvM1xCv0LI= +github.com/prometheus/client_golang v1.15.1/go.mod h1:e9yaBhRPU2pPNsZwE+JdQl0KEt1N9XgF6zxWmaC0xOk= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190115171406-56726106282f/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= @@ -520,6 +531,8 @@ github.com/prometheus/client_model v0.1.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6T github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.3.0 h1:UBgGFHqYdG/TPFD1B1ogZywDqEkwp3fBMvqdiQ7Xew4= github.com/prometheus/client_model v0.3.0/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w= +github.com/prometheus/client_model v0.4.0 h1:5lQXD3cAg1OXBf4Wq03gTrXHeaV0TQvGfUooCfx1yqY= +github.com/prometheus/client_model v0.4.0/go.mod h1:oMQmHW1/JoDwqLtg57MGgP/Fb1CJEYF2imWWhWtMkYU= github.com/prometheus/common v0.2.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/common v0.7.0/go.mod h1:DjGbpBbp5NYNiECxcL/VnbXCCaQpKd3tt26CguLLsqA= @@ -531,6 +544,8 @@ github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9 github.com/prometheus/common v0.29.0/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+M/gUGO4Hls= github.com/prometheus/common v0.38.0 h1:VTQitp6mXTdUoCmDMugDVOJ1opi6ADftKfp/yeqTR/E= github.com/prometheus/common v0.38.0/go.mod h1:MBXfmBQZrK5XpbCkjofnXs96LD2QQ7fEq4C0xjC/yec= +github.com/prometheus/common v0.44.0 h1:+5BrQJwiBB9xsMygAB3TNvpQKOwlkc25LbISbrdOOfY= +github.com/prometheus/common v0.44.0/go.mod h1:ofAIvZbQ1e/nugmZGz4/qCb9Ap1VoSTIO7x0VV9VvuY= github.com/prometheus/common/sigv4 v0.1.0 h1:qoVebwtwwEhS85Czm2dSROY5fTo2PAPEVdDeppTwGX4= github.com/prometheus/common/sigv4 v0.1.0/go.mod h1:2Jkxxk9yYvCkE5G1sQT7GuEXm57JrvHu9k5YwTjsNtI= github.com/prometheus/exporter-toolkit v0.5.1/go.mod h1:OCkM4805mmisBhLmVFw858QYi3v0wKdY6/UxrT0pZVg= @@ -542,6 +557,8 @@ github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4O github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= github.com/prometheus/procfs v0.8.0 h1:ODq8ZFEaYeCaZOJlZZdJA2AbQR98dSHSM1KW/You5mo= github.com/prometheus/procfs v0.8.0/go.mod h1:z7EfXMXOkbkqb9IINtpCn86r/to3BnA0uaxHdg830/4= +github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI= +github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY= github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4= github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= github.com/rogpeppe/go-internal v1.1.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= @@ -550,6 +567,7 @@ github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFR github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= github.com/rogpeppe/go-internal v1.8.0 h1:FCbCCtXNOY3UtUuHUYaghJg4y7Fd14rXifAYUAtL9R8= github.com/rogpeppe/go-internal v1.8.0/go.mod h1:WmiCO8CzOY8rg0OYDC4/i/2WRWAB6poM+XZ2dLUbcbE= +github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/rs/cors v1.7.0/go.mod h1:gFx+x8UowdsKA9AchylcLynDq+nNFfI8FkUZdN/jGCU= github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= @@ -736,6 +754,8 @@ golang.org/x/net v0.9.0 h1:aWJ/m6xSmxWBx+V0XRHTlrYrPG56jKsLdTFmsSsCzOM= golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.15.0 h1:ugBLEUaxABaB5AJqW9enI0ACdci2RUd4eP51NTBvuJ8= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -744,6 +764,8 @@ golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4Iltr golang.org/x/oauth2 v0.0.0-20210514164344-f6687ab2804c/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1 h1:lxqLZaMad/dJHMFZH0NiNpiEZI/nhgWhe4wgzpE+MuA= golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= +golang.org/x/oauth2 v0.8.0 h1:6dkIjl3j3LtZ/O3sTgZTMsLKSftL/B8Zgq4huOIIUu8= +golang.org/x/oauth2 v0.8.0/go.mod h1:yr7u4HXZRm1R1kBWqr/xKNqewf0plRYoB7sla+BCIXE= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -825,6 +847,8 @@ golang.org/x/sys v0.7.0 h1:3jlCCIQZPdOYu1h8BkNvLz8Kgwtae2cagcG/VamtZRU= golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= @@ -849,6 +873,8 @@ golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.10.0 h1:UpjohKhiEgNc0CSauXmwYftY1+LlaC75SJwh0SgCX58= golang.org/x/text v0.10.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -1003,6 +1029,8 @@ google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp0 google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.28.1 h1:d0NfwRgPtno5B1Wa6L2DAG+KivqkdutMf1UhdNx175w= google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng= +google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -1048,6 +1076,8 @@ k8s.io/api v0.27.1 h1:Z6zUGQ1Vd10tJ+gHcNNNgkV5emCyW+v2XTmn+CLjSd0= k8s.io/api v0.27.1/go.mod h1:z5g/BpAiD+f6AArpqNjkY+cji8ueZDU/WV1jcj5Jk4E= k8s.io/api v0.27.2 h1:+H17AJpUMvl+clT+BPnKf0E3ksMAzoBBg7CntpSuADo= k8s.io/api v0.27.2/go.mod h1:ENmbocXfBT2ADujUXcBhHV55RIT31IIEvkntP6vZKS4= +k8s.io/api v0.28.1 h1:i+0O8k2NPBCPYaMB+uCkseEbawEt/eFaiRqUx8aB108= +k8s.io/api v0.28.1/go.mod h1:uBYwID+66wiL28Kn2tBjBYQdEU0Xk0z5qF8bIBqk/Dg= k8s.io/apimachinery v0.25.4 h1:CtXsuaitMESSu339tfhVXhQrPET+EiWnIY1rcurKnAc= k8s.io/apimachinery v0.25.4/go.mod h1:jaF9C/iPNM1FuLl7Zuy5b9v+n35HGSh6AQ4HYRkCqwo= k8s.io/apimachinery v0.26.1 h1:8EZ/eGJL+hY/MYCNwhmDzVqq2lPl3N3Bo8rvweJwXUQ= @@ -1056,6 +1086,8 @@ k8s.io/apimachinery v0.27.1 h1:EGuZiLI95UQQcClhanryclaQE6xjg1Bts6/L3cD7zyc= k8s.io/apimachinery v0.27.1/go.mod h1:5ikh59fK3AJ287GUvpUsryoMFtH9zj/ARfWCo3AyXTM= k8s.io/apimachinery v0.27.2 h1:vBjGaKKieaIreI+oQwELalVG4d8f3YAMNpWLzDXkxeg= k8s.io/apimachinery v0.27.2/go.mod h1:XNfZ6xklnMCOGGFNqXG7bUrQCoR04dh/E7FprV6pb+E= +k8s.io/apimachinery v0.28.1 h1:EJD40og3GizBSV3mkIoXQBsws32okPOy+MkRyzh6nPY= +k8s.io/apimachinery v0.28.1/go.mod h1:X0xh/chESs2hP9koe+SdIAcXWcQ+RM5hy0ZynB+yEvw= k8s.io/klog/v2 v2.80.1 h1:atnLQ121W371wYYFawwYx1aEY2eUfs4l3J72wtgAwV4= k8s.io/klog/v2 v2.80.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0= k8s.io/klog/v2 v2.90.0 h1:VkTxIV/FjRXn1fgNNcKGM8cfmL1Z33ZjXRTVxKCoF5M= @@ -1070,6 +1102,8 @@ k8s.io/utils v0.0.0-20230406110748-d93618cff8a2 h1:qY1Ad8PODbnymg2pRbkyMT/ylpTrC k8s.io/utils v0.0.0-20230406110748-d93618cff8a2/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= k8s.io/utils v0.0.0-20230505201702-9f6742963106 h1:EObNQ3TW2D+WptiYXlApGNLVy0zm/JIBVY9i+M4wpAU= k8s.io/utils v0.0.0-20230505201702-9f6742963106/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/utils v0.0.0-20230726121419-3b25d923346b h1:sgn3ZU783SCgtaSJjpcVVlRqd6GSnlTLKgpAAttJvpI= +k8s.io/utils v0.0.0-20230726121419-3b25d923346b/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= @@ -1079,6 +1113,8 @@ sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMm sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E= +sigs.k8s.io/structured-merge-diff/v4 v4.3.0 h1:UZbZAZfX0wV2zr7YZorDz6GXROfDFj6LvqCRm4VUVKk= +sigs.k8s.io/structured-merge-diff/v4 v4.3.0/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o= sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8= diff --git a/test/tests/providers/capi/capz/capi-cluster.rules.test.yml b/test/tests/providers/capi/capz/capi-cluster.rules.test.yml index eddd0371a..ff2c8c1a6 100644 --- a/test/tests/providers/capi/capz/capi-cluster.rules.test.yml +++ b/test/tests/providers/capi/capz/capi-cluster.rules.test.yml @@ -21,8 +21,8 @@ tests: - exp_labels: area: kaas cancel_if_outside_working_hours: "true" - severity: notify - team: clippy + severity: page + team: phoenix topic: managementcluster name: clippaxy exported_namespace: giantswarm @@ -37,7 +37,7 @@ tests: area: kaas cancel_if_outside_working_hours: "true" severity: notify - team: clippy + team: phoenix topic: managementcluster name: grumpy exported_namespace: giantswarm @@ -53,7 +53,7 @@ tests: area: kaas cancel_if_outside_working_hours: "true" severity: notify - team: clippy + team: phoenix topic: managementcluster name: grumpy exported_namespace: giantswarm diff --git a/test/tests/providers/capi/capz/capi-kubeadmcontrolplane.rules.test.yml b/test/tests/providers/capi/capz/capi-kubeadmcontrolplane.rules.test.yml index 458e344ba..fd03e4afb 100644 --- a/test/tests/providers/capi/capz/capi-kubeadmcontrolplane.rules.test.yml +++ b/test/tests/providers/capi/capz/capi-kubeadmcontrolplane.rules.test.yml @@ -22,7 +22,7 @@ tests: area: kaas cancel_if_outside_working_hours: "true" severity: notify - team: clippy + team: phoenix topic: managementcluster cluster_name: clippaxy name: clippaxy-72jzy @@ -37,7 +37,7 @@ tests: area: kaas cancel_if_outside_working_hours: "true" severity: notify - team: clippy + team: phoenix topic: managementcluster cluster_name: grumpy name: grumpy-72r5c diff --git a/test/tests/providers/capi/capz/capi-machine.rules.test.yml b/test/tests/providers/capi/capz/capi-machine.rules.test.yml index 354a50ad3..25d2694bb 100644 --- a/test/tests/providers/capi/capz/capi-machine.rules.test.yml +++ b/test/tests/providers/capi/capz/capi-machine.rules.test.yml @@ -18,7 +18,7 @@ tests: area: kaas cancel_if_outside_working_hours: "true" severity: notify - team: clippy + team: phoenix topic: managementcluster cluster_name: clippaxy name: clippaxy-72jq5 @@ -34,7 +34,7 @@ tests: area: kaas cancel_if_outside_working_hours: "true" severity: notify - team: clippy + team: phoenix topic: managementcluster cluster_name: grumpy name: grumpy-72r5c diff --git a/test/tests/providers/capi/capz/capi-machinedeployment.rules.test.yml b/test/tests/providers/capi/capz/capi-machinedeployment.rules.test.yml index b3e79d2e9..c2df07936 100644 --- a/test/tests/providers/capi/capz/capi-machinedeployment.rules.test.yml +++ b/test/tests/providers/capi/capz/capi-machinedeployment.rules.test.yml @@ -22,7 +22,7 @@ tests: area: kaas cancel_if_outside_working_hours: "true" severity: notify - team: clippy + team: phoenix topic: managementcluster cluster_name: clippaxy name: clippaxy-def99 @@ -37,7 +37,7 @@ tests: area: kaas cancel_if_outside_working_hours: "true" severity: notify - team: clippy + team: phoenix topic: managementcluster cluster_name: grumpy name: grumpy-def99 diff --git a/test/tests/providers/capi/capz/capi-machinepool.rules.test.yml b/test/tests/providers/capi/capz/capi-machinepool.rules.test.yml index 2550090ab..5d3fba71c 100644 --- a/test/tests/providers/capi/capz/capi-machinepool.rules.test.yml +++ b/test/tests/providers/capi/capz/capi-machinepool.rules.test.yml @@ -22,7 +22,7 @@ tests: area: kaas cancel_if_outside_working_hours: "true" severity: notify - team: clippy + team: phoenix topic: managementcluster cluster_name: clippaxy name: clippaxy-def99 @@ -37,7 +37,7 @@ tests: area: kaas cancel_if_outside_working_hours: "true" severity: notify - team: clippy + team: phoenix topic: managementcluster cluster_name: grumpy name: grumpy-72r5c diff --git a/test/tests/providers/capi/capz/capi-machineset.rules.test.yml b/test/tests/providers/capi/capz/capi-machineset.rules.test.yml index 5f83b14c5..8d6119275 100644 --- a/test/tests/providers/capi/capz/capi-machineset.rules.test.yml +++ b/test/tests/providers/capi/capz/capi-machineset.rules.test.yml @@ -14,7 +14,7 @@ tests: area: kaas cancel_if_outside_working_hours: "true" severity: notify - team: clippy + team: phoenix topic: managementcluster cluster_name: grumpy name: grumpy-def99 diff --git a/test/tests/providers/capi/capz/capi.rules.test.yml b/test/tests/providers/capi/capz/capi.rules.test.yml index 8ed9bf1e6..2efb0a1d9 100644 --- a/test/tests/providers/capi/capz/capi.rules.test.yml +++ b/test/tests/providers/capi/capz/capi.rules.test.yml @@ -16,7 +16,7 @@ tests: area: kaas cancel_if_outside_working_hours: "true" severity: notify - team: clippy + team: phoenix topic: managementcluster cluster_name: clippaxy name: clippaxy-72jq5 @@ -42,7 +42,7 @@ tests: area: kaas cancel_if_outside_working_hours: "true" severity: notify - team: clippy + team: phoenix topic: managementcluster cluster_name: clippaxy name: clippaxy-def99 @@ -67,7 +67,7 @@ tests: area: kaas cancel_if_outside_working_hours: "true" severity: notify - team: clippy + team: phoenix topic: managementcluster cluster_name: clippaxy name: clippaxy-def99 @@ -92,7 +92,7 @@ tests: area: kaas cancel_if_outside_working_hours: "true" severity: notify - team: clippy + team: phoenix topic: managementcluster cluster_name: clippaxy name: clippaxy-72jzy @@ -113,7 +113,7 @@ tests: area: kaas cancel_if_outside_working_hours: "true" severity: notify - team: clippy + team: phoenix topic: managementcluster name: clippaxy exported_namespace: giantswarm diff --git a/test/tests/providers/capi/capz/cert-manager.rules.test.yml b/test/tests/providers/capi/capz/cert-manager.rules.test.yml index b0bbf7e32..99972a896 100644 --- a/test/tests/providers/capi/capz/cert-manager.rules.test.yml +++ b/test/tests/providers/capi/capz/cert-manager.rules.test.yml @@ -34,7 +34,7 @@ tests: installation: gollem service_priority: highest severity: page - team: clippy + team: bigmac topic: cert-manager exp_annotations: description: "cert-manager in namespace kube-system is down." diff --git a/test/tests/providers/capi/capz/certificate.all.rules.test.yml b/test/tests/providers/capi/capz/certificate.all.rules.test.yml index bc666c157..9442359db 100644 --- a/test/tests/providers/capi/capz/certificate.all.rules.test.yml +++ b/test/tests/providers/capi/capz/certificate.all.rules.test.yml @@ -33,7 +33,7 @@ tests: service_priority: highest severity: page secretkey: tls.crt - team: clippy + team: phoenix topic: cert-manager exp_annotations: description: "Certificate stored in Secret giantswarm/athena-certs-secret on gollem will expire in less than two weeks." @@ -75,7 +75,7 @@ tests: installation: gollem service_priority: highest severity: page - team: clippy + team: phoenix topic: cert-manager issuer_ref: kiam-ca-issuer managed_issuer: true diff --git a/test/tests/providers/capi/capz/dns-operator-azure.rules.test.yml b/test/tests/providers/capi/capz/dns-operator-azure.rules.test.yml index 6a45dfb31..d31e3b317 100644 --- a/test/tests/providers/capi/capz/dns-operator-azure.rules.test.yml +++ b/test/tests/providers/capi/capz/dns-operator-azure.rules.test.yml @@ -22,7 +22,7 @@ tests: area: kaas cancel_if_outside_working_hours: "true" severity: notify - team: clippy + team: phoenix topic: managementcluster phase: Provisioned exported_namespace: org-31f75bf9 @@ -38,7 +38,7 @@ tests: area: kaas cancel_if_outside_working_hours: "true" severity: notify - team: clippy + team: phoenix topic: managementcluster installation: puppy method: recordSets.CreateOrUpdate diff --git a/test/tests/providers/capi/capz/node-exporter.all.rules.test.yml b/test/tests/providers/capi/capz/node-exporter.all.rules.test.yml index dcac4d862..33d296117 100644 --- a/test/tests/providers/capi/capz/node-exporter.all.rules.test.yml +++ b/test/tests/providers/capi/capz/node-exporter.all.rules.test.yml @@ -27,7 +27,7 @@ tests: collector: "cpu" instance: "10.0.5.111:10300" severity: "page" - team: "clippy" + team: "phoenix" topic: "observability" exp_annotations: description: "NodeExporter Collector cpu on 10.0.5.111:10300 is failed." @@ -60,7 +60,7 @@ tests: instance: "10.0.5.111:10300" mountpoint: "/var/lib/kubelet" severity: "page" - team: "clippy" + team: "phoenix" topic: "observability" exp_annotations: description: "NodeExporter Mountpoint /var/lib/kubelet on device /dev/mapper/usr on 10.0.5.111:10300 is erroring." diff --git a/test/tests/providers/capi/openstack/cert-manager.rules.test.yml b/test/tests/providers/capi/openstack/cert-manager.rules.test.yml index 5ab5da226..58b88a18a 100644 --- a/test/tests/providers/capi/openstack/cert-manager.rules.test.yml +++ b/test/tests/providers/capi/openstack/cert-manager.rules.test.yml @@ -34,7 +34,7 @@ tests: installation: gollem service_priority: highest severity: page - team: rocket + team: bigmac topic: cert-manager exp_annotations: description: "cert-manager in namespace kube-system is down." diff --git a/test/tests/providers/global/cilium.rules.test.yml b/test/tests/providers/global/cilium.rules.test.yml new file mode 100644 index 000000000..1858a1ac0 --- /dev/null +++ b/test/tests/providers/global/cilium.rules.test.yml @@ -0,0 +1,49 @@ +--- +rule_files: + - cilium.rules.yml + +tests: + - interval: 1m + input_series: + # For the first 60min: test with 1 pod: none, up, down + - series: 'cilium_bpf_map_pressure{map_name="policy_00001"}' + values: "_x20 20+0x20 90+0x20" + alert_rule_test: + - alertname: CiliumBPFMapAlmostFull + eval_time: 10m + - alertname: CiliumBPFMapAlmostFull + eval_time: 30m + - alertname: CiliumBPFMapAlmostFull + eval_time: 50m + exp_alerts: + - exp_labels: + area: kaas + severity: page + team: phoenix + topic: cilium + cancel_if_outside_working_hours: "true" + map_name: "policy_00001" + exp_annotations: + description: "Cilium BPF map is about to fill up." + opsrecipe: "cilium-bpf-map/" + - interval: 1m + input_series: + - series: 'cilium_bpf_map_pressure{map_name="policy_00001"}' + values: "_x20 20+0x20 90+0x20 98+0x20" + alert_rule_test: + - alertname: CiliumBPFMapFull + eval_time: 10m + - alertname: CiliumBPFMapFull + eval_time: 30m + - alertname: CiliumBPFMapFull + eval_time: 70m + exp_alerts: + - exp_labels: + area: kaas + severity: page + team: phoenix + topic: cilium + map_name: "policy_00001" + exp_annotations: + description: "Cilium BPF map is about filled up." + opsrecipe: "cilium-bpf-map/" diff --git a/test/tests/providers/global/crossplane.rules.test.yml b/test/tests/providers/global/crossplane.rules.test.yml index 8cf0bfe0d..2b8ad6a50 100644 --- a/test/tests/providers/global/crossplane.rules.test.yml +++ b/test/tests/providers/global/crossplane.rules.test.yml @@ -5,7 +5,7 @@ rule_files: tests: - interval: 1m input_series: - - series: 'kube_deployment_status_replicas_unavailable{app="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="crossplane", installation="gauss", instance="100.64.5.122:8080", job="gauss-prometheus/workload-gauss/0", namespace="crossplane", node="ip-10-0-5-119.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-95bbb4bd7-v6hvh", provider="aws", service_priority="highest", workload_name="crossplane", workload_type="deployment"}' + - series: 'kube_deployment_status_replicas_unavailable{app="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="crossplane", installation="gauss", instance="100.64.5.122:8080", job="gauss-prometheus/workload-gauss/0", namespace="crossplane", node="ip-10-0-5-119.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-95bbb4bd7-v6hvh", provider="aws", service_priority="highest"}' values: "0+0x20 1+0x100" alert_rule_test: - alertname: DeploymentNotSatisfiedCrossplane @@ -36,14 +36,12 @@ tests: severity: page team: honeybadger topic: managementcluster - workload_name: crossplane - workload_type: deployment exp_annotations: description: "Crossplane related deployment crossplane/crossplane is not satisfied." opsrecipe: "deployment-not-satisfied/" - interval: 1m input_series: - - series: 'kube_deployment_status_replicas_unavailable{app="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="caicloud-event-exporter", installation="gauss", instance="100.64.5.122:8080", job="gauss-prometheus/workload-gauss/0", namespace="crossplane", node="ip-10-0-5-119.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-95bbb4bd7-v6hvh", provider="aws", service_priority="highest", workload_name="caicloud-event-exporter", workload_type="deployment"}' + - series: 'kube_deployment_status_replicas_unavailable{app="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="caicloud-event-exporter", installation="gauss", instance="100.64.5.122:8080", job="gauss-prometheus/workload-gauss/0", namespace="crossplane", node="ip-10-0-5-119.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-95bbb4bd7-v6hvh", provider="aws", service_priority="highest"}' values: "0+0x20 1+0x100" alert_rule_test: - alertname: DeploymentNotSatisfiedCrossplane @@ -74,8 +72,6 @@ tests: severity: page team: honeybadger topic: managementcluster - workload_name: caicloud-event-exporter - workload_type: deployment exp_annotations: description: "Crossplane related deployment crossplane/caicloud-event-exporter is not satisfied." opsrecipe: "deployment-not-satisfied/" diff --git a/test/tests/providers/global/external-secrets.rules.test.yml b/test/tests/providers/global/external-secrets.rules.test.yml index 93fb6198c..06d0bd4ca 100644 --- a/test/tests/providers/global/external-secrets.rules.test.yml +++ b/test/tests/providers/global/external-secrets.rules.test.yml @@ -5,7 +5,7 @@ rule_files: tests: - interval: 1m input_series: - - series: 'kube_deployment_status_replicas_unavailable{app="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="external-secrets", installation="gauss", instance="100.64.6.226:8080", job="gauss-prometheus/workload-gauss/0", namespace="external-secrets", node="ip-10-0-5-161.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-fd99568b6-fnhdv", provider="aws", service_priority="highest", workload_name="external-secrets", workload_type="deployment"}' + - series: 'kube_deployment_status_replicas_unavailable{app="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="external-secrets", installation="gauss", instance="100.64.6.226:8080", job="gauss-prometheus/workload-gauss/0", namespace="external-secrets", node="ip-10-0-5-161.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-fd99568b6-fnhdv", provider="aws", service_priority="highest"}' values: "0+0x20 1+0x100" alert_rule_test: - alertname: DeploymentNotSatisfiedExternalSecrets @@ -36,14 +36,12 @@ tests: severity: page team: honeybadger topic: managementcluster - workload_name: external-secrets - workload_type: deployment exp_annotations: description: "ExternalSecrets related deployment external-secrets/external-secrets is not satisfied." opsrecipe: "deployment-not-satisfied/" - interval: 1m input_series: - - series: 'kube_deployment_status_replicas_unavailable{app="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="external-secrets-cert-controller", installation="gauss", instance="100.64.6.226:8080", job="gauss-prometheus/workload-gauss/0", namespace="external-secrets", node="ip-10-0-5-161.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-fd99568b6-fnhdv", provider="aws", service_priority="highest", workload_name="external-secrets-cert-controller", workload_type="deployment"}' + - series: 'kube_deployment_status_replicas_unavailable{app="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="external-secrets-cert-controller", installation="gauss", instance="100.64.6.226:8080", job="gauss-prometheus/workload-gauss/0", namespace="external-secrets", node="ip-10-0-5-161.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-fd99568b6-fnhdv", provider="aws", service_priority="highest"}' values: "0+0x20 1+0x100" alert_rule_test: - alertname: DeploymentNotSatisfiedExternalSecrets @@ -74,14 +72,12 @@ tests: severity: page team: honeybadger topic: managementcluster - workload_name: external-secrets-cert-controller - workload_type: deployment exp_annotations: description: "ExternalSecrets related deployment external-secrets/external-secrets-cert-controller is not satisfied." opsrecipe: "deployment-not-satisfied/" - interval: 1m input_series: - - series: 'kube_deployment_status_replicas_unavailable{app="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="external-secrets-webhook", installation="gauss", instance="100.64.6.226:8080", job="gauss-prometheus/workload-gauss/0", namespace="external-secrets", node="ip-10-0-5-161.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-fd99568b6-fnhdv", provider="aws", service_priority="highest", workload_name="external-secrets-webhook", workload_type="deployment"}' + - series: 'kube_deployment_status_replicas_unavailable{app="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="external-secrets-webhook", installation="gauss", instance="100.64.6.226:8080", job="gauss-prometheus/workload-gauss/0", namespace="external-secrets", node="ip-10-0-5-161.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-fd99568b6-fnhdv", provider="aws", service_priority="highest"}' values: "0+0x20 1+0x100" alert_rule_test: - alertname: DeploymentNotSatisfiedExternalSecrets @@ -112,8 +108,6 @@ tests: severity: page team: honeybadger topic: managementcluster - workload_name: external-secrets-webhook - workload_type: deployment exp_annotations: description: "ExternalSecrets related deployment external-secrets/external-secrets-webhook is not satisfied." opsrecipe: "deployment-not-satisfied/" \ No newline at end of file diff --git a/test/tests/providers/global/inhibit.prometheus-agent.rules.test.yml b/test/tests/providers/global/inhibit.prometheus-agent.rules.test.yml index a3c738166..bc83d6ef2 100644 --- a/test/tests/providers/global/inhibit.prometheus-agent.rules.test.yml +++ b/test/tests/providers/global/inhibit.prometheus-agent.rules.test.yml @@ -5,24 +5,29 @@ rule_files: tests: - interval: 1m input_series: - - series: 'prometheus_build_info{app="prometheus",cluster_id="gauss",instance="localhost:9090"}' + # - cluster 1: "clu01" + - series: 'kube_namespace_created{app="kube-state-metrics", cluster_id="myinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="myinstall", instance="100.64.25.34:8080", job="kube-state-metrics", namespace="clu01-prometheus", node="ip-10-0-5-14.eu-central-1.compute.internal", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-f7b868f49-ngvr8", service="prometheus-operator-app-kube-state-metrics"}' + values: '1671707388+0x40' + # - cluster 2: "clu02" + - series: 'kube_namespace_created{app="kube-state-metrics", cluster_id="myinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="myinstall", instance="100.64.25.34:8080", job="kube-state-metrics", namespace="clu02-prometheus", node="ip-10-0-5-14.eu-central-1.compute.internal", organization="giantswarm", pipeline="stable", pod="prometheus-operator-app-kube-state-metrics-f7b868f49-ngvr8", service="prometheus-operator-app-kube-state-metrics"}' + values: '1671707388+0x40' + # - cluster 3: "myinstall", the install name + - series: 'kube_namespace_created{app="kube-state-metrics", cluster_id="myinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="myinstall", instance="100.64.25.34:8080", job="kube-state-metrics", namespace="myinstall-prometheus", node="ip-10-0-5-14.eu-central-1.compute.internal", organization="giantswarm", pipeline="stable", pod="prometheus-operator-app-kube-state-metrics-f7b868f49-ngvr8", service="prometheus-operator-app-kube-state-metrics"}' + values: "1671707388+0x40" + # prometheus-agent app info for "clu01" + - series: 'app_operator_app_info{app="prometheus-agent", app_version="2.40.5", catalog="giantswarm-playground", cluster_id="myinstall", cluster_missing="false", cluster_type="management_cluster", customer="giantswarm", deployed_version="0.1.7", endpoint="web", installation="myinstall", instance="app-exporter", job="app-exporter", name="prometheus-agent", namespace="clu01", node="ip-10-0-5-141.eu-central-1.compute.internal", organization="giantswarm", pipeline="stable", pod="app-exporter-6865c9c648-sg5vg", service="app-exporter", status="deployed", team="atlas", upgrade_available="false", version="0.1.7", version_mismatch="false"}' values: "1+0x40" - - series: 'kube_statefulset_created{namespace="kube-system",cluster_id="gauss",statefulset="prometheus-prometheus-agent"}' - values: "1+0x20 0+0x20" - - series: 'kube_statefulset_created{namespace="kube-system",cluster_id="gauss",statefulset="prometheus-prometheus-agent-shard-1"}' - values: "1+0x20 0+0x20" alert_rule_test: + #- alertname: InhibitionClusterIsNotRunningPrometheusAgent + # eval_time: 1m - alertname: InhibitionClusterIsNotRunningPrometheusAgent - eval_time: 1m - - alertname: InhibitionClusterIsNotRunningPrometheusAgent - eval_time: 22m + eval_time: 10m exp_alerts: - exp_labels: area: empowerment team: atlas topic: monitoring cluster_is_not_running_prometheus_agent: "true" - cluster_id: "gauss" + cluster_id: "clu02" exp_annotations: - description: "Cluster (gauss) is not running Prometheus Agent." - + description: "Cluster (clu02) is not running Prometheus Agent." diff --git a/test/tests/providers/global/kube-state-metrics.rules.test.yml b/test/tests/providers/global/kube-state-metrics.rules.test.yml new file mode 100644 index 000000000..8f5891193 --- /dev/null +++ b/test/tests/providers/global/kube-state-metrics.rules.test.yml @@ -0,0 +1,198 @@ +--- +rule_files: +- kube-state-metrics.rules.yml + +tests: + # KubeStateMetricsDown tests + # Tests to be run: + # - no "up" metrics + # - "up" metrics with servicemonitor discovery (ports 8080 and 8081) + # - "up" metric for port 8080 is OK, but port 8081 is set to 0 + # - "up" metric for port 8080 is set to 0, but port 8080 is OK + # - "up" metrics with label discovery (random port) + # - "up" is ok, but we don't have enough metrics + - name: "KSMDown with servicemonitor discovery" + interval: 1m + input_series: + # Tests for servicemonitor discovery + # - 00:00 Start with no metrics + # - 00:30 Both ports up and enough metrics + # - 01:00 Port 8080 goes down + # - 01:30 All is up again + # - 02:00 Port 8081 goes down + # - 02:30 all is up again + # - 03:00 we don't have enough metrics + # - 03:30 all is up again + - series: 'up{app="kube-state-metrics", cluster_id="testinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="testinstall", instance="192.0.2.10:8080", job="kube-state-metrics", namespace="kube-system", node="ip-10-0-1-1.eu-west-1.compute.internal", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-d7f4ff68d-72vzx", provider="aws", region="eu-west-1", service="prometheus-operator-app-kube-state-metrics", service_priority="highest"}' + values: "_x30 1x30 0x30 1x30 1x30 1x30 1x30 1x30" + - series: 'up{app="kube-state-metrics", cluster_id="testinstall", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="metrics", installation="testinstall", instance="192.0.2.10:8081", job="kube-state-metrics", namespace="kube-system", node="ip-10-0-1-1.eu-west-1.compute.internal", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-d7f4ff68d-72vzx", provider="aws", region="eu-west-1", service="prometheus-operator-app-kube-state-metrics", service_priority="highest"}' + values: "_x30 1x30 1x30 1x30 0x30 1x30 1x30 1x30" + - series: 'testmetric2{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric3{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric4{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric5{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric6{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric7{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric8{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric9{app="kube-state-metrics"}' + values: "_x30 1x30 1x30 1x30 1x30 1x30 _x30 1x30" + alert_rule_test: + # - 00:00 Start with no metrics + - alertname: KubeStateMetricsDown + eval_time: 25m + exp_alerts: + - exp_labels: + area: "kaas" + cancel_if_apiserver_down: "true" + cancel_if_cluster_has_no_workers: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "false" + inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" + severity: "page" + team: "atlas" + topic: "observability" + exp_annotations: + description: "KubeStateMetrics () is down." + opsrecipe: "kube-state-metrics-down/" + # - 00:30 Both ports up and enough metrics + - alertname: KubeStateMetricsDown + eval_time: 55m + # - 01:00 Port 8080 goes down + - alertname: KubeStateMetricsDown + eval_time: 85m + exp_alerts: + - exp_labels: + area: "kaas" + cancel_if_apiserver_down: "true" + cancel_if_cluster_has_no_workers: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "false" + inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" + severity: "page" + team: "atlas" + topic: "observability" + exp_annotations: + description: "KubeStateMetrics () is down." + opsrecipe: "kube-state-metrics-down/" + # - 01:30 All is up again + - alertname: KubeStateMetricsDown + eval_time: 115m + # - 02:00 Port 8081 goes down + - alertname: KubeStateMetricsDown + eval_time: 145m + # - 02:30 all is up again + - alertname: KubeStateMetricsDown + eval_time: 175m + # - 03:00 we don't have enough metrics + - alertname: KubeStateMetricsDown + eval_time: 205m + exp_alerts: + - exp_labels: + area: "kaas" + cancel_if_apiserver_down: "true" + cancel_if_cluster_has_no_workers: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "false" + inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" + severity: "page" + team: "atlas" + topic: "observability" + exp_annotations: + description: "KubeStateMetrics () is down." + opsrecipe: "kube-state-metrics-down/" + # - 03:30 all is up again + - alertname: KubeStateMetricsDown + eval_time: 235m + + + # Tests for label-discovery targets + - name: "KSMDown with label discovery" + interval: 1m + input_series: + # - 00:00 Start with no metrics + # - 00:30 all goes up + # - 01:00 up goes down + # - 01:30 All is up again + - series: 'up{app="kube-state-metrics", cluster_id="testvintage", cluster_type="workload_cluster", customer="giantswarm", installation="testinstall", instance="10.0.2.4:10301", job="test-prometheus/workload-test/0", namespace="kube-system", node="ip-10-1-0-3.eu-west-1.compute.internal", organization="giantswarm", pod="kube-state-metrics-v2-3-0-67b5fdc5d4-78mhf", provider="aws", service_priority="highest"}' + values: "_x30 1x30 0x30 1x30" + - series: 'testmetric2{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric3{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric4{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric5{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric6{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric7{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric8{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric9{app="kube-state-metrics"}' + values: "0x1000" + - series: 'testmetric10{app="kube-state-metrics"}' + values: "0x1000" + alert_rule_test: + # - 00:00 Start with no metrics + - alertname: KubeStateMetricsDown + eval_time: 25m + exp_alerts: + - exp_labels: + area: "kaas" + cancel_if_apiserver_down: "true" + cancel_if_cluster_has_no_workers: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "false" + inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" + severity: "page" + team: "atlas" + topic: "observability" + exp_annotations: + description: "KubeStateMetrics () is down." + opsrecipe: "kube-state-metrics-down/" + # - 00:30 all goes up + - alertname: KubeStateMetricsDown + eval_time: 55m + # - 01:00 up goes down + - alertname: KubeStateMetricsDown + eval_time: 85m + exp_alerts: + - exp_labels: + area: "kaas" + cancel_if_apiserver_down: "true" + cancel_if_cluster_has_no_workers: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_kubelet_down: "true" + cancel_if_outside_working_hours: "false" + inhibit_kube_state_metrics_down: "true" + cancel_if_prometheus_agent_down: "true" + severity: "page" + team: "atlas" + topic: "observability" + exp_annotations: + description: "KubeStateMetrics () is down." + opsrecipe: "kube-state-metrics-down/" + # - 01:30 All is up again + - alertname: KubeStateMetricsDown + eval_time: 115m diff --git a/test/tests/providers/global/kyverno.all.rules.test.yml b/test/tests/providers/global/kyverno.all.rules.test.yml new file mode 100644 index 000000000..b855bb76b --- /dev/null +++ b/test/tests/providers/global/kyverno.all.rules.test.yml @@ -0,0 +1,88 @@ +--- +rule_files: + - kyverno.all.rules.yml +tests: + - interval: 1m + input_series: + # Kyverno validating webhooks + - series: 'kube_validatingwebhookconfiguration_info{app="kube-state-metrics", cluster_id="gremlin", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="gremlin", instance="10.0.135.241:8080", job="kube-state-metrics", node="master-00000y", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-d7f4ff68d-qn6sb", prometheus="kube-system/prometheus-agent", prometheus_replica="prometheus-prometheus-agent-0", provider="azure", region="germanywestcentral", service="prometheus-operator-app-kube-state-metrics", service_priority="highest", validatingwebhookconfiguration="kyverno-exception-validating-webhook-cfg"}' + values: "1+0x20" + - series: 'kube_validatingwebhookconfiguration_info{app="kube-state-metrics", cluster_id="gremlin", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="gremlin", instance="10.0.135.241:8080", job="kube-state-metrics", node="master-00000y", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-d7f4ff68d-qn6sb", prometheus="kube-system/prometheus-agent", prometheus_replica="prometheus-prometheus-agent-0", provider="azure", region="germanywestcentral", service="prometheus-operator-app-kube-state-metrics", service_priority="highest", validatingwebhookconfiguration="kyverno-policy-validating-webhook-cfg"}' + values: "1+0x20" + - series: 'kube_validatingwebhookconfiguration_info{app="kube-state-metrics", cluster_id="gremlin", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", endpoint="http", installation="gremlin", instance="10.0.135.241:8080", job="kube-state-metrics", node="master-00000y", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-d7f4ff68d-qn6sb", prometheus="kube-system/prometheus-agent", prometheus_replica="prometheus-prometheus-agent-0", provider="azure", region="germanywestcentral", service="prometheus-operator-app-kube-state-metrics", service_priority="highest", validatingwebhookconfiguration="kyverno-resource-validating-webhook-cfg"}' + values: "1+0x20" + # Kyverno deployment status replicas + - series: 'kube_deployment_status_replicas{app="kube-state-metrics", cluster_id="gremlin", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="kyverno", endpoint="http", installation="gremlin", instance="10.0.135.241:8080", job="kube-state-metrics", namespace="kyverno", node="master-00000y", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-d7f4ff68d-qn6sb", prometheus="kube-system/prometheus-agent", prometheus_replica="prometheus-prometheus-agent-0", provider="azure", region="germanywestcentral", service="prometheus-operator-app-kube-state-metrics", service_priority="highest"}' + values: "0+0x20" + # Kyverno deployment spec replicas + - series: 'kube_deployment_spec_replicas{app="kube-state-metrics", cluster_id="gremlin", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="kyverno", endpoint="http", installation="gremlin", instance="10.0.135.241:8080", job="kube-state-metrics", namespace="kyverno", node="master-00000y", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-d7f4ff68d-qn6sb", prometheus="kube-system/prometheus-agent", prometheus_replica="prometheus-prometheus-agent-0", provider="azure", region="germanywestcentral", service="prometheus-operator-app-kube-state-metrics", service_priority="highest"}' + values: "0+0x240 1+0x70" + # Kyverno admission reports + - series: 'aggregation:kyverno_resource_counts{kind="admissionreports.kyverno.io"}' + values: "0+1000x30 30000+1500x30" + alert_rule_test: + # Webhooks alert + - alertname: KyvernoWebhookHasNoAvailableReplicas + eval_time: 15m + exp_alerts: + - exp_labels: + area: managedservices + severity: page + team: shield + topic: kyverno + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "false" + exp_annotations: + description: "Kyverno has no available replicas but webhooks are present." + opsrecipe: "kyverno-webhooks/" + # Kyverno reports too high alert + - alertname: KyvernoAdmissionReportCountTooHigh + eval_time: 60m + exp_alerts: + - exp_labels: + area: managedservices + severity: page + team: shield + topic: kyverno + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "false" + kind: "admissionreports.kyverno.io" + exp_annotations: + description: "Kyverno admissionreports.kyverno.io are too high. This is an indicator that Kyverno's report processing may not be keeping up with cluster demand." + opsrecipe: "kyverno-reports/" + # Kyverno scaled down alert + - alertname: KyvernoScaledDownTooLong + eval_time: 240m + exp_alerts: + - exp_labels: + area: managedservices + severity: notify + team: shield + topic: kyverno + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + exp_annotations: + description: "Kyverno has been scaled down for too long." + opsrecipe: "kyverno-scaled-down/" + # Kyverno unsafe replica count alert + - alertname: KyvernoUnsafeReplicaCount + eval_time: 310m + exp_alerts: + - exp_labels: + area: managedservices + severity: notify + team: shield + topic: kyverno + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + exp_annotations: + description: "Kyverno's admission controller deployment must use at least 3 replicas, or be scaled to 0." + opsrecipe: "KyvernoWronglyScaled/" diff --git a/test/tests/providers/global/loki.all.rules.test.yml b/test/tests/providers/global/loki.all.rules.test.yml index 9d4762354..07c55c458 100644 --- a/test/tests/providers/global/loki.all.rules.test.yml +++ b/test/tests/providers/global/loki.all.rules.test.yml @@ -27,7 +27,7 @@ tests: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: "true" cancel_if_scrape_timeout: "true" job: zj88t-prometheus/workload-zj88t/0 namespace: loki @@ -53,7 +53,7 @@ tests: cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "false" + cancel_if_outside_working_hours: "true" cancel_if_scrape_timeout: "true" job: zj88t-prometheus/workload-zj88t/0 namespace: loki diff --git a/test/tests/providers/global/prometheus-agent.rules.test.yml b/test/tests/providers/global/prometheus-agent.rules.test.yml index d23164296..7ed7ff3a0 100644 --- a/test/tests/providers/global/prometheus-agent.rules.test.yml +++ b/test/tests/providers/global/prometheus-agent.rules.test.yml @@ -3,17 +3,14 @@ rule_files: - prometheus-agent.rules.yml tests: + # Tests for `PrometheusAgentFailing` alert - interval: 1m input_series: - - series: 'up{instance="prometheus-agent",cluster_type="workload_cluster",cluster_id="gauss",installation="gauss"}' - values: "_x10 _x20 0+0x100 1+0x100" - - series: 'kube_statefulset_created{namespace="kube-system",statefulset="prometheus-prometheus-agent",cluster_id="gauss",installation="gauss"}' - values: "_x10 0+0x20 1+0x100 1+0x100" + - series: 'up{instance="prometheus-agent",cluster_type="workload_cluster",cluster_id="gauss",installation="myinstall"}' + values: "_x60 0+0x60 1+0x60" alert_rule_test: - alertname: PrometheusAgentFailing - eval_time: 10m - - alertname: PrometheusAgentFailing - eval_time: 25m + eval_time: 30m exp_alerts: - exp_labels: area: empowerment @@ -21,16 +18,87 @@ tests: team: atlas topic: observability inhibit_prometheus_agent_down: "true" + instance: prometheus-agent cancel_if_cluster_is_not_running_prometheus_agent: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" - cancel_if_outside_working_hours: "true" exp_annotations: dashboard: "promRW001/prometheus-remote-write" description: "Prometheus agent remote write is failing." opsrecipe: "prometheus-agent-remote-write-failed/" summary: "Prometheus agent fails to send samples to remote write endpoint." - alertname: PrometheusAgentFailing - eval_time: 65m + eval_time: 90m + exp_alerts: + - exp_labels: + area: empowerment + cluster_id: gauss + cluster_type: workload_cluster + severity: page + team: atlas + topic: observability + inhibit_prometheus_agent_down: "true" + installation: myinstall + instance: prometheus-agent + cancel_if_cluster_is_not_running_prometheus_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + exp_annotations: + dashboard: "promRW001/prometheus-remote-write" + description: "Prometheus agent remote write is failing." + opsrecipe: "prometheus-agent-remote-write-failed/" + summary: "Prometheus agent fails to send samples to remote write endpoint." - alertname: PrometheusAgentFailing - eval_time: 165m + eval_time: 150m + # Tests for `PrometheusAgentShardsMissing` alert + - interval: 1m + input_series: + - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + values: "10000+0x180" + - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + values: "10000+0x180" + - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}' + values: "10000+0x180" + - series: 'prometheus_operator_spec_shards{cluster_id="test01", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' + values: '3+0x60 5+0x60 3+0x60' + - series: 'prometheus_operator_spec_replicas{cluster_id="test01", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}' + values: '1+0x180' + alert_rule_test: + - alertname: PrometheusAgentShardsMissing + eval_time: 40m + - alertname: PrometheusAgentShardsMissing + eval_time: 100m + exp_alerts: + - exp_labels: + area: empowerment + severity: page + team: atlas + topic: observability + inhibit_prometheus_agent_down: "true" + cancel_if_cluster_is_not_running_prometheus_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + exp_annotations: + description: "Prometheus agent is missing shards." + opsrecipe: "prometheus-agent-missing-shards/" + summary: "Prometheus agent is missing shards." + - alertname: PrometheusAgentShardsMissing + eval_time: 125m + exp_alerts: + - exp_labels: + area: empowerment + severity: page + team: atlas + topic: observability + inhibit_prometheus_agent_down: "true" + cancel_if_cluster_is_not_running_prometheus_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_outside_working_hours: "true" + exp_annotations: + description: "Prometheus agent is missing shards." + opsrecipe: "prometheus-agent-missing-shards/" + summary: "Prometheus agent is missing shards." + - alertname: PrometheusAgentShardsMissing + eval_time: 130m diff --git a/test/tests/providers/global/prometheus-availability.rules.test.yml b/test/tests/providers/global/prometheus-availability.rules.test.yml deleted file mode 100644 index d40c1de75..000000000 --- a/test/tests/providers/global/prometheus-availability.rules.test.yml +++ /dev/null @@ -1,64 +0,0 @@ ---- -rule_files: - - prometheus-availability.rules.yml - -# Setting evaluation interval to 1h -# to make it faster on long test duration. -evaluation_interval: 1h - -tests: - # Test PrometheusAvailabilityRatio - - interval: 1m - input_series: - # This prometheus is up foreve - generates no alert - - series: 'kube_pod_status_ready{app="kube-state-metrics", condition="true", container="kube-state-metrics", namespace="install-prometheus", pod="prometheus-install-0"}' - values: "1+0x120" - # This prometheus starts at h+1, and takes 5min to get ready - generates no alert - - series: 'kube_pod_status_ready{app="kube-state-metrics", condition="true", container="kube-state-metrics", namespace="wcok-prometheus", pod="prometheus-wcok-0"}' - values: "_x60 0+0x5 1+0x60" - # This prometheus is down - generates alerts - - series: 'kube_pod_status_ready{app="kube-state-metrics", condition="true", container="kube-state-metrics", namespace="wcbad-prometheus", pod="prometheus-wcbad-0"}' - values: "0+0x60 1+0x60" - alert_rule_test: - - alertname: PrometheusAvailabilityRatio - eval_time: 60m - exp_alerts: - - exp_labels: - area: empowerment - severity: page - team: atlas - topic: observability - cancel_if_any_apiserver_down: "true" - cancel_if_cluster_has_no_workers: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "true" - pod: "prometheus-wcbad-0" - cluster_id: wcbad - exp_annotations: - description: "Prometheus prometheus-wcbad-0 has availability ratio of 0.00 (min 0.8) over the last hour." - opsrecipe: "prometheus-resource-limit-reached/" - dashboard: "promavailability/prometheus-availability" - - alertname: PrometheusAvailabilityRatio - eval_time: 108m - exp_alerts: - - exp_labels: - area: empowerment - severity: page - team: atlas - topic: observability - cancel_if_any_apiserver_down: "true" - cancel_if_cluster_has_no_workers: "true" - cancel_if_cluster_status_creating: "true" - cancel_if_cluster_status_deleting: "true" - cancel_if_cluster_status_updating: "true" - cancel_if_outside_working_hours: "true" - pod: "prometheus-wcbad-0" - cluster_id: wcbad - exp_annotations: - description: "Prometheus prometheus-wcbad-0 has availability ratio of 0.00 (min 0.8) over the last hour." - opsrecipe: "prometheus-resource-limit-reached/" - dashboard: "promavailability/prometheus-availability" - - alertname: PrometheusAvailabilityRatio - eval_time: 140m diff --git a/test/tests/providers/global/silence-operator.rules.test.yml b/test/tests/providers/global/silence-operator.rules.test.yml index f66c2c6f3..f6556027d 100644 --- a/test/tests/providers/global/silence-operator.rules.test.yml +++ b/test/tests/providers/global/silence-operator.rules.test.yml @@ -18,6 +18,7 @@ tests: area: "empowerment" cancel_if_outside_working_hours: "true" controller: silence-controller + installation: "myinstall" severity: "page" team: "atlas" topic: "observability" diff --git a/test/tests/providers/vintage/aws/cert-manager.rules.test.yml b/test/tests/providers/vintage/aws/cert-manager.rules.test.yml index 6fc51bbde..c39c1e12f 100644 --- a/test/tests/providers/vintage/aws/cert-manager.rules.test.yml +++ b/test/tests/providers/vintage/aws/cert-manager.rules.test.yml @@ -34,7 +34,7 @@ tests: installation: gollem service_priority: highest severity: page - team: phoenix + team: bigmac topic: cert-manager exp_annotations: description: "cert-manager in namespace kube-system is down."