Skip to content

Commit

Permalink
feat(kube-prom-stack): added some extra rules (#749)
Browse files Browse the repository at this point in the history
  • Loading branch information
thiagoalmeidasa authored May 13, 2024
1 parent 8288dbc commit 7e110ac
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,7 @@ spec:
- receiver: "pagerduty"
matchers:
- severity=~"critical|page"
- alertname!="KubeAPIErrorBudgetBurn"
continue: true
inhibit_rules:
- source_match:
severity: "critical"
target_match:
severity: "warning"
equal: ["alertname", "namespace"]
ingress:
enabled: true
pathType: Prefix
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helmrelease.yaml
- prometheusrule.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
---
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/prometheusrule_v1.json
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: miscellaneous-rules
namespace: monitoring
spec:
groups:
- name: k8s.rules
rules:
- alert: KubernetesContainerOomKiller
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
for: 0m
labels:
severity: critical
annotations:
summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: KubernetesPodCrashLooping
expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: KubernetesNodeMemoryPressure
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
description: "Node {{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: KubernetesNodeDiskPressure
expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes Node disk pressure (instance {{ $labels.instance }})
description: "Node {{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: KubernetesNodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 10m
labels:
severity: critical
annotations:
summary: Kubernetes Node not ready (instance {{ $labels.instance }})
description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: KubernetesVolumeFullInFourDays
expr: predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0
for: 0m
labels:
severity: critical
annotations:
summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

0 comments on commit 7e110ac

Please sign in to comment.