diff --git a/assets/alertmanager/alertmanager.yaml b/assets/alertmanager/alertmanager.yaml index 5de4ca6..cefcafa 100644 --- a/assets/alertmanager/alertmanager.yaml +++ b/assets/alertmanager/alertmanager.yaml @@ -15,7 +15,7 @@ route: receiver: default-receiver routes: - match: - alertname: DeadMansSwitch + alertname: Watchdog receiver: 'null' inhibit_rules: - source_match: @@ -28,16 +28,33 @@ receivers: - name: 'default-receiver' slack_configs: - channel: '#your_slack_channel' - title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] Prometheus Event Notification' + title: |- + [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }} + {{- if gt (len .CommonLabels) (len .GroupLabels) -}} + {{" "}}( + {{- with .CommonLabels.Remove .GroupLabels.Names }} + {{- range $index, $label := .SortedPairs -}} + {{ if $index }}, {{ end }} + {{- $label.Name }}="{{ $label.Value -}}" + {{- end }} + {{- end -}} + ) + {{- end }} text: >- - {{ range .Alerts }} - *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` - *Description:* {{ .Annotations.description }} - *Graph:* <{{ .GeneratorURL }}|:chart_with_upwards_trend:> *Runbook:* <{{ .Annotations.runbook }}|:spiral_note_pad:> - *Details:* - {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` - {{ end }} + {{ with index .Alerts 0 -}} + :chart_with_upwards_trend: *<{{ .GeneratorURL }}|Graph>* + {{- if .Labels.runbook }} :notebook: *<{{ .Labels.runbook }}|Runbook>*{{ end }} + {{ end }} + + *Alert details*: + + {{ range .Alerts -}} + *Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} + *Description:* {{ .Annotations.description }} + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` {{ end }} + {{ end }} send_resolved: true email_configs: - to: 'your_alert_email_address' diff --git a/deploy b/deploy index c8422f1..2bdd1a5 100755 --- a/deploy +++ b/deploy @@ -3,15 +3,15 @@ ######################################################################################### #components default version ######################################################################################### -GRAFANA_DEFAULT_VERSION=5.2.2 -PROMETHEUS_DEFAULT_VERSION=v2.3.2 -PROMETHEUS_OPERATOR_DEFAULT_VERSION=v0.23.1 +GRAFANA_DEFAULT_VERSION=6.0.2 +PROMETHEUS_DEFAULT_VERSION=v2.8.1 +PROMETHEUS_OPERATOR_DEFAULT_VERSION=v0.29.0 PROMETHEUS_STORAGE_DEFAULT_RETENTION=168h PROMETHEUS_STORAGE_DEFAULT_VOLUME_SIZE=40Gi PROMETHEUS_DEFAULT_MEMORY_REQUEST=1Gi -ALERTMANAGER_DEFAULT_VERSION=v0.15.1 -NODE_EXPORTER_DEFAULT_VERSION=v0.16.0 -KUBE_STATE_METRICS_DEFAULT_VERSION=v1.3.1 +ALERTMANAGER_DEFAULT_VERSION=v0.16.1 +NODE_EXPORTER_DEFAULT_VERSION=v0.17.0 +KUBE_STATE_METRICS_DEFAULT_VERSION=v1.5.0 NODE_LABEL_DEFAULT_KEY=beta.kubernetes.io/monit NODE_LABEL_DEFAULT_VALUE=prometheus DEFAULT_NAMESPACE=monitoring diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml index 3a41e81..cf20a80 100644 --- a/manifests/prometheus-operator/prometheus-operator.yaml +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -18,7 +18,7 @@ spec: - args: - --kubelet-service=kube-system/kubelet - --config-reloader-image=quay.io/coreos/configmap-reload:v0.0.1 - - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.23.0 + - --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.29.0 image: quay.io/coreos/prometheus-operator:PROMETHEUS_OPERATOR_VERSION name: prometheus-operator ports: @@ -32,6 +32,7 @@ spec: cpu: 100m memory: 50Mi securityContext: + fsGroup: 2000 runAsNonRoot: true runAsUser: 65534 serviceAccountName: prometheus-operator diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml index 0c68ae4..d764941 100644 --- a/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -10,35 +10,40 @@ spec: - name: k8s.rules rules: - expr: | - sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!=""}[5m])) by (namespace) + sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace) record: namespace:container_cpu_usage_seconds_total:sum_rate - expr: | - sum(container_memory_usage_bytes{job="kubelet", image!=""}) by (namespace) + sum by (namespace, pod_name, container_name) ( + rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m]) + ) + record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate + - expr: | + sum(container_memory_usage_bytes{job="kubelet", image!="", container_name!=""}) by (namespace) record: namespace:container_memory_usage_bytes:sum - expr: | sum by (namespace, label_name) ( - sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!=""}[5m])) by (namespace, pod_name) + sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace, pod_name) * on (namespace, pod_name) group_left(label_name) label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") ) record: namespace_name:container_cpu_usage_seconds_total:sum_rate - expr: | sum by (namespace, label_name) ( - sum(container_memory_usage_bytes{job="kubelet",image!=""}) by (pod_name, namespace) + sum(container_memory_usage_bytes{job="kubelet",image!="", container_name!=""}) by (pod_name, namespace) * on (namespace, pod_name) group_left(label_name) label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") ) record: namespace_name:container_memory_usage_bytes:sum - expr: | sum by (namespace, label_name) ( - sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}) by (namespace, pod) + sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod) * on (namespace, pod) group_left(label_name) label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") ) record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum - expr: | sum by (namespace, label_name) ( - sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod) + sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod) * on (namespace, pod) group_left(label_name) label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") ) @@ -130,6 +135,13 @@ spec: * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) record: node:node_cpu_utilisation:avg1m + - expr: | + node:node_cpu_utilisation:avg1m + * + node:node_num_cpu:sum + / + scalar(sum(node:node_num_cpu:sum)) + record: node:cluster_cpu_utilisation:ratio - expr: | sum(node_load1{job="node-exporter"}) / @@ -150,6 +162,12 @@ spec: / sum(node_memory_MemTotal_bytes{job="node-exporter"}) record: ':node_memory_utilisation:' + - expr: | + sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) + record: :node_memory_MemFreeCachedBuffers_bytes:sum + - expr: | + sum(node_memory_MemTotal_bytes{job="node-exporter"}) + record: :node_memory_MemTotal_bytes:sum - expr: | sum by (node) ( (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) @@ -167,8 +185,13 @@ spec: - expr: | (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) / - scalar(sum(node:node_memory_bytes_total:sum)) + node:node_memory_bytes_total:sum record: node:node_memory_utilisation:ratio + - expr: | + (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) + / + scalar(sum(node:node_memory_bytes_total:sum)) + record: node:cluster_memory_utilisation:ratio - expr: | 1e3 * sum( (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) @@ -201,66 +224,99 @@ spec: ) record: node:node_memory_swap_io_bytes:sum_rate - expr: | - avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) + avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) record: :node_disk_utilisation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 + irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_disk_utilisation:avg_irate - expr: | - avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) + avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) record: :node_disk_saturation:avg_irate - expr: | avg by (node) ( - irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 + irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_disk_saturation:avg_irate - expr: | - sum(irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m])) + - sum(irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m])) + max by (namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} + - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) + / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) + record: 'node:node_filesystem_usage:' + - expr: | + max by (namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) + record: 'node:node_filesystem_avail:' + - expr: | + sum(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) + + sum(irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) record: :node_net_utilisation:sum_irate - expr: | sum by (node) ( - (irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m]) + - irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m])) + (irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m]) + + irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_net_utilisation:sum_irate - expr: | - sum(irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m])) + - sum(irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m])) + sum(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m])) + + sum(irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m])) record: :node_net_saturation:sum_irate - expr: | sum by (node) ( - (irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m]) + - irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m])) + (irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m]) + + irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m])) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_net_saturation:sum_irate + - expr: | + max( + max( + kube_pod_info{job="kube-state-metrics", host_ip!=""} + ) by (node, host_ip) + * on (host_ip) group_right (node) + label_replace( + (max(node_filesystem_files{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*" + ) + ) by (node) + record: 'node:node_inodes_total:' + - expr: | + max( + max( + kube_pod_info{job="kube-state-metrics", host_ip!=""} + ) by (node, host_ip) + * on (host_ip) group_right (node) + label_replace( + (max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*" + ) + ) by (node) + record: 'node:node_inodes_free:' - name: kube-prometheus-node-recording.rules rules: - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY (instance) + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY + (instance) record: instance:node_cpu:rate:sum - - expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) + - expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) BY (instance) record: instance:node_filesystem_usage:sum - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) - record: instance:node_network_receive_bytes_total:rate:sum - - expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) + record: instance:node_network_receive_bytes:rate:sum + - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) record: instance:node_network_transmit_bytes:rate:sum - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) - / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT + (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) + BY (instance, cpu)) BY (instance) record: instance:node_cpu:ratio - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) record: cluster:node_cpu:sum_rate5m - - expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) + - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) + BY (instance, cpu)) record: cluster:node_cpu:ratio - name: kubernetes-absent rules: @@ -269,7 +325,16 @@ spec: message: Alertmanager has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown expr: | - absent(up{job="alertmanager-main"} == 1) + absent(up{job="alertmanager-main",namespace="monitoring"} == 1) + for: 15m + labels: + severity: critical + - alert: CoreDNSDown + annotations: + message: CoreDNS has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-corednsdown + expr: | + absent(up{job="kube-dns"} == 1) for: 15m labels: severity: critical @@ -332,7 +397,7 @@ spec: message: Prometheus has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown expr: | - absent(up{job="prometheus-k8s"} == 1) + absent(up{job="prometheus-k8s",namespace="monitoring"} == 1) for: 15m labels: severity: critical @@ -341,7 +406,7 @@ spec: message: PrometheusOperator has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown expr: | - absent(up{job="prometheus-operator"} == 1) + absent(up{job="prometheus-operator",namespace="monitoring"} == 1) for: 15m labels: severity: critical @@ -349,27 +414,29 @@ spec: rules: - alert: KubePodCrashLooping annotations: - message: '{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container - }}) is restarting {{ printf "%.2f" $value }} / second' + message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container + }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping expr: | - rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) > 0 + rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0 for: 1h labels: severity: critical - alert: KubePodNotReady annotations: - message: '{{ $labels.namespace }}/{{ $labels.pod }} is not ready.' + message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready + state for longer than an hour. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready expr: | - sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase!~"Running|Succeeded"}) > 0 + sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) > 0 for: 1h labels: severity: critical - alert: KubeDeploymentGenerationMismatch annotations: - message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} generation - mismatch + message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment + }} does not match, this indicates that the Deployment has failed but has + not been rolled back. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch expr: | kube_deployment_status_observed_generation{job="kube-state-metrics"} @@ -380,8 +447,8 @@ spec: severity: critical - alert: KubeDeploymentReplicasMismatch annotations: - message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replica - mismatch + message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not + matched the expected number of replicas for longer than an hour. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch expr: | kube_deployment_spec_replicas{job="kube-state-metrics"} @@ -392,8 +459,8 @@ spec: severity: critical - alert: KubeStatefulSetReplicasMismatch annotations: - message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} replica - mismatch + message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has + not matched the expected number of replicas for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch expr: | kube_statefulset_status_replicas_ready{job="kube-state-metrics"} @@ -404,8 +471,9 @@ spec: severity: critical - alert: KubeStatefulSetGenerationMismatch annotations: - message: StatefulSet {{ $labels.namespace }}/{{ labels.statefulset }} generation - mismatch + message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset + }} does not match, this indicates that the StatefulSet has failed but has + not been rolled back. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch expr: | kube_statefulset_status_observed_generation{job="kube-state-metrics"} @@ -414,10 +482,30 @@ spec: for: 15m labels: severity: critical + - alert: KubeStatefulSetUpdateNotRolledOut + annotations: + message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update + has not been rolled out. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout + expr: | + max without (revision) ( + kube_statefulset_status_current_revision{job="kube-state-metrics"} + unless + kube_statefulset_status_update_revision{job="kube-state-metrics"} + ) + * + ( + kube_statefulset_replicas{job="kube-state-metrics"} + != + kube_statefulset_status_replicas_updated{job="kube-state-metrics"} + ) + for: 15m + labels: + severity: critical - alert: KubeDaemonSetRolloutStuck annotations: - message: Only {{$value}}% of desired pods scheduled and ready for daemon set - {{$labels.namespace}}/{{$labels.daemonset}} + message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace + }}/{{ $labels.daemonset }} are scheduled and ready. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck expr: | kube_daemonset_status_number_ready{job="kube-state-metrics"} @@ -428,8 +516,8 @@ spec: severity: critical - alert: KubeDaemonSetNotScheduled annotations: - message: A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} - are not scheduled. + message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset + }} are not scheduled.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled expr: | kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} @@ -440,8 +528,8 @@ spec: severity: warning - alert: KubeDaemonSetMisScheduled annotations: - message: A number of pods of daemonset {{$labels.namespace}}/{{$labels.daemonset}} - are running where they are not supposed to run. + message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset + }} are running where they are not supposed to run.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled expr: | kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 @@ -450,8 +538,8 @@ spec: severity: warning - alert: KubeCronJobRunning annotations: - message: CronJob {{ $labels.namespaces }}/{{ $labels.cronjob }} is taking - more than 1h to complete. + message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more + than 1h to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning expr: | time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600 @@ -460,8 +548,8 @@ spec: severity: warning - alert: KubeJobCompletion annotations: - message: Job {{ $labels.namespaces }}/{{ $labels.job }} is taking more than - 1h to complete. + message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more + than one hour to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion expr: | kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 @@ -470,7 +558,7 @@ spec: severity: warning - alert: KubeJobFailed annotations: - message: Job {{ $labels.namespaces }}/{{ $labels.job }} failed to complete. + message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed expr: | kube_job_status_failed{job="kube-state-metrics"} > 0 @@ -481,8 +569,8 @@ spec: rules: - alert: KubeCPUOvercommit annotations: - message: Overcommited CPU resource requests on Pods, cannot tolerate node - failure. + message: Cluster has overcommitted CPU resource requests for Pods and cannot + tolerate node failure. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit expr: | sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum) @@ -495,8 +583,8 @@ spec: severity: warning - alert: KubeMemOvercommit annotations: - message: Overcommited Memory resource requests on Pods, cannot tolerate node - failure. + message: Cluster has overcommitted memory resource requests for Pods and cannot + tolerate node failure. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit expr: | sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum) @@ -511,10 +599,10 @@ spec: severity: warning - alert: KubeCPUOvercommit annotations: - message: Overcommited CPU resource request quota on Namespaces. + message: Cluster has overcommitted CPU resource requests for Namespaces. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit expr: | - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"}) + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) / sum(node:node_num_cpu:sum) > 1.5 @@ -523,10 +611,10 @@ spec: severity: warning - alert: KubeMemOvercommit annotations: - message: Overcommited Memory resource request quota on Namespaces. + message: Cluster has overcommitted memory resource requests for Namespaces. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit expr: | - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"}) + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) / sum(node_memory_MemTotal_bytes{job="node-exporter"}) > 1.5 @@ -535,24 +623,36 @@ spec: severity: warning - alert: KubeQuotaExceeded annotations: - message: '{{ printf "%0.0f" $value }}% usage of {{ $labels.resource }} in - namespace {{ $labels.namespace }}.' + message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value + }}% of its {{ $labels.resource }} quota. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded expr: | 100 * kube_resourcequota{job="kube-state-metrics", type="used"} / ignoring(instance, job, type) - kube_resourcequota{job="kube-state-metrics", type="hard"} + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) > 90 for: 15m labels: severity: warning + - alert: CPUThrottlingHigh + annotations: + message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace + }} for container {{ $labels.container_name }} in pod {{ $labels.pod_name + }}.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh + expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!=\"\", + }[5m])) by (container_name, pod_name, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m])) + by (container_name, pod_name, namespace)\n > 25 \n" + for: 15m + labels: + severity: warning - name: kubernetes-storage rules: - alert: KubePersistentVolumeUsageCritical annotations: - message: The persistent volume claimed by {{ $labels.persistentvolumeclaim - }} in namespace {{ $labels.namespace }} has {{ printf "%0.0f" $value }}% - free. + message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value + }}% free. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical expr: | 100 * kubelet_volume_stats_available_bytes{job="kubelet"} @@ -564,12 +664,28 @@ spec: severity: critical - alert: KubePersistentVolumeFullInFourDays annotations: - message: Based on recent sampling, the persistent volume claimed by {{ $labels.persistentvolumeclaim - }} in namespace {{ $labels.namespace }} is expected to fill up within four - days. + message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} is expected to fill up within four + days. Currently {{ printf "%0.2f" $value }}% is available. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays expr: | - predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[1h], 4 * 24 * 3600) < 0 + 100 * ( + kubelet_volume_stats_available_bytes{job="kubelet"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet"} + ) < 15 + and + predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 + for: 5m + labels: + severity: critical + - alert: KubePersistentVolumeErrors + annotations: + message: The persistent volume {{ $labels.persistentvolume }} has status {{ + $labels.phase }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors + expr: | + kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 for: 5m labels: severity: critical @@ -577,7 +693,7 @@ spec: rules: - alert: KubeNodeNotReady annotations: - message: '{{ $labels.node }} has been unready for more than an hour' + message: '{{ $labels.node }} has been unready for more than an hour.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready expr: | kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 @@ -586,11 +702,11 @@ spec: severity: warning - alert: KubeVersionMismatch annotations: - message: There are {{ $value }} different versions of Kubernetes components - running. + message: There are {{ $value }} different semantic versions of Kubernetes + components running. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch expr: | - count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1 + count(count by (gitVersion) (label_replace(kubernetes_build_info{job!="kube-dns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1 for: 1h labels: severity: warning @@ -600,17 +716,17 @@ spec: }}' is experiencing {{ printf "%0.0f" $value }}% errors.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors expr: | - sum(rate(rest_client_requests_total{code!~"2.."}[5m])) by (instance, job) * 100 + (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) / - sum(rate(rest_client_requests_total[5m])) by (instance, job) - > 1 + sum(rate(rest_client_requests_total[5m])) by (instance, job)) + * 100 > 1 for: 15m labels: severity: warning - alert: KubeClientErrors annotations: message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance - }}' is experiencing {{ printf "%0.0f" $value }} errors / sec.' + }}' is experiencing {{ printf "%0.0f" $value }} errors / second. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors expr: | sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1 @@ -619,93 +735,285 @@ spec: severity: warning - alert: KubeletTooManyPods annotations: - message: Kubelet {{$labels.instance}} is running {{$value}} pods, close to - the limit of 110. + message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close + to the limit of 110. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods expr: | - kubelet_running_pod_count{job="kubelet"} > 100 + kubelet_running_pod_count{job="kubelet"} > 110 * 0.9 for: 15m labels: severity: warning - alert: KubeAPILatencyHigh annotations: message: The API server has a 99th percentile latency of {{ $value }} seconds - for {{$labels.verb}} {{$labels.resource}}. + for {{ $labels.verb }} {{ $labels.resource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh expr: | - cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 + cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 for: 10m labels: severity: warning - alert: KubeAPILatencyHigh annotations: message: The API server has a 99th percentile latency of {{ $value }} seconds - for {{$labels.verb}} {{$labels.resource}}. + for {{ $labels.verb }} {{ $labels.resource }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh expr: | - cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 + cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 for: 10m labels: severity: critical - alert: KubeAPIErrorsHigh annotations: - message: API server is erroring for {{ $value }}% of requests. + message: API server is returning errors for {{ $value }}% of requests. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh expr: | - sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) / - sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5 + sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 3 for: 10m labels: severity: critical - alert: KubeAPIErrorsHigh annotations: - message: API server is erroring for {{ $value }}% of requests. + message: API server is returning errors for {{ $value }}% of requests. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh expr: | - sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) / - sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5 + sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 1 + for: 10m + labels: + severity: warning + - alert: KubeAPIErrorsHigh + annotations: + message: API server is returning errors for {{ $value }}% of requests for + {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh + expr: | + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb) + / + sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10 + for: 10m + labels: + severity: critical + - alert: KubeAPIErrorsHigh + annotations: + message: API server is returning errors for {{ $value }}% of requests for + {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh + expr: | + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb) + / + sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5 for: 10m labels: severity: warning - alert: KubeClientCertificateExpiration annotations: - message: Kubernetes API certificate is expiring in less than 7 days. + message: A client certificate used to authenticate to the apiserver is expiring + in less than 7.0 days. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration expr: | - histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 labels: severity: warning - alert: KubeClientCertificateExpiration annotations: - message: Kubernetes API certificate is expiring in less than 1 day. + message: A client certificate used to authenticate to the apiserver is expiring + in less than 24.0 hours. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration expr: | - histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + labels: + severity: critical + - name: alertmanager.rules + rules: + - alert: AlertmanagerConfigInconsistent + annotations: + message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` + are out of sync. + expr: | + count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1 + for: 5m + labels: + severity: critical + - alert: AlertmanagerFailedReload + annotations: + message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace + }}/{{ $labels.pod}}. + expr: | + alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"} == 0 + for: 10m + labels: + severity: warning + - alert: AlertmanagerMembersInconsistent + annotations: + message: Alertmanager has not found all other members of the cluster. + expr: | + alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"} + != on (service) GROUP_LEFT() + count by (service) (alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}) + for: 5m labels: severity: critical - - + - name: general.rules + rules: + - alert: TargetDown + annotations: + message: '{{ $value }}% of the {{ $labels.job }} targets are down.' + expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 + for: 10m + labels: + severity: warning + - alert: Watchdog + annotations: + message: | + This is an alert meant to ensure that the entire alerting pipeline is functional. + This alert is always firing, therefore it should always be firing in Alertmanager + and always fire against a receiver. There are integrations with various notification + mechanisms that send a notification when this alert is not firing. For example the + "DeadMansSnitch" integration in PagerDuty. + expr: vector(1) + labels: + severity: none - name: kube-prometheus-node-alerting.rules rules: - alert: NodeDiskRunningFull annotations: - description: device {{$labels.device}} on node {{$labels.instance}} is running - full within the next 24 hours (mounted at {{$labels.mountpoint}}) - summary: Node disk is running full within 24 hours + message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace + }}/{{ $labels.pod }} will be full within the next 24 hours. expr: | - predict_linear(node_filesystem_free{job="node-exporter",mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[6h], 3600 * 24) < 0 and on(instance) up{job="node-exporter"} + (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0) for: 30m labels: severity: warning - alert: NodeDiskRunningFull annotations: - description: device {{$labels.device}} on node {{$labels.instance}} is running - full within the next 2 hours (mounted at {{$labels.mountpoint}}) - summary: Node disk is running full within 2 hours + message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace + }}/{{ $labels.pod }} will be full within the next 2 hours. + expr: | + (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0) + for: 10m + labels: + severity: critical + - name: prometheus.rules + rules: + - alert: PrometheusConfigReloadFailed + annotations: + description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} + summary: Reloading Prometheus' configuration failed + expr: | + prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"} == 0 + for: 10m + labels: + severity: warning + - alert: PrometheusNotificationQueueRunningFull + annotations: + description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ + $labels.pod}} + summary: Prometheus' alert notification queue is running full expr: | - predict_linear(node_filesystem_free{job="node-exporter",mountpoint!~"^/etc/(?:resolv.conf|hosts|hostname)$"}[30m], 3600 * 2) < 0 and on(instance) up{job="node-exporter"} + predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"} + for: 10m + labels: + severity: warning + - alert: PrometheusErrorSendingAlerts + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + summary: Errors while sending alert from Prometheus + expr: | + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.01 + for: 10m + labels: + severity: warning + - alert: PrometheusErrorSendingAlerts + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + summary: Errors while sending alerts from Prometheus + expr: | + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.03 for: 10m labels: severity: critical + - alert: PrometheusNotConnectedToAlertmanagers + annotations: + description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected + to any Alertmanagers + summary: Prometheus is not connected to any Alertmanagers + expr: | + prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"} < 1 + for: 10m + labels: + severity: warning + - alert: PrometheusTSDBReloadsFailing + annotations: + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} + reload failures over the last four hours.' + summary: Prometheus has issues reloading data blocks from disk + expr: | + increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[2h]) > 0 + for: 12h + labels: + severity: warning + - alert: PrometheusTSDBCompactionsFailing + annotations: + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} + compaction failures over the last four hours.' + summary: Prometheus has issues compacting sample blocks + expr: | + increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[2h]) > 0 + for: 12h + labels: + severity: warning + - alert: PrometheusTSDBWALCorruptions + annotations: + description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead + log (WAL).' + summary: Prometheus write-ahead log is corrupted + expr: | + prometheus_tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusNotIngestingSamples + annotations: + description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting + samples. + summary: Prometheus isn't ingesting samples + expr: | + rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0 + for: 10m + labels: + severity: warning + - alert: PrometheusTargetScrapesDuplicate + annotations: + description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected + due to duplicate timestamps but different values' + summary: Prometheus has many samples rejected + expr: | + increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 10m + labels: + severity: warning + - name: prometheus-operator + rules: + - alert: PrometheusOperatorReconcileErrors + annotations: + message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace + }} Namespace. + expr: | + rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorNodeLookupErrors + annotations: + message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. + expr: | + rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 + for: 10m + labels: + severity: warning \ No newline at end of file diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml index 3150c40..99f9fc7 100644 --- a/manifests/prometheus/prometheus-k8s.yaml +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -30,6 +30,10 @@ spec: requests: memory: PROMETHEUS_MEMORY_REQUEST retention: PROMETHEUS_STORAGE_RETENTION + securityContext: + fsGroup: 2000 + runAsNonRoot: true + runAsUser: 1000 storage: class: STORAGE_CLASS_TYPE selector: diff --git a/release/download b/release/download index a7af68d..29ffccd 100755 --- a/release/download +++ b/release/download @@ -1,6 +1,6 @@ #! /bin/sh -PROMKUBE_VERSION=${PROMKUBE_VERSION:-2.1.0} +PROMKUBE_VERSION=${PROMKUBE_VERSION:-2.8.0} NAME="prometheus-kubernetes"