werdnum · marvasgit · Dec 7, 2022 · Dec 7, 2022 · Dec 7, 2022 · Dec 8, 2022
diff --git a/Dockerfile b/Dockerfile
@@ -1,17 +1,16 @@
-FROM golang:1.14 AS builder
+FROM golang:1.18-bullseye AS builder
 
 COPY . /work
 WORKDIR /work
-RUN useradd pressurecooker
-RUN cd /work ; go build -o kubernetes-pressurecooker cmd/main.go
+RUN useradd multicooker
+RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o app/multicooker cmd/main.go
 
-FROM scratch
+FROM  alpine
 
-LABEL MAINTAINER="Rene Treffer <[email protected]>"
-COPY --from=builder /work/kubernetes-pressurecooker /usr/bin/kubernetes-pressurecooker
+COPY  --from=builder /work/app/multicooker /usr/sbin/multicooker
 COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
 COPY --from=builder /etc/passwd /etc/
+RUN chmod 777 /tmp/
+USER multicooker
 
-USER pressurecooker
-
-ENTRYPOINT ["/usr/bin/kubernetes-pressurecooker", "-logtostderr"]
+ENTRYPOINT ["/usr/sbin/multicooker", "-logtostderr"]
diff --git a/README.md b/README.md
@@ -1,10 +1,14 @@
-# Kubernetes Pressure Cooker
+# Kubernetes Multi Cooker
+Automatically taint and evict nodes with high CPU overload based on chosen Metric PSI or Avarage Load. Derived from [kubernetes-loadwatcher](https://github.com/mittwald/kubernetes-loadwatcher).
 
-Automatically taint and evict nodes with high CPU overload. Derived from [kubernetes-loadwatcher](https://github.com/mittwald/kubernetes-loadwatcher).
+This actually started as a small extension of [kubernetes-pressurecooker](https://github.com/rtreffer/kubernetes-pressurecooker) just to do the job.But there were popping more and more things that we needed. It became multicooker once we tried to move with it to GKE and we hit a wall with it. Because google have quite a bit of different kernels on their machines. Some of them has PSI others doesnt.
 
 The load average describes the average length of the run queue whenever a scheduling decision is made. But it does not tell us how often processes were waiting for CPU time.
 The [kernel pressure metrics (psi by facebook)](https://facebookmicrosites.github.io/psi/docs/overview.html#pressure-metric-definitions) describes how often there was not enough CPU available.
 
+There some big clound providers that doesnt support PSI metrics out of the box. I'm looking at you Google
+That is why there is a flag  `-use-avarage` to choose load avarage metrics.
+
 ## Synopsis
 
 A kubernetes node can be overcommited on CPU: there might be more processes that want more CPU than requested. This can easily happen due to variable resource usage per pod, variance in hardware or variance in pod distributions.
@@ -18,11 +22,16 @@ Pressure is more sensitive for small overloads, e.g. with pressure information i
 ## How it works
 
 This controller can be started with two threshold flags: `-taint-threshold` and `-evict-threshold`. There are also safeguard flags `-min-pod-age` and `-eviction-backoff`.
+There are also few configuration flags 
+Use`-use-avarage` to choose load avarage metrics instead of PSI.
+Use `-target-metric` to choose the metric that will be a good fit to use for the threshold. 
+Possible values are 1,2,3 based on the metric type. For LoadAva this will be [Load1, Load5,Load15] respectively for PSI it will be [Avg10,Avg60,Avg300]
 The controller will continuously monitor a node's CPU pressure.
 
-- If the CPU pressure (5min average) exceeds the _taint threshold_, the node will be tainted with a `pressurecooker/load-exceeded` taint with the `PreferNoSchedule` effect. This will instruct Kubernetes to not schedule any additional workloads on this node if at all possible.
-- If the CPU load (both 5min and 15min average) falls back below the _taint threshold_, the taint will be removed again.
-- If the CPU load (15 min average) exceeds the _eviction threshold_, the controller will pick a suitable Pod running on the node and evict it. However, the following types of Pods will _not_ be evicted:
+- If the target-metric exceeds the _taint threshold_, the node will be tainted with a `multicooker/load-exceeded` taint with the `PreferNoSchedule` effect. This will instruct Kubernetes to not schedule any additional workloads on this node if at all possible.
+- Once node is tainted target metric is moved to the first one so the controller will be more reactive.
+- If the ALL the metrics falls back below the _taint threshold_, the taint will be removed again.
+- If the the FIRST Metric (Load1 Avg10) exceeds the _eviction threshold_, the controller will pick a suitable Pod running on the node and evict it. However, the following types of Pods will _not_ be evicted:
 
     - Pods with the `Guaranteed` QoS class
     - Pods belonging to Stateful Sets
@@ -31,9 +40,20 @@ The controller will continuously monitor a node's CPU pressure.
     - Pods running in the `kube-system` namespace or with a critical `priorityClassName`
     - Pods newer than _min-pod-age_
 
-After a Pod was evicted, the next Pod will be evicted after a configurable _eviction backoff_ (controllable using the `evict-backoff` argument) if the load15 is still above the _eviction threshold_.
+After a Pod was evicted, the next Pod will be evicted after a configurable _eviction backoff_ (controllable using the `evict-backoff` argument) if the FIRST Metric (Load1 Avg10) is still above the _eviction threshold_.
 
 Older pods will be evicted first.
 The ration to remove old pods first is tat it is usually better to move well behaving pods away from bad neighbors
 than moving bad neighbors through the cluster. And as a node will always stay in a healthy state it can be assumed
 that the older pods are less likely to be the cause of an overload.
+
+## Installation
+
+There is a helm chart in the repo.
+To install from repo folder:
+
+`helm upgrade --install --namespace kube-system kubernetes-multicooker chart/`
+## TODO
+
+- Create tests
+- Fix prometheus metrics to be per node- release 1.0.2
diff --git a/chart/.helmignore b/chart/.helmignore
@@ -0,0 +1,21 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
diff --git a/chart/Chart.yaml b/chart/Chart.yaml
@@ -0,0 +1,5 @@
+apiVersion: v1
+appVersion: "1.0"
+description: A Helm chart for Kubernetes
+name: multicooker
+version: 0.1.0
diff --git a/chart/templates/_helpers.tpl b/chart/templates/_helpers.tpl
@@ -0,0 +1,32 @@
+{{/* vim: set filetype=mustache: */}}
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "chart.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "chart.fullname" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- $name := default .Chart.Name .Values.nameOverride -}}
+{{- if contains $name .Release.Name -}}
+{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "chart.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
diff --git a/chart/templates/account.yaml b/chart/templates/account.yaml
@@ -0,0 +1,9 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ include "chart.fullname" . }}
+  labels:
+    app.kubernetes.io/name: {{ include "chart.name" . }}
+    helm.sh/chart: {{ include "chart.chart" . }}
+    app.kubernetes.io/instance: {{ .Release.Name }}
+    app.kubernetes.io/managed-by: {{ .Release.Service }}
diff --git a/chart/templates/daemonset.yaml b/chart/templates/daemonset.yaml
@@ -0,0 +1,72 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: {{ include "chart.fullname" . }}
+  labels:
+    app.kubernetes.io/name: {{ include "chart.name" . }}
+    helm.sh/chart: {{ include "chart.chart" . }}
+    app.kubernetes.io/instance: {{ .Release.Name }}
+    app.kubernetes.io/managed-by: {{ .Release.Service }}
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: {{ include "chart.name" . }}
+      app.kubernetes.io/instance: {{ .Release.Name }}
+  template:
+    metadata:
+      labels:
+        app: {{ include "chart.name"  . }}
+        chart: {{ .Chart.Name }}-{{ .Chart.Version }}
+        release: {{ .Release.Name }}
+        app.kubernetes.io/name: {{ include "chart.name" . }}
+        app.kubernetes.io/instance: {{ .Release.Name }}
+    spec:
+      serviceAccountName: {{ include "chart.fullname" . }}
+      priorityClassName : system-node-critical
+      containers:
+        - name: {{ .Chart.Name }}
+          image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          ports:
+          - containerPort: 8080
+            protocol: TCP
+            name: metrics
+          env:
+            - name: NODE_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
+          args:
+            - -node-name=$(NODE_NAME)
+            {{- if .Values.taintThreshold }}
+            - -taint-threshold={{ .Values.taintThreshold }}
+            {{- end }}
+            {{- if .Values.evictThreshold }}
+            - -evict-threshold={{ .Values.evictThreshold }}
+            {{- end }}
+            - -evict-backoff={{ .Values.evictBackoff }}
+            - -v=8
+            {{- if .Values.metricsPort }}
+            - -metrics-port={{ .Values.metricsPort }}
+            {{- end }}
+            - -use-avarage={{ .Values.useAverage }}
+            {{- if .Values.targetMetric }}
+            - -target-metric={{ .Values.targetMetric }}
+            {{- end }}
+          resources:
+{{ toYaml .Values.resources | indent 12 }}
+    {{- with .Values.nodeSelector }}
+      nodeSelector:
+{{ toYaml . | indent 8 }}
+    {{- end }}
+    {{- with .Values.affinity }}
+      affinity:
+{{ toYaml . | indent 8 }}
+    {{- end }}
+      tolerations:
+        - key: multicooker/load-exceeded
+          operator: Exists
+          effect: NoSchedule
+        {{- with .Values.tolerations }}
+{{ toYaml . | indent 8 }}
+        {{- end }}
diff --git a/chart/templates/pormonitor.yaml b/chart/templates/pormonitor.yaml
@@ -0,0 +1,29 @@
+{{- if .Values.podMonitor.isEnabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: kube-prometheus-stack-multicooker-metrics
+  namespace: monitoring
+  labels: 
+  #some of those labels should match the "spec.selector.matchLatels" of prometheus otherwise metrics won't be available in targets
+    app: kube-prometheus-stack-operator
+    chart: {{ .Chart.Name }}-{{ .Chart.Version }}
+    release: kube-prometheus-stack
+    {{- range $key, $value := .Values.podMonitor.additionalLabels }}
+    {{ $key }}: {{ $value | quote }}
+    {{- end }}
+spec:
+  selector:
+    matchLabels:
+       app: {{ include "chart.name"  . }}
+  podMetricsEndpoints:
+  - port: metrics
+  - path: {{ .Values.podMonitor.endpoint }}}}
+    interval: {{ .Values.podMonitor.interval }}
+    {{- if .Values.podMonitor.scrapeTimeout }}
+    scrapeTimeout: {{ .Values.podMonitor.scrapeTimeout }}
+    {{- end }}
+    {{- with .Values.podMonitor.relabelings }}
+    relabelings: {{ toYaml . | nindent 8 }}
+    {{- end }}
+  {{- end }}
diff --git a/chart/templates/role.yaml b/chart/templates/role.yaml
@@ -0,0 +1,22 @@
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: {{ include "chart.fullname" . }}
+  labels:
+    app.kubernetes.io/name: {{ include "chart.name" . }}
+    helm.sh/chart: {{ include "chart.chart" . }}
+    app.kubernetes.io/instance: {{ .Release.Name }}
+    app.kubernetes.io/managed-by: {{ .Release.Service }}
+rules:
+- apiGroups: [""] # "" indicates the core API group
+  resources: ["nodes"]
+  verbs: ["get", "watch", "list", "update", "patch"]
+- apiGroups: [""]
+  resources: ["events"]
+  verbs: ["create", "patch", "list", "get"]
+- apiGroups: [""]
+  resources: ["pods/eviction"]
+  verbs: ["create"]
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["list", "get", "watch"]
diff --git a/chart/templates/rolebinding.yaml b/chart/templates/rolebinding.yaml
@@ -0,0 +1,17 @@
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: {{ include "chart.fullname" . }}
+  labels:
+    app.kubernetes.io/name: {{ include "chart.name" . }}
+    helm.sh/chart: {{ include "chart.chart" . }}
+    app.kubernetes.io/instance: {{ .Release.Name }}
+    app.kubernetes.io/managed-by: {{ .Release.Service }}
+subjects:
+- kind: ServiceAccount
+  name: {{ include "chart.fullname" . }}
+  namespace: {{ .Release.Namespace }}
+roleRef:
+  kind: ClusterRole
+  name: {{ include "chart.fullname" . }}
+  apiGroup: rbac.authorization.k8s.io
diff --git a/chart/values.yaml b/chart/values.yaml
@@ -0,0 +1,47 @@
+image:
+  repository: docmarr/kubernetes-multicooker
+  tag: "1.0.3"
+  pullPolicy: Always 
+
+nameOverride: ""
+fullnameOverride: ""
+
+taintThreshold: 25
+evictThreshold: 60
+evictBackoff: 2.12m
+metricsPort: 8080
+useAverage: true
+targetMetric: 2
+
+resources: {}
+  # We usually recommend not to specify default resources and to leave this as a conscious
+  # choice for the user. This also increases chances charts run on environments with little
+  # resources, such as Minikube. If you do want to specify resources, uncomment the following
+  # lines, adjust them as necessary, and remove the curly braces after 'resources:'.
+  # limits:
+  #  cpu: 100m
+  #  memory: 128Mi
+  # requests:
+  #  cpu: 100m
+  #  memory: 128Mi
+
+nodeSelector: {}
+
+tolerations: []
+
+affinity: {}
+
+podMonitor:
+  enabled: true
+  endpoint: "/metrics"
+  interval: 30s
+  scrapeTimeout: 10s
+  namespace: monitoring
+  additionalLabels:
+    release: "kube-prometheus-stack"
+  relabelings:
+     - sourceLabels: [__meta_kubernetes_pod_node_name]
+       separator: ;
+       targetLabel: instance
+       replacement: $1
+       action: replace