From 3e49814d65c774941648d0e6782760dd271b53a0 Mon Sep 17 00:00:00 2001 From: Feruzjon Muyassarov Date: Thu, 28 Sep 2023 13:30:21 +0300 Subject: [PATCH] deployment: refactor config manager to support NRI enabling in CRI-O This commit extends config manager code and the plugins helm charts so that CRI-O users are also able to enable NRI via our charts if they wish to. Same parameter is used to opt in for the feature in Helm charts and we don't require users to indicate what container runtime is being used. Instead the config manager auto-detects the runtime and does the necessary changes to its configuration file. In scenarios with multiple active runtimes (e.g., CRI-O and containerd), the manager gracefully exits and throws an error. Signed-off-by: Feruzjon Muyassarov --- cmd/config-manager/main.go | 98 +++++++++++++++---- .../helm/balloons/templates/daemonset.yaml | 19 ++-- deployment/helm/balloons/values.yaml | 3 +- .../topology-aware/templates/daemonset.yaml | 19 ++-- deployment/helm/topology-aware/values.yaml | 2 +- docs/resource-policy/installation.md | 23 +++-- 6 files changed, 124 insertions(+), 40 deletions(-) diff --git a/cmd/config-manager/main.go b/cmd/config-manager/main.go index d81e37d97..ee3c7969b 100644 --- a/cmd/config-manager/main.go +++ b/cmd/config-manager/main.go @@ -29,33 +29,71 @@ import ( ) const ( - tomlFilePath = "/etc/containerd/config.toml" - nriPluginKey = "io.containerd.nri.v1.nri" - disableKey = "disable" - replaceMode = "replace" - resultDone = "done" - unit = "containerd.service" + containerdConfigFile = "/etc/containerd/config.toml" + crioConfigFile = "/etc/crio/crio.conf.d/10-enable-nri.conf" + nriPluginKey = "io.containerd.nri.v1.nri" + replaceMode = "replace" + resultDone = "done" + containerdUnit = "containerd.service" + crioUnit = "crio.service" ) func main() { - tomlMap, err := readConfig(tomlFilePath) + unit, err := detectRuntime() if err != nil { - log.Fatalf("Error reading TOML file: %v", err) + log.Fatalf("failed to autodetect container runtime: %v", err) } - updatedTomlMap := updateNRIPlugin(tomlMap) + switch unit { + case containerdUnit: + err = enableNriForContainerd() + case crioUnit: + err = enableNriForCrio() + default: + log.Fatalf("unknown container runtime %q", unit) + } + + if err != nil { + log.Fatalf("error enabling NRI: %v", err) + } + + if err = restartSystemdUnit(unit); err != nil { + log.Fatalf("failed to restart %q unit: %v", unit, err) + } + + log.Println("enabled NRI for", unit) +} + +func enableNriForContainerd() error { + tomlMap, err := readConfig(containerdConfigFile) + if err != nil { + return fmt.Errorf("error reading TOML file: %w", err) + } + + updatedTomlMap := updateContainerdConfig(tomlMap) + + err = writeToContainerdConfig(containerdConfigFile, updatedTomlMap) + if err != nil { + return fmt.Errorf("failed to write updated config into a file %q: %w", containerdConfigFile, err) + } + return nil +} - err = writeConfig(tomlFilePath, updatedTomlMap) +func enableNriForCrio() error { + f, err := os.Create(crioConfigFile) if err != nil { - log.Fatalf("failed to write updated config into a file %q:, %v", tomlFilePath, err) + return fmt.Errorf("error creating a drop-in file for CRI-O: %w", err) } + defer f.Close() - err = restartSystemdUnit(unit) + _, err = f.WriteString("[crio.nri]\nenable_nri = true\n") if err != nil { - log.Fatalf("failed to restart containerd: %v", err) + return fmt.Errorf("error writing a drop-in file for CRI-O: %w", err) } + return nil } -func writeConfig(file string, config map[string]interface{}) error { + +func writeToContainerdConfig(file string, config map[string]interface{}) error { var buf bytes.Buffer enc := tomlv2.NewEncoder(&buf) enc.SetIndentTables(true) @@ -90,10 +128,10 @@ func readConfig(file string) (map[string]interface{}, error) { return tomlMap, nil } -func updateNRIPlugin(config map[string]interface{}) map[string]interface{} { +func updateContainerdConfig(config map[string]interface{}) map[string]interface{} { plugins, exists := config["plugins"].(map[string]interface{}) if !exists { - log.Println("Top level plugins section not found, adding it to enable NRI...") + log.Println("top level plugins section not found, adding it to enable NRI...") plugins = make(map[string]interface{}) config["plugins"] = plugins } @@ -105,15 +143,37 @@ func updateNRIPlugin(config map[string]interface{}) map[string]interface{} { plugins[nriPluginKey] = nri } - nri[disableKey] = false - log.Println("Enabled NRI...") + nri["disable"] = false return config } +func detectRuntime() (string, error) { + conn, err := dbus.NewSystemConnectionContext(context.Background()) + if err != nil { + return "", fmt.Errorf("failed to create DBus connection: %w", err) + } + defer conn.Close() + + // Filter out active container runtime (CRI-O or containerd) systemd units on the node. + // It is expected that only one container runtime systemd unit should be active at a time + // (either containerd or CRI-O).If more than one container runtime systemd unit is found + // to be in an active state, the process fails. + units, err := conn.ListUnitsByPatternsContext(context.Background(), []string{"active"}, []string{containerdUnit, crioUnit}) + if err != nil { + return "", fmt.Errorf("failed to detect container runtime in use: %w", err) + } + + if len(units) > 1 { + return "", fmt.Errorf("detected more than one container runtime on the host, expected one") + } + + return units[0].Name, nil +} + func restartSystemdUnit(unit string) error { conn, err := dbus.NewSystemConnectionContext(context.Background()) if err != nil { - return fmt.Errorf("failed to create DBus connection for unit %q: %w", unit, err) + return fmt.Errorf("failed to create DBus connection: %w", err) } defer conn.Close() diff --git a/deployment/helm/balloons/templates/daemonset.yaml b/deployment/helm/balloons/templates/daemonset.yaml index e0410387d..a6463c272 100644 --- a/deployment/helm/balloons/templates/daemonset.yaml +++ b/deployment/helm/balloons/templates/daemonset.yaml @@ -17,14 +17,17 @@ spec: serviceAccount: nri-resource-policy-balloons nodeSelector: kubernetes.io/os: "linux" - {{- if .Values.nri.patchContainerdConfig }} + {{- if .Values.nri.patchRuntimeConfig }} initContainers: - - name: patch-containerd + - name: patch-runtime image: {{ .Values.initContainerImage.name }}:{{ .Values.initContainerImage.tag | default .Chart.AppVersion }} imagePullPolicy: {{ .Values.initContainerImage.pullPolicy }} + restartPolicy: Never volumeMounts: - name: containerd-config - mountPath: /etc/containerd/config.toml + mountPath: /etc/containerd + - name: crio-config + mountPath: /etc/crio/crio.conf.d - name: dbus-socket mountPath: /var/run/dbus/system_bus_socket securityContext: @@ -91,11 +94,15 @@ spec: hostPath: path: /var/run/nri type: DirectoryOrCreate - {{- if .Values.nri.patchContainerdConfig }} + {{- if .Values.nri.patchRuntimeConfig }} - name: containerd-config hostPath: - path: /etc/containerd/config.toml - type: File + path: /etc/containerd/ + type: DirectoryOrCreate + - name: crio-config + hostPath: + path: /etc/crio/crio.conf.d/ + type: DirectoryOrCreate - name: dbus-socket hostPath: path: /var/run/dbus/system_bus_socket diff --git a/deployment/helm/balloons/values.yaml b/deployment/helm/balloons/values.yaml index 50d953ca2..fdc562fc0 100644 --- a/deployment/helm/balloons/values.yaml +++ b/deployment/helm/balloons/values.yaml @@ -20,7 +20,8 @@ resources: memory: 512Mi nri: - patchContainerdConfig: false + patchRuntimeConfig: false + initContainerImage: name: ghcr.io/containers/nri-plugins/nri-config-manager diff --git a/deployment/helm/topology-aware/templates/daemonset.yaml b/deployment/helm/topology-aware/templates/daemonset.yaml index db1abb531..c5e3f70fb 100644 --- a/deployment/helm/topology-aware/templates/daemonset.yaml +++ b/deployment/helm/topology-aware/templates/daemonset.yaml @@ -17,14 +17,17 @@ spec: serviceAccount: nri-resource-policy-topology-aware nodeSelector: kubernetes.io/os: "linux" - {{- if .Values.nri.patchContainerdConfig }} + {{- if .Values.nri.patchRuntimeConfig }} initContainers: - - name: patch-containerd + - name: patch-runtime image: {{ .Values.initContainerImage.name }}:{{ .Values.initContainerImage.tag | default .Chart.AppVersion }} imagePullPolicy: {{ .Values.initContainerImage.pullPolicy }} + restartPolicy: Never volumeMounts: - name: containerd-config - mountPath: /etc/containerd/config.toml + mountPath: /etc/containerd + - name: crio-config + mountPath: /etc/crio/crio.conf.d - name: dbus-socket mountPath: /var/run/dbus/system_bus_socket securityContext: @@ -91,11 +94,15 @@ spec: hostPath: path: /var/run/nri type: DirectoryOrCreate - {{- if .Values.nri.patchContainerdConfig }} + {{- if .Values.nri.patchRuntimeConfig }} - name: containerd-config hostPath: - path: /etc/containerd/config.toml - type: File + path: /etc/containerd/ + type: DirectoryOrCreate + - name: crio-config + hostPath: + path: /etc/crio/crio.conf.d/ + type: DirectoryOrCreate - name: dbus-socket hostPath: path: /var/run/dbus/system_bus_socket diff --git a/deployment/helm/topology-aware/values.yaml b/deployment/helm/topology-aware/values.yaml index 8393f4469..41e93d01b 100644 --- a/deployment/helm/topology-aware/values.yaml +++ b/deployment/helm/topology-aware/values.yaml @@ -20,7 +20,7 @@ resources: memory: 512Mi nri: - patchContainerdConfig: false + patchRuntimeConfig: false initContainerImage: name: ghcr.io/containers/nri-plugins/nri-config-manager diff --git a/docs/resource-policy/installation.md b/docs/resource-policy/installation.md index d7ff0424a..1d9c69227 100644 --- a/docs/resource-policy/installation.md +++ b/docs/resource-policy/installation.md @@ -18,23 +18,32 @@ following components: DaemonSet, ConfigMap, CustomResourceDefinition, and RBAC-r - Container runtime: - containerD: - At least [containerd 1.7.0](https://github.com/containerd/containerd/releases/tag/v1.7.0) - release version to use the NRI feature + release version to use the NRI feature. + - Enable NRI feature by following [these](https://github.com/containerd/containerd/blob/main/docs/NRI.md#enabling-nri-support-in-containerd) detailed instructions. You can optionally enable the NRI in containerd using the Helm chart - during the chart installation simply by setting the `nri.patchContainerdConfig` parameter. + during the chart installation simply by setting the `nri.patchRuntimeConfig` parameter. For instance, ```sh - helm install topology-aware --namespace kube-system --set nri.patchContainerdConfig=true deployment/helm/topology-aware/ + helm install topology-aware --namespace kube-system --set nri.patchRuntimeConfig=true deployment/helm/topology-aware/ ``` - Enabling `nri.patchContainerdConfig` creates an init container to turn on + Enabling `nri.patchRuntimeConfig` creates an init container to turn on NRI feature in containerd and only after that proceed the plugin installation. - CRI-O - At least [v1.26.0](https://github.com/cri-o/cri-o/releases/tag/v1.26.0) release version to use the NRI feature - Enable NRI feature by following [these](https://github.com/cri-o/cri-o/blob/main/docs/crio.conf.5.md#crionri-table) detailed instructions. + You can optionally enable the NRI in CRI-O using the Helm chart + during the chart installation simply by setting the `nri.patchRuntimeConfig` parameter. + For instance, + + ```sh + helm install topology-aware --namespace kube-system --set nri.patchRuntimeConfig=true deployment/helm/topology-aware/ + ``` + - Kubernetes 1.24+ - Helm 3.0.0+ @@ -94,14 +103,14 @@ along with the default values, for the Topology-aware and Balloons plugins Helm | Name | Default | Description | | ------------------ | ----------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------- | -| `image.name` | [ghcr.io/containers/nri-plugins/nri-resource-policy-topology-aware](ghcr.io/containers/nri-plugins/nri-resource-policy-topology-aware) | container image name | +| `image.name` | [ghcr.io/containers/nri-plugins/nri-resource-policy-topology-aware](ghcr.io/containers/nri-plugins/nri-resource-policy-topology-aware) | container image name | | `image.tag` | unstable | container image tag | | `image.pullPolicy` | Always | image pull policy | | `resources.cpu` | 500m | cpu resources for the Pod | | `resources.memory` | 512Mi | memory qouta for the Pod | | `hostPort` | 8891 | metrics port to expose on the host | | `config` |
ReservedResources:
cpu: 750m
| plugin configuration data | -| `nri.patchContainerdConfig` | false | enable/disable NRI in containerd. | +| `nri.patchRuntimeConfig` | false | enable NRI in containerd or CRI-O | | `initImage.name` | [ghcr.io/containers/nri-plugins/config-manager](ghcr.io/containers/nri-plugins/config-manager) | init container image name | | `initImage.tag` | unstable | init container image tag | | `initImage.pullPolicy` | Always | init container image pull policy | @@ -117,7 +126,7 @@ along with the default values, for the Topology-aware and Balloons plugins Helm | `resources.memory` | 512Mi | memory qouta for the Pod | | `hostPort` | 8891 | metrics port to expose on the host | | `config` |
ReservedResources:
cpu: 750m
| plugin configuration data | -| `nri.patchContainerdConfig` | false | enable/disable NRI in containerd. | +| `nri.patchRuntimeConfig` | false | enable NRI in containerd or CRI-O | | `initImage.name` | [ghcr.io/containers/nri-plugins/config-manager](ghcr.io/containers/nri-plugins/config-manager) | init container image name | | `initImage.tag` | unstable | init container image tag | | `initImage.pullPolicy` | Always | init container image pull policy |