From 292daebe05b36686f982fa30f942f9c1ce908a66 Mon Sep 17 00:00:00 2001 From: Feruzjon Muyassarov Date: Thu, 28 Sep 2023 13:30:21 +0300 Subject: [PATCH] deployment: refactor config manager to support NRI enabling in CRI-O This commit extends config manager code and the plugins helm charts so that CRI-O users are also able to enable NRI via our charts if they wish to. Same parameter is used to opt in for the feature in Helm charts and we don't require users to indicate what container runtime is being used. Instead the config manager auto-detects the runtime and does the necessary changes to its configuration file. In scenarios with multiple active runtimes (e.g., CRI-O and containerd), the manager gracefully exits and throws an error. Signed-off-by: Feruzjon Muyassarov --- cmd/config-manager/main.go | 106 ++++++++++++++---- .../balloons/templates/daemonset.yaml | 17 +-- .../balloons/values.yaml | 3 +- .../topology-aware/templates/daemonset.yaml | 17 +-- .../topology-aware/values.yaml | 2 +- docs/resource-policy/installation.md | 23 ++-- 6 files changed, 124 insertions(+), 44 deletions(-) diff --git a/cmd/config-manager/main.go b/cmd/config-manager/main.go index d81e37d97..2963e4455 100644 --- a/cmd/config-manager/main.go +++ b/cmd/config-manager/main.go @@ -29,33 +29,79 @@ import ( ) const ( - tomlFilePath = "/etc/containerd/config.toml" - nriPluginKey = "io.containerd.nri.v1.nri" - disableKey = "disable" - replaceMode = "replace" - resultDone = "done" - unit = "containerd.service" + containerdConfigFile = "/etc/containerd/config.toml" + crioConfigFile = "/etc/crio/crio.conf.d/10-enable-nri.conf" + nriPluginKey = "io.containerd.nri.v1.nri" + replaceMode = "replace" + resultDone = "done" + containerdUnit = "containerd.service" + crioUnit = "crio.service" ) func main() { - tomlMap, err := readConfig(tomlFilePath) + unit, err := detectRuntime() if err != nil { - log.Fatalf("Error reading TOML file: %v", err) + log.Fatalf("failed to autodetect container runtime: %v", err) } - updatedTomlMap := updateNRIPlugin(tomlMap) + switch unit { + case containerdUnit: + err = enableNriForContainerd() + case crioUnit: + err = enableNriForCrio() + default: + log.Fatalf("unknown container runtime %q", unit) + } + + if err != nil { + log.Fatalf("error enabling NRI: %v", err) + } + + if err = restartSystemdUnit(unit); err != nil { + log.Fatalf("failed to restart %q unit: %v", unit, err) + } + + log.Println("enabled NRI for", unit) +} - err = writeConfig(tomlFilePath, updatedTomlMap) +func enableNriForContainerd() error { + tomlMap, err := readConfig(containerdConfigFile) if err != nil { - log.Fatalf("failed to write updated config into a file %q:, %v", tomlFilePath, err) + return fmt.Errorf("error reading TOML file: %v", err) } - err = restartSystemdUnit(unit) + updatedTomlMap := updateContainerdConfig(tomlMap) + + err = writeToContainerdConfig(containerdConfigFile, updatedTomlMap) if err != nil { - log.Fatalf("failed to restart containerd: %v", err) + return fmt.Errorf("failed to write updated config into a file %q: %v", containerdConfigFile, err) } + return nil } -func writeConfig(file string, config map[string]interface{}) error { + +func enableNriForCrio() error { + err := updateCrioConfig() + if err != nil { + return fmt.Errorf("failed to update the CRI-O configuration %v", err) + } + return nil +} + +func updateCrioConfig() error { + f, err := os.Create(crioConfigFile) + if err != nil { + return fmt.Errorf("error creating a drop-in file for CRI-O: %w", err) + } + defer f.Close() + + _, err = f.WriteString("[crio.nri]\nenable_nri = true\n") + if err != nil { + return fmt.Errorf("error writing a drop-in file for CRI-O: %w", err) + } + return nil +} + +func writeToContainerdConfig(file string, config map[string]interface{}) error { var buf bytes.Buffer enc := tomlv2.NewEncoder(&buf) enc.SetIndentTables(true) @@ -90,10 +136,10 @@ func readConfig(file string) (map[string]interface{}, error) { return tomlMap, nil } -func updateNRIPlugin(config map[string]interface{}) map[string]interface{} { +func updateContainerdConfig(config map[string]interface{}) map[string]interface{} { plugins, exists := config["plugins"].(map[string]interface{}) if !exists { - log.Println("Top level plugins section not found, adding it to enable NRI...") + log.Println("top level plugins section not found, adding it to enable NRI...") plugins = make(map[string]interface{}) config["plugins"] = plugins } @@ -105,15 +151,37 @@ func updateNRIPlugin(config map[string]interface{}) map[string]interface{} { plugins[nriPluginKey] = nri } - nri[disableKey] = false - log.Println("Enabled NRI...") + nri["disable"] = false return config } +func detectRuntime() (string, error) { + conn, err := dbus.NewSystemConnectionContext(context.Background()) + if err != nil { + return "", fmt.Errorf("failed to create DBus connection: %w", err) + } + defer conn.Close() + + // Filter out active container runtime (CRI-O or containerd) systemd units on the node. + // It is expected that only one container runtime systemd unit should be active at a time + // (either containerd or CRI-O).If more than one container runtime systemd unit is found + // to be in an active state, the process fails. + units, err := conn.ListUnitsByPatternsContext(context.Background(), []string{"active"}, []string{containerdUnit, crioUnit}) + if err != nil { + return "", fmt.Errorf("failed to detect container runtime in use: %w", err) + } + + if len(units) > 1 { + return "", fmt.Errorf("detected more than one container runtime on the host, expected one") + } + + return units[0].Name, nil +} + func restartSystemdUnit(unit string) error { conn, err := dbus.NewSystemConnectionContext(context.Background()) if err != nil { - return fmt.Errorf("failed to create DBus connection for unit %q: %w", unit, err) + return fmt.Errorf("failed to create DBus connection: %w", err) } defer conn.Close() diff --git a/deployment/helm/resource-management-policies/balloons/templates/daemonset.yaml b/deployment/helm/resource-management-policies/balloons/templates/daemonset.yaml index 33133bdd6..9d7e52f72 100644 --- a/deployment/helm/resource-management-policies/balloons/templates/daemonset.yaml +++ b/deployment/helm/resource-management-policies/balloons/templates/daemonset.yaml @@ -17,14 +17,15 @@ spec: serviceAccount: nri-resource-policy-balloons nodeSelector: kubernetes.io/os: "linux" - {{- if .Values.nri.patchContainerdConfig }} + {{- if .Values.nri.patchRuntime }} initContainers: - - name: patch-containerd + - name: patch-runtime image: {{ .Values.initContainerImage.name }}:{{ .Values.initContainerImage.tag | default .Chart.AppVersion }} imagePullPolicy: {{ .Values.initContainerImage.pullPolicy }} + restartPolicy: Never volumeMounts: - - name: containerd-config - mountPath: /etc/containerd/config.toml + - name: etc + mountPath: /etc - name: dbus-socket mountPath: /var/run/dbus/system_bus_socket securityContext: @@ -91,11 +92,11 @@ spec: hostPath: path: /var/run/nri type: DirectoryOrCreate - {{- if .Values.nri.patchContainerdConfig }} - - name: containerd-config + {{- if .Values.nri.patchRuntime }} + - name: etc hostPath: - path: /etc/containerd/config.toml - type: File + path: /etc + type: Directory - name: dbus-socket hostPath: path: /var/run/dbus/system_bus_socket diff --git a/deployment/helm/resource-management-policies/balloons/values.yaml b/deployment/helm/resource-management-policies/balloons/values.yaml index 7c013a16c..6f392a912 100644 --- a/deployment/helm/resource-management-policies/balloons/values.yaml +++ b/deployment/helm/resource-management-policies/balloons/values.yaml @@ -20,7 +20,8 @@ resources: memory: 512Mi nri: - patchContainerdConfig: false + patchRuntime: false + initContainerImage: name: ghcr.io/containers/nri-plugins/nri-resource-policy-config-manager diff --git a/deployment/helm/resource-management-policies/topology-aware/templates/daemonset.yaml b/deployment/helm/resource-management-policies/topology-aware/templates/daemonset.yaml index 467655acc..9164bdd17 100644 --- a/deployment/helm/resource-management-policies/topology-aware/templates/daemonset.yaml +++ b/deployment/helm/resource-management-policies/topology-aware/templates/daemonset.yaml @@ -17,14 +17,15 @@ spec: serviceAccount: nri-resource-policy-topology-aware nodeSelector: kubernetes.io/os: "linux" - {{- if .Values.nri.patchContainerdConfig }} + {{- if .Values.nri.patchRuntime }} initContainers: - - name: patch-containerd + - name: patch-runtime image: {{ .Values.initContainerImage.name }}:{{ .Values.initContainerImage.tag | default .Chart.AppVersion }} imagePullPolicy: {{ .Values.initContainerImage.pullPolicy }} + restartPolicy: Never volumeMounts: - - name: containerd-config - mountPath: /etc/containerd/config.toml + - name: etc + mountPath: /etc - name: dbus-socket mountPath: /var/run/dbus/system_bus_socket securityContext: @@ -91,11 +92,11 @@ spec: hostPath: path: /var/run/nri type: DirectoryOrCreate - {{- if .Values.nri.patchContainerdConfig }} - - name: containerd-config + {{- if .Values.nri.patchRuntime }} + - name: etc hostPath: - path: /etc/containerd/config.toml - type: File + path: /etc + type: Directory - name: dbus-socket hostPath: path: /var/run/dbus/system_bus_socket diff --git a/deployment/helm/resource-management-policies/topology-aware/values.yaml b/deployment/helm/resource-management-policies/topology-aware/values.yaml index e7948be3b..a63ad90cc 100644 --- a/deployment/helm/resource-management-policies/topology-aware/values.yaml +++ b/deployment/helm/resource-management-policies/topology-aware/values.yaml @@ -20,7 +20,7 @@ resources: memory: 512Mi nri: - patchContainerdConfig: false + patchRuntime: false initContainerImage: name: ghcr.io/containers/nri-plugins/nri-resource-policy-config-manager diff --git a/docs/resource-policy/installation.md b/docs/resource-policy/installation.md index 85bdc006d..aab7ae788 100644 --- a/docs/resource-policy/installation.md +++ b/docs/resource-policy/installation.md @@ -18,23 +18,32 @@ following components: DaemonSet, ConfigMap, CustomResourceDefinition, and RBAC-r - Container runtime: - containerD: - At least [containerd 1.7.0](https://github.com/containerd/containerd/releases/tag/v1.7.0) - release version to use the NRI feature + release version to use the NRI feature. + - Enable NRI feature by following [these](https://github.com/containerd/containerd/blob/main/docs/NRI.md#enabling-nri-support-in-containerd) detailed instructions. You can optionally enable the NRI in containerd using the Helm chart - during the chart installation simply by setting the `nri.patchContainerdConfig` parameter. + during the chart installation simply by setting the `nri.patchRuntime` parameter. For instance, ```sh - helm install topology-aware --namespace kube-system --set nri.patchContainerdConfig=true deployment/helm/resource-management-policies/topology-aware/ + helm install topology-aware --namespace kube-system --set nri.patchRuntime=true deployment/helm/resource-management-policies/topology-aware/ ``` - Enabling `nri.patchContainerdConfig` creates an init container to turn on + Enabling `nri.patchRuntime` creates an init container to turn on NRI feature in containerd and only after that proceed the plugin installation. - CRI-O - At least [v1.26.0](https://github.com/cri-o/cri-o/releases/tag/v1.26.0) release version to use the NRI feature - Enable NRI feature by following [these](https://github.com/cri-o/cri-o/blob/main/docs/crio.conf.5.md#crionri-table) detailed instructions. + You can optionally enable the NRI in CRI-O using the Helm chart + during the chart installation simply by setting the `nri.patchRuntime` parameter. + For instance, + + ```sh + helm install topology-aware --namespace kube-system --set nri.patchRuntime=true deployment/helm/resource-management-policies/topology-aware/ + ``` + - Kubernetes 1.24+ - Helm 3.0.0+ @@ -94,14 +103,14 @@ along with the default values, for the Topology-aware and Balloons plugins Helm | Name | Default | Description | | ------------------ | ----------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------- | -| `image.name` | [ghcr.io/containers/nri-plugins/nri-resource-policy-topology-aware](ghcr.io/containers/nri-plugins/nri-resource-policy-topology-aware) | container image name | +| `image.name` | [ghcr.io/containers/nri-plugins/nri-resource-policy-topology-aware](ghcr.io/containers/nri-plugins/nri-resource-policy-topology-aware) | container image name | | `image.tag` | unstable | container image tag | | `image.pullPolicy` | Always | image pull policy | | `resources.cpu` | 500m | cpu resources for the Pod | | `resources.memory` | 512Mi | memory qouta for the Pod | | `hostPort` | 8891 | metrics port to expose on the host | | `config` |
ReservedResources:
cpu: 750m
| plugin configuration data | -| `nri.patchContainerdConfig` | false | enable/disable NRI in containerd. | +| `nri.patchRuntime` | false | enable NRI in containerd or CRI-O | | `initImage.name` | [ghcr.io/containers/nri-plugins/config-manager](ghcr.io/containers/nri-plugins/config-manager) | init container image name | | `initImage.tag` | unstable | init container image tag | | `initImage.pullPolicy` | Always | init container image pull policy | @@ -117,7 +126,7 @@ along with the default values, for the Topology-aware and Balloons plugins Helm | `resources.memory` | 512Mi | memory qouta for the Pod | | `hostPort` | 8891 | metrics port to expose on the host | | `config` |
ReservedResources:
cpu: 750m
| plugin configuration data | -| `nri.patchContainerdConfig` | false | enable/disable NRI in containerd. | +| `nri.patchRuntime` | false | enable NRI in containerd or CRI-O | | `initImage.name` | [ghcr.io/containers/nri-plugins/config-manager](ghcr.io/containers/nri-plugins/config-manager) | init container image name | | `initImage.tag` | unstable | init container image tag | | `initImage.pullPolicy` | Always | init container image pull policy |