From 4f760f8ee35127b38c7ce571c7e113b9fc7e56c8 Mon Sep 17 00:00:00 2001 From: Feruzjon Muyassarov Date: Thu, 28 Sep 2023 13:30:21 +0300 Subject: [PATCH] deployment: refactor config manager to support NRI enabling in CRI-O This commit extends config manager code and the plugins helm charts so that CRI-O users are also able to enable NRI via our charts if they wish to. Same parameter is used to opt in for the feature in Helm charts and we don't require users to indicate what container runtime is being used. Instead the config manager auto-detects the runtime and does the necessary changes to its configuration file. In scenarios with multiple active runtimes (e.g., CRI-O and containerd), the manager gracefully exits and throws an error. Signed-off-by: Feruzjon Muyassarov --- cmd/config-manager/main.go | 91 +++++++++++++++---- .../balloons/templates/daemonset.yaml | 17 ++-- .../balloons/values.yaml | 3 +- .../topology-aware/templates/daemonset.yaml | 17 ++-- .../topology-aware/values.yaml | 2 +- docs/resource-policy/installation.md | 23 +++-- 6 files changed, 110 insertions(+), 43 deletions(-) diff --git a/cmd/config-manager/main.go b/cmd/config-manager/main.go index d81e37d97..1a9dc223d 100644 --- a/cmd/config-manager/main.go +++ b/cmd/config-manager/main.go @@ -29,33 +29,66 @@ import ( ) const ( - tomlFilePath = "/etc/containerd/config.toml" - nriPluginKey = "io.containerd.nri.v1.nri" - disableKey = "disable" - replaceMode = "replace" - resultDone = "done" - unit = "containerd.service" + containerdConfigFile = "/etc/containerd/config.toml" + crioConfigFile = "/etc/crio/crio.conf.d/10-enable-nri.conf" + nriPluginKey = "io.containerd.nri.v1.nri" + replaceMode = "replace" + resultDone = "done" + containerdUnit = "containerd.service" + crioUnit = "crio.service" ) func main() { - tomlMap, err := readConfig(tomlFilePath) + unit, err := detectRuntime() if err != nil { - log.Fatalf("Error reading TOML file: %v", err) + log.Fatalf("failed to autodetect container runtime: %v", err) } - updatedTomlMap := updateNRIPlugin(tomlMap) + // Edit containerd config only if containerd is detected to be the runtime. + if unit == containerdUnit { + tomlMap, err := readConfig(containerdConfigFile) + if err != nil { + log.Fatalf("Error reading TOML file: %v", err) + } - err = writeConfig(tomlFilePath, updatedTomlMap) + updatedTomlMap := updateContainerdConfig(tomlMap) + + err = writeToContainerdConfig(containerdConfigFile, updatedTomlMap) + if err != nil { + log.Fatalf("failed to write updated config into a file %q:, %v", containerdConfigFile, err) + } + } + + err = writeToCrioConfig() if err != nil { - log.Fatalf("failed to write updated config into a file %q:, %v", tomlFilePath, err) + log.Fatalf("failed to update the CRI-O configuration %v", err) } - err = restartSystemdUnit(unit) + // If we reach this point, it indicates that the CRI-O runtime is in use, + // and we perform a restart only. NRI related modifications to the CRI-O + // are accomplished through a mounted drop-in file. + if err = restartSystemdUnit(unit); err != nil { + log.Fatalf("failed to restart %q unit: %v", unit, err) + } + + log.Println("Enabled NRI for", unit) +} + +func writeToCrioConfig() error { + f, err := os.Create(crioConfigFile) if err != nil { - log.Fatalf("failed to restart containerd: %v", err) + return fmt.Errorf("error creating a drop-in file for CRI-O: %w", err) } + defer f.Close() + + _, err = f.WriteString("[crio.nri]\nenable_nri = true\n") + if err != nil { + return fmt.Errorf("error writing a drop-in file for CRI-O: %w", err) + } + return nil } -func writeConfig(file string, config map[string]interface{}) error { + +func writeToContainerdConfig(file string, config map[string]interface{}) error { var buf bytes.Buffer enc := tomlv2.NewEncoder(&buf) enc.SetIndentTables(true) @@ -90,7 +123,7 @@ func readConfig(file string) (map[string]interface{}, error) { return tomlMap, nil } -func updateNRIPlugin(config map[string]interface{}) map[string]interface{} { +func updateContainerdConfig(config map[string]interface{}) map[string]interface{} { plugins, exists := config["plugins"].(map[string]interface{}) if !exists { log.Println("Top level plugins section not found, adding it to enable NRI...") @@ -105,15 +138,37 @@ func updateNRIPlugin(config map[string]interface{}) map[string]interface{} { plugins[nriPluginKey] = nri } - nri[disableKey] = false - log.Println("Enabled NRI...") + nri["disable"] = false return config } +func detectRuntime() (string, error) { + conn, err := dbus.NewSystemConnectionContext(context.Background()) + if err != nil { + return "", fmt.Errorf("failed to create DBus connection: %w", err) + } + defer conn.Close() + + // Filter out active container runtime (CRI-O or containerd) systemd units on the node. + // It is expected that only one container runtime systemd unit should be active at a time + // (either containerd or CRI-O).If more than one container runtime systemd unit is found + // to be in an active state, the process fails. + units, err := conn.ListUnitsByPatternsContext(context.Background(), []string{"active"}, []string{containerdUnit, crioUnit}) + if err != nil { + return "", fmt.Errorf("failed to detect container runtime in use: %w", err) + } + + if len(units) > 1 { + return "", fmt.Errorf("detected more than one container runtime on the host, expected one") + } + + return units[0].Name, nil +} + func restartSystemdUnit(unit string) error { conn, err := dbus.NewSystemConnectionContext(context.Background()) if err != nil { - return fmt.Errorf("failed to create DBus connection for unit %q: %w", unit, err) + return fmt.Errorf("failed to create DBus connection: %w", err) } defer conn.Close() diff --git a/deployment/helm/resource-management-policies/balloons/templates/daemonset.yaml b/deployment/helm/resource-management-policies/balloons/templates/daemonset.yaml index 2da97dd52..b6f06668f 100644 --- a/deployment/helm/resource-management-policies/balloons/templates/daemonset.yaml +++ b/deployment/helm/resource-management-policies/balloons/templates/daemonset.yaml @@ -17,14 +17,15 @@ spec: serviceAccount: nri-resource-policy-balloons nodeSelector: kubernetes.io/os: "linux" - {{- if .Values.nri.patchContainerdConfig }} + {{- if .Values.nri.patchRuntime }} initContainers: - - name: patch-containerd + - name: patch-runtime image: {{ .Values.initContainerImage.name }}:{{ .Values.initContainerImage.tag | default .Chart.AppVersion }} imagePullPolicy: {{ .Values.initContainerImage.pullPolicy }} + restartPolicy: Never volumeMounts: - - name: containerd-config - mountPath: /etc/containerd/config.toml + - name: etc + mountPath: /etc - name: dbus-socket mountPath: /var/run/dbus/system_bus_socket securityContext: @@ -91,11 +92,11 @@ spec: hostPath: path: /var/run/nri type: Directory - {{- if .Values.nri.patchContainerdConfig }} - - name: containerd-config + {{- if .Values.nri.patchRuntime }} + - name: etc hostPath: - path: /etc/containerd/config.toml - type: File + path: /etc + type: Directory - name: dbus-socket hostPath: path: /var/run/dbus/system_bus_socket diff --git a/deployment/helm/resource-management-policies/balloons/values.yaml b/deployment/helm/resource-management-policies/balloons/values.yaml index 7c013a16c..6f392a912 100644 --- a/deployment/helm/resource-management-policies/balloons/values.yaml +++ b/deployment/helm/resource-management-policies/balloons/values.yaml @@ -20,7 +20,8 @@ resources: memory: 512Mi nri: - patchContainerdConfig: false + patchRuntime: false + initContainerImage: name: ghcr.io/containers/nri-plugins/nri-resource-policy-config-manager diff --git a/deployment/helm/resource-management-policies/topology-aware/templates/daemonset.yaml b/deployment/helm/resource-management-policies/topology-aware/templates/daemonset.yaml index 4b6767091..d946b97ef 100644 --- a/deployment/helm/resource-management-policies/topology-aware/templates/daemonset.yaml +++ b/deployment/helm/resource-management-policies/topology-aware/templates/daemonset.yaml @@ -17,14 +17,15 @@ spec: serviceAccount: nri-resource-policy-topology-aware nodeSelector: kubernetes.io/os: "linux" - {{- if .Values.nri.patchContainerdConfig }} + {{- if .Values.nri.patchRuntime }} initContainers: - - name: patch-containerd + - name: patch-runtime image: {{ .Values.initContainerImage.name }}:{{ .Values.initContainerImage.tag | default .Chart.AppVersion }} imagePullPolicy: {{ .Values.initContainerImage.pullPolicy }} + restartPolicy: Never volumeMounts: - - name: containerd-config - mountPath: /etc/containerd/config.toml + - name: etc + mountPath: /etc - name: dbus-socket mountPath: /var/run/dbus/system_bus_socket securityContext: @@ -91,11 +92,11 @@ spec: hostPath: path: /var/run/nri type: Directory - {{- if .Values.nri.patchContainerdConfig }} - - name: containerd-config + {{- if .Values.nri.patchRuntime }} + - name: etc hostPath: - path: /etc/containerd/config.toml - type: File + path: /etc + type: Directory - name: dbus-socket hostPath: path: /var/run/dbus/system_bus_socket diff --git a/deployment/helm/resource-management-policies/topology-aware/values.yaml b/deployment/helm/resource-management-policies/topology-aware/values.yaml index e7948be3b..a63ad90cc 100644 --- a/deployment/helm/resource-management-policies/topology-aware/values.yaml +++ b/deployment/helm/resource-management-policies/topology-aware/values.yaml @@ -20,7 +20,7 @@ resources: memory: 512Mi nri: - patchContainerdConfig: false + patchRuntime: false initContainerImage: name: ghcr.io/containers/nri-plugins/nri-resource-policy-config-manager diff --git a/docs/resource-policy/installation.md b/docs/resource-policy/installation.md index 85bdc006d..aab7ae788 100644 --- a/docs/resource-policy/installation.md +++ b/docs/resource-policy/installation.md @@ -18,23 +18,32 @@ following components: DaemonSet, ConfigMap, CustomResourceDefinition, and RBAC-r - Container runtime: - containerD: - At least [containerd 1.7.0](https://github.com/containerd/containerd/releases/tag/v1.7.0) - release version to use the NRI feature + release version to use the NRI feature. + - Enable NRI feature by following [these](https://github.com/containerd/containerd/blob/main/docs/NRI.md#enabling-nri-support-in-containerd) detailed instructions. You can optionally enable the NRI in containerd using the Helm chart - during the chart installation simply by setting the `nri.patchContainerdConfig` parameter. + during the chart installation simply by setting the `nri.patchRuntime` parameter. For instance, ```sh - helm install topology-aware --namespace kube-system --set nri.patchContainerdConfig=true deployment/helm/resource-management-policies/topology-aware/ + helm install topology-aware --namespace kube-system --set nri.patchRuntime=true deployment/helm/resource-management-policies/topology-aware/ ``` - Enabling `nri.patchContainerdConfig` creates an init container to turn on + Enabling `nri.patchRuntime` creates an init container to turn on NRI feature in containerd and only after that proceed the plugin installation. - CRI-O - At least [v1.26.0](https://github.com/cri-o/cri-o/releases/tag/v1.26.0) release version to use the NRI feature - Enable NRI feature by following [these](https://github.com/cri-o/cri-o/blob/main/docs/crio.conf.5.md#crionri-table) detailed instructions. + You can optionally enable the NRI in CRI-O using the Helm chart + during the chart installation simply by setting the `nri.patchRuntime` parameter. + For instance, + + ```sh + helm install topology-aware --namespace kube-system --set nri.patchRuntime=true deployment/helm/resource-management-policies/topology-aware/ + ``` + - Kubernetes 1.24+ - Helm 3.0.0+ @@ -94,14 +103,14 @@ along with the default values, for the Topology-aware and Balloons plugins Helm | Name | Default | Description | | ------------------ | ----------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------- | -| `image.name` | [ghcr.io/containers/nri-plugins/nri-resource-policy-topology-aware](ghcr.io/containers/nri-plugins/nri-resource-policy-topology-aware) | container image name | +| `image.name` | [ghcr.io/containers/nri-plugins/nri-resource-policy-topology-aware](ghcr.io/containers/nri-plugins/nri-resource-policy-topology-aware) | container image name | | `image.tag` | unstable | container image tag | | `image.pullPolicy` | Always | image pull policy | | `resources.cpu` | 500m | cpu resources for the Pod | | `resources.memory` | 512Mi | memory qouta for the Pod | | `hostPort` | 8891 | metrics port to expose on the host | | `config` |
ReservedResources:
cpu: 750m
| plugin configuration data | -| `nri.patchContainerdConfig` | false | enable/disable NRI in containerd. | +| `nri.patchRuntime` | false | enable NRI in containerd or CRI-O | | `initImage.name` | [ghcr.io/containers/nri-plugins/config-manager](ghcr.io/containers/nri-plugins/config-manager) | init container image name | | `initImage.tag` | unstable | init container image tag | | `initImage.pullPolicy` | Always | init container image pull policy | @@ -117,7 +126,7 @@ along with the default values, for the Topology-aware and Balloons plugins Helm | `resources.memory` | 512Mi | memory qouta for the Pod | | `hostPort` | 8891 | metrics port to expose on the host | | `config` |
ReservedResources:
cpu: 750m
| plugin configuration data | -| `nri.patchContainerdConfig` | false | enable/disable NRI in containerd. | +| `nri.patchRuntime` | false | enable NRI in containerd or CRI-O | | `initImage.name` | [ghcr.io/containers/nri-plugins/config-manager](ghcr.io/containers/nri-plugins/config-manager) | init container image name | | `initImage.tag` | unstable | init container image tag | | `initImage.pullPolicy` | Always | init container image pull policy |