From a82ab0c9b0556daa1e7576ed23f960ce64606963 Mon Sep 17 00:00:00 2001 From: Antti Kervinen Date: Mon, 28 Aug 2023 17:03:26 +0300 Subject: [PATCH] Add nri-memtierd plugin The nri-memtierd plugin toggles swapping, starts/stops memtierd processes that track and manage memory of containers. Signed-off-by: Antti Kervinen Signed-off-by: Luukas Makila --- cmd/plugins/memtierd/Dockerfile | 32 ++ cmd/plugins/memtierd/main.go | 520 ++++++++++++++++++ .../memtierd/nri-memtierd-deployment.yaml.in | 93 ++++ cmd/plugins/memtierd/test-pod.yaml | 56 ++ deployment/overlays/memtierd/daemonset.yaml | 62 +++ .../overlays/memtierd/kustomization.yaml | 13 + .../overlays/memtierd/sample-configmap.yaml | 30 + docs/memory/memtierd.md | 176 ++++++ .../e2e/files/nri-memtierd-deployment.yaml.in | 99 ++++ 9 files changed, 1081 insertions(+) create mode 100644 cmd/plugins/memtierd/Dockerfile create mode 100644 cmd/plugins/memtierd/main.go create mode 100644 cmd/plugins/memtierd/nri-memtierd-deployment.yaml.in create mode 100644 cmd/plugins/memtierd/test-pod.yaml create mode 100644 deployment/overlays/memtierd/daemonset.yaml create mode 100644 deployment/overlays/memtierd/kustomization.yaml create mode 100644 deployment/overlays/memtierd/sample-configmap.yaml create mode 100644 docs/memory/memtierd.md create mode 100644 test/e2e/files/nri-memtierd-deployment.yaml.in diff --git a/cmd/plugins/memtierd/Dockerfile b/cmd/plugins/memtierd/Dockerfile new file mode 100644 index 000000000..bd33fd6a7 --- /dev/null +++ b/cmd/plugins/memtierd/Dockerfile @@ -0,0 +1,32 @@ +ARG GO_VERSION=1.21 + +FROM golang:${GO_VERSION}-bullseye as builder + +WORKDIR /go/builder + +RUN GOBIN=/bin go install github.com/intel/memtierd/cmd/memtierd@latest + +# Fetch go dependencies in a separate layer for caching +COPY go.mod go.sum ./ +COPY pkg/topology/ pkg/topology/ +RUN go mod download + +# Build nri-memtierd +COPY . . + +RUN make clean +RUN make PLUGINS=nri-memtierd build-plugins-static + +# FROM gcr.io/distroless/static +# ENV PATH=/bin +# TODO: DEBUG why exec.Command("memtierd", "-c", ...).Start() fails on distroless: +# execve("/bin/memtierd", ["/bin/memtierd", "-c", "", "-config", "/host/tmp/memtierd/default/test-"...], 0xc000146000 /* 38 vars */) = -1 ENOENT (No such file or directory) +# However, execve works on busybox base image, so let's use it until +# the real problem is solved. + +FROM busybox + +COPY --from=builder /go/builder/build/bin/nri-memtierd /bin/nri-memtierd +COPY --from=builder /bin/memtierd /bin/memtierd + +ENTRYPOINT ["/bin/nri-memtierd", "--idx", "45"] diff --git a/cmd/plugins/memtierd/main.go b/cmd/plugins/memtierd/main.go new file mode 100644 index 000000000..8ee2e5685 --- /dev/null +++ b/cmd/plugins/memtierd/main.go @@ -0,0 +1,520 @@ +// Copyright 2023 Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "bufio" + "context" + "flag" + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + "strings" + "syscall" + + "gopkg.in/yaml.v2" + + "github.com/sirupsen/logrus" + + "github.com/containerd/nri/pkg/api" + "github.com/containerd/nri/pkg/stub" +) + +type plugin struct { + stub stub.Stub + mask stub.EventMask + config *pluginConfig + cgroupsDir string + ctrMemtierdEnv map[string]*memtierdEnv +} + +type pluginConfig struct { + // Classes define how memory of all workloads in each QoS + // class should be managed. + Classes []qosClass +} + +type qosClass struct { + // Name of the QoS class, matches to annotations in + // pods. Examples: + // annotations: + // # The default for all containers in the pod: + // class.memtierd.nri.io: swap-idle-data + // # Override the default for CONTAINERNAME1: + // class.memtierd.nri.io/CONTAINERNAME1: noswap + // # Do not apply any class for CONTAINERNAME2: + // class.memtierd.nri.io/CONTAINERNAME2: "" + Name string + + // MemtierdConfig is a string that contains full configuration + // for memtierd. If non-empty, a separate memtierd will be + // launched to track each container of this QoS class. + MemtierdConfig string + + // AllowSwap: if true, set memory.swap.max to max, if false, + // set memory.swap.max to 0. If undefined, do not touch + // memory.swap.max. Direct annotation that defines value of + // memory.swap.max overrides this option. + AllowSwap *bool +} + +type memtierdEnv struct { + pid int + ctrDir string + configFile string + outputFile string + pidFile string + cmd *exec.Cmd +} + +type options struct { + HostRoot string +} + +const ( + annotationSuffix = ".memtierd.nri.io" +) + +var opt = options{} + +var ( + log *logrus.Logger +) + +// Configure handles connecting to container runtime's NRI server. +func (p *plugin) Configure(ctx context.Context, config, runtime, version string) (stub.EventMask, error) { + log.Infof("Connected to %s %s...", runtime, version) + if config != "" { + if err := p.setConfig([]byte(config)); err != nil { + return 0, loggedErrorf("Configure: loading configuration from NRI server failed: %s", err) + } + log.Debugf("Using configuration from NRI server") + } else { + log.Debugf("No configuration from NRI server") + } + return 0, nil +} + +// setConfig applies new plugin configuration. +func (p *plugin) setConfig(config []byte) error { + log.Tracef("setConfig: parsing\n---8<---\n%s\n--->8---", config) + cfg := pluginConfig{} + err := yaml.Unmarshal(config, &cfg) + if err != nil { + log.Tracef("setConfig: parsing failed: %s", err) + return fmt.Errorf("setConfig: cannot parse configuration: %w", err) + } + p.config = &cfg + if log.GetLevel() == logrus.TraceLevel { + log.Tracef("new configuration has %d classes:", len(p.config.Classes)) + for _, cls := range p.config.Classes { + log.Tracef("- %s", cls.Name) + } + } + return nil +} + +// pprintCtr() returns human readable container name that is +// unique to the node. +func pprintCtr(pod *api.PodSandbox, ctr *api.Container) string { + return fmt.Sprintf("%s/%s:%s", pod.GetNamespace(), pod.GetName(), ctr.GetName()) +} + +// loggedErrorf formats, logs and returns an error. +func loggedErrorf(s string, args ...any) error { + err := fmt.Errorf(s, args...) + log.Errorf("%s", err) + return err +} + +// associate adds new key-value pair to a map, or updates existing +// pair if called with the override set. Returns true if the pair was +// added/updated. +func associate(m *map[string]string, key, value string, override bool) bool { + if _, exists := (*m)[key]; override || !exists { + (*m)[key] = value + return true + } + return false +} + +// effectiveAnnotations returns map of annotation key prefixes and +// values that are effective for a container. Example: a +// container-specific pod annotation +// +// memory.high.memory-qos.nri.io/CTRNAME: 10000000 +// +// shows up as +// +// effAnn["memory.high"] = "10000000" +func effectiveAnnotations(pod *api.PodSandbox, ctr *api.Container) *map[string]string { + effAnn := map[string]string{} + for key, value := range pod.GetAnnotations() { + annPrefix, hasSuffix := strings.CutSuffix(key, annotationSuffix+"/"+ctr.Name) + if hasSuffix { + // Override possibly already found pod-level annotation. + log.Tracef("- found container-specific annotation %q", key) + associate(&effAnn, annPrefix, value, true) + continue + } + annPrefix, hasSuffix = strings.CutSuffix(key, annotationSuffix) + if hasSuffix { + // Do not override if there already is a + // container-level annotation. + if associate(&effAnn, annPrefix, value, false) { + log.Tracef("- found pod-level annotation %q", key) + } else { + log.Tracef("- ignoring pod-level annotation %q due to a container-level annotation", key) + } + continue + } + } + return &effAnn +} + +// CreateContainer responsibilities: +// - validate all annotations effective for a new container so that +// validation is no more needed in StartContainer. +// - configure cgroups unified parameters, for instance +// memory.swap.max. +func (p *plugin) CreateContainer(ctx context.Context, pod *api.PodSandbox, ctr *api.Container) (*api.ContainerAdjustment, []*api.ContainerUpdate, error) { + ppName := pprintCtr(pod, ctr) + unified := map[string]string{} + class := "" + for annPrefix, value := range *effectiveAnnotations(pod, ctr) { + switch annPrefix { + case "memory.swap.max": + unified["memory.swap.max"] = value + case "memory.high": + unified["memory.high"] = value + case "class": + class = value + if class != "" { + qoscls, err := p.qosClass(class) + if err != nil { + return nil, nil, loggedErrorf("CreateContainer: cannot search for class %q: %s", class, err) + } + if qoscls == nil { + return nil, nil, loggedErrorf("CreateContainer: unknown class %q", class) + } + if qoscls.AllowSwap != nil { + if *qoscls.AllowSwap { + associate(&unified, "memory.swap.max", "max", false) + } else { + associate(&unified, "memory.swap.max", "0", false) + } + } + } + default: + log.Errorf("CreateContainer %s: pod has invalid annotation: %q", ppName, annPrefix) + } + } + if len(unified) == 0 { + return nil, nil, nil + } + ca := api.ContainerAdjustment{ + Linux: &api.LinuxContainerAdjustment{ + Resources: &api.LinuxResources{ + Unified: unified, + }, + }, + } + log.Debugf("CreateContainer %s: class %q, LinuxResources.Unified=%v", ppName, class, ca.Linux.Resources.Unified) + return &ca, nil, nil +} + +// StartContainer launches a memtierd to manage container's memory if +// the container is associated with a QoS class that has memtierd +// configuration. +func (p *plugin) StartContainer(ctx context.Context, pod *api.PodSandbox, ctr *api.Container) error { + ppName := pprintCtr(pod, ctr) + log.Tracef("StartContainer: %s", ppName) + + hostRoot := opt.HostRoot + + namespace := pod.GetNamespace() + podName := pod.GetName() + containerName := ctr.GetName() + + annotatedClass, ok := (*effectiveAnnotations(pod, ctr))["class"] + if !ok || annotatedClass == "" { + log.Debugf("StartContainer: container %q has no QoS class", ppName) + return nil + } + + qoscls, err := p.qosClass(annotatedClass) + if qoscls == nil || err != nil { + return loggedErrorf("cannot find QoS class for %s: %s", ppName, err) + } + if qoscls.MemtierdConfig == "" { + log.Debugf("StartContainer: QoS class %q has no MemtierdConfig in the configuration", annotatedClass) + return nil + } + + fullCgroupsPath, err := p.getFullCgroupsPath(ctr) + if err != nil { + return loggedErrorf("cannot detect cgroup v2 path for container %q: %v", ppName, err) + } + mtdEnv, err := newMemtierdEnv(fullCgroupsPath, namespace, podName, containerName, qoscls.MemtierdConfig, hostRoot) + if err != nil || mtdEnv == nil { + return loggedErrorf("failed to prepare memtierd run environment: %v", err) + } + err = mtdEnv.startMemtierd() + if err != nil { + return loggedErrorf("failed to start memtierd: %v", err) + } + p.ctrMemtierdEnv[ppName] = mtdEnv + log.Infof("StartContainer: launched memtierd for %q with config %q", ppName, mtdEnv.configFile) + return nil +} + +// qosClass returns QoS class from plugin config based on class name. +func (p *plugin) qosClass(className string) (*qosClass, error) { + if p.config == nil { + return nil, fmt.Errorf("plugin is not configured") + } + for _, class := range p.config.Classes { + if class.Name == className { + return &class, nil + } + } + return nil, nil +} + +// StopContainer stops the memtierd that manages a container. +func (p *plugin) StopContainer(ctx context.Context, pod *api.PodSandbox, ctr *api.Container) ([]*api.ContainerUpdate, error) { + ppName := pprintCtr(pod, ctr) + + mtdEnv, ok := p.ctrMemtierdEnv[ppName] + if !ok || mtdEnv == nil { + log.Tracef("StopContainer: no memtierd environment for %s", ppName) + return nil, nil + } + delete(p.ctrMemtierdEnv, ppName) + + log.Debugf("StopContainer: stopping memtierd of %s, destroy %s", ppName, mtdEnv.ctrDir) + + if mtdEnv.cmd != nil && mtdEnv.cmd.Process != nil { + pid := mtdEnv.cmd.Process.Pid + log.Tracef("StopContainer: killing memtierd %d", pid) + if err := mtdEnv.cmd.Process.Kill(); err != nil { + log.Debugf("StopContainer: killing memtierd of %s (pid: %d) failed: %s", ppName, pid, err) + } + // Close files, read exit status (leave no zombie processes behind) + go mtdEnv.cmd.Wait() + } + + log.Tracef("StopContainer: removing memtierd run directory %s", mtdEnv.ctrDir) + if err := os.RemoveAll(mtdEnv.ctrDir); err != nil { + log.Debugf("StopContainer: removing memtierd run dir of %s (%q) failed: %s", + ppName, mtdEnv.ctrDir, err) + } + log.Infof("StopContainer: stopped memtierd of %s", ppName) + return nil, nil +} + +// onClose handles losing connection to the NRI server +func (p *plugin) onClose() { + log.Infof("Connection to the runtime lost, exiting...") + os.Exit(0) +} + +// detectCgroupsDir sets plugin's cgroups mount point +func (p *plugin) detectCgroupsDir() error { + file, err := os.Open("/proc/mounts") + if err != nil { + return fmt.Errorf("failed to open /proc/mounts: %v", err) + } + defer file.Close() + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + fields := strings.Fields(line) + + if fields[0] == "cgroup2" { + p.cgroupsDir = fields[1] + return nil + } + } + if err := scanner.Err(); err != nil { + return fmt.Errorf("failed to read /proc/mounts: %v", err) + } + return fmt.Errorf("cgroup2 missing in /proc/mounts") +} + +// getFullCgroupsPath returns container's cgroups directory. +func (p *plugin) getFullCgroupsPath(ctr *api.Container) (string, error) { + var fullCgroupsPath string + cgroupsPath := ctr.Linux.CgroupsPath + log.Tracef("getFullCgroupsPath: ctr.Id=%q ctr.cgroupsPath=%q", ctr.Id, cgroupsPath) + err := filepath.WalkDir(p.cgroupsDir, func(path string, info os.DirEntry, err error) error { + if err != nil { + return err + } + if info.IsDir() { + if strings.Contains(path, ctr.Id) { + log.Tracef("getFullCgroupsPath: container Id matches %s", path) + fullCgroupsPath = path + return io.EOF + } + } + return nil + }) + if err == io.EOF { + err = nil + } else { + log.Tracef("getFullCgroupsPath: could not find a directory matching *%s* anywhere under cgroups root %q", ctr.Id, p.cgroupsDir) + } + return fullCgroupsPath, err +} + +// newMemtierdEnv prepares new memtierd run environment with a +// configuration file template instantiated for managing a container. +func newMemtierdEnv(fullCgroupPath string, namespace string, podName string, containerName string, memtierdConfigIn string, hostRoot string) (*memtierdEnv, error) { + // Create container directory if it doesn't exist + ctrDir := fmt.Sprintf("%s%s/memtierd/%s/%s/%s", hostRoot, os.TempDir(), namespace, podName, containerName) + if err := os.MkdirAll(ctrDir, 0755); err != nil { + return nil, fmt.Errorf("cannot create memtierd run directory %q: %w", ctrDir, err) + } + + outputFilePath := fmt.Sprintf("%s/memtierd.output", ctrDir) + statsFilePath := fmt.Sprintf("%s/memtierd.stats", ctrDir) + pidFilePath := fmt.Sprintf("%s/memtierd.pid", ctrDir) + + // Instantiate memtierd configuration from configuration template + replace := map[string]string{ + "$CGROUP2_ABS_PATH": fullCgroupPath, + "$MEMTIERD_SWAP_STATS_PATH": statsFilePath, + } + memtierdConfigOut := string(memtierdConfigIn) + for key, value := range replace { + memtierdConfigOut = strings.Replace(memtierdConfigOut, key, value, -1) + } + + configFilePath := fmt.Sprintf("%s/memtierd.config.yaml", ctrDir) + if err := os.WriteFile(configFilePath, []byte(memtierdConfigOut), 0644); err != nil { + return nil, fmt.Errorf("cannot write memtierd configuration into file %q: %w", configFilePath, err) + } + + me := memtierdEnv{} + me.outputFile = outputFilePath + me.configFile = configFilePath + me.pidFile = pidFilePath + me.ctrDir = ctrDir + return &me, nil +} + +// startMemtierd launches memtierd in prepared environment. +func (me *memtierdEnv) startMemtierd() error { + outputFile, err := os.OpenFile(me.outputFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) + if err != nil { + return fmt.Errorf("failed to create memtierd output file: %w", err) + } + + // Create the command and write its output to the output file + cmd := exec.Command("memtierd", "-c", "", "-config", me.configFile) + cmd.Stdout = outputFile + cmd.Stderr = outputFile + + // Start the command in a new session and process group + cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true} + + // Start the command in the background + if err := cmd.Start(); err != nil { + return fmt.Errorf("failed to start command %s: %q", cmd, err) + } + if cmd.Process != nil { + os.WriteFile(me.pidFile, + []byte(fmt.Sprintf("%d\n", cmd.Process.Pid)), + 0400) + } + me.cmd = cmd + return nil +} + +// main program to run the plugin. +func main() { + var ( + pluginName string + pluginIdx string + configFile string + err error + verbose bool + veryVerbose bool + ) + + log = logrus.StandardLogger() + log.SetFormatter(&logrus.TextFormatter{ + PadLevelText: true, + }) + + flag.StringVar(&pluginName, "name", "", "plugin name to register to NRI") + flag.StringVar(&pluginIdx, "idx", "", "plugin index to register to NRI") + flag.StringVar(&configFile, "config", "", "configuration file name") + flag.StringVar(&opt.HostRoot, "host-root", "", "Directory prefix under which the host's tmp, etc. are mounted.") + flag.BoolVar(&verbose, "v", false, "verbose output") + flag.BoolVar(&veryVerbose, "vv", false, "very verbose output") + flag.Parse() + + if verbose { + log.SetLevel(logrus.DebugLevel) + } + if veryVerbose { + log.SetLevel(logrus.TraceLevel) + } + + p := &plugin{ + ctrMemtierdEnv: map[string]*memtierdEnv{}, + } + + if configFile != "" { + log.Debugf("read configuration from %q", configFile) + config, err := os.ReadFile(configFile) + if err != nil { + log.Fatalf("error reading configuration file %q: %s", configFile, err) + } + if err = p.setConfig(config); err != nil { + log.Fatalf("error applying configuration from file %q: %s", configFile, err) + } + } + + if p.cgroupsDir == "" { + if err := p.detectCgroupsDir(); err != nil { + log.Fatalf("cannot find cgroup2 mount point. %s", err) + } + } + + opts := []stub.Option{ + stub.WithOnClose(p.onClose), + } + if pluginName != "" { + opts = append(opts, stub.WithPluginName(pluginName)) + } + if pluginIdx != "" { + opts = append(opts, stub.WithPluginIdx(pluginIdx)) + } + + if p.stub, err = stub.New(p, opts...); err != nil { + log.Fatalf("failed to create plugin stub: %v", err) + } + + if err = p.stub.Run(context.Background()); err != nil { + log.Errorf("plugin exited (%v)", err) + os.Exit(1) + } +} diff --git a/cmd/plugins/memtierd/nri-memtierd-deployment.yaml.in b/cmd/plugins/memtierd/nri-memtierd-deployment.yaml.in new file mode 100644 index 000000000..d4698baf5 --- /dev/null +++ b/cmd/plugins/memtierd/nri-memtierd-deployment.yaml.in @@ -0,0 +1,93 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app: nri-memtierd + name: nri-memtierd + namespace: kube-system +spec: + selector: + matchLabels: + app: nri-memtierd + template: + metadata: + labels: + app: nri-memtierd + spec: + nodeSelector: + kubernetes.io/os: "linux" + hostPID: true + containers: + - name: nri-memtierd + command: + - nri-memtierd + - --idx + - "45" + - --config + - /etc/nri/memtierd/config.yaml + - --host-root + - /host + - -v + image: IMAGE_PLACEHOLDER + imagePullPolicy: IfNotPresent + resources: + requests: + cpu: 250m + memory: 100Mi + securityContext: + privileged: true + volumeMounts: + - name: memtierd-config-vol + mountPath: /etc/nri/memtierd + - name: nri-sockets-vol + mountPath: /var/run/nri + - name: host-vol + mountPath: /host + - name: host-bitmap + mountPath: /sys/kernel/mm/page_idle/bitmap + volumes: + - name: memtierd-config-vol + configMap: + name: nri-memtierd-config.default + - name: nri-sockets-vol + hostPath: + path: /var/run/nri + type: Directory + - name: host-vol + hostPath: + path: / + type: Directory + - name: host-bitmap + hostPath: + path: /sys/kernel/mm/page_idle/bitmap +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: nri-memtierd-config.default + namespace: kube-system +data: + config.yaml: | + classes: + - name: swap-idle-data + allowswap: true + memtierdconfig: | + policy: + name: age + config: | + intervalms: 10000 + pidwatcher: + name: cgroups + config: | + cgroups: + - $CGROUP2_ABS_PATH + swapoutms: 10000 + tracker: + name: idlepage + config: | + pagesinregion: 512 + maxcountperregion: 1 + scanintervalms: 10000 + mover: + intervalms: 20 + bandwidth: 50 diff --git a/cmd/plugins/memtierd/test-pod.yaml b/cmd/plugins/memtierd/test-pod.yaml new file mode 100644 index 000000000..04f471740 --- /dev/null +++ b/cmd/plugins/memtierd/test-pod.yaml @@ -0,0 +1,56 @@ +apiVersion: v1 +kind: Pod +metadata: + name: test-pod + labels: + app: test-pod + annotations: + # Set the default memtierd class for all containers in this pod. + class.memtierd.nri.io: swap-idle-data + + # Clear class from the c2-noswap container. + class.memtierd.nri.io/c2-noswap: "" + + # Always enable swap, but do not swap due to pressure from + # memory.high + memory.swap.max.memtierd.nri.io: max + memory.high.memtierd.nri.io: max +spec: + containers: + - name: c0-lowprio + image: busybox + imagePullPolicy: IfNotPresent + command: + - sh + - -c + - dd count=1 bs=80M if=/dev/zero | sleep inf + resources: + requests: + memory: 64M + limits: + memory: 100M + - name: c1-normal + image: busybox + imagePullPolicy: IfNotPresent + command: + - sh + - -c + - dd count=1 bs=80M if=/dev/zero | sleep inf + resources: + requests: + memory: 64M + limits: + memory: 100M + - name: c2-noswap + image: busybox + imagePullPolicy: IfNotPresent + command: + - sh + - -c + - dd count=1 bs=80M if=/dev/zero | sleep inf + resources: + requests: + memory: 64M + limits: + memory: 100M + terminationGracePeriodSeconds: 1 diff --git a/deployment/overlays/memtierd/daemonset.yaml b/deployment/overlays/memtierd/daemonset.yaml new file mode 100644 index 000000000..ac7cc5045 --- /dev/null +++ b/deployment/overlays/memtierd/daemonset.yaml @@ -0,0 +1,62 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app: nri-memtierd + name: nri-memtierd + namespace: kube-system +spec: + selector: + matchLabels: + app: nri-memtierd + template: + metadata: + labels: + app: nri-memtierd + spec: + nodeSelector: + kubernetes.io/os: "linux" + hostPID: true + containers: + - name: nri-memtierd + command: + - nri-memtierd + - --idx + - "45" + - --config + - /etc/nri/memtierd/config.yaml + - --host-root + - /host + - -v + image: IMAGE_PLACEHOLDER + imagePullPolicy: IfNotPresent + resources: + requests: + cpu: 250m + memory: 100Mi + securityContext: + privileged: true + volumeMounts: + - name: memtierd-config-vol + mountPath: /etc/nri/memtierd + - name: nri-sockets-vol + mountPath: /var/run/nri + - name: host-vol + mountPath: /host + - name: host-bitmap + mountPath: /sys/kernel/mm/page_idle/bitmap + volumes: + - name: memtierd-config-vol + configMap: + name: nri-memtierd-config.default + - name: nri-sockets-vol + hostPath: + path: /var/run/nri + type: Directory + - name: host-vol + hostPath: + path: / + type: Directory + - name: host-bitmap + hostPath: + path: /sys/kernel/mm/page_idle/bitmap diff --git a/deployment/overlays/memtierd/kustomization.yaml b/deployment/overlays/memtierd/kustomization.yaml new file mode 100644 index 000000000..09af057da --- /dev/null +++ b/deployment/overlays/memtierd/kustomization.yaml @@ -0,0 +1,13 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: kube-system + +images: +- name: '*' + newName: ghcr.io/containers/nri-plugins/nri-memtierd + newTag: unstable + +resources: +- daemonset.yaml +- sample-configmap.yaml diff --git a/deployment/overlays/memtierd/sample-configmap.yaml b/deployment/overlays/memtierd/sample-configmap.yaml new file mode 100644 index 000000000..d0d8ebb6f --- /dev/null +++ b/deployment/overlays/memtierd/sample-configmap.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: nri-memtierd-config.default + namespace: kube-system +data: + config.yaml: | + classes: + - name: swap-idle-data + allowswap: true + memtierdconfig: | + policy: + name: age + config: | + intervalms: 10000 + pidwatcher: + name: cgroups + config: | + cgroups: + - $CGROUP2_ABS_PATH + swapoutms: 10000 + tracker: + name: idlepage + config: | + pagesinregion: 512 + maxcountperregion: 1 + scanintervalms: 10000 + mover: + intervalms: 20 + bandwidth: 50 diff --git a/docs/memory/memtierd.md b/docs/memory/memtierd.md new file mode 100644 index 000000000..16a7f2e43 --- /dev/null +++ b/docs/memory/memtierd.md @@ -0,0 +1,176 @@ +# Memtierd NRI plugin + +This plugins enables managing workloads with +[Memtierd](https://github.com/intel/memtierd) in Kubernetes. + +Plugin's configuration defines a set of workload classes and their +attributes. If a class is attributed with memtierd configuration, +then this plugin will launch memtierd with that configuration to track +and manage memory of each workload that belongs to the class. + +The class of a workload is given in pod annotations. + +## Workload configuration + +The class of a pod or a container is defined using annotations: +``` + annotations: + # Set the default class for all containers in this pod. + class.memtierd.nri.io: swap-idle-data + # Override the default class for the c0 container. + class.memtierd.nri.io/c0: track-working-set-size + # Do not associate any class on the c1 container. + class.memtierd.nri.io/c1: "" +``` + +## Plugin configuration + +### Classes + +Plugin configuration lists workload classes and their attributes. + +`classes:` is followed by list of maps with following keys and values: +- `name` (string): name of the QoS class, matches + `class.memtierd.nri.io` annotations. +- `allowswap` (`true` or `false`): if `true`, allow OS to swap the + workload. If `false` disallow swapping. If not set, the plugin will + not affect what will be written to `memory.swap.max` in cgroups v2. +- `memtierdconfig` (string): configuration template with which + memtierd will be launched to manage workloads in this + class. Variables that will be replaced with container-specific + values in this template: + - `$CGROUP2_ABS_PATH` absolute path to cgroups v2 directory into + which container's processes will belong to. + +### Example + +```yaml +classes: + - name: swap-idle-data + allowswap: true + memtierdconfig: | + policy: + name: age + config: | + intervalms: 10000 + pidwatcher: + name: cgroups + config: | + cgroups: + - $CGROUP2_ABS_PATH + swapoutms: 10000 + tracker: + name: idlepage + config: | + pagesinregion: 512 + maxcountperregion: 1 + scanintervalms: 10000 + mover: + intervalms: 20 + bandwidth: 50 +``` + +The configuration defines the `swap-idle-data` workload class. + +`allowswap: true` makes sure that OS will allow swapping when memtierd +decides that data should be swapped out from memory. + +`memtierdconfig: ...` means that a memtierd will manage the memory of +a workload in this class. The `age` policy uses the `idlepage` tracker +to find data that has not been accessed in 10 seconds, and swaps out +that data `swapoutms: 10000`. The swapping will be done in 20 ms +interval (`mover.intervalms`), and no more than 50 MB/s +(`mover.bandwidth`). Refer to [memtierd +documentation](https://github.com/intel/memtierd/tree/main/cmd/memtierd) +for more configuration options. + +## Developer's guide + +### Prerequisites + +- Containerd v1.7+ +- Enable NRI in /etc/containerd/config.toml: + ``` + [plugins."io.containerd.nri.v1.nri"] + disable = false + disable_connections = false + plugin_config_path = "/etc/nri/conf.d" + plugin_path = "/opt/nri/plugins" + plugin_registration_timeout = "5s" + plugin_request_timeout = "2s" + socket_path = "/var/run/nri/nri.sock" + ``` +- To run the nri-memtierd plugin on a host, install memtierd on the host. + ``` + GOBIN=/usr/local/bin go install github.com/intel/memtierd/cmd/memtierd@latest + ``` + +### Build + +In `cmd/plugins/memtierd` run: + +```bash +go build . +``` + +### Run + +```bash +./memtierd -config sample-config.yaml -idx 40 -vv +``` + +### Manual test + +```bash +kubectl create -f ./test-pod.yaml +``` + +See swap status of dd processes, each allocating the same amount of +memory: + +```bash +for pid in $(pidof dd); do + grep VmSwap /proc/$pid/status +done +``` + +### Debug + +`-v` enables debug output from the plugin. `-vv` makes it even more verbose. + +The plugin stores `memtierd` config and output under `/tmp/memtierd/NAMESPACE/POD/CONTAINER/`. + +Debugging the plugin with dlv: + +```bash +go install github.com/go-delve/delve/cmd/dlv@latest +dlv exec ./memtierd -- -config memtierd.conf -idx 40 +(dlv) break plugin.CreateContainer +(dlv) continue +``` + +### Deploy + +Build an image, import it on the node, and deploy the plugin by +running the following in `nri-plugins`: + +```bash +rm -rf build +make PLUGINS=nri-memtierd IMAGE_VERSION=devel images +ctr -n k8s.io images import build/images/nri-memtierd-image-*.tar +kubectl create -f build/images/nri-memtierd-deployment-e2e.yaml +``` + +The e2e deployment variant gives more debug output from both +`nri-memtierd` plugin (see `kubectl logs -n kube-system +nri-memtierd-*`) and `memtierd` to the output (see +`/tmp/memtierd/**/*.output`). + +## Security + +`memtierd` needs privileged access in order to find pids in other +containers, track memory activity, move pages and swap workload data +out and in. Therefore only privileged users must be allowed to create +and modify memtierd configuration files and ConfigMaps. Commands in +memtierd configurations will be executed by memtierd in privleged +mode. diff --git a/test/e2e/files/nri-memtierd-deployment.yaml.in b/test/e2e/files/nri-memtierd-deployment.yaml.in new file mode 100644 index 000000000..75ce5b79a --- /dev/null +++ b/test/e2e/files/nri-memtierd-deployment.yaml.in @@ -0,0 +1,99 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app: nri-memtierd + name: nri-memtierd + namespace: kube-system +spec: + selector: + matchLabels: + app: nri-memtierd + template: + metadata: + labels: + app: nri-memtierd + spec: + nodeSelector: + kubernetes.io/os: "linux" + hostPID: true + containers: + - name: nri-memtierd + command: + - nri-memtierd + - --idx + - "45" + - --config + - /etc/nri/memtierd/config.yaml + - --host-root + - /host + - -vv + image: IMAGE_PLACEHOLDER + imagePullPolicy: IfNotPresent + resources: + requests: + cpu: 250m + memory: 100Mi + securityContext: + privileged: true + volumeMounts: + - name: memtierd-config-vol + mountPath: /etc/nri/memtierd + - name: nri-sockets-vol + mountPath: /var/run/nri + - name: host-vol + mountPath: /host + - name: host-bitmap + mountPath: /sys/kernel/mm/page_idle/bitmap + volumes: + - name: memtierd-config-vol + configMap: + name: nri-memtierd-config.default + - name: nri-sockets-vol + hostPath: + path: /var/run/nri + type: Directory + - name: host-vol + hostPath: + path: / + type: Directory + - name: host-bitmap + hostPath: + path: /sys/kernel/mm/page_idle/bitmap +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: nri-memtierd-config.default + namespace: kube-system +data: + config.yaml: | + classes: + - name: swap-idle-data + allowswap: true + memtierdconfig: | + policy: + name: age + config: | + intervalms: 10000 + pidwatcher: + name: cgroups + config: | + cgroups: + - $CGROUP2_ABS_PATH + swapoutms: 10000 + tracker: + name: idlepage + config: | + pagesinregion: 512 + maxcountperregion: 1 + scanintervalms: 10000 + mover: + intervalms: 20 + bandwidth: 50 + routines: + - name: statactions + config: | + intervalms: 10000 + intervalcommand: ["stats", "-t", "memory_scans,process_madvise"] + intervalcommandrunner: memtier