From 42457cfedb7cfd89d258ac85698053734ad83195 Mon Sep 17 00:00:00 2001
From: Antti Kervinen <antti.kervinen@intel.com>
Date: Fri, 9 Feb 2024 11:13:09 +0200
Subject: [PATCH] topology-aware: implement CPU and memory pinning preservation

Signed-off-by: Antti Kervinen <antti.kervinen@intel.com>
---
 .../topology-aware/policy/pod-preferences.go  |  8 ++++
 cmd/plugins/topology-aware/policy/pools.go    | 45 ++++++++++++-------
 .../topology-aware/policy/resources.go        |  2 +-
 docs/resource-policy/policy/topology-aware.md | 16 +++++++
 .../code.var.sh                               | 27 ++++++++++-
 5 files changed, 80 insertions(+), 18 deletions(-)

diff --git a/cmd/plugins/topology-aware/policy/pod-preferences.go b/cmd/plugins/topology-aware/policy/pod-preferences.go
index 8f702f618..af1bb1b90 100644
--- a/cmd/plugins/topology-aware/policy/pod-preferences.go
+++ b/cmd/plugins/topology-aware/policy/pod-preferences.go
@@ -62,11 +62,13 @@ type cpuClass int
 var cpuClassNames = map[cpuClass]string{
 	cpuNormal:   "normal",
 	cpuReserved: "reserved",
+	cpuPreserve: "preserve",
 }
 
 const (
 	cpuNormal cpuClass = iota
 	cpuReserved
+	cpuPreserve
 )
 
 // types by memory type name
@@ -93,6 +95,7 @@ const (
 	memoryDRAM
 	memoryPMEM
 	memoryHBM
+	memoryPreserve
 	memoryFirstUnusedBit
 	memoryAll = memoryFirstUnusedBit - 1
 
@@ -155,6 +158,9 @@ func sharedCPUsPreference(pod cache.Pod, container cache.Container) (bool, bool)
 // If the effective annotations are not found, this function falls back to
 // looking for the deprecated syntax by calling podMemoryTypePreference.
 func memoryTypePreference(pod cache.Pod, container cache.Container) memoryType {
+	if container.PreserveMemoryResources() {
+		return memoryPreserve
+	}
 	key := preferMemoryTypeKey
 	value, ok := pod.GetEffectiveAnnotation(key, container.GetName())
 	if !ok {
@@ -437,6 +443,8 @@ func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, in
 	// easy cases: kube-system namespace, Burstable or BestEffort QoS class containers
 	preferReserved, explicitReservation := checkReservedCPUsAnnotations(container)
 	switch {
+	case container.PreserveCpuResources():
+		return 0, fraction, false, cpuPreserve
 	case preferReserved == true:
 		return 0, fraction, false, cpuReserved
 	case checkReservedPoolNamespaces(namespace) && !explicitReservation:
diff --git a/cmd/plugins/topology-aware/policy/pools.go b/cmd/plugins/topology-aware/policy/pools.go
index c0d8e47e0..e3975e522 100644
--- a/cmd/plugins/topology-aware/policy/pools.go
+++ b/cmd/plugins/topology-aware/policy/pools.go
@@ -347,7 +347,7 @@ func (p *policy) allocatePool(container cache.Container, poolHint string) (Grant
 	// the same pool. This assumption can be relaxed later, requires separate
 	// (but connected) scoring of memory and CPU.
 
-	if request.CPUType() == cpuReserved {
+	if request.CPUType() == cpuReserved || request.CPUType() == cpuPreserve {
 		pool = p.root
 	} else {
 		affinity, err := p.calculatePoolAffinities(request.GetContainer())
@@ -601,7 +601,8 @@ func (p *policy) applyGrant(grant Grant) {
 
 	cpus := ""
 	kind := ""
-	if cpuType == cpuNormal {
+	switch cpuType {
+	case cpuNormal:
 		if exclusive.IsEmpty() {
 			cpus = shared.String()
 			kind = "shared"
@@ -614,11 +615,13 @@ func (p *policy) applyGrant(grant Grant) {
 				cpus = exclusive.String()
 			}
 		}
-	} else if cpuType == cpuReserved {
+	case cpuReserved:
 		kind = "reserved"
 		cpus = reserved.String()
 		cpuPortion = grant.ReservedPortion()
-	} else {
+	case cpuPreserve:
+		// Will skip CPU pinning, may still pin memory.
+	default:
 		log.Debug("unsupported granted cpuType %s", cpuType)
 		return
 	}
@@ -629,13 +632,16 @@ func (p *policy) applyGrant(grant Grant) {
 	}
 
 	if opt.PinCPU {
-		if cpus != "" {
-			log.Info("  => pinning %s to (%s) cpuset %s", container.PrettyName(), kind, cpus)
+		if cpuType == cpuPreserve {
+			log.Info("  => preserving %s cpuset %s", container.PrettyName(), container.GetCpusetCpus())
 		} else {
-			log.Info("  => not pinning %s CPUs, cpuset is empty...",
-				container.PrettyName())
+			if cpus != "" {
+				log.Info("  => pinning %s to (%s) cpuset %s", container.PrettyName(), kind, cpus)
+			} else {
+				log.Info("  => not pinning %s CPUs, cpuset is empty...", container.PrettyName())
+			}
+			container.SetCpusetCpus(cpus)
 		}
-		container.SetCpusetCpus(cpus)
 
 		// Notes:
 		//     It is extremely important to ensure that the exclusive subset of mixed
@@ -664,11 +670,15 @@ func (p *policy) applyGrant(grant Grant) {
 		container.SetCPUShares(int64(cache.MilliCPUToShares(int64(milliCPU))))
 	}
 
-	if mems != "" {
-		log.Debug("  => pinning %s to memory %s", container.PrettyName(), mems)
-		container.SetCpusetMems(mems)
+	if grant.MemoryType() == memoryPreserve {
+		log.Debug("  => preserving %s memory pinning %s", container.PrettyName(), container.GetCpusetMems())
 	} else {
-		log.Debug("  => not pinning %s memory, memory set is empty...", container.PrettyName())
+		if mems != "" {
+			log.Debug("  => pinning %s to memory %s", container.PrettyName(), mems)
+		} else {
+			log.Debug("  => not pinning %s memory, memory set is empty...", container.PrettyName())
+		}
+		container.SetCpusetMems(mems)
 	}
 }
 
@@ -717,6 +727,11 @@ func (p *policy) updateSharedAllocations(grant *Grant) {
 			continue
 		}
 
+		if other.CPUType() == cpuPreserve {
+			log.Info("  => %s not affected (preserving CPU pinning)", other)
+			continue
+		}
+
 		if other.SharedPortion() == 0 && !other.ExclusiveCPUs().IsEmpty() {
 			log.Info("  => %s not affected (only exclusive CPUs)...", other)
 			continue
@@ -750,7 +765,7 @@ func (p *policy) filterInsufficientResources(req Request, originals []Node) []No
 		supply := node.FreeSupply()
 		reqMemType := req.MemoryType()
 
-		if reqMemType == memoryUnspec {
+		if reqMemType == memoryUnspec || reqMemType == memoryPreserve {
 			// The algorithm for handling unspecified memory allocations is the same as for handling a request
 			// with memory type all.
 			reqMemType = memoryAll
@@ -883,7 +898,7 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco
 	log.Debug("  - affinity is a TIE")
 
 	// 3) matching memory type wins
-	if reqType := request.MemoryType(); reqType != memoryUnspec {
+	if reqType := request.MemoryType(); reqType != memoryUnspec && reqType != memoryPreserve {
 		if node1.HasMemoryType(reqType) && !node2.HasMemoryType(reqType) {
 			log.Debug("  => %s WINS on memory type", node1.Name())
 			return true
diff --git a/cmd/plugins/topology-aware/policy/resources.go b/cmd/plugins/topology-aware/policy/resources.go
index 497f8c1e7..b2e4ef210 100644
--- a/cmd/plugins/topology-aware/policy/resources.go
+++ b/cmd/plugins/topology-aware/policy/resources.go
@@ -456,7 +456,7 @@ func (cs *supply) AccountReleaseCPU(g Grant) {
 // allocateMemory tries to fulfill the memory allocation part of a request.
 func (cs *supply) allocateMemory(r Request) (memoryMap, error) {
 	reqType := r.MemoryType()
-	if reqType == memoryUnspec {
+	if reqType == memoryUnspec || reqType == memoryPreserve {
 		reqType = memoryAll
 	}
 
diff --git a/docs/resource-policy/policy/topology-aware.md b/docs/resource-policy/policy/topology-aware.md
index dc6fdff1b..ec84a52d3 100644
--- a/docs/resource-policy/policy/topology-aware.md
+++ b/docs/resource-policy/policy/topology-aware.md
@@ -310,6 +310,22 @@ defined affinities with implicit co-location requires both careful consideration
 and a thorough understanding of affinity evaluation, or it should be avoided
 altogether.
 
+## Disabling CPU or Memory Pinning of a Container
+
+Some containers may need to run on all CPUs or access all memories
+without restrictions. Annotate these pods and containers to prevent
+the resource policy from touching their CPU or memory pinning.
+
+```yaml
+cpu.preserve.resource-policy.nri.io/container.CONTAINER_NAME: "true"
+cpu.preserve.resource-policy.nri.io/pod: "true"
+cpu.preserve.resource-policy.nri.io: "true"
+
+memory.preserve.resource-policy.nri.io/container.CONTAINER_NAME: "true"
+memory.preserve.resource-policy.nri.io/pod: "true"
+memory.preserve.resource-policy.nri.io: "true"
+```
+
 ## Cold Start
 
 The `topology-aware` policy supports "cold start" functionality. When cold start
diff --git a/test/e2e/policies.test-suite/topology-aware/n4c16/test11-reserved-cpu-annotations/code.var.sh b/test/e2e/policies.test-suite/topology-aware/n4c16/test11-reserved-cpu-annotations/code.var.sh
index b2768782c..b9c20ef79 100644
--- a/test/e2e/policies.test-suite/topology-aware/n4c16/test11-reserved-cpu-annotations/code.var.sh
+++ b/test/e2e/policies.test-suite/topology-aware/n4c16/test11-reserved-cpu-annotations/code.var.sh
@@ -1,9 +1,12 @@
-# Test that
-# - containers marked in Annotations pinned on Reserved CPUs.
+# Test annotations:
+# - prefer-reserved-cpus
+# - cpu.preserve
+# - memory.preserve
 
 cleanup-test-pods() {
     ( vm-command "kubectl delete pods pod0 --now" ) || true
     ( vm-command "kubectl delete pods pod1 --now" ) || true
+    ( vm-command "kubectl delete pods pod2 --now" ) || true
 }
 cleanup-test-pods
 
@@ -24,6 +27,26 @@ report allowed
 verify 'cpus["pod0c0"] == {"cpu10", "cpu11"}'
 verify 'cpus["pod1c0"] == {"cpu08"}'
 
+ANNOTATIONS=(
+    'cpu.preserve.resource-policy.nri.io: "true"'
+    'memory.preserve.resource-policy.nri.io/container.pod2c1: "true"'
+    'memory.preserve.resource-policy.nri.io/container.pod2c2: "true"'
+    'cpu.preserve.resource-policy.nri.io/container.pod2c2: "false"'
+    'cpu.preserve.resource-policy.nri.io/container.pod2c3: "false"'
+    'memory.preserve.resource-policy.nri.io/container.pod2c3: "false"'
+)
+CONTCOUNT=4 CPU=100m MEM=100M create reserved-annotated
+report allowed
+
+verify 'len(cpus["pod2c0"]) == 16' \
+       'len(mems["pod2c0"]) == 4' \
+       'len(cpus["pod2c1"]) == 16' \
+       'len(mems["pod2c1"]) == 4' \
+       'len(cpus["pod2c2"]) == 1' \
+       'len(mems["pod2c2"]) == 4' \
+       'len(cpus["pod2c3"]) == 1' \
+       'len(mems["pod2c3"]) == 1'
+
 cleanup-test-pods
 
 helm-terminate