From 883894b111b545f50a5876ed5796d5a27f054604 Mon Sep 17 00:00:00 2001
From: Krisztian Litkey <krisztian.litkey@intel.com>
Date: Tue, 25 Jun 2024 18:14:03 +0300
Subject: [PATCH] WiP: cpuallocator: use last useful cache for clustering.

Try finding the last cache which provides non-trivial clustering
of CPUs and use that for cache-group based allocation.

Note that this current implementation is somewhat simplistic. It
expects all CPUs to provide identical cache grouping and picks a
single cache level to set up clusters. This might result in sub-
optimal cache-based clustering on hybrid core architectures. We
can address this shortcoming in the future.

Signed-off-by: Krisztian Litkey <krisztian.litkey@intel.com>
---
 pkg/cpuallocator/allocator.go | 153 +++++++++++++++++++++-------------
 1 file changed, 95 insertions(+), 58 deletions(-)

diff --git a/pkg/cpuallocator/allocator.go b/pkg/cpuallocator/allocator.go
index 89a81e071..730814dfb 100644
--- a/pkg/cpuallocator/allocator.go
+++ b/pkg/cpuallocator/allocator.go
@@ -38,13 +38,13 @@ const (
 	AllocIdleNodes
 	// AllocIdleClusters requests allocation of full idle CPU clusters.
 	AllocIdleClusters
-	// AllocLLCGroups requests allocation and splitting of idle and used LLC groups
-	AllocLLCGroups
+	// AllocCacheGroups requests allocation and splitting of idle and used cache groups
+	AllocCacheGroups
 	// AllocIdleCores requests allocation of full idle cores (all threads in core).
 	AllocIdleCores
 
 	// AllocDefault is the default allocation preferences.
-	AllocDefault = AllocIdlePackages | AllocIdleClusters | AllocLLCGroups | AllocIdleCores
+	AllocDefault = AllocIdlePackages | AllocIdleClusters | AllocCacheGroups | AllocIdleCores
 
 	logSource = "cpuallocator"
 )
@@ -96,7 +96,7 @@ type topologyCache struct {
 
 	cpuPriorities cpuPriorities // CPU priority mapping
 	clusters      []*cpuCluster // CPU clusters
-	llcGroups     []*llcGroup   // CPU last-level cache groups
+	cacheGroups   []*cacheGroup // CPU cache groups
 }
 
 type cpuPriorities [NumCPUPriorities]cpuset.CPUSet
@@ -109,7 +109,7 @@ type cpuCluster struct {
 	kind    sysfs.CoreKind
 }
 
-type llcGroup struct {
+type cacheGroup struct {
 	id   int
 	pkg  idset.ID
 	die  idset.ID
@@ -403,17 +403,17 @@ func (a *allocatorHelper) takeIdleClusters() {
 }
 
 // Allocate idle or partial CPU last-level cache groups.
-func (a *allocatorHelper) takeLLCGroups() {
-	log.Debug("* takeLLCGroups()...")
+func (a *allocatorHelper) takeCacheGroups() {
+	log.Debug("* takeCacheGroups()...")
 
-	if len(a.topology.llcGroups) <= 1 {
+	if len(a.topology.cacheGroups) <= 1 {
 		return
 	}
 
 	if a.cnt < 2 {
 		// XXX TODO(klihub): we could also decide based on some criteria, if it was better
 		// to handle such containers here and, for instance, pack them tightly into shared
-		// LLC groups.
+		// cache groups.
 		return
 	}
 
@@ -451,7 +451,7 @@ func (a *allocatorHelper) takeLLCGroups() {
 
 	var (
 		offline    = a.sys.OfflineCPUs()
-		pickGroups = func(g *llcGroup) (pickVerdict, cpuset.CPUSet) {
+		pickGroups = func(g *cacheGroup) (pickVerdict, cpuset.CPUSet) {
 			// only take E-groups for low-prio requests
 			if a.prefer != PriorityLow && g.kind == sysfs.EfficientCore {
 				log.Debug("  - ignore %s (CPU preference is %s)", g, a.prefer)
@@ -483,7 +483,7 @@ func (a *allocatorHelper) takeLLCGroups() {
 			return pickUsable, free
 		}
 
-		sortIdle = func(gA, gB *llcGroup, s *llcGroupSorter) (r int) {
+		sortIdle = func(gA, gB *cacheGroup, s *cacheGroupSorter) (r int) {
 			defer func() {
 				switch {
 				case r < 0:
@@ -594,7 +594,7 @@ func (a *allocatorHelper) takeLLCGroups() {
 			return gA.id - gB.id
 		}
 
-		sortUsed = func(gA, gB *llcGroup, s *llcGroupSorter) (r int) {
+		sortUsed = func(gA, gB *cacheGroup, s *cacheGroupSorter) (r int) {
 			defer func() {
 				switch {
 				case r < 0:
@@ -616,7 +616,7 @@ func (a *allocatorHelper) takeLLCGroups() {
 				csetB    = s.cpus[gB]
 				full     = s.full
 				part     = s.part
-				idle     *llcGroup
+				idle     *cacheGroup
 			)
 
 			if len(s.prefer) > 0 {
@@ -683,7 +683,7 @@ func (a *allocatorHelper) takeLLCGroups() {
 			return 0
 		}
 
-		sorter = &llcGroupSorter{
+		sorter = &cacheGroupSorter{
 			pick:       pickGroups,
 			sortPrefer: sortIdle,
 			sortUsable: sortUsed,
@@ -692,7 +692,7 @@ func (a *allocatorHelper) takeLLCGroups() {
 
 	log.Debug("looking for %d CPUs (prio %s) from %s", a.cnt, a.prefer, a.from)
 
-	sorter.sortLLCGroups(a)
+	sorter.sortCacheGroups(a)
 
 	var (
 		preferPkgCPUs int
@@ -792,7 +792,7 @@ func (a *allocatorHelper) takeLLCGroups() {
 	log.Debug("%d more CPUs needed", cnt)
 
 	var (
-		groupsBySize = map[int][]*llcGroup{}
+		groupsBySize = map[int][]*cacheGroup{}
 		totalByIndex = make([]int, 0, len(sorter.usable))
 		totalCPUs    = 0
 	)
@@ -875,7 +875,7 @@ func (a *allocatorHelper) takeLLCGroups() {
 	}
 
 	// use up smallest number of groups possible (start with the largest group)
-	log.Debug("=> taking LLC groups in decreasing size order for %d more CPUs...", cnt)
+	log.Debug("=> taking cache groups in decreasing size order for %d more CPUs...", cnt)
 
 	var (
 		grpCnt = 0
@@ -1148,8 +1148,8 @@ func (a *allocatorHelper) allocate() cpuset.CPUSet {
 		if a.cnt > 0 && (a.flags&AllocIdleClusters) != 0 {
 			a.takeIdleClusters()
 		}
-		if a.cnt > 0 && (a.flags&AllocLLCGroups) != 0 {
-			a.takeLLCGroups()
+		if a.cnt > 0 && (a.flags&AllocCacheGroups) != 0 {
+			a.takeCacheGroups()
 		}
 		if a.cnt > 0 && (a.flags&AllocIdleCores) != 0 {
 			a.takeIdleCores()
@@ -1254,59 +1254,59 @@ const (
 	pickIgnore
 )
 
-type llcGroupSorter struct {
+type cacheGroupSorter struct {
 	// function to pick preferred and usable cache groups
-	pick func(*llcGroup) (pickVerdict, cpuset.CPUSet)
+	pick func(*cacheGroup) (pickVerdict, cpuset.CPUSet)
 	// functions for sorting picked cache groups
-	sortPrefer func(a, b *llcGroup, s *llcGroupSorter) int
-	sortUsable func(a, b *llcGroup, s *llcGroupSorter) int
+	sortPrefer func(a, b *cacheGroup, s *cacheGroupSorter) int
+	sortUsable func(a, b *cacheGroup, s *cacheGroupSorter) int
 
 	// preferred groups, available CPU count per package and die
-	prefer    []*llcGroup
+	prefer    []*cacheGroup
 	preferPkg map[idset.ID]int
 	preferDie map[idset.ID]map[idset.ID]int
 
 	// other usable groups, available CPU count per package and die
-	usable    []*llcGroup
+	usable    []*cacheGroup
 	usablePkg map[idset.ID]int
 	usableDie map[idset.ID]map[idset.ID]int
 
 	// available CPUs per group
-	cpus map[*llcGroup]cpuset.CPUSet
+	cpus map[*cacheGroup]cpuset.CPUSet
 
 	// full and partial groups worth of requested CPUs
 	full int
 	part int
 }
 
-func (s *llcGroupSorter) preferPkgCPUCount(pkg idset.ID) int {
+func (s *cacheGroupSorter) preferPkgCPUCount(pkg idset.ID) int {
 	return s.preferPkg[pkg]
 }
 
-func (s *llcGroupSorter) preferDieCPUCount(pkg, die idset.ID) int {
+func (s *cacheGroupSorter) preferDieCPUCount(pkg, die idset.ID) int {
 	return s.preferDie[pkg][die]
 }
 
-func (s *llcGroupSorter) usablePkgCPUCount(pkg idset.ID) int {
+func (s *cacheGroupSorter) usablePkgCPUCount(pkg idset.ID) int {
 	return s.usablePkg[pkg]
 }
 
-func (s *llcGroupSorter) usableDieCPUCount(pkg, die idset.ID) int {
+func (s *cacheGroupSorter) usableDieCPUCount(pkg, die idset.ID) int {
 	return s.usableDie[pkg][die]
 }
 
-func (s *llcGroupSorter) CPUSet(g *llcGroup) cpuset.CPUSet {
+func (s *cacheGroupSorter) CPUSet(g *cacheGroup) cpuset.CPUSet {
 	return s.cpus[g]
 }
 
-func (s *llcGroupSorter) sortLLCGroups(a *allocatorHelper) {
-	s.prefer = []*llcGroup{}
+func (s *cacheGroupSorter) sortCacheGroups(a *allocatorHelper) {
+	s.prefer = []*cacheGroup{}
 	s.preferPkg = map[idset.ID]int{}
 	s.preferDie = map[idset.ID]map[idset.ID]int{}
-	s.usable = []*llcGroup{}
+	s.usable = []*cacheGroup{}
 	s.usablePkg = map[idset.ID]int{}
 	s.usableDie = map[idset.ID]map[idset.ID]int{}
-	s.cpus = map[*llcGroup]cpuset.CPUSet{}
+	s.cpus = map[*cacheGroup]cpuset.CPUSet{}
 
 	log.Debug("picking suitable cache groups")
 
@@ -1315,11 +1315,11 @@ func (s *llcGroupSorter) sortLLCGroups(a *allocatorHelper) {
 	//   the same size and use this assumption to split the request into
 	//   full cache size multiples and the remaining partial allocation.
 
-	s.part = a.cnt % a.topology.llcGroups[0].cpus.Size()
+	s.part = a.cnt % a.topology.cacheGroups[0].cpus.Size()
 	s.full = a.cnt - s.part
 
 	// collect preferred and usable groups, count their CPUs per package and die
-	for _, g := range a.topology.llcGroups {
+	for _, g := range a.topology.cacheGroups {
 		verdict, cset := s.pick(g)
 		switch verdict {
 		case pickPrefer:
@@ -1353,7 +1353,7 @@ func (s *llcGroupSorter) sortLLCGroups(a *allocatorHelper) {
 
 	if log.DebugEnabled() {
 		if len(s.preferPkg) > 0 {
-			log.Debug("number of preferred LLC group CPUs per package/die:")
+			log.Debug("number of preferred cache group CPUs per package/die:")
 			for pkg, cnt := range s.preferPkg {
 				log.Debug("  - package #%d: %d", pkg, cnt)
 			}
@@ -1363,11 +1363,11 @@ func (s *llcGroupSorter) sortLLCGroups(a *allocatorHelper) {
 				}
 			}
 		} else {
-			log.Debug("no preferred LLC groups found")
+			log.Debug("no preferred cache groups found")
 		}
 
 		if len(s.usablePkg) > 0 {
-			log.Debug("number of non-preferred but usable LLC group CPUs per package/die:")
+			log.Debug("number of non-preferred but usable cache group CPUs per package/die:")
 			for pkg, cnt := range s.usablePkg {
 				log.Debug("  - package #%d: %d", pkg, cnt)
 			}
@@ -1377,22 +1377,22 @@ func (s *llcGroupSorter) sortLLCGroups(a *allocatorHelper) {
 				}
 			}
 		} else {
-			log.Debug("no non-preferred but usable LLC groups found")
+			log.Debug("no non-preferred but usable cache groups found")
 		}
 	}
 
 	// sort preferred groups
 	if len(s.prefer) > 0 {
-		log.Debug("sorting preferred LLC groups")
-		slices.SortFunc(s.prefer, func(gA, gB *llcGroup) int {
+		log.Debug("sorting preferred cache groups")
+		slices.SortFunc(s.prefer, func(gA, gB *cacheGroup) int {
 			return s.sortPrefer(gA, gB, s)
 		})
 	}
 
 	// sort other usable groups
 	if len(s.usable) > 0 {
-		log.Debug("sorting non-preferred but usable LLC groups")
-		slices.SortFunc(s.usable, func(gA, gB *llcGroup) int {
+		log.Debug("sorting non-preferred but usable cache groups")
+		slices.SortFunc(s.usable, func(gA, gB *cacheGroup) int {
 			return s.sortUsable(gA, gB, s)
 		})
 	}
@@ -1467,7 +1467,7 @@ func newTopologyCache(sys sysfs.System) topologyCache {
 	}
 
 	c.discoverCPUClusters(sys)
-	c.discoverLLCGroups(sys)
+	c.discoverCacheGroups(sys)
 	c.discoverCPUPriorities(sys)
 
 	return c
@@ -1736,15 +1736,53 @@ func (c *topologyCache) discoverCPUClusters(sys sysfs.System) {
 	}
 }
 
-func (c *topologyCache) discoverLLCGroups(sys sysfs.System) {
+func (c *topologyCache) pickCacheLevelForClustering(sys sysfs.System) int {
 	if sys == nil {
+		return -1
+	}
+
+	online := sys.OnlineCPUs()
+	for _, id := range online.List() {
+		cpu := sys.CPU(id)
+		pkg := sys.Package(cpu.PackageID())
+		for n := cpu.CacheCount() - 1; n > 0; n-- {
+			cpus := cpu.GetNthLevelCacheCPUSet(n)
+
+			switch {
+			case cpus.Size() == 0 || cpus.Size() == 1:
+				continue
+			case cpus.Equals(cpu.ThreadCPUSet().Intersection(online)):
+				continue
+			case cpus.Equals(pkg.DieCPUSet(cpu.DieID()).Intersection(online)):
+				continue
+			case cpus.Equals(pkg.CPUSet().Intersection(online)):
+				continue
+			}
+
+			return n
+		}
+	}
+
+	return -1
+}
+
+func (c *topologyCache) discoverCacheGroups(sys sysfs.System) {
+	if sys == nil {
+		return
+	}
+
+	n := c.pickCacheLevelForClustering(sys)
+	if n < 0 {
+		log.Info("no cache level provides extra clustering")
 		return
 	}
 
+	log.Info("picked cache level %d for extra clustering", n)
+
 	online := sys.OnlineCPUs()
 	for _, id := range sys.PackageIDs() {
 		pkg := sys.Package(id)
-		groups := []*llcGroup{}
+		groups := []*cacheGroup{}
 		assigned := idset.NewIDSet()
 
 		for _, cpuID := range pkg.CPUSet().Intersection(online).List() {
@@ -1753,7 +1791,7 @@ func (c *topologyCache) discoverLLCGroups(sys sysfs.System) {
 			}
 
 			cpu := sys.CPU(cpuID)
-			cpus := cpu.GetLastLevelCacheCPUSet().Intersection(online)
+			cpus := cpu.GetNthLevelCacheCPUSet(n).Intersection(online)
 
 			switch {
 			case cpus.Size() == 0 || cpus.Size() == 1:
@@ -1764,10 +1802,9 @@ func (c *topologyCache) discoverLLCGroups(sys sysfs.System) {
 				continue
 			case cpus.Equals(pkg.CPUSet().Intersection(online)):
 				continue
-
 			}
 
-			groups = append(groups, &llcGroup{
+			groups = append(groups, &cacheGroup{
 				pkg:  cpu.PackageID(),
 				die:  cpu.DieID(),
 				node: cpu.NodeID(),
@@ -1778,12 +1815,12 @@ func (c *topologyCache) discoverLLCGroups(sys sysfs.System) {
 		}
 
 		if len(groups) > 1 {
-			c.llcGroups = append(c.llcGroups, groups...)
+			c.cacheGroups = append(c.cacheGroups, groups...)
 		}
 	}
 
 	// sort groups by package, die, NUMA node, and lowest CPU ID.
-	slices.SortFunc(c.llcGroups, func(a, b *llcGroup) int {
+	slices.SortFunc(c.cacheGroups, func(a, b *cacheGroup) int {
 		if diff := a.pkg - b.pkg; diff != 0 {
 			return diff
 		}
@@ -1796,7 +1833,7 @@ func (c *topologyCache) discoverLLCGroups(sys sysfs.System) {
 		return a.cpus.List()[0] - b.cpus.List()[0]
 	})
 
-	for idx, g := range c.llcGroups {
+	for idx, g := range c.cacheGroups {
 		g.id = idx
 
 		for _, cpuID := range g.cpus.UnsortedList() {
@@ -1920,20 +1957,20 @@ func (c *cpuCluster) String() string {
 		c.cpus.Size(), c.kind, c.cpus)
 }
 
-func (c *llcGroup) PackageID() int {
+func (c *cacheGroup) PackageID() int {
 	return c.pkg
 }
 
-func (c *llcGroup) DieID(sys sysfs.System) int {
+func (c *cacheGroup) DieID(sys sysfs.System) int {
 	cpu := sys.CPU(c.cpus.List()[0])
 	return cpu.DieID()
 }
 
-func (c *llcGroup) SmallestCoreID(sys sysfs.System) int {
+func (c *cacheGroup) SmallestCoreID(sys sysfs.System) int {
 	return c.cpus.List()[0]
 }
 
-func (c *llcGroup) String() string {
+func (c *cacheGroup) String() string {
 	return fmt.Sprintf("group #%d/%d, %d %s CPUs (%s)", c.pkg, c.id,
 		c.cpus.Size(), c.kind, c.cpus)
 }