From ea841f6268363b532f96f2e20856c0662285a5ff Mon Sep 17 00:00:00 2001
From: Antti Kervinen <antti.kervinen@intel.com>
Date: Fri, 6 Oct 2023 16:49:40 +0300
Subject: [PATCH] balloons: add PreferSpreadOnPhysicalCores

- Enable using different CPU allocators in different balloon types. So
  far the same CPU allocator was used with the same options for all
  CPU allocations in the policy.
- Add new PreferSpreadOnPhysicalCores option to toggle CPU allocations
  packing or spreading on physical CPU cores. This option is specific
  to balloon type with a policy level default.
- Make existing AllocatorTopologyBalancing option specific to balloon
  type, too.

Signed-off-by: Antti Kervinen <antti.kervinen@intel.com>
---
 .../balloons/policy/balloons-policy.go        |  43 +++--
 cmd/plugins/balloons/policy/cputree.go        | 182 +++++++++++++++++-
 cmd/plugins/balloons/policy/cputree_test.go   | 130 +++++++++++--
 cmd/plugins/balloons/policy/flags.go          |  21 +-
 docs/resource-policy/policy/balloons.md       |  12 ++
 .../balloons-allocator-opts.cfg               |  31 +++
 .../n4c16/test10-allocator-opts/code.var.sh   |  46 +++++
 7 files changed, 434 insertions(+), 31 deletions(-)
 create mode 100644 test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/balloons-allocator-opts.cfg
 create mode 100644 test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/code.var.sh

diff --git a/cmd/plugins/balloons/policy/balloons-policy.go b/cmd/plugins/balloons/policy/balloons-policy.go
index 23122885c..019cdb293 100644
--- a/cmd/plugins/balloons/policy/balloons-policy.go
+++ b/cmd/plugins/balloons/policy/balloons-policy.go
@@ -90,7 +90,8 @@ type Balloon struct {
 	// - len(PodIDs) is the number of pods in the balloon.
 	// - len(PodIDs[podID]) is the number of containers of podID
 	//   currently assigned to the balloon.
-	PodIDs map[string][]string
+	PodIDs           map[string][]string
+	cpuTreeAllocator *cpuTreeAllocator
 }
 
 var log logger.Logger = logger.NewLogger("policy")
@@ -540,6 +541,24 @@ func (p *balloons) newBalloon(blnDef *BalloonDef, confCpus bool) (*Balloon, erro
 			break
 		}
 	}
+	// Configure new cpuTreeAllocator for this balloon if there
+	// are type specific allocator options, otherwise use policy
+	// default allocator.
+	cpuTreeAllocator := p.cpuTreeAllocator
+	if blnDef.AllocatorTopologyBalancing != nil || blnDef.PreferSpreadOnPhysicalCores != nil {
+		allocatorOptions := cpuTreeAllocatorOptions{
+			topologyBalancing:           p.bpoptions.AllocatorTopologyBalancing,
+			preferSpreadOnPhysicalCores: p.bpoptions.PreferSpreadOnPhysicalCores,
+		}
+		if blnDef.AllocatorTopologyBalancing != nil {
+			allocatorOptions.topologyBalancing = *blnDef.AllocatorTopologyBalancing
+		}
+		if blnDef.PreferSpreadOnPhysicalCores != nil {
+			allocatorOptions.preferSpreadOnPhysicalCores = *blnDef.PreferSpreadOnPhysicalCores
+		}
+		cpuTreeAllocator = p.cpuTree.NewAllocator(allocatorOptions)
+	}
+
 	// Allocate CPUs
 	if blnDef == p.reservedBalloonDef ||
 		(blnDef == p.defaultBalloonDef && blnDef.MinCpus == 0 && blnDef.MaxCpus == 0) {
@@ -547,7 +566,7 @@ func (p *balloons) newBalloon(blnDef *BalloonDef, confCpus bool) (*Balloon, erro
 		// So does the default balloon unless its CPU counts are tweaked.
 		cpus = p.reserved
 	} else {
-		addFromCpus, _, err := p.cpuTreeAllocator.ResizeCpus(cpuset.New(), p.freeCpus, blnDef.MinCpus)
+		addFromCpus, _, err := cpuTreeAllocator.ResizeCpus(cpuset.New(), p.freeCpus, blnDef.MinCpus)
 		if err != nil {
 			return nil, balloonsError("failed to choose a cpuset for allocating first %d CPUs from %#s", blnDef.MinCpus, p.freeCpus)
 		}
@@ -558,12 +577,13 @@ func (p *balloons) newBalloon(blnDef *BalloonDef, confCpus bool) (*Balloon, erro
 		p.freeCpus = p.freeCpus.Difference(cpus)
 	}
 	bln := &Balloon{
-		Def:            blnDef,
-		Instance:       freeInstance,
-		PodIDs:         make(map[string][]string),
-		Cpus:           cpus,
-		SharedIdleCpus: cpuset.New(),
-		Mems:           p.closestMems(cpus),
+		Def:              blnDef,
+		Instance:         freeInstance,
+		PodIDs:           make(map[string][]string),
+		Cpus:             cpus,
+		SharedIdleCpus:   cpuset.New(),
+		Mems:             p.closestMems(cpus),
+		cpuTreeAllocator: cpuTreeAllocator,
 	}
 	if confCpus {
 		if err = p.useCpuClass(bln); err != nil {
@@ -1042,7 +1062,8 @@ func (p *balloons) setConfig(bpoptions *BalloonsOptions) error {
 	p.freeCpus = p.allowed.Clone()
 	p.freeCpus = p.freeCpus.Difference(p.reserved)
 	p.cpuTreeAllocator = p.cpuTree.NewAllocator(cpuTreeAllocatorOptions{
-		topologyBalancing: bpoptions.AllocatorTopologyBalancing,
+		topologyBalancing:           bpoptions.AllocatorTopologyBalancing,
+		preferSpreadOnPhysicalCores: bpoptions.PreferSpreadOnPhysicalCores,
 	})
 	// Instantiate built-in reserved and default balloons.
 	reservedBalloon, err := p.newBalloon(p.reservedBalloonDef, false)
@@ -1149,7 +1170,7 @@ func (p *balloons) resizeBalloon(bln *Balloon, newMilliCpus int) error {
 	defer p.useCpuClass(bln)
 	if cpuCountDelta > 0 {
 		// Inflate the balloon.
-		addFromCpus, _, err := p.cpuTreeAllocator.ResizeCpus(bln.Cpus, p.freeCpus, cpuCountDelta)
+		addFromCpus, _, err := bln.cpuTreeAllocator.ResizeCpus(bln.Cpus, p.freeCpus, cpuCountDelta)
 		if err != nil {
 			return balloonsError("resize/inflate: failed to choose a cpuset for allocating additional %d CPUs: %w", cpuCountDelta, err)
 		}
@@ -1163,7 +1184,7 @@ func (p *balloons) resizeBalloon(bln *Balloon, newMilliCpus int) error {
 		p.updatePinning(p.shareIdleCpus(p.freeCpus, newCpus)...)
 	} else {
 		// Deflate the balloon.
-		_, removeFromCpus, err := p.cpuTreeAllocator.ResizeCpus(bln.Cpus, p.freeCpus, cpuCountDelta)
+		_, removeFromCpus, err := bln.cpuTreeAllocator.ResizeCpus(bln.Cpus, p.freeCpus, cpuCountDelta)
 		if err != nil {
 			return balloonsError("resize/deflate: failed to choose a cpuset for releasing %d CPUs: %w", -cpuCountDelta, err)
 		}
diff --git a/cmd/plugins/balloons/policy/cputree.go b/cmd/plugins/balloons/policy/cputree.go
index d2d0662ce..eb43987fe 100644
--- a/cmd/plugins/balloons/policy/cputree.go
+++ b/cmd/plugins/balloons/policy/cputree.go
@@ -79,7 +79,8 @@ type cpuTreeAllocatorOptions struct {
 	// topologyBalancing true prefers allocating from branches
 	// with most free CPUs (spread allocations), while false is
 	// the opposite (packed allocations).
-	topologyBalancing bool
+	topologyBalancing           bool
+	preferSpreadOnPhysicalCores bool
 }
 
 // Strings returns topology level as a string
@@ -131,6 +132,19 @@ func (t *cpuTreeNode) String() string {
 	return fmt.Sprintf("%s%v", t.name, t.children)
 }
 
+func (t *cpuTreeNode) PrettyPrint() string {
+	origDepth := t.Depth()
+	lines := []string{}
+	t.DepthFirstWalk(func(tn *cpuTreeNode) error {
+		lines = append(lines,
+			fmt.Sprintf("%s%s: %q cpus: %s",
+				strings.Repeat(" ", (tn.Depth()-origDepth)*4),
+				tn.level, tn.name, tn.cpus))
+		return nil
+	})
+	return strings.Join(lines, "\n")
+}
+
 // String returns cpuTreeNodeAttributes as a string.
 func (tna cpuTreeNodeAttributes) String() string {
 	return fmt.Sprintf("%s{%d,%v,%d,%d}", tna.t.name, tna.depth,
@@ -146,6 +160,34 @@ func NewCpuTree(name string) *cpuTreeNode {
 	}
 }
 
+func (t *cpuTreeNode) CopyTree() *cpuTreeNode {
+	newNode := t.CopyNode()
+	newNode.children = make([]*cpuTreeNode, 0, len(t.children))
+	for _, child := range t.children {
+		newNode.AddChild(child.CopyTree())
+	}
+	return newNode
+}
+
+func (t *cpuTreeNode) CopyNode() *cpuTreeNode {
+	newNode := cpuTreeNode{
+		name:     t.name,
+		level:    t.level,
+		parent:   t.parent,
+		children: t.children,
+		cpus:     t.cpus,
+	}
+	return &newNode
+}
+
+// Depth returns the distance from the root node.
+func (t *cpuTreeNode) Depth() int {
+	if t.parent == nil {
+		return 0
+	}
+	return t.parent.Depth() + 1
+}
+
 // AddChild adds new child node to a CPU tree node.
 func (t *cpuTreeNode) AddChild(child *cpuTreeNode) {
 	child.parent = t
@@ -165,6 +207,38 @@ func (t *cpuTreeNode) Cpus() cpuset.CPUSet {
 	return t.cpus
 }
 
+// SiblingIndex returns the index of this node among its parents
+// children. Returns -1 for the root node, -2 if this node is not
+// listed among the children of its parent.
+func (t *cpuTreeNode) SiblingIndex() int {
+	if t.parent == nil {
+		return -1
+	}
+	for idx, child := range t.parent.children {
+		if child == t {
+			return idx
+		}
+	}
+	return -2
+}
+
+func (t *cpuTreeNode) FindLeafWithCpu(cpu int) *cpuTreeNode {
+	var found *cpuTreeNode
+	t.DepthFirstWalk(func(tn *cpuTreeNode) error {
+		if len(tn.children) > 0 {
+			return nil
+		}
+		for _, cpuHere := range tn.cpus.List() {
+			if cpu == cpuHere {
+				found = tn
+				return WalkStop
+			}
+		}
+		return nil // not found here, no more children to search
+	})
+	return found
+}
+
 // WalkSkipChildren error returned from a DepthFirstWalk handler
 // prevents walking deeper in the tree. The caller of the
 // DepthFirstWalk will get no error.
@@ -236,13 +310,18 @@ func NewCpuTreeFromSystem() (*cpuTreeNode, error) {
 				nodeTree.level = CPUTopologyLevelNuma
 				dieTree.AddChild(nodeTree)
 				node := sys.Node(nodeID)
+				threadsSeen := map[int]struct{}{}
 				for _, cpuID := range node.CPUSet().List() {
+					if _, alreadySeen := threadsSeen[cpuID]; alreadySeen {
+						continue
+					}
 					cpuTree := NewCpuTree(fmt.Sprintf("p%dd%dn%dcpu%d", packageID, dieID, nodeID, cpuID))
 
 					cpuTree.level = CPUTopologyLevelCore
 					nodeTree.AddChild(cpuTree)
 					cpu := sys.CPU(cpuID)
 					for _, threadID := range cpu.ThreadCPUSet().List() {
+						threadsSeen[threadID] = struct{}{}
 						threadTree := NewCpuTree(fmt.Sprintf("p%dd%dn%dcpu%dt%d", packageID, dieID, nodeID, cpuID, threadID))
 						threadTree.level = CPUTopologyLevelThread
 						cpuTree.AddChild(threadTree)
@@ -312,6 +391,61 @@ func (t *cpuTreeNode) toAttributedSlice(
 	}
 }
 
+// SplitLevel returns the root node of a new CPU tree where all
+// branches of a topology level have been split into new classes.
+func (t *cpuTreeNode) SplitLevel(splitLevel CPUTopologyLevel, cpuClassifier func(int) int) *cpuTreeNode {
+	newRoot := t.CopyTree()
+	newRoot.DepthFirstWalk(func(tn *cpuTreeNode) error {
+		// Dive into the level that will be split.
+		if tn.level != splitLevel {
+			return nil
+		}
+		// Classify CPUs to the map: class -> list of cpus
+		classCpus := map[int][]int{}
+		for _, cpu := range t.cpus.List() {
+			class := cpuClassifier(cpu)
+			classCpus[class] = append(classCpus[class], cpu)
+		}
+		// Clear existing children of this node. New children
+		// will be classes whose children are masked versions
+		// of original children of this node.
+		origChildren := tn.children
+		tn.children = make([]*cpuTreeNode, 0, len(classCpus))
+		// Add new child corresponding each class.
+		for class, cpus := range classCpus {
+			cpuMask := cpuset.New(cpus...)
+			newNode := NewCpuTree(fmt.Sprintf("%sclass%d", tn.name, class))
+			tn.AddChild(newNode)
+			newNode.cpus = tn.cpus.Intersection(cpuMask)
+			newNode.level = tn.level
+			newNode.parent = tn
+			for _, child := range origChildren {
+				newChild := child.CopyTree()
+				newChild.DepthFirstWalk(func(cn *cpuTreeNode) error {
+					cn.cpus = cn.cpus.Intersection(cpuMask)
+					if cn.cpus.Size() == 0 && cn.parent != nil {
+						// all cpus masked
+						// out: cut out this
+						// branch
+						newSiblings := []*cpuTreeNode{}
+						for _, child := range cn.parent.children {
+							if child != cn {
+								newSiblings = append(newSiblings, child)
+							}
+						}
+						cn.parent.children = newSiblings
+						return WalkSkipChildren
+					}
+					return nil
+				})
+				newNode.AddChild(newChild)
+			}
+		}
+		return WalkSkipChildren
+	})
+	return newRoot
+}
+
 // NewAllocator returns new CPU allocator for allocating CPUs from a
 // CPU tree branch.
 func (t *cpuTreeNode) NewAllocator(options cpuTreeAllocatorOptions) *cpuTreeAllocator {
@@ -319,6 +453,21 @@ func (t *cpuTreeNode) NewAllocator(options cpuTreeAllocatorOptions) *cpuTreeAllo
 		root:    t,
 		options: options,
 	}
+	if options.preferSpreadOnPhysicalCores {
+		newTree := t.SplitLevel(CPUTopologyLevelNuma,
+			// CPU classifier: class of the CPU equals to
+			// the index in the child list of its parent
+			// node in the tree. Expect leaf node is a
+			// hyperthread, parent a physical core.
+			func(cpu int) int {
+				leaf := t.FindLeafWithCpu(cpu)
+				if leaf == nil {
+					log.Fatalf("SplitLevel CPU classifier: cpu %d not in tree:\n%s\n\n", cpu, t.PrettyPrint())
+				}
+				return leaf.SiblingIndex()
+			})
+		ta.root = newTree
+	}
 	return ta
 }
 
@@ -409,7 +558,36 @@ func (ta *cpuTreeAllocator) sorterRelease(tnas []cpuTreeNodeAttributes) func(int
 //     abs(delta) CPUs can be freed.
 func (ta *cpuTreeAllocator) ResizeCpus(currentCpus, freeCpus cpuset.CPUSet, delta int) (cpuset.CPUSet, cpuset.CPUSet, error) {
 	if delta > 0 {
-		return ta.resizeCpus(currentCpus, freeCpus, delta)
+		addFromSuperset, removeFromSuperset, err := ta.resizeCpus(currentCpus, freeCpus, delta)
+		if !ta.options.preferSpreadOnPhysicalCores || addFromSuperset.Size() == delta {
+			return addFromSuperset, removeFromSuperset, err
+		}
+		// addFromSuperset contains more CPUs (equally good
+		// choices) than actually needed. In case of
+		// preferSpreadOnPhysicalCores, however, selecting any
+		// of these does not result in equally good
+		// result. Therefore, in this case, construct addFrom
+		// set by adding one CPU at a time.
+		addFrom := cpuset.New()
+		for n := 0; n < delta; n++ {
+			addSingleFrom, _, err := ta.resizeCpus(currentCpus, freeCpus, 1)
+			if err != nil {
+				return addFromSuperset, removeFromSuperset, err
+			}
+			if addSingleFrom.Size() != 1 {
+				return addFromSuperset, removeFromSuperset, fmt.Errorf("internal error: failed to find single CPU to allocate, "+
+					"currentCpus=%s freeCpus=%s expectedSingle=%s",
+					currentCpus, freeCpus, addSingleFrom)
+			}
+			addFrom = addFrom.Union(addSingleFrom)
+			if addFrom.Size() != n+1 {
+				return addFromSuperset, removeFromSuperset, fmt.Errorf("internal error: double add the same CPU (%s) to cpuset %s on round %d",
+					addSingleFrom, addFrom, n+1)
+			}
+			currentCpus = currentCpus.Union(addSingleFrom)
+			freeCpus = freeCpus.Difference(addSingleFrom)
+		}
+		return addFrom, removeFromSuperset, nil
 	}
 	// In multi-CPU removal, remove CPUs one by one instead of
 	// trying to find a single topology element from which all of
diff --git a/cmd/plugins/balloons/policy/cputree_test.go b/cmd/plugins/balloons/policy/cputree_test.go
index f2a2be7c7..b1021a2f2 100644
--- a/cmd/plugins/balloons/policy/cputree_test.go
+++ b/cmd/plugins/balloons/policy/cputree_test.go
@@ -119,7 +119,7 @@ func verifyNotOn(t *testing.T, nameContents string, cpus cpuset.CPUSet, csit cpu
 	}
 }
 
-func verifySame(t *testing.T, topoLevel string, cpus cpuset.CPUSet, csit cpusInTopology) {
+func doVerifySame(t *testing.T, topoLevel string, cpus cpuset.CPUSet, csit cpusInTopology, inversed bool) {
 	seenName := ""
 	seenCpuID := -1
 	for _, cpuID := range cpus.List() {
@@ -132,9 +132,16 @@ func verifySame(t *testing.T, topoLevel string, cpus cpuset.CPUSet, csit cpusInT
 		if seenName == "" {
 			seenName = thisName
 			seenCpuID = cit.cpuID
+			continue
 		}
-		if seenName != thisName {
-			t.Errorf("expected the same %s, got: cpu%d in %s, cpu%d in %s",
+		if (seenName != thisName && !inversed) ||
+			(seenName == thisName && inversed) {
+			msg := "the same"
+			if inversed {
+				msg = "not the same"
+			}
+			t.Errorf("expected %s %s, got: cpu%d in %s, cpu%d in %s",
+				msg,
 				topoLevel,
 				seenCpuID, seenName,
 				thisCpuID, thisName)
@@ -142,6 +149,14 @@ func verifySame(t *testing.T, topoLevel string, cpus cpuset.CPUSet, csit cpusInT
 	}
 }
 
+func verifySame(t *testing.T, topoLevel string, cpus cpuset.CPUSet, csit cpusInTopology) {
+	doVerifySame(t, topoLevel, cpus, csit, false)
+}
+
+func verifyNotSame(t *testing.T, topoLevel string, cpus cpuset.CPUSet, csit cpusInTopology) {
+	doVerifySame(t, topoLevel, cpus, csit, true)
+}
+
 func (csit cpusInTopology) getElements(topoLevel string, cpus cpuset.CPUSet) []string {
 	elts := []string{}
 	for _, cpuID := range cpus.List() {
@@ -214,19 +229,21 @@ func TestResizeCpus(t *testing.T) {
 		ccids []int
 	}
 	tcases := []struct {
-		name                string
-		topology            [5]int // package, die, numa, core, thread count
-		allocatorTB         bool   // allocator topologyBalancing
-		allocations         []int
-		deltas              []int
-		allocate            bool
-		operateOnCcid       []int // which ccid (currentCpus id) will be used on call
-		expectCurrentOnSame []string
-		expectAllOnSame     []string
-		expectCurrentNotOn  []string
-		expectAddSizes      []int
-		expectDisjoint      []TopoCcids // which ccids should be disjoint
-		expectErrors        []string
+		name                   string
+		topology               [5]int // package, die, numa, core, thread count
+		allocatorTB            bool   // allocator topologyBalancing
+		allocatorPSoPC         bool   // allocator preferSpreadOnPhysicalCores
+		allocations            []int
+		deltas                 []int
+		allocate               bool
+		operateOnCcid          []int // which ccid (currentCpus id) will be used on call
+		expectCurrentOnSame    []string
+		expectCurrentNotOnSame []string
+		expectAllOnSame        []string
+		expectCurrentNotOn     []string
+		expectAddSizes         []int
+		expectDisjoint         []TopoCcids // which ccids should be disjoint
+		expectErrors           []string
 	}{
 		{
 			name:           "first allocations",
@@ -411,12 +428,56 @@ func TestResizeCpus(t *testing.T) {
 				"", "", "", "", "", "", "package", "package",
 			},
 		},
+		{
+			name:           "prefer spread on physical cores",
+			topology:       [5]int{4, 1, 4, 8, 2},
+			allocatorTB:    true,
+			allocatorPSoPC: true,
+			deltas: []int{
+				2, 1, 4, 1, // allocate one thread from each core from the same NUMA
+				3, 9, 16, // allocate three other cpusets, each should be from separate package (due to topology balancing)
+				3, 4, 3, // increase the size of the
+				// original, 3+4 fits to the same
+				// NUMA, in the last 3: first cpu
+				// should fill the NUMA and the rest 2
+				// go to another NUMA on the same package.
+				-2, 2, // release two CPUs that went to another NUMA on the same package, and put them back
+				-10, // release 2+8 CPUs, the rest should be single threads each on their own core
+			},
+			allocate: true,
+			operateOnCcid: []int{
+				1, 1, 1, 1, // allocate one thread from each core from the same NUMA by inflating all the time the same cpuset
+				2, 3, 4, // three new cpusets
+				1, 1, 1, // increase size over one NUMA
+				1, 1,
+				1,
+			},
+			expectCurrentOnSame: []string{
+				"numa", "numa", "numa", "numa",
+				"numa", "numa", "numa",
+				"numa", "numa", "package",
+				"numa", "package",
+				"numa",
+			},
+			expectCurrentNotOnSame: []string{
+				"core", "core", "core", "core",
+				"core", "", "",
+				"", "", "",
+				"", "",
+				"core",
+			},
+			expectDisjoint: []TopoCcids{
+				{}, {}, {}, {},
+				{"package", []int{1, 2}}, {"package", []int{1, 2, 3}}, {"package", []int{1, 2, 3, 4}},
+			},
+		},
 	}
 	for _, tc := range tcases {
 		t.Run(tc.name, func(t *testing.T) {
 			tree, csit := newCpuTreeFromInt5(tc.topology)
 			treeA := tree.NewAllocator(cpuTreeAllocatorOptions{
-				topologyBalancing: tc.allocatorTB,
+				topologyBalancing:           tc.allocatorTB,
+				preferSpreadOnPhysicalCores: tc.allocatorPSoPC,
 			})
 			currentCpus := cpuset.New()
 			freeCpus := tree.Cpus()
@@ -482,6 +543,9 @@ func TestResizeCpus(t *testing.T) {
 					if i < len(tc.expectCurrentOnSame) && tc.expectCurrentOnSame[i] != "" {
 						verifySame(t, tc.expectCurrentOnSame[i], currentCpus, csit)
 					}
+					if i < len(tc.expectCurrentNotOnSame) && tc.expectCurrentNotOnSame[i] != "" {
+						verifyNotSame(t, tc.expectCurrentNotOnSame[i], currentCpus, csit)
+					}
 					if i < len(tc.expectCurrentNotOn) && tc.expectCurrentNotOn[i] != "" {
 						verifyNotOn(t, tc.expectCurrentNotOn[i], currentCpus, csit)
 					}
@@ -647,3 +711,35 @@ func TestCPUTopologyLevel(t *testing.T) {
 	}
 
 }
+
+func TestSplitLevel(t *testing.T) {
+	root, _ := newCpuTreeFromInt5([5]int{2, 2, 2, 4, 2})
+	newRoot := root.SplitLevel(CPUTopologyLevelNuma,
+		func(cpu int) int {
+			leaf := root.FindLeafWithCpu(cpu)
+			if leaf == nil {
+				t.Fatalf("cpu %d not in tree:\n%s\n\n", cpu, root.PrettyPrint())
+			}
+			return leaf.SiblingIndex()
+		})
+
+	oldc62 := root.FindLeafWithCpu(62)
+	oldc63 := root.FindLeafWithCpu(63)
+	if oldc62.parent != oldc63.parent {
+		t.Errorf("expected: 62 and 63 are hyperthreads of the same physical core in the original tree, observed parents %s and %s", oldc62.parent, oldc63.parent)
+	}
+	newc62 := newRoot.FindLeafWithCpu(62)
+	newc63 := newRoot.FindLeafWithCpu(63)
+	if newc62.parent == newc63.parent {
+		t.Errorf("expected: 62 and 63 have different parents (physical cores), but they have the same %s", newc62.parent)
+	}
+	if newc62.parent.parent == newc63.parent.parent {
+		t.Errorf("expected: 62 and 63 have different grand parents (numa subclasses), but they have the same: %s", newc62.parent.parent)
+	}
+	if newc62.parent.parent.parent != newc63.parent.parent.parent {
+		t.Errorf("expected: 62 and 63 have the same great grand parents (numa), but they differ: %s and %s", newc62.parent.parent.parent, newc63.parent.parent.parent)
+	}
+	if t.Failed() {
+		t.Logf("newRoot:\n%s\n", newRoot.PrettyPrint())
+	}
+}
diff --git a/cmd/plugins/balloons/policy/flags.go b/cmd/plugins/balloons/policy/flags.go
index c3764c63c..9e108ddf8 100644
--- a/cmd/plugins/balloons/policy/flags.go
+++ b/cmd/plugins/balloons/policy/flags.go
@@ -41,8 +41,21 @@ type balloonsOptionsWrapped struct {
 	// allocated and resized so that all topology elements
 	// (packages, dies, numa nodes, cores) have roughly same
 	// amount of allocations. The default is false: balloons are
-	// packed tightly to optimize power efficiency.
+	// packed tightly to optimize power efficiency. The value set
+	// here can be overridden with the balloon type specific
+	// setting with the same name.
 	AllocatorTopologyBalancing bool
+	// PreferSpreadOnPhysicalCores prefers allocating logical CPUs
+	// (possibly hyperthreads) for a balloon from separate physical CPU
+	// cores. This prevents workloads in the balloon from interfering with
+	// themselves as they do not compete on the resources of the same CPU
+	// cores. On the other hand, it allows more interference between
+	// workloads in different balloons. The default is false: balloons
+	// are packed tightly to a minimum number of physical CPU cores. The
+	// value set here is the default for all balloon types, but it can be
+	// overridden with the balloon type specific setting with the same
+	// name.
+	PreferSpreadOnPhysicalCores bool `json:"PreferSpreadOnPhysicalCores,omitempty"`
 	// BallonDefs contains balloon type definitions.
 	BalloonDefs []*BalloonDef `json:"BalloonTypes,omitempty"`
 }
@@ -69,6 +82,12 @@ type BalloonDef struct {
 	// resizing a balloon. At init, balloons with highest priority
 	// CPUs are allocated first.
 	AllocatorPriority cpuallocator.CPUPriority `json:"AllocatorPriority"`
+	// PreferSpreadOnPhysicalCores is the balloon type specific
+	// parameter of the policy level parameter with the same name.
+	PreferSpreadOnPhysicalCores *bool `json:"PreferSpreadOnPhysicalCores,omitempty"`
+	// AllocatorTopologyBalancing is the balloon type specific
+	// parameter of the policy level parameter with the same name.
+	AllocatorTopologyBalancing *bool `json:"AllocatorTopologyBalancing,omitempty"`
 	// CpuClass controls how CPUs of a balloon are (re)configured
 	// whenever a balloon is created, inflated or deflated.
 	CpuClass string `json:"CpuClass"`
diff --git a/docs/resource-policy/policy/balloons.md b/docs/resource-policy/policy/balloons.md
index f6a9c80da..d87f5bd33 100644
--- a/docs/resource-policy/policy/balloons.md
+++ b/docs/resource-policy/policy/balloons.md
@@ -83,6 +83,16 @@ Balloons policy parameters:
   pack new balloons tightly into the same NUMAs/dies/packages. This
   helps keeping large portions of hardware idle and entering into deep
   power saving states.
+- `PreferSpreadOnPhysicalCores` prefers allocating logical CPUs
+  (possibly hyperthreads) for a balloon from separate physical CPU
+  cores. This prevents workloads in the balloon from interfering with
+  themselves as they do not compete on the resources of the same CPU
+  cores. On the other hand, it allows more interference between
+  workloads in different balloons. The default is `false`: balloons
+  are packed tightly to a minimum number of physical CPU cores. The
+  value set here is the default for all balloon types, but it can be
+  overridden with the balloon type specific setting with the same
+  name.
 - `BalloonTypes` is a list of balloon type definitions. Each type can
   be configured with the following parameters:
   - `Name` of the balloon type. This is used in pod annotations to
@@ -133,6 +143,8 @@ Balloons policy parameters:
     - `numa`: ...in the same numa node(s) as the balloon.
     - `core`: ...allowed to use idle CPU threads in the same cores with
       the balloon.
+  - `PreferSpreadOnPhysicalCores` overrides the policy level option
+    with the same name in the scope of this balloon type.
   - `AllocatorPriority` (0: High, 1: Normal, 2: Low, 3: None). CPU
     allocator parameter, used when creating new or resizing existing
     balloons. If there are balloon types with pre-created balloons
diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/balloons-allocator-opts.cfg b/test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/balloons-allocator-opts.cfg
new file mode 100644
index 000000000..89f3c42ef
--- /dev/null
+++ b/test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/balloons-allocator-opts.cfg
@@ -0,0 +1,31 @@
+policy:
+  Active: balloons
+  ReservedResources:
+    CPU: 1
+  balloons:
+    AllocatorTopologyBalancing: true
+    PreferSpreadOnPhysicalCores: true
+    BalloonTypes:
+      - Name: policydefaults
+        MinCPUs: 2
+        MinBalloons: 2
+      - Name: topo1cores0
+        MinCPUs: 2
+        MinBalloons: 2
+        PreferSpreadOnPhysicalCores: false
+      - Name: topo0cores1
+        AllocatorTopologyBalancing: false
+        PreferSpreadOnPhysicalCores: true
+      - Name: topo0cores0
+        AllocatorTopologyBalancing: false
+        PreferSpreadOnPhysicalCores: false
+      - Name: topo1cores1
+        AllocatorTopologyBalancing: true
+        PreferSpreadOnPhysicalCores: true
+instrumentation:
+  HTTPEndpoint: :8891
+  PrometheusExport: true
+logger:
+  Debug: policy
+  Klog:
+    skip_headers: true
diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/code.var.sh b/test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/code.var.sh
new file mode 100644
index 000000000..11bfb1198
--- /dev/null
+++ b/test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/code.var.sh
@@ -0,0 +1,46 @@
+cleanup() {
+    vm-command "kubectl delete pods --all --now --wait"
+    return 0
+}
+
+cleanup
+
+# Launch cri-resmgr with wanted metrics update interval and a
+# configuration that opens the instrumentation http server.
+terminate nri-resource-policy
+nri_resource_policy_cfg=${TEST_DIR}/balloons-allocator-opts.cfg launch nri-resource-policy
+
+# pod0 in a 2-CPU balloon
+CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M"
+POD_ANNOTATION="balloon.balloons.resource-policy.nri.io: policydefaults" CONTCOUNT=1 create balloons-busybox
+report allowed
+verify 'len(cores["pod0c0"]) == 2' \
+       'len(cpus["pod0c0"]) == 2'
+
+
+# pod1 in a 2-CPU balloon
+CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M"
+POD_ANNOTATION="balloon.balloons.resource-policy.nri.io: topo1cores0" CONTCOUNT=1 create balloons-busybox
+report allowed
+verify 'len(cores["pod1c0"]) == 1' \
+       'len(cpus["pod1c0"]) == 2'
+
+# pod2: container 0 resizes first from 0 to 1, container 2 from 1 to 2 CPUs,
+# use more cores
+CPUREQ="1" MEMREQ="100M" CPULIM="1" MEMLIM="100M"
+POD_ANNOTATION="balloon.balloons.resource-policy.nri.io: topo1cores1" CONTCOUNT=2 create balloons-busybox
+report allowed
+verify 'len(cores["pod2c0"]) == 2' \
+       'len(cpus["pod2c0"]) == 2' \
+       'cpus["pod2c0"] == cpus["pod2c1"]'
+
+# pod3: container 0 resizes first from 0 to 1, container 2 from 1 to 2 CPUs,
+# pack tightly
+CPUREQ="1" MEMREQ="100M" CPULIM="1" MEMLIM="100M"
+POD_ANNOTATION="balloon.balloons.resource-policy.nri.io: topo0cores0" CONTCOUNT=2 create balloons-busybox
+report allowed
+verify 'len(cores["pod3c0"]) == 1' \
+       'len(cpus["pod3c0"]) == 2' \
+       'cpus["pod3c0"] == cpus["pod3c1"]'
+
+cleanup