From 062be7cb4facf9a4ba916219f642832c9b669c93 Mon Sep 17 00:00:00 2001 From: Antti Kervinen Date: Fri, 6 Oct 2023 16:49:40 +0300 Subject: [PATCH] balloons: add PreferSpreadOnPhysicalCores - Enable using different CPU allocators in different balloon types. So far the same CPU allocator was used with the same options for all CPU allocations in the policy. - Add new PreferSpreadOnPhysicalCores option to toggle CPU allocations packing or spreading on physical CPU cores. This option is specific to balloon type with a policy level default. - Make existing AllocatorTopologyBalancing option specific to balloon type, too. Signed-off-by: Antti Kervinen --- .../balloons/policy/balloons-policy.go | 43 +++-- cmd/plugins/balloons/policy/cputree.go | 182 +++++++++++++++++- cmd/plugins/balloons/policy/cputree_test.go | 130 +++++++++++-- cmd/plugins/balloons/policy/flags.go | 21 +- docs/resource-policy/policy/balloons.md | 12 ++ .../balloons-allocator-opts.cfg | 31 +++ .../n4c16/test10-allocator-opts/code.var.sh | 46 +++++ 7 files changed, 434 insertions(+), 31 deletions(-) create mode 100644 test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/balloons-allocator-opts.cfg create mode 100644 test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/code.var.sh diff --git a/cmd/plugins/balloons/policy/balloons-policy.go b/cmd/plugins/balloons/policy/balloons-policy.go index 23122885c..019cdb293 100644 --- a/cmd/plugins/balloons/policy/balloons-policy.go +++ b/cmd/plugins/balloons/policy/balloons-policy.go @@ -90,7 +90,8 @@ type Balloon struct { // - len(PodIDs) is the number of pods in the balloon. // - len(PodIDs[podID]) is the number of containers of podID // currently assigned to the balloon. - PodIDs map[string][]string + PodIDs map[string][]string + cpuTreeAllocator *cpuTreeAllocator } var log logger.Logger = logger.NewLogger("policy") @@ -540,6 +541,24 @@ func (p *balloons) newBalloon(blnDef *BalloonDef, confCpus bool) (*Balloon, erro break } } + // Configure new cpuTreeAllocator for this balloon if there + // are type specific allocator options, otherwise use policy + // default allocator. + cpuTreeAllocator := p.cpuTreeAllocator + if blnDef.AllocatorTopologyBalancing != nil || blnDef.PreferSpreadOnPhysicalCores != nil { + allocatorOptions := cpuTreeAllocatorOptions{ + topologyBalancing: p.bpoptions.AllocatorTopologyBalancing, + preferSpreadOnPhysicalCores: p.bpoptions.PreferSpreadOnPhysicalCores, + } + if blnDef.AllocatorTopologyBalancing != nil { + allocatorOptions.topologyBalancing = *blnDef.AllocatorTopologyBalancing + } + if blnDef.PreferSpreadOnPhysicalCores != nil { + allocatorOptions.preferSpreadOnPhysicalCores = *blnDef.PreferSpreadOnPhysicalCores + } + cpuTreeAllocator = p.cpuTree.NewAllocator(allocatorOptions) + } + // Allocate CPUs if blnDef == p.reservedBalloonDef || (blnDef == p.defaultBalloonDef && blnDef.MinCpus == 0 && blnDef.MaxCpus == 0) { @@ -547,7 +566,7 @@ func (p *balloons) newBalloon(blnDef *BalloonDef, confCpus bool) (*Balloon, erro // So does the default balloon unless its CPU counts are tweaked. cpus = p.reserved } else { - addFromCpus, _, err := p.cpuTreeAllocator.ResizeCpus(cpuset.New(), p.freeCpus, blnDef.MinCpus) + addFromCpus, _, err := cpuTreeAllocator.ResizeCpus(cpuset.New(), p.freeCpus, blnDef.MinCpus) if err != nil { return nil, balloonsError("failed to choose a cpuset for allocating first %d CPUs from %#s", blnDef.MinCpus, p.freeCpus) } @@ -558,12 +577,13 @@ func (p *balloons) newBalloon(blnDef *BalloonDef, confCpus bool) (*Balloon, erro p.freeCpus = p.freeCpus.Difference(cpus) } bln := &Balloon{ - Def: blnDef, - Instance: freeInstance, - PodIDs: make(map[string][]string), - Cpus: cpus, - SharedIdleCpus: cpuset.New(), - Mems: p.closestMems(cpus), + Def: blnDef, + Instance: freeInstance, + PodIDs: make(map[string][]string), + Cpus: cpus, + SharedIdleCpus: cpuset.New(), + Mems: p.closestMems(cpus), + cpuTreeAllocator: cpuTreeAllocator, } if confCpus { if err = p.useCpuClass(bln); err != nil { @@ -1042,7 +1062,8 @@ func (p *balloons) setConfig(bpoptions *BalloonsOptions) error { p.freeCpus = p.allowed.Clone() p.freeCpus = p.freeCpus.Difference(p.reserved) p.cpuTreeAllocator = p.cpuTree.NewAllocator(cpuTreeAllocatorOptions{ - topologyBalancing: bpoptions.AllocatorTopologyBalancing, + topologyBalancing: bpoptions.AllocatorTopologyBalancing, + preferSpreadOnPhysicalCores: bpoptions.PreferSpreadOnPhysicalCores, }) // Instantiate built-in reserved and default balloons. reservedBalloon, err := p.newBalloon(p.reservedBalloonDef, false) @@ -1149,7 +1170,7 @@ func (p *balloons) resizeBalloon(bln *Balloon, newMilliCpus int) error { defer p.useCpuClass(bln) if cpuCountDelta > 0 { // Inflate the balloon. - addFromCpus, _, err := p.cpuTreeAllocator.ResizeCpus(bln.Cpus, p.freeCpus, cpuCountDelta) + addFromCpus, _, err := bln.cpuTreeAllocator.ResizeCpus(bln.Cpus, p.freeCpus, cpuCountDelta) if err != nil { return balloonsError("resize/inflate: failed to choose a cpuset for allocating additional %d CPUs: %w", cpuCountDelta, err) } @@ -1163,7 +1184,7 @@ func (p *balloons) resizeBalloon(bln *Balloon, newMilliCpus int) error { p.updatePinning(p.shareIdleCpus(p.freeCpus, newCpus)...) } else { // Deflate the balloon. - _, removeFromCpus, err := p.cpuTreeAllocator.ResizeCpus(bln.Cpus, p.freeCpus, cpuCountDelta) + _, removeFromCpus, err := bln.cpuTreeAllocator.ResizeCpus(bln.Cpus, p.freeCpus, cpuCountDelta) if err != nil { return balloonsError("resize/deflate: failed to choose a cpuset for releasing %d CPUs: %w", -cpuCountDelta, err) } diff --git a/cmd/plugins/balloons/policy/cputree.go b/cmd/plugins/balloons/policy/cputree.go index d2d0662ce..eb43987fe 100644 --- a/cmd/plugins/balloons/policy/cputree.go +++ b/cmd/plugins/balloons/policy/cputree.go @@ -79,7 +79,8 @@ type cpuTreeAllocatorOptions struct { // topologyBalancing true prefers allocating from branches // with most free CPUs (spread allocations), while false is // the opposite (packed allocations). - topologyBalancing bool + topologyBalancing bool + preferSpreadOnPhysicalCores bool } // Strings returns topology level as a string @@ -131,6 +132,19 @@ func (t *cpuTreeNode) String() string { return fmt.Sprintf("%s%v", t.name, t.children) } +func (t *cpuTreeNode) PrettyPrint() string { + origDepth := t.Depth() + lines := []string{} + t.DepthFirstWalk(func(tn *cpuTreeNode) error { + lines = append(lines, + fmt.Sprintf("%s%s: %q cpus: %s", + strings.Repeat(" ", (tn.Depth()-origDepth)*4), + tn.level, tn.name, tn.cpus)) + return nil + }) + return strings.Join(lines, "\n") +} + // String returns cpuTreeNodeAttributes as a string. func (tna cpuTreeNodeAttributes) String() string { return fmt.Sprintf("%s{%d,%v,%d,%d}", tna.t.name, tna.depth, @@ -146,6 +160,34 @@ func NewCpuTree(name string) *cpuTreeNode { } } +func (t *cpuTreeNode) CopyTree() *cpuTreeNode { + newNode := t.CopyNode() + newNode.children = make([]*cpuTreeNode, 0, len(t.children)) + for _, child := range t.children { + newNode.AddChild(child.CopyTree()) + } + return newNode +} + +func (t *cpuTreeNode) CopyNode() *cpuTreeNode { + newNode := cpuTreeNode{ + name: t.name, + level: t.level, + parent: t.parent, + children: t.children, + cpus: t.cpus, + } + return &newNode +} + +// Depth returns the distance from the root node. +func (t *cpuTreeNode) Depth() int { + if t.parent == nil { + return 0 + } + return t.parent.Depth() + 1 +} + // AddChild adds new child node to a CPU tree node. func (t *cpuTreeNode) AddChild(child *cpuTreeNode) { child.parent = t @@ -165,6 +207,38 @@ func (t *cpuTreeNode) Cpus() cpuset.CPUSet { return t.cpus } +// SiblingIndex returns the index of this node among its parents +// children. Returns -1 for the root node, -2 if this node is not +// listed among the children of its parent. +func (t *cpuTreeNode) SiblingIndex() int { + if t.parent == nil { + return -1 + } + for idx, child := range t.parent.children { + if child == t { + return idx + } + } + return -2 +} + +func (t *cpuTreeNode) FindLeafWithCpu(cpu int) *cpuTreeNode { + var found *cpuTreeNode + t.DepthFirstWalk(func(tn *cpuTreeNode) error { + if len(tn.children) > 0 { + return nil + } + for _, cpuHere := range tn.cpus.List() { + if cpu == cpuHere { + found = tn + return WalkStop + } + } + return nil // not found here, no more children to search + }) + return found +} + // WalkSkipChildren error returned from a DepthFirstWalk handler // prevents walking deeper in the tree. The caller of the // DepthFirstWalk will get no error. @@ -236,13 +310,18 @@ func NewCpuTreeFromSystem() (*cpuTreeNode, error) { nodeTree.level = CPUTopologyLevelNuma dieTree.AddChild(nodeTree) node := sys.Node(nodeID) + threadsSeen := map[int]struct{}{} for _, cpuID := range node.CPUSet().List() { + if _, alreadySeen := threadsSeen[cpuID]; alreadySeen { + continue + } cpuTree := NewCpuTree(fmt.Sprintf("p%dd%dn%dcpu%d", packageID, dieID, nodeID, cpuID)) cpuTree.level = CPUTopologyLevelCore nodeTree.AddChild(cpuTree) cpu := sys.CPU(cpuID) for _, threadID := range cpu.ThreadCPUSet().List() { + threadsSeen[threadID] = struct{}{} threadTree := NewCpuTree(fmt.Sprintf("p%dd%dn%dcpu%dt%d", packageID, dieID, nodeID, cpuID, threadID)) threadTree.level = CPUTopologyLevelThread cpuTree.AddChild(threadTree) @@ -312,6 +391,61 @@ func (t *cpuTreeNode) toAttributedSlice( } } +// SplitLevel returns the root node of a new CPU tree where all +// branches of a topology level have been split into new classes. +func (t *cpuTreeNode) SplitLevel(splitLevel CPUTopologyLevel, cpuClassifier func(int) int) *cpuTreeNode { + newRoot := t.CopyTree() + newRoot.DepthFirstWalk(func(tn *cpuTreeNode) error { + // Dive into the level that will be split. + if tn.level != splitLevel { + return nil + } + // Classify CPUs to the map: class -> list of cpus + classCpus := map[int][]int{} + for _, cpu := range t.cpus.List() { + class := cpuClassifier(cpu) + classCpus[class] = append(classCpus[class], cpu) + } + // Clear existing children of this node. New children + // will be classes whose children are masked versions + // of original children of this node. + origChildren := tn.children + tn.children = make([]*cpuTreeNode, 0, len(classCpus)) + // Add new child corresponding each class. + for class, cpus := range classCpus { + cpuMask := cpuset.New(cpus...) + newNode := NewCpuTree(fmt.Sprintf("%sclass%d", tn.name, class)) + tn.AddChild(newNode) + newNode.cpus = tn.cpus.Intersection(cpuMask) + newNode.level = tn.level + newNode.parent = tn + for _, child := range origChildren { + newChild := child.CopyTree() + newChild.DepthFirstWalk(func(cn *cpuTreeNode) error { + cn.cpus = cn.cpus.Intersection(cpuMask) + if cn.cpus.Size() == 0 && cn.parent != nil { + // all cpus masked + // out: cut out this + // branch + newSiblings := []*cpuTreeNode{} + for _, child := range cn.parent.children { + if child != cn { + newSiblings = append(newSiblings, child) + } + } + cn.parent.children = newSiblings + return WalkSkipChildren + } + return nil + }) + newNode.AddChild(newChild) + } + } + return WalkSkipChildren + }) + return newRoot +} + // NewAllocator returns new CPU allocator for allocating CPUs from a // CPU tree branch. func (t *cpuTreeNode) NewAllocator(options cpuTreeAllocatorOptions) *cpuTreeAllocator { @@ -319,6 +453,21 @@ func (t *cpuTreeNode) NewAllocator(options cpuTreeAllocatorOptions) *cpuTreeAllo root: t, options: options, } + if options.preferSpreadOnPhysicalCores { + newTree := t.SplitLevel(CPUTopologyLevelNuma, + // CPU classifier: class of the CPU equals to + // the index in the child list of its parent + // node in the tree. Expect leaf node is a + // hyperthread, parent a physical core. + func(cpu int) int { + leaf := t.FindLeafWithCpu(cpu) + if leaf == nil { + log.Fatalf("SplitLevel CPU classifier: cpu %d not in tree:\n%s\n\n", cpu, t.PrettyPrint()) + } + return leaf.SiblingIndex() + }) + ta.root = newTree + } return ta } @@ -409,7 +558,36 @@ func (ta *cpuTreeAllocator) sorterRelease(tnas []cpuTreeNodeAttributes) func(int // abs(delta) CPUs can be freed. func (ta *cpuTreeAllocator) ResizeCpus(currentCpus, freeCpus cpuset.CPUSet, delta int) (cpuset.CPUSet, cpuset.CPUSet, error) { if delta > 0 { - return ta.resizeCpus(currentCpus, freeCpus, delta) + addFromSuperset, removeFromSuperset, err := ta.resizeCpus(currentCpus, freeCpus, delta) + if !ta.options.preferSpreadOnPhysicalCores || addFromSuperset.Size() == delta { + return addFromSuperset, removeFromSuperset, err + } + // addFromSuperset contains more CPUs (equally good + // choices) than actually needed. In case of + // preferSpreadOnPhysicalCores, however, selecting any + // of these does not result in equally good + // result. Therefore, in this case, construct addFrom + // set by adding one CPU at a time. + addFrom := cpuset.New() + for n := 0; n < delta; n++ { + addSingleFrom, _, err := ta.resizeCpus(currentCpus, freeCpus, 1) + if err != nil { + return addFromSuperset, removeFromSuperset, err + } + if addSingleFrom.Size() != 1 { + return addFromSuperset, removeFromSuperset, fmt.Errorf("internal error: failed to find single CPU to allocate, "+ + "currentCpus=%s freeCpus=%s expectedSingle=%s", + currentCpus, freeCpus, addSingleFrom) + } + addFrom = addFrom.Union(addSingleFrom) + if addFrom.Size() != n+1 { + return addFromSuperset, removeFromSuperset, fmt.Errorf("internal error: double add the same CPU (%s) to cpuset %s on round %d", + addSingleFrom, addFrom, n+1) + } + currentCpus = currentCpus.Union(addSingleFrom) + freeCpus = freeCpus.Difference(addSingleFrom) + } + return addFrom, removeFromSuperset, nil } // In multi-CPU removal, remove CPUs one by one instead of // trying to find a single topology element from which all of diff --git a/cmd/plugins/balloons/policy/cputree_test.go b/cmd/plugins/balloons/policy/cputree_test.go index f2a2be7c7..b1021a2f2 100644 --- a/cmd/plugins/balloons/policy/cputree_test.go +++ b/cmd/plugins/balloons/policy/cputree_test.go @@ -119,7 +119,7 @@ func verifyNotOn(t *testing.T, nameContents string, cpus cpuset.CPUSet, csit cpu } } -func verifySame(t *testing.T, topoLevel string, cpus cpuset.CPUSet, csit cpusInTopology) { +func doVerifySame(t *testing.T, topoLevel string, cpus cpuset.CPUSet, csit cpusInTopology, inversed bool) { seenName := "" seenCpuID := -1 for _, cpuID := range cpus.List() { @@ -132,9 +132,16 @@ func verifySame(t *testing.T, topoLevel string, cpus cpuset.CPUSet, csit cpusInT if seenName == "" { seenName = thisName seenCpuID = cit.cpuID + continue } - if seenName != thisName { - t.Errorf("expected the same %s, got: cpu%d in %s, cpu%d in %s", + if (seenName != thisName && !inversed) || + (seenName == thisName && inversed) { + msg := "the same" + if inversed { + msg = "not the same" + } + t.Errorf("expected %s %s, got: cpu%d in %s, cpu%d in %s", + msg, topoLevel, seenCpuID, seenName, thisCpuID, thisName) @@ -142,6 +149,14 @@ func verifySame(t *testing.T, topoLevel string, cpus cpuset.CPUSet, csit cpusInT } } +func verifySame(t *testing.T, topoLevel string, cpus cpuset.CPUSet, csit cpusInTopology) { + doVerifySame(t, topoLevel, cpus, csit, false) +} + +func verifyNotSame(t *testing.T, topoLevel string, cpus cpuset.CPUSet, csit cpusInTopology) { + doVerifySame(t, topoLevel, cpus, csit, true) +} + func (csit cpusInTopology) getElements(topoLevel string, cpus cpuset.CPUSet) []string { elts := []string{} for _, cpuID := range cpus.List() { @@ -214,19 +229,21 @@ func TestResizeCpus(t *testing.T) { ccids []int } tcases := []struct { - name string - topology [5]int // package, die, numa, core, thread count - allocatorTB bool // allocator topologyBalancing - allocations []int - deltas []int - allocate bool - operateOnCcid []int // which ccid (currentCpus id) will be used on call - expectCurrentOnSame []string - expectAllOnSame []string - expectCurrentNotOn []string - expectAddSizes []int - expectDisjoint []TopoCcids // which ccids should be disjoint - expectErrors []string + name string + topology [5]int // package, die, numa, core, thread count + allocatorTB bool // allocator topologyBalancing + allocatorPSoPC bool // allocator preferSpreadOnPhysicalCores + allocations []int + deltas []int + allocate bool + operateOnCcid []int // which ccid (currentCpus id) will be used on call + expectCurrentOnSame []string + expectCurrentNotOnSame []string + expectAllOnSame []string + expectCurrentNotOn []string + expectAddSizes []int + expectDisjoint []TopoCcids // which ccids should be disjoint + expectErrors []string }{ { name: "first allocations", @@ -411,12 +428,56 @@ func TestResizeCpus(t *testing.T) { "", "", "", "", "", "", "package", "package", }, }, + { + name: "prefer spread on physical cores", + topology: [5]int{4, 1, 4, 8, 2}, + allocatorTB: true, + allocatorPSoPC: true, + deltas: []int{ + 2, 1, 4, 1, // allocate one thread from each core from the same NUMA + 3, 9, 16, // allocate three other cpusets, each should be from separate package (due to topology balancing) + 3, 4, 3, // increase the size of the + // original, 3+4 fits to the same + // NUMA, in the last 3: first cpu + // should fill the NUMA and the rest 2 + // go to another NUMA on the same package. + -2, 2, // release two CPUs that went to another NUMA on the same package, and put them back + -10, // release 2+8 CPUs, the rest should be single threads each on their own core + }, + allocate: true, + operateOnCcid: []int{ + 1, 1, 1, 1, // allocate one thread from each core from the same NUMA by inflating all the time the same cpuset + 2, 3, 4, // three new cpusets + 1, 1, 1, // increase size over one NUMA + 1, 1, + 1, + }, + expectCurrentOnSame: []string{ + "numa", "numa", "numa", "numa", + "numa", "numa", "numa", + "numa", "numa", "package", + "numa", "package", + "numa", + }, + expectCurrentNotOnSame: []string{ + "core", "core", "core", "core", + "core", "", "", + "", "", "", + "", "", + "core", + }, + expectDisjoint: []TopoCcids{ + {}, {}, {}, {}, + {"package", []int{1, 2}}, {"package", []int{1, 2, 3}}, {"package", []int{1, 2, 3, 4}}, + }, + }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { tree, csit := newCpuTreeFromInt5(tc.topology) treeA := tree.NewAllocator(cpuTreeAllocatorOptions{ - topologyBalancing: tc.allocatorTB, + topologyBalancing: tc.allocatorTB, + preferSpreadOnPhysicalCores: tc.allocatorPSoPC, }) currentCpus := cpuset.New() freeCpus := tree.Cpus() @@ -482,6 +543,9 @@ func TestResizeCpus(t *testing.T) { if i < len(tc.expectCurrentOnSame) && tc.expectCurrentOnSame[i] != "" { verifySame(t, tc.expectCurrentOnSame[i], currentCpus, csit) } + if i < len(tc.expectCurrentNotOnSame) && tc.expectCurrentNotOnSame[i] != "" { + verifyNotSame(t, tc.expectCurrentNotOnSame[i], currentCpus, csit) + } if i < len(tc.expectCurrentNotOn) && tc.expectCurrentNotOn[i] != "" { verifyNotOn(t, tc.expectCurrentNotOn[i], currentCpus, csit) } @@ -647,3 +711,35 @@ func TestCPUTopologyLevel(t *testing.T) { } } + +func TestSplitLevel(t *testing.T) { + root, _ := newCpuTreeFromInt5([5]int{2, 2, 2, 4, 2}) + newRoot := root.SplitLevel(CPUTopologyLevelNuma, + func(cpu int) int { + leaf := root.FindLeafWithCpu(cpu) + if leaf == nil { + t.Fatalf("cpu %d not in tree:\n%s\n\n", cpu, root.PrettyPrint()) + } + return leaf.SiblingIndex() + }) + + oldc62 := root.FindLeafWithCpu(62) + oldc63 := root.FindLeafWithCpu(63) + if oldc62.parent != oldc63.parent { + t.Errorf("expected: 62 and 63 are hyperthreads of the same physical core in the original tree, observed parents %s and %s", oldc62.parent, oldc63.parent) + } + newc62 := newRoot.FindLeafWithCpu(62) + newc63 := newRoot.FindLeafWithCpu(63) + if newc62.parent == newc63.parent { + t.Errorf("expected: 62 and 63 have different parents (physical cores), but they have the same %s", newc62.parent) + } + if newc62.parent.parent == newc63.parent.parent { + t.Errorf("expected: 62 and 63 have different grand parents (numa subclasses), but they have the same: %s", newc62.parent.parent) + } + if newc62.parent.parent.parent != newc63.parent.parent.parent { + t.Errorf("expected: 62 and 63 have the same great grand parents (numa), but they differ: %s and %s", newc62.parent.parent.parent, newc63.parent.parent.parent) + } + if t.Failed() { + t.Logf("newRoot:\n%s\n", newRoot.PrettyPrint()) + } +} diff --git a/cmd/plugins/balloons/policy/flags.go b/cmd/plugins/balloons/policy/flags.go index c3764c63c..9e108ddf8 100644 --- a/cmd/plugins/balloons/policy/flags.go +++ b/cmd/plugins/balloons/policy/flags.go @@ -41,8 +41,21 @@ type balloonsOptionsWrapped struct { // allocated and resized so that all topology elements // (packages, dies, numa nodes, cores) have roughly same // amount of allocations. The default is false: balloons are - // packed tightly to optimize power efficiency. + // packed tightly to optimize power efficiency. The value set + // here can be overridden with the balloon type specific + // setting with the same name. AllocatorTopologyBalancing bool + // PreferSpreadOnPhysicalCores prefers allocating logical CPUs + // (possibly hyperthreads) for a balloon from separate physical CPU + // cores. This prevents workloads in the balloon from interfering with + // themselves as they do not compete on the resources of the same CPU + // cores. On the other hand, it allows more interference between + // workloads in different balloons. The default is false: balloons + // are packed tightly to a minimum number of physical CPU cores. The + // value set here is the default for all balloon types, but it can be + // overridden with the balloon type specific setting with the same + // name. + PreferSpreadOnPhysicalCores bool `json:"PreferSpreadOnPhysicalCores,omitempty"` // BallonDefs contains balloon type definitions. BalloonDefs []*BalloonDef `json:"BalloonTypes,omitempty"` } @@ -69,6 +82,12 @@ type BalloonDef struct { // resizing a balloon. At init, balloons with highest priority // CPUs are allocated first. AllocatorPriority cpuallocator.CPUPriority `json:"AllocatorPriority"` + // PreferSpreadOnPhysicalCores is the balloon type specific + // parameter of the policy level parameter with the same name. + PreferSpreadOnPhysicalCores *bool `json:"PreferSpreadOnPhysicalCores,omitempty"` + // AllocatorTopologyBalancing is the balloon type specific + // parameter of the policy level parameter with the same name. + AllocatorTopologyBalancing *bool `json:"AllocatorTopologyBalancing,omitempty"` // CpuClass controls how CPUs of a balloon are (re)configured // whenever a balloon is created, inflated or deflated. CpuClass string `json:"CpuClass"` diff --git a/docs/resource-policy/policy/balloons.md b/docs/resource-policy/policy/balloons.md index f6a9c80da..d87f5bd33 100644 --- a/docs/resource-policy/policy/balloons.md +++ b/docs/resource-policy/policy/balloons.md @@ -83,6 +83,16 @@ Balloons policy parameters: pack new balloons tightly into the same NUMAs/dies/packages. This helps keeping large portions of hardware idle and entering into deep power saving states. +- `PreferSpreadOnPhysicalCores` prefers allocating logical CPUs + (possibly hyperthreads) for a balloon from separate physical CPU + cores. This prevents workloads in the balloon from interfering with + themselves as they do not compete on the resources of the same CPU + cores. On the other hand, it allows more interference between + workloads in different balloons. The default is `false`: balloons + are packed tightly to a minimum number of physical CPU cores. The + value set here is the default for all balloon types, but it can be + overridden with the balloon type specific setting with the same + name. - `BalloonTypes` is a list of balloon type definitions. Each type can be configured with the following parameters: - `Name` of the balloon type. This is used in pod annotations to @@ -133,6 +143,8 @@ Balloons policy parameters: - `numa`: ...in the same numa node(s) as the balloon. - `core`: ...allowed to use idle CPU threads in the same cores with the balloon. + - `PreferSpreadOnPhysicalCores` overrides the policy level option + with the same name in the scope of this balloon type. - `AllocatorPriority` (0: High, 1: Normal, 2: Low, 3: None). CPU allocator parameter, used when creating new or resizing existing balloons. If there are balloon types with pre-created balloons diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/balloons-allocator-opts.cfg b/test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/balloons-allocator-opts.cfg new file mode 100644 index 000000000..89f3c42ef --- /dev/null +++ b/test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/balloons-allocator-opts.cfg @@ -0,0 +1,31 @@ +policy: + Active: balloons + ReservedResources: + CPU: 1 + balloons: + AllocatorTopologyBalancing: true + PreferSpreadOnPhysicalCores: true + BalloonTypes: + - Name: policydefaults + MinCPUs: 2 + MinBalloons: 2 + - Name: topo1cores0 + MinCPUs: 2 + MinBalloons: 2 + PreferSpreadOnPhysicalCores: false + - Name: topo0cores1 + AllocatorTopologyBalancing: false + PreferSpreadOnPhysicalCores: true + - Name: topo0cores0 + AllocatorTopologyBalancing: false + PreferSpreadOnPhysicalCores: false + - Name: topo1cores1 + AllocatorTopologyBalancing: true + PreferSpreadOnPhysicalCores: true +instrumentation: + HTTPEndpoint: :8891 + PrometheusExport: true +logger: + Debug: policy + Klog: + skip_headers: true diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/code.var.sh b/test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/code.var.sh new file mode 100644 index 000000000..11bfb1198 --- /dev/null +++ b/test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/code.var.sh @@ -0,0 +1,46 @@ +cleanup() { + vm-command "kubectl delete pods --all --now --wait" + return 0 +} + +cleanup + +# Launch cri-resmgr with wanted metrics update interval and a +# configuration that opens the instrumentation http server. +terminate nri-resource-policy +nri_resource_policy_cfg=${TEST_DIR}/balloons-allocator-opts.cfg launch nri-resource-policy + +# pod0 in a 2-CPU balloon +CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M" +POD_ANNOTATION="balloon.balloons.resource-policy.nri.io: policydefaults" CONTCOUNT=1 create balloons-busybox +report allowed +verify 'len(cores["pod0c0"]) == 2' \ + 'len(cpus["pod0c0"]) == 2' + + +# pod1 in a 2-CPU balloon +CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M" +POD_ANNOTATION="balloon.balloons.resource-policy.nri.io: topo1cores0" CONTCOUNT=1 create balloons-busybox +report allowed +verify 'len(cores["pod1c0"]) == 1' \ + 'len(cpus["pod1c0"]) == 2' + +# pod2: container 0 resizes first from 0 to 1, container 2 from 1 to 2 CPUs, +# use more cores +CPUREQ="1" MEMREQ="100M" CPULIM="1" MEMLIM="100M" +POD_ANNOTATION="balloon.balloons.resource-policy.nri.io: topo1cores1" CONTCOUNT=2 create balloons-busybox +report allowed +verify 'len(cores["pod2c0"]) == 2' \ + 'len(cpus["pod2c0"]) == 2' \ + 'cpus["pod2c0"] == cpus["pod2c1"]' + +# pod3: container 0 resizes first from 0 to 1, container 2 from 1 to 2 CPUs, +# pack tightly +CPUREQ="1" MEMREQ="100M" CPULIM="1" MEMLIM="100M" +POD_ANNOTATION="balloon.balloons.resource-policy.nri.io: topo0cores0" CONTCOUNT=2 create balloons-busybox +report allowed +verify 'len(cores["pod3c0"]) == 1' \ + 'len(cpus["pod3c0"]) == 2' \ + 'cpus["pod3c0"] == cpus["pod3c1"]' + +cleanup