kubernetes · kisieland · Oct 27, 2023 · x13n · Nov 2, 2023
diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go
@@ -187,6 +187,8 @@ type AutoscalingOptions struct {
 	NodeAutoprovisioningEnabled bool
 	// MaxAutoprovisionedNodeGroupCount is the maximum number of autoprovisioned groups in the cluster.
 	MaxAutoprovisionedNodeGroupCount int
+	// MaxPodsPerGroup is the maximum number of Pods that are grouped in one group.
+	MaxPodsPerGroup int
 	// UnremovableNodeRecheckTimeout is the timeout before we check again a node that couldn't be removed before
 	UnremovableNodeRecheckTimeout time.Duration
 	// Pods with priority below cutoff are expendable. They can be killed without any consideration during scale down and they don't cause scale-up.

diff --git a/cluster-autoscaler/core/scaleup/equivalence/groups.go b/cluster-autoscaler/core/scaleup/equivalence/groups.go
@@ -36,9 +36,9 @@ type PodGroup struct {
 }
 
 // BuildPodGroups prepares pod groups with equivalent scheduling properties.
-func BuildPodGroups(pods []*apiv1.Pod) []*PodGroup {
+func BuildPodGroups(pods []*apiv1.Pod, maxPodsPerGroup int) []*PodGroup {
 	podEquivalenceGroups := []*PodGroup{}
-	for _, pods := range groupPodsBySchedulingProperties(pods) {
+	for _, pods := range groupPodsBySchedulingProperties(pods, maxPodsPerGroup) {
 		podEquivalenceGroups = append(podEquivalenceGroups, &PodGroup{
 			Pods:             pods,
 			SchedulingErrors: map[string]status.Reasons{},
@@ -54,11 +54,9 @@ type equivalenceGroup struct {
 	representant *apiv1.Pod
 }
 
-const maxEquivalenceGroupsByController = 10
-
 // groupPodsBySchedulingProperties groups pods based on scheduling properties. Group ID is meaningless.
 // TODO(x13n): refactor this to have shared logic with PodSchedulableMap.
-func groupPodsBySchedulingProperties(pods []*apiv1.Pod) map[equivalenceGroupId][]*apiv1.Pod {
+func groupPodsBySchedulingProperties(pods []*apiv1.Pod, maxPodsPerGroup int) map[equivalenceGroupId][]*apiv1.Pod {
 	podEquivalenceGroups := map[equivalenceGroupId][]*apiv1.Pod{}
 	equivalenceGroupsByController := make(map[types.UID][]equivalenceGroup)
 
@@ -76,7 +74,7 @@ func groupPodsBySchedulingProperties(pods []*apiv1.Pod) map[equivalenceGroupId][
 			podEquivalenceGroups[*gid] = append(podEquivalenceGroups[*gid], pod)
 			continue
 		}
-		if len(egs) < maxEquivalenceGroupsByController {
+		if len(egs) < maxPodsPerGroup {
 			// Avoid too many different pods per owner reference.
 			newGroup := equivalenceGroup{
 				id:           nextGroupId,

diff --git a/cluster-autoscaler/core/scaleup/equivalence/groups_test.go b/cluster-autoscaler/core/scaleup/equivalence/groups_test.go
@@ -92,7 +92,7 @@ func TestGroupSchedulablePodsForNode(t *testing.T) {
 	p5_2.OwnerReferences = GenerateOwnerReferences(rc4.Name, "ReplicationController", "extensions/v1beta1", rc4.UID)
 	unschedulablePods := []*apiv1.Pod{p1, p2_1, p2_2, p3_1, p3_2, p4_1, p4_2, p5_1, p5_2}
 
-	podGroups := groupPodsBySchedulingProperties(unschedulablePods)
+	podGroups := groupPodsBySchedulingProperties(unschedulablePods, 10)
 	assert.Equal(t, 6, len(podGroups))
 
 	wantedGroups := []struct {
@@ -141,6 +141,7 @@ func TestGroupSchedulablePodsForNode(t *testing.T) {
 }
 
 func TestEquivalenceGroupSizeLimiting(t *testing.T) {
+	maxGroupSize := 10
 	rc := apiv1.ReplicationController{
 		ObjectMeta: metav1.ObjectMeta{
 			Name:      "rc",
@@ -149,18 +150,18 @@ func TestEquivalenceGroupSizeLimiting(t *testing.T) {
 			UID:       "12345678-1234-1234-1234-123456789012",
 		},
 	}
-	pods := make([]*apiv1.Pod, 0, maxEquivalenceGroupsByController+1)
-	for i := 0; i < maxEquivalenceGroupsByController+1; i += 1 {
+	pods := make([]*apiv1.Pod, 0, maxGroupSize+1)
+	for i := 0; i < maxGroupSize+1; i += 1 {
 		p := BuildTestPod(fmt.Sprintf("p%d", i), 3000, 200000)
 		p.OwnerReferences = GenerateOwnerReferences(rc.Name, "ReplicationController", "extensions/v1beta1", rc.UID)
 		label := fmt.Sprintf("l%d", i)
-		if i > maxEquivalenceGroupsByController {
-			label = fmt.Sprintf("l%d", maxEquivalenceGroupsByController)
+		if i > maxGroupSize {
+			label = fmt.Sprintf("l%d", maxGroupSize)
 		}
 		p.Labels = map[string]string{"uniqueLabel": label}
 		pods = append(pods, p)
 	}
-	podGroups := groupPodsBySchedulingProperties(pods)
+	podGroups := groupPodsBySchedulingProperties(pods, maxGroupSize)
 	assert.Equal(t, len(pods), len(podGroups))
 	for i := range podGroups {
 		assert.Equal(t, 1, len(podGroups[i]))
@@ -181,6 +182,6 @@ func TestEquivalenceGroupIgnoresDaemonSets(t *testing.T) {
 	pods[0].OwnerReferences = GenerateOwnerReferences(ds.Name, "DaemonSet", "apps/v1", ds.UID)
 	pods[1] = BuildTestPod("p2", 3000, 200000)
 	pods[1].OwnerReferences = GenerateOwnerReferences(ds.Name, "DaemonSet", "apps/v1", ds.UID)
-	podGroups := groupPodsBySchedulingProperties(pods)
+	podGroups := groupPodsBySchedulingProperties(pods, 10)
 	assert.Equal(t, 2, len(podGroups))
 }
diff --git a/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go b/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go
@@ -105,7 +105,7 @@ func (o *ScaleUpOrchestrator) ScaleUp(
 	klogx.V(1).Over(loggingQuota).Infof("%v other pods are also unschedulable", -loggingQuota.Left())
 
 	buildPodEquivalenceGroupsStart := time.Now()
-	podEquivalenceGroups := equivalence.BuildPodGroups(unschedulablePods)
+	podEquivalenceGroups := equivalence.BuildPodGroups(unschedulablePods, o.autoscalingContext.MaxPodsPerGroup)
 	metrics.UpdateDurationFromStart(metrics.BuildPodEquivalenceGroups, buildPodEquivalenceGroupsStart)
 
 	upcomingNodes, aErr := o.UpcomingNodes(nodeInfos)

diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go
@@ -192,6 +192,7 @@ var (
 	balanceSimilarNodeGroupsFlag     = flag.Bool("balance-similar-node-groups", false, "Detect similar node groups and balance the number of nodes between them")
 	nodeAutoprovisioningEnabled      = flag.Bool("node-autoprovisioning-enabled", false, "Should CA autoprovision node groups when needed")
 	maxAutoprovisionedNodeGroupCount = flag.Int("max-autoprovisioned-node-group-count", 15, "The maximum number of autoprovisioned groups in the cluster.")
+	maxPodsPerGroup                  = flag.Int("max-pods-per-group", 10, "The maximum number of pods per group in scale-up.")
 
 	unremovableNodeRecheckTimeout = flag.Duration("unremovable-node-recheck-timeout", 5*time.Minute, "The timeout before we check again a node that couldn't be removed before")
 	expendablePodsPriorityCutoff  = flag.Int("expendable-pods-priority-cutoff", -10, "Pods with priority below cutoff will be expendable. They can be killed without any consideration during scale down and they don't cause scale up. Pods with null priority (PodPriority disabled) are non expendable.")
@@ -345,6 +346,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
 		ClusterName:                      *clusterName,
 		NodeAutoprovisioningEnabled:      *nodeAutoprovisioningEnabled,
 		MaxAutoprovisionedNodeGroupCount: *maxAutoprovisionedNodeGroupCount,
+		MaxPodsPerGroup:                  *maxPodsPerGroup,
 		UnremovableNodeRecheckTimeout:    *unremovableNodeRecheckTimeout,
 		ExpendablePodsPriorityCutoff:     *expendablePodsPriorityCutoff,
 		Regional:                         *regional,