From f6efdc9ccd918bb79c91a8788fe6c6f675e44bd5 Mon Sep 17 00:00:00 2001 From: Mahmoud Atwa Date: Wed, 18 Oct 2023 09:21:08 +0000 Subject: [PATCH] Update autoscaler options & treat unknown pods as unschedulable --- .../config/autoscaling_options.go | 8 ++++++++ cluster-autoscaler/core/static_autoscaler.go | 20 ++++++------------- cluster-autoscaler/main.go | 4 ++++ cluster-autoscaler/metrics/metrics.go | 10 ++++++---- .../utils/kubernetes/listers.go | 17 ++++++++++++++++ 5 files changed, 41 insertions(+), 18 deletions(-) diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index 05e6cc2fb6e8..ce38224c557a 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -277,4 +277,12 @@ type AutoscalingOptions struct { // dynamicNodeDeleteDelayAfterTaintEnabled is used to enable/disable dynamic adjustment of NodeDeleteDelayAfterTaint // based on the latency between the CA and the api-server DynamicNodeDeleteDelayAfterTaintEnabled bool + // UnschedulablePodTimeBuffer controls when scale-ups happen so that + // the oldest unschedulable pod is older than UnschedulablePodTimeBuffer + UnschedulablePodTimeBuffer time.Duration + // UnschedulablePodWithGpuTimeBuffer specifies how old should the oldest unschedulable pod with GPU be before starting scale up. + // The idea is that nodes with GPU are very expensive and we're ready to sacrifice + // a bit more latency to wait for more pods and make a more informed scale-up decision. + UnschedulablePodWithGpuTimeBuffer time.Duration + // unschedulablePodWithGpuTimeBuffer = 30 * time.Second } diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index b5de00eb7e80..08ff9a8fc663 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -57,20 +57,12 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/utils/backoff" caerrors "k8s.io/autoscaler/cluster-autoscaler/utils/errors" scheduler_utils "k8s.io/autoscaler/cluster-autoscaler/utils/scheduler" - "k8s.io/autoscaler/cluster-autoscaler/utils/tpu" "k8s.io/utils/integer" klog "k8s.io/klog/v2" ) const ( - // How old the oldest unschedulable pod should be before starting scale up. - unschedulablePodTimeBuffer = 2 * time.Second - // How old the oldest unschedulable pod with GPU should be before starting scale up. - // The idea is that nodes with GPU are very expensive and we're ready to sacrifice - // a bit more latency to wait for more pods and make a more informed scale-up decision. - unschedulablePodWithGpuTimeBuffer = 30 * time.Second - // NodeUpcomingAnnotation is an annotation CA adds to nodes which are upcoming. NodeUpcomingAnnotation = "cluster-autoscaler.k8s.io/upcoming-node" @@ -309,7 +301,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr klog.Errorf("Failed to list pods: %v", err) return caerrors.ToAutoscalerError(caerrors.ApiCallError, err) } - originalScheduledPods, unschedulablePods := kube_util.ScheduledPods(pods), kube_util.UnschedulablePods(pods) + originalScheduledPods, unschedulablePods, unknownPods := kube_util.ScheduledPods(pods), kube_util.UnschedulablePods(pods), kube_util.UnknownPods(pods) // Update cluster resource usage metrics coresTotal, memoryTotal := calculateCoresMemoryTotal(allNodes, currentTime) @@ -451,9 +443,9 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr metrics.UpdateLastTime(metrics.Autoscaling, time.Now()) - metrics.UpdateUnschedulablePodsCount(len(unschedulablePods)) - - unschedulablePods = tpu.ClearTPURequests(unschedulablePods) + metrics.UpdateUnschedulablePodsCount(len(unschedulablePods), len(unknownPods)) + // Treat unknown pods as unschedulable, pod list processor will remove schedulable pods + unschedulablePods = append(unschedulablePods, unknownPods...) // Upcoming nodes are recently created nodes that haven't registered in the cluster yet, or haven't become ready yet. upcomingCounts, registeredUpcoming := a.clusterStateRegistry.GetUpcomingNodes() @@ -538,7 +530,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr } else if a.MaxNodesTotal > 0 && len(readyNodes) >= a.MaxNodesTotal { scaleUpStatus.Result = status.ScaleUpNoOptionsAvailable klog.V(1).Info("Max total nodes in cluster reached") - } else if allPodsAreNew(unschedulablePodsToHelp, currentTime) { + } else if allPodsAreNew(unschedulablePodsToHelp, currentTime, a.AutoscalingOptions.UnschedulablePodTimeBuffer, a.AutoscalingOptions.UnschedulablePodWithGpuTimeBuffer) { // The assumption here is that these pods have been created very recently and probably there // is more pods to come. In theory we could check the newest pod time but then if pod were created // slowly but at the pace of 1 every 2 seconds then no scale up would be triggered for long time. @@ -963,7 +955,7 @@ func (a *StaticAutoscaler) updateClusterState(allNodes []*apiv1.Node, nodeInfosF return nil } -func allPodsAreNew(pods []*apiv1.Pod, currentTime time.Time) bool { +func allPodsAreNew(pods []*apiv1.Pod, currentTime time.Time, unschedulablePodTimeBuffer, unschedulablePodWithGpuTimeBuffer time.Duration) bool { if core_utils.GetOldestCreateTime(pods).Add(unschedulablePodTimeBuffer).After(currentTime) { return true } diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index 5f879522670a..c5c24a819553 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -243,6 +243,8 @@ var ( maxAllocatableDifferenceRatio = flag.Float64("max-allocatable-difference-ratio", config.DefaultMaxAllocatableDifferenceRatio, "Maximum difference in allocatable resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's allocatable resource.") forceDaemonSets = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.") dynamicNodeDeleteDelayAfterTaintEnabled = flag.Bool("dynamic-node-delete-delay-after-taint-enabled", false, "Enables dynamic adjustment of NodeDeleteDelayAfterTaint based of the latency between CA and api-server") + unschedulablePodTimeBuffer = flag.Duration("unschedulable-pod-time-buffer", 2*time.Second, "How old the oldest unschedulable pod should be before starting scale up.") + unschedulablePodWithGpuTimeBuffer = flag.Duration("unschedulable-pod-with-gpu-time-buffer", 30*time.Second, "How old the oldest unschedulable pod with GPU should be before starting scale up.") ) func isFlagPassed(name string) bool { @@ -390,6 +392,8 @@ func createAutoscalingOptions() config.AutoscalingOptions { MaxFreeDifferenceRatio: *maxFreeDifferenceRatio, }, DynamicNodeDeleteDelayAfterTaintEnabled: *dynamicNodeDeleteDelayAfterTaintEnabled, + UnschedulablePodTimeBuffer: *unschedulablePodTimeBuffer, + UnschedulablePodWithGpuTimeBuffer: *unschedulablePodWithGpuTimeBuffer, } } diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go index 8f4e0d869ddd..6e5a79edd87c 100644 --- a/cluster-autoscaler/metrics/metrics.go +++ b/cluster-autoscaler/metrics/metrics.go @@ -135,12 +135,13 @@ var ( }, []string{"node_group_type"}, ) - unschedulablePodsCount = k8smetrics.NewGauge( + // Unschedulable pod count can be from scheduler-marked-unschedulable pods or not-yet-processed pods (unknown) + unschedulablePodsCount = k8smetrics.NewGaugeVec( &k8smetrics.GaugeOpts{ Namespace: caNamespace, Name: "unschedulable_pods_count", Help: "Number of unschedulable pods in the cluster.", - }, + }, []string{"count_type"}, ) maxNodesCount = k8smetrics.NewGauge( @@ -462,8 +463,9 @@ func UpdateNodeGroupsCount(autoscaled, autoprovisioned int) { } // UpdateUnschedulablePodsCount records number of currently unschedulable pods -func UpdateUnschedulablePodsCount(podsCount int) { - unschedulablePodsCount.Set(float64(podsCount)) +func UpdateUnschedulablePodsCount(uschedulablePodsCount, unknownPodsCount int) { + unschedulablePodsCount.WithLabelValues("unschedulable").Set(float64(uschedulablePodsCount)) + unschedulablePodsCount.WithLabelValues("unknown").Set(float64(unknownPodsCount)) } // UpdateMaxNodesCount records the current maximum number of nodes being set for all node groups diff --git a/cluster-autoscaler/utils/kubernetes/listers.go b/cluster-autoscaler/utils/kubernetes/listers.go index cbba21180682..4618c113d17b 100644 --- a/cluster-autoscaler/utils/kubernetes/listers.go +++ b/cluster-autoscaler/utils/kubernetes/listers.go @@ -156,6 +156,23 @@ func ScheduledPods(allPods []*apiv1.Pod) []*apiv1.Pod { return scheduledPods } +// UnknownPods is a helper method that returns all pods which are not yet processed by the scheduler +func UnknownPods(allPods []*apiv1.Pod) []*apiv1.Pod { + var unknownPods []*apiv1.Pod + for _, pod := range allPods { + // Make sure it's not scheduled or deleted + if pod.Spec.NodeName != "" || pod.GetDeletionTimestamp() != nil { + continue + } + // Make sure it's not unschedulable + _, condition := podv1.GetPodCondition(&pod.Status, apiv1.PodScheduled) + if condition == nil || (condition.Status == apiv1.ConditionFalse && condition.Reason != apiv1.PodReasonUnschedulable) { + unknownPods = append(unknownPods, pod) + } + } + return unknownPods +} + // UnschedulablePods is a helper method that returns all unschedulable pods from given pod list. func UnschedulablePods(allPods []*apiv1.Pod) []*apiv1.Pod { var unschedulablePods []*apiv1.Pod