From f6efdc9ccd918bb79c91a8788fe6c6f675e44bd5 Mon Sep 17 00:00:00 2001
From: Mahmoud Atwa <mahmoudatwa@google.com>
Date: Wed, 18 Oct 2023 09:21:08 +0000
Subject: [PATCH] Update autoscaler options & treat unknown pods as
 unschedulable

---
 .../config/autoscaling_options.go             |  8 ++++++++
 cluster-autoscaler/core/static_autoscaler.go  | 20 ++++++-------------
 cluster-autoscaler/main.go                    |  4 ++++
 cluster-autoscaler/metrics/metrics.go         | 10 ++++++----
 .../utils/kubernetes/listers.go               | 17 ++++++++++++++++
 5 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go
index 05e6cc2fb6e8..ce38224c557a 100644
--- a/cluster-autoscaler/config/autoscaling_options.go
+++ b/cluster-autoscaler/config/autoscaling_options.go
@@ -277,4 +277,12 @@ type AutoscalingOptions struct {
 	// dynamicNodeDeleteDelayAfterTaintEnabled is used to enable/disable dynamic adjustment of NodeDeleteDelayAfterTaint
 	// based on the latency between the CA and the api-server
 	DynamicNodeDeleteDelayAfterTaintEnabled bool
+	// UnschedulablePodTimeBuffer controls when scale-ups happen so that
+	// the oldest unschedulable pod is older than UnschedulablePodTimeBuffer
+	UnschedulablePodTimeBuffer time.Duration
+	// UnschedulablePodWithGpuTimeBuffer specifies how old should the oldest unschedulable pod with GPU be before starting scale up.
+	// The idea is that nodes with GPU are very expensive and we're ready to sacrifice
+	// a bit more latency to wait for more pods and make a more informed scale-up decision.
+	UnschedulablePodWithGpuTimeBuffer time.Duration
+	// unschedulablePodWithGpuTimeBuffer = 30 * time.Second
 }
diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go
index b5de00eb7e80..08ff9a8fc663 100644
--- a/cluster-autoscaler/core/static_autoscaler.go
+++ b/cluster-autoscaler/core/static_autoscaler.go
@@ -57,20 +57,12 @@ import (
 	"k8s.io/autoscaler/cluster-autoscaler/utils/backoff"
 	caerrors "k8s.io/autoscaler/cluster-autoscaler/utils/errors"
 	scheduler_utils "k8s.io/autoscaler/cluster-autoscaler/utils/scheduler"
-	"k8s.io/autoscaler/cluster-autoscaler/utils/tpu"
 	"k8s.io/utils/integer"
 
 	klog "k8s.io/klog/v2"
 )
 
 const (
-	// How old the oldest unschedulable pod should be before starting scale up.
-	unschedulablePodTimeBuffer = 2 * time.Second
-	// How old the oldest unschedulable pod with GPU should be before starting scale up.
-	// The idea is that nodes with GPU are very expensive and we're ready to sacrifice
-	// a bit more latency to wait for more pods and make a more informed scale-up decision.
-	unschedulablePodWithGpuTimeBuffer = 30 * time.Second
-
 	// NodeUpcomingAnnotation is an annotation CA adds to nodes which are upcoming.
 	NodeUpcomingAnnotation = "cluster-autoscaler.k8s.io/upcoming-node"
 
@@ -309,7 +301,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
 		klog.Errorf("Failed to list pods: %v", err)
 		return caerrors.ToAutoscalerError(caerrors.ApiCallError, err)
 	}
-	originalScheduledPods, unschedulablePods := kube_util.ScheduledPods(pods), kube_util.UnschedulablePods(pods)
+	originalScheduledPods, unschedulablePods, unknownPods := kube_util.ScheduledPods(pods), kube_util.UnschedulablePods(pods), kube_util.UnknownPods(pods)
 
 	// Update cluster resource usage metrics
 	coresTotal, memoryTotal := calculateCoresMemoryTotal(allNodes, currentTime)
@@ -451,9 +443,9 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
 
 	metrics.UpdateLastTime(metrics.Autoscaling, time.Now())
 
-	metrics.UpdateUnschedulablePodsCount(len(unschedulablePods))
-
-	unschedulablePods = tpu.ClearTPURequests(unschedulablePods)
+	metrics.UpdateUnschedulablePodsCount(len(unschedulablePods), len(unknownPods))
+	// Treat unknown pods as unschedulable, pod list processor will remove schedulable pods
+	unschedulablePods = append(unschedulablePods, unknownPods...)
 
 	// Upcoming nodes are recently created nodes that haven't registered in the cluster yet, or haven't become ready yet.
 	upcomingCounts, registeredUpcoming := a.clusterStateRegistry.GetUpcomingNodes()
@@ -538,7 +530,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
 	} else if a.MaxNodesTotal > 0 && len(readyNodes) >= a.MaxNodesTotal {
 		scaleUpStatus.Result = status.ScaleUpNoOptionsAvailable
 		klog.V(1).Info("Max total nodes in cluster reached")
-	} else if allPodsAreNew(unschedulablePodsToHelp, currentTime) {
+	} else if allPodsAreNew(unschedulablePodsToHelp, currentTime, a.AutoscalingOptions.UnschedulablePodTimeBuffer, a.AutoscalingOptions.UnschedulablePodWithGpuTimeBuffer) {
 		// The assumption here is that these pods have been created very recently and probably there
 		// is more pods to come. In theory we could check the newest pod time but then if pod were created
 		// slowly but at the pace of 1 every 2 seconds then no scale up would be triggered for long time.
@@ -963,7 +955,7 @@ func (a *StaticAutoscaler) updateClusterState(allNodes []*apiv1.Node, nodeInfosF
 	return nil
 }
 
-func allPodsAreNew(pods []*apiv1.Pod, currentTime time.Time) bool {
+func allPodsAreNew(pods []*apiv1.Pod, currentTime time.Time, unschedulablePodTimeBuffer, unschedulablePodWithGpuTimeBuffer time.Duration) bool {
 	if core_utils.GetOldestCreateTime(pods).Add(unschedulablePodTimeBuffer).After(currentTime) {
 		return true
 	}
diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go
index 5f879522670a..c5c24a819553 100644
--- a/cluster-autoscaler/main.go
+++ b/cluster-autoscaler/main.go
@@ -243,6 +243,8 @@ var (
 	maxAllocatableDifferenceRatio           = flag.Float64("max-allocatable-difference-ratio", config.DefaultMaxAllocatableDifferenceRatio, "Maximum difference in allocatable resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's allocatable resource.")
 	forceDaemonSets                         = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.")
 	dynamicNodeDeleteDelayAfterTaintEnabled = flag.Bool("dynamic-node-delete-delay-after-taint-enabled", false, "Enables dynamic adjustment of NodeDeleteDelayAfterTaint based of the latency between CA and api-server")
+	unschedulablePodTimeBuffer              = flag.Duration("unschedulable-pod-time-buffer", 2*time.Second, "How old the oldest unschedulable pod should be before starting scale up.")
+	unschedulablePodWithGpuTimeBuffer       = flag.Duration("unschedulable-pod-with-gpu-time-buffer", 30*time.Second, "How old the oldest unschedulable pod with GPU should be before starting scale up.")
 )
 
 func isFlagPassed(name string) bool {
@@ -390,6 +392,8 @@ func createAutoscalingOptions() config.AutoscalingOptions {
 			MaxFreeDifferenceRatio:           *maxFreeDifferenceRatio,
 		},
 		DynamicNodeDeleteDelayAfterTaintEnabled: *dynamicNodeDeleteDelayAfterTaintEnabled,
+		UnschedulablePodTimeBuffer:              *unschedulablePodTimeBuffer,
+		UnschedulablePodWithGpuTimeBuffer:       *unschedulablePodWithGpuTimeBuffer,
 	}
 }
 
diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go
index 8f4e0d869ddd..6e5a79edd87c 100644
--- a/cluster-autoscaler/metrics/metrics.go
+++ b/cluster-autoscaler/metrics/metrics.go
@@ -135,12 +135,13 @@ var (
 		}, []string{"node_group_type"},
 	)
 
-	unschedulablePodsCount = k8smetrics.NewGauge(
+	// Unschedulable pod count can be from scheduler-marked-unschedulable pods or not-yet-processed pods (unknown)
+	unschedulablePodsCount = k8smetrics.NewGaugeVec(
 		&k8smetrics.GaugeOpts{
 			Namespace: caNamespace,
 			Name:      "unschedulable_pods_count",
 			Help:      "Number of unschedulable pods in the cluster.",
-		},
+		}, []string{"count_type"}, 
 	)
 
 	maxNodesCount = k8smetrics.NewGauge(
@@ -462,8 +463,9 @@ func UpdateNodeGroupsCount(autoscaled, autoprovisioned int) {
 }
 
 // UpdateUnschedulablePodsCount records number of currently unschedulable pods
-func UpdateUnschedulablePodsCount(podsCount int) {
-	unschedulablePodsCount.Set(float64(podsCount))
+func UpdateUnschedulablePodsCount(uschedulablePodsCount, unknownPodsCount int) {
+	unschedulablePodsCount.WithLabelValues("unschedulable").Set(float64(uschedulablePodsCount))
+	unschedulablePodsCount.WithLabelValues("unknown").Set(float64(unknownPodsCount))
 }
 
 // UpdateMaxNodesCount records the current maximum number of nodes being set for all node groups
diff --git a/cluster-autoscaler/utils/kubernetes/listers.go b/cluster-autoscaler/utils/kubernetes/listers.go
index cbba21180682..4618c113d17b 100644
--- a/cluster-autoscaler/utils/kubernetes/listers.go
+++ b/cluster-autoscaler/utils/kubernetes/listers.go
@@ -156,6 +156,23 @@ func ScheduledPods(allPods []*apiv1.Pod) []*apiv1.Pod {
 	return scheduledPods
 }
 
+// UnknownPods is a helper method that returns all pods which are not yet processed by the scheduler
+func UnknownPods(allPods []*apiv1.Pod) []*apiv1.Pod {
+	var unknownPods []*apiv1.Pod
+	for _, pod := range allPods {
+		// Make sure it's not scheduled or deleted
+		if pod.Spec.NodeName != "" || pod.GetDeletionTimestamp() != nil {
+			continue
+		}
+		// Make sure it's not unschedulable
+		_, condition := podv1.GetPodCondition(&pod.Status, apiv1.PodScheduled)
+		if condition == nil || (condition.Status == apiv1.ConditionFalse && condition.Reason != apiv1.PodReasonUnschedulable) {
+			unknownPods = append(unknownPods, pod)
+		}
+	}
+	return unknownPods
+}
+
 // UnschedulablePods is a helper method that returns all unschedulable pods from given pod list.
 func UnschedulablePods(allPods []*apiv1.Pod) []*apiv1.Pod {
 	var unschedulablePods []*apiv1.Pod