Skip to content

Commit

Permalink
Update autoscaler options & treat unknown pods as unschedulable
Browse files Browse the repository at this point in the history
  • Loading branch information
atwamahmoud committed Oct 18, 2023
1 parent a875e2b commit f6efdc9
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 18 deletions.
8 changes: 8 additions & 0 deletions cluster-autoscaler/config/autoscaling_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,4 +277,12 @@ type AutoscalingOptions struct {
// dynamicNodeDeleteDelayAfterTaintEnabled is used to enable/disable dynamic adjustment of NodeDeleteDelayAfterTaint
// based on the latency between the CA and the api-server
DynamicNodeDeleteDelayAfterTaintEnabled bool
// UnschedulablePodTimeBuffer controls when scale-ups happen so that
// the oldest unschedulable pod is older than UnschedulablePodTimeBuffer
UnschedulablePodTimeBuffer time.Duration
// UnschedulablePodWithGpuTimeBuffer specifies how old should the oldest unschedulable pod with GPU be before starting scale up.
// The idea is that nodes with GPU are very expensive and we're ready to sacrifice
// a bit more latency to wait for more pods and make a more informed scale-up decision.
UnschedulablePodWithGpuTimeBuffer time.Duration
// unschedulablePodWithGpuTimeBuffer = 30 * time.Second
}
20 changes: 6 additions & 14 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,20 +57,12 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/utils/backoff"
caerrors "k8s.io/autoscaler/cluster-autoscaler/utils/errors"
scheduler_utils "k8s.io/autoscaler/cluster-autoscaler/utils/scheduler"
"k8s.io/autoscaler/cluster-autoscaler/utils/tpu"
"k8s.io/utils/integer"

klog "k8s.io/klog/v2"
)

const (
// How old the oldest unschedulable pod should be before starting scale up.
unschedulablePodTimeBuffer = 2 * time.Second
// How old the oldest unschedulable pod with GPU should be before starting scale up.
// The idea is that nodes with GPU are very expensive and we're ready to sacrifice
// a bit more latency to wait for more pods and make a more informed scale-up decision.
unschedulablePodWithGpuTimeBuffer = 30 * time.Second

// NodeUpcomingAnnotation is an annotation CA adds to nodes which are upcoming.
NodeUpcomingAnnotation = "cluster-autoscaler.k8s.io/upcoming-node"

Expand Down Expand Up @@ -309,7 +301,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
klog.Errorf("Failed to list pods: %v", err)
return caerrors.ToAutoscalerError(caerrors.ApiCallError, err)
}
originalScheduledPods, unschedulablePods := kube_util.ScheduledPods(pods), kube_util.UnschedulablePods(pods)
originalScheduledPods, unschedulablePods, unknownPods := kube_util.ScheduledPods(pods), kube_util.UnschedulablePods(pods), kube_util.UnknownPods(pods)

// Update cluster resource usage metrics
coresTotal, memoryTotal := calculateCoresMemoryTotal(allNodes, currentTime)
Expand Down Expand Up @@ -451,9 +443,9 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr

metrics.UpdateLastTime(metrics.Autoscaling, time.Now())

metrics.UpdateUnschedulablePodsCount(len(unschedulablePods))

unschedulablePods = tpu.ClearTPURequests(unschedulablePods)
metrics.UpdateUnschedulablePodsCount(len(unschedulablePods), len(unknownPods))
// Treat unknown pods as unschedulable, pod list processor will remove schedulable pods
unschedulablePods = append(unschedulablePods, unknownPods...)

// Upcoming nodes are recently created nodes that haven't registered in the cluster yet, or haven't become ready yet.
upcomingCounts, registeredUpcoming := a.clusterStateRegistry.GetUpcomingNodes()
Expand Down Expand Up @@ -538,7 +530,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
} else if a.MaxNodesTotal > 0 && len(readyNodes) >= a.MaxNodesTotal {
scaleUpStatus.Result = status.ScaleUpNoOptionsAvailable
klog.V(1).Info("Max total nodes in cluster reached")
} else if allPodsAreNew(unschedulablePodsToHelp, currentTime) {
} else if allPodsAreNew(unschedulablePodsToHelp, currentTime, a.AutoscalingOptions.UnschedulablePodTimeBuffer, a.AutoscalingOptions.UnschedulablePodWithGpuTimeBuffer) {
// The assumption here is that these pods have been created very recently and probably there
// is more pods to come. In theory we could check the newest pod time but then if pod were created
// slowly but at the pace of 1 every 2 seconds then no scale up would be triggered for long time.
Expand Down Expand Up @@ -963,7 +955,7 @@ func (a *StaticAutoscaler) updateClusterState(allNodes []*apiv1.Node, nodeInfosF
return nil
}

func allPodsAreNew(pods []*apiv1.Pod, currentTime time.Time) bool {
func allPodsAreNew(pods []*apiv1.Pod, currentTime time.Time, unschedulablePodTimeBuffer, unschedulablePodWithGpuTimeBuffer time.Duration) bool {
if core_utils.GetOldestCreateTime(pods).Add(unschedulablePodTimeBuffer).After(currentTime) {
return true
}
Expand Down
4 changes: 4 additions & 0 deletions cluster-autoscaler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,8 @@ var (
maxAllocatableDifferenceRatio = flag.Float64("max-allocatable-difference-ratio", config.DefaultMaxAllocatableDifferenceRatio, "Maximum difference in allocatable resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's allocatable resource.")
forceDaemonSets = flag.Bool("force-ds", false, "Blocks scale-up of node groups too small for all suitable Daemon Sets pods.")
dynamicNodeDeleteDelayAfterTaintEnabled = flag.Bool("dynamic-node-delete-delay-after-taint-enabled", false, "Enables dynamic adjustment of NodeDeleteDelayAfterTaint based of the latency between CA and api-server")
unschedulablePodTimeBuffer = flag.Duration("unschedulable-pod-time-buffer", 2*time.Second, "How old the oldest unschedulable pod should be before starting scale up.")
unschedulablePodWithGpuTimeBuffer = flag.Duration("unschedulable-pod-with-gpu-time-buffer", 30*time.Second, "How old the oldest unschedulable pod with GPU should be before starting scale up.")
)

func isFlagPassed(name string) bool {
Expand Down Expand Up @@ -390,6 +392,8 @@ func createAutoscalingOptions() config.AutoscalingOptions {
MaxFreeDifferenceRatio: *maxFreeDifferenceRatio,
},
DynamicNodeDeleteDelayAfterTaintEnabled: *dynamicNodeDeleteDelayAfterTaintEnabled,
UnschedulablePodTimeBuffer: *unschedulablePodTimeBuffer,
UnschedulablePodWithGpuTimeBuffer: *unschedulablePodWithGpuTimeBuffer,
}
}

Expand Down
10 changes: 6 additions & 4 deletions cluster-autoscaler/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,12 +135,13 @@ var (
}, []string{"node_group_type"},
)

unschedulablePodsCount = k8smetrics.NewGauge(
// Unschedulable pod count can be from scheduler-marked-unschedulable pods or not-yet-processed pods (unknown)
unschedulablePodsCount = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "unschedulable_pods_count",
Help: "Number of unschedulable pods in the cluster.",
},
}, []string{"count_type"},
)

maxNodesCount = k8smetrics.NewGauge(
Expand Down Expand Up @@ -462,8 +463,9 @@ func UpdateNodeGroupsCount(autoscaled, autoprovisioned int) {
}

// UpdateUnschedulablePodsCount records number of currently unschedulable pods
func UpdateUnschedulablePodsCount(podsCount int) {
unschedulablePodsCount.Set(float64(podsCount))
func UpdateUnschedulablePodsCount(uschedulablePodsCount, unknownPodsCount int) {
unschedulablePodsCount.WithLabelValues("unschedulable").Set(float64(uschedulablePodsCount))
unschedulablePodsCount.WithLabelValues("unknown").Set(float64(unknownPodsCount))
}

// UpdateMaxNodesCount records the current maximum number of nodes being set for all node groups
Expand Down
17 changes: 17 additions & 0 deletions cluster-autoscaler/utils/kubernetes/listers.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,23 @@ func ScheduledPods(allPods []*apiv1.Pod) []*apiv1.Pod {
return scheduledPods
}

// UnknownPods is a helper method that returns all pods which are not yet processed by the scheduler
func UnknownPods(allPods []*apiv1.Pod) []*apiv1.Pod {
var unknownPods []*apiv1.Pod
for _, pod := range allPods {
// Make sure it's not scheduled or deleted
if pod.Spec.NodeName != "" || pod.GetDeletionTimestamp() != nil {
continue
}
// Make sure it's not unschedulable
_, condition := podv1.GetPodCondition(&pod.Status, apiv1.PodScheduled)
if condition == nil || (condition.Status == apiv1.ConditionFalse && condition.Reason != apiv1.PodReasonUnschedulable) {
unknownPods = append(unknownPods, pod)
}
}
return unknownPods
}

// UnschedulablePods is a helper method that returns all unschedulable pods from given pod list.
func UnschedulablePods(allPods []*apiv1.Pod) []*apiv1.Pod {
var unschedulablePods []*apiv1.Pod
Expand Down

0 comments on commit f6efdc9

Please sign in to comment.