Skip to content

Commit

Permalink
fix incompatibility due to refactor from upstream
Browse files Browse the repository at this point in the history
  • Loading branch information
nvthongswansea committed Sep 3, 2024
1 parent b640369 commit 1f8be98
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,10 @@ func (d *gridscaleCloudProvider) Refresh() error {
return d.manager.Refresh()
}

func (d *gridscaleCloudProvider) GetNodeGpuConfig(node *apiv1.Node) *cloudprovider.GpuConfig {
return nil
}

// BuildGridscale builds the gridscale cloud provider.
func BuildGridscale(
opts config.AutoscalingOptions,
Expand Down
73 changes: 46 additions & 27 deletions cluster-autoscaler/core/scaledown/actuation/actuator.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,15 +149,21 @@ func (a *Actuator) StartDeletion(empty, drain []*apiv1.Node) (status.ScaleDownRe
// 2. Replace the to-be-deleted nodes with the last n nodes in the cluster.
// 3. Taint & drain the to-be-deleted nodes.
// 4. Delete the last n nodes in the cluster.
func (a *Actuator) StartDeletionForGridscaleProvider(empty, drain, all []*apiv1.Node, currentTime time.Time) (*status.ScaleDownStatus, errors.AutoscalerError) {
defer func() { metrics.UpdateDuration(metrics.ScaleDownNodeDeletion, time.Now().Sub(currentTime)) }()
results, ts := a.nodeDeletionTracker.DeletionResults()
scaleDownStatus := &status.ScaleDownStatus{NodeDeleteResults: results, NodeDeleteResultsAsOf: ts}

emptyToDelete, drainToDelete := a.budgetProcessor.CropNodes(a.nodeDeletionTracker, empty, drain)
func (a *Actuator) StartDeletionForGridscaleProvider(empty, drain, all []*apiv1.Node) (status.ScaleDownResult, []*status.ScaleDownNode, errors.AutoscalerError) {
a.nodeDeletionScheduler.ResetAndReportMetrics()
deletionStartTime := time.Now()
defer func() { metrics.UpdateDuration(metrics.ScaleDownNodeDeletion, time.Since(deletionStartTime)) }()
emptyToDelete := []*apiv1.Node{}
drainToDelete := []*apiv1.Node{}
emptyToDeleteNodeGroupViews, drainToDeleteNodeGroupViews := a.budgetProcessor.CropNodes(a.nodeDeletionTracker, empty, drain)
for _, bucket := range emptyToDeleteNodeGroupViews {
emptyToDelete = append(emptyToDelete, bucket.Nodes...)
}
for _, bucket := range drainToDeleteNodeGroupViews {
drainToDelete = append(drainToDelete, bucket.Nodes...)
}
if len(emptyToDelete) == 0 && len(drainToDelete) == 0 {
scaleDownStatus.Result = status.ScaleDownNoNodeDeleted
return scaleDownStatus, nil
return status.ScaleDownNoNodeDeleted, nil, nil
}

// Count the number of nodes to be deleted.
Expand All @@ -166,8 +172,7 @@ func (a *Actuator) StartDeletionForGridscaleProvider(empty, drain, all []*apiv1.
if nodesToDeleteCount >= len(all) {
// If the number of nodes to be deleted is greater than or equal to the number of nodes in the cluster,
// we cannot delete the nodes. Return an error.
scaleDownStatus.Result = status.ScaleDownError
return scaleDownStatus, errors.NewAutoscalerError(
return status.ScaleDownError, nil, errors.NewAutoscalerError(
errors.InternalError,
"cannot delete nodes because the number of nodes to be deleted is greater than or equal to the number of nodes in the cluster. There has to be at least one node left in the cluster.",
)
Expand Down Expand Up @@ -218,29 +223,32 @@ func (a *Actuator) StartDeletionForGridscaleProvider(empty, drain, all []*apiv1.

// do some sanity check
if len(nodesToDelete) <= 0 {
scaleDownStatus.Result = status.ScaleDownError
return scaleDownStatus, errors.NewAutoscalerError(
return status.ScaleDownError, nil, errors.NewAutoscalerError(
errors.InternalError,
"cannot delete nodes because there is no node to be deleted.",
)
}
for i, node := range nodesToDelete {
if node == nil {
scaleDownStatus.Result = status.ScaleDownError
return scaleDownStatus, errors.NewAutoscalerError(
return status.ScaleDownError, nil, errors.NewAutoscalerError(
errors.InternalError,
fmt.Sprintf("cannot delete nodes because the node at index %d of to-be-deleted nodes is nil.", i),
)
}
}

nodesToDeleteNodeGroupViews := []*budgets.NodeGroupView{
&budgets.NodeGroupView{
Nodes: nodesToDelete,
},
}

// Taint all nodes that need drain synchronously, but don't start any drain/deletion yet. Otherwise, pods evicted from one to-be-deleted node
// could get recreated on another.
klog.V(4).Infof("Tainting to-be-deleted nodes.")
err := a.taintNodesSync(nodesToDelete)
err := a.taintNodesSync(nodesToDeleteNodeGroupViews)
if err != nil {
scaleDownStatus.Result = status.ScaleDownError
return scaleDownStatus, err
return status.ScaleDownError, nil, err
}
// Clean taint from NEW to-be-deleted nodes after scale down. We don't care about the error here.
defer func() {
Expand All @@ -254,8 +262,7 @@ func (a *Actuator) StartDeletionForGridscaleProvider(empty, drain, all []*apiv1.
// Since gridscale provider only support single-node-group clusters, we just need to get nodeGroup from the first node of to-be-deleted nodes.
nodeGroup, cpErr := a.ctx.CloudProvider.NodeGroupForNode(nodesToDelete[0])
if cpErr != nil {
scaleDownStatus.Result = status.ScaleDownError
return scaleDownStatus, errors.NewAutoscalerError(errors.CloudProviderError, "failed to find node group for %s: %v", nodesToDelete[0].Name, cpErr)
return status.ScaleDownError, nil, errors.NewAutoscalerError(errors.CloudProviderError, "failed to find node group for %s: %v", nodesToDelete[0].Name, cpErr)
}

var scaledDownNodes []*status.ScaleDownNode
Expand All @@ -273,8 +280,7 @@ func (a *Actuator) StartDeletionForGridscaleProvider(empty, drain, all []*apiv1.
// Drain to-be-deleted nodes synchronously.
finishFuncList, cpErr := a.drainNodesSyncForGridscaleProvider(nodeGroup.Id(), nodesToDelete)
if cpErr != nil {
scaleDownStatus.Result = status.ScaleDownError
return scaleDownStatus, errors.NewAutoscalerError(errors.CloudProviderError, "failed to drain nodes: %v", cpErr)
return status.ScaleDownError, nil, errors.NewAutoscalerError(errors.CloudProviderError, "failed to drain nodes: %v", cpErr)
}
klog.V(4).Infof("Finish draining to-be-deleted nodes.")

Expand All @@ -285,16 +291,13 @@ func (a *Actuator) StartDeletionForGridscaleProvider(empty, drain, all []*apiv1.
for _, finishFunc := range finishFuncList {
finishFunc(status.NodeDeleteErrorFailedToDelete, cpErr)
}
scaleDownStatus.Result = status.ScaleDownError
return scaleDownStatus, errors.NewAutoscalerError(errors.CloudProviderError, "failed to delete nodes: %v", cpErr)
return status.ScaleDownError, nil, errors.NewAutoscalerError(errors.CloudProviderError, "failed to delete nodes: %v", cpErr)
}
for _, finishFunc := range finishFuncList {
finishFunc(status.NodeDeleteOk, nil)
}
scaleDownStatus.ScaledDownNodes = append(scaleDownStatus.ScaledDownNodes, scaledDownNodes...)
scaleDownStatus.Result = status.ScaleDownNodeDeleteStarted
klog.V(4).Infof("Finish scaling down nodes")
return scaleDownStatus, nil
return status.ScaleDownNodeDeleteStarted, scaledDownNodes, nil
}

// deleteAsyncEmpty immediately starts deletions asynchronously.
Expand Down Expand Up @@ -366,9 +369,25 @@ func (a *Actuator) taintNodesSync(NodeGroupViews []*budgets.NodeGroupView) error

func (a *Actuator) drainNodesSyncForGridscaleProvider(nodeGroupID string, nodes []*apiv1.Node) ([]func(resultType status.NodeDeleteResultType, err error), errors.AutoscalerError) {
var finishFuncList []func(resultType status.NodeDeleteResultType, err error)
clusterSnapshot, err := a.createSnapshot(nodes)
if err != nil {
klog.Errorf("Scale-down: couldn't create delete snapshot, err: %v", err)
nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorInternal, Err: errors.NewAutoscalerError(errors.InternalError, "createSnapshot returned error %v", err)}
for _, node := range nodes {
a.nodeDeletionScheduler.AbortNodeDeletion(node, nodeGroupID, true, "failed to create delete snapshot", nodeDeleteResult)
}
return nil, errors.NewAutoscalerError(errors.InternalError, "couldn't create delete snapshot, err: %v", err)
}
for _, node := range nodes {
nodeInfo, err := clusterSnapshot.NodeInfos().Get(node.Name)
if err != nil {
klog.Errorf("Scale-down: can't retrieve node %q from snapshot, err: %v", node.Name, err)
nodeDeleteResult := status.NodeDeleteResult{ResultType: status.NodeDeleteErrorInternal, Err: errors.NewAutoscalerError(errors.InternalError, "nodeInfos.Get for %q returned error: %v", node.Name, err)}
a.nodeDeletionScheduler.AbortNodeDeletion(node, nodeGroupID, true, "failed to get node info", nodeDeleteResult)
continue
}
a.nodeDeletionTracker.StartDeletionWithDrain(nodeGroupID, node.Name)
evictionResults, err := a.evictor.DrainNode(a.ctx, node)
evictionResults, err := a.nodeDeletionScheduler.evictor.DrainNode(a.ctx, nodeInfo)
klog.V(4).Infof("Scale-down: drain results for node %s: %v", node.Name, evictionResults)
if err != nil {
a.nodeDeletionTracker.EndDeletion(nodeGroupID, node.Name, status.NodeDeleteResult{
Expand Down
16 changes: 4 additions & 12 deletions cluster-autoscaler/core/scaledown/legacy/wrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,21 +99,13 @@ func (p *ScaleDownWrapper) StartDeletion(empty, needDrain []*apiv1.Node) (status
}

// StartDeletionForGridscaleProvider triggers an actual scale down logic for gridscale provider.
func (p *ScaleDownWrapper) StartDeletionForGridscaleProvider(empty, needDrain, all []*apiv1.Node, currentTime time.Time) (*status.ScaleDownStatus, errors.AutoscalerError) {
func (p *ScaleDownWrapper) StartDeletionForGridscaleProvider(empty, needDrain, all []*apiv1.Node) (status.ScaleDownResult, []*status.ScaleDownNode, errors.AutoscalerError) {
// Done to preserve legacy behavior, see comment on NodesToDelete.
if p.lastNodesToDeleteErr != nil || p.lastNodesToDeleteResult != status.ScaleDownNodeDeleteStarted {
// When there is no need for scale-down, p.lastNodesToDeleteResult is set to ScaleDownNoUnneeded. We have to still report node delete
// results in this case, otherwise they wouldn't get reported until the next call to actuator.StartDeletion (i.e. until the next scale-down
// attempt).
// Run actuator.StartDeletion with no nodes just to grab the delete results.
origStatus, _ := p.actuator.StartDeletionForGridscaleProvider(nil, nil, nil, currentTime)
return &status.ScaleDownStatus{
Result: p.lastNodesToDeleteResult,
NodeDeleteResults: origStatus.NodeDeleteResults,
NodeDeleteResultsAsOf: origStatus.NodeDeleteResultsAsOf,
}, p.lastNodesToDeleteErr
return p.lastNodesToDeleteResult, []*status.ScaleDownNode{}, p.lastNodesToDeleteErr
}
return p.actuator.StartDeletionForGridscaleProvider(empty, needDrain, all, currentTime)

return p.actuator.StartDeletionForGridscaleProvider(empty, needDrain, all)
}

// CheckStatus snapshots current deletion status
Expand Down
2 changes: 1 addition & 1 deletion cluster-autoscaler/core/scaledown/scaledown.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ type Actuator interface {
StartDeletion(empty, needDrain []*apiv1.Node) (status.ScaleDownResult, []*status.ScaleDownNode, errors.AutoscalerError)
// StartDeletionForGridscaleProvider is similar to StartDeletion but
// it is used for gridscale provider.
StartDeletionForGridscaleProvider(empty, needDrain, all []*apiv1.Node, currentTime time.Time) (status.ScaleDownResult, []*status.ScaleDownNode, errors.AutoscalerError)
StartDeletionForGridscaleProvider(empty, needDrain, all []*apiv1.Node) (status.ScaleDownResult, []*status.ScaleDownNode, errors.AutoscalerError)
// CheckStatus returns an immutable snapshot of ongoing deletions.
CheckStatus() ActuationStatus
// ClearResultsNotNewerThan removes information about deletions finished
Expand Down
6 changes: 3 additions & 3 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,11 +261,11 @@ func (a *StaticAutoscaler) cleanUpTaintsForAllNodes() {
if readyNodes, err := a.ReadyNodeLister().List(); err != nil {
klog.Errorf("Failed to list ready nodes, not cleaning up taints: %v", err)
} else {
deletetaint.CleanAllToBeDeleted(readyNodes,
taints.CleanAllToBeDeleted(readyNodes,
a.AutoscalingContext.ClientSet, a.Recorder, a.CordonNodeBeforeTerminate)
if a.AutoscalingContext.AutoscalingOptions.MaxBulkSoftTaintCount == 0 {
// Clean old taints if soft taints handling is disabled
deletetaint.CleanAllDeletionCandidates(readyNodes,
taints.CleanAllDeletionCandidates(readyNodes,
a.AutoscalingContext.ClientSet, a.Recorder)
}
}
Expand Down Expand Up @@ -670,7 +670,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
scaleDownStart := time.Now()
metrics.UpdateLastTime(metrics.ScaleDown, scaleDownStart)
empty, needDrain := a.scaleDownPlanner.NodesToDelete(currentTime)
scaleDownResult, scaledDownNodes, typedErr := a.scaleDownActuator.StartDeletionForGridscaleProvider(empty, needDrain, scaleDownCandidates, currentTime)
scaleDownResult, scaledDownNodes, typedErr := a.scaleDownActuator.StartDeletionForGridscaleProvider(empty, needDrain, scaleDownCandidates)
scaleDownStatus.Result = scaleDownResult
scaleDownStatus.ScaledDownNodes = scaledDownNodes
metrics.UpdateDurationFromStart(metrics.ScaleDown, scaleDownStart)
Expand Down

0 comments on commit 1f8be98

Please sign in to comment.