Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
nvthongswansea committed Sep 8, 2024
1 parent cf29854 commit cc2ad62
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 54 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,7 @@ func (n *NodeGroup) AtomicIncreaseSize(delta int) error {
// given node doesn't belong to this node group. This function should wait
// until node group size is updated. Implementation required.
func (n *NodeGroup) DeleteNodes(nodes []*apiv1.Node) error {
for _, node := range nodes {
klog.V(4).Infof("Deleting node %s from node group", node.Name)
}
klog.V(4).Infof("Deleting nodes: %v from node group %s", nodes, n.name)

targetSize := n.nodeCount - len(nodes)
ctx := context.Background()
Expand Down Expand Up @@ -262,7 +260,7 @@ SERVERLISTLOOP:
}
nodeList := toInstances(gskNodeList)
klog.V(4).Infof("Node list: %v ", nodeList)
return toInstances(gskNodeList), nil
return nodeList, nil
}

// TemplateNodeInfo returns a schedulerframework.NodeInfo structure of an empty
Expand Down
118 changes: 68 additions & 50 deletions cluster-autoscaler/core/scaledown/actuation/actuator.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,11 @@ type actuatorNodeGroupConfigGetter interface {
GetIgnoreDaemonSetsUtilization(nodeGroup cloudprovider.NodeGroup) (bool, error)
}

type NodeGroupWithNodes struct {
Group cloudprovider.NodeGroup
Nodes []*apiv1.Node
}

// NewActuator returns a new instance of Actuator.
func NewActuator(ctx *context.AutoscalingContext, scaleStateNotifier nodegroupchange.NodeGroupChangeObserver, ndt *deletiontracker.NodeDeletionTracker, deleteOptions options.NodeDeleteOptions, drainabilityRules rules.Rules, configGetter actuatorNodeGroupConfigGetter) *Actuator {
ndb := NewNodeDeletionBatcher(ctx, scaleStateNotifier, ndt, ctx.NodeDeletionBatcherInterval)
Expand Down Expand Up @@ -237,64 +242,77 @@ func (a *Actuator) StartDeletionForGridscaleProvider(empty, drain, all []*apiv1.
}
}

nodesToDeleteNodeGroupViews := []*budgets.NodeGroupView{
&budgets.NodeGroupView{
Nodes: nodesToDelete,
},
}

// Taint all nodes that need drain synchronously, but don't start any drain/deletion yet. Otherwise, pods evicted from one to-be-deleted node
// could get recreated on another.
klog.V(4).Infof("Tainting to-be-deleted nodes.")
err := a.taintNodesSync(nodesToDeleteNodeGroupViews)
if err != nil {
return status.ScaleDownError, nil, err
}
// Clean taint from NEW to-be-deleted nodes after scale down. We don't care about the error here.
defer func() {
klog.V(4).Infof("Cleaning taint from to-be-deleted nodes.")
for _, node := range nodesToDelete {
taints.CleanToBeDeleted(node, a.ctx.ClientSet, a.ctx.CordonNodeBeforeTerminate)
nodesToDeleteByNodeGroup := make(map[string]NodeGroupWithNodes)
// Get all node groups which contain nodes to be deleted.
for _, node := range nodesToDelete {
nodeGroup, cpErr := a.ctx.CloudProvider.NodeGroupForNode(node)
if cpErr != nil {
return status.ScaleDownError, nil, errors.NewAutoscalerError(errors.CloudProviderError, "failed to find node group for %s: %v", node.Name, cpErr)
}
}()
klog.V(4).Infof("Finish tainting to-be-deleted nodes.")

// Since gridscale provider only support single-node-group clusters, we just need to get nodeGroup from the first node of to-be-deleted nodes.
nodeGroup, cpErr := a.ctx.CloudProvider.NodeGroupForNode(nodesToDelete[0])
if cpErr != nil {
return status.ScaleDownError, nil, errors.NewAutoscalerError(errors.CloudProviderError, "failed to find node group for %s: %v", nodesToDelete[0].Name, cpErr)
if _, ok := nodesToDeleteByNodeGroup[nodeGroup.Id()]; !ok {
nodesToDeleteByNodeGroup[nodeGroup.Id()] = NodeGroupWithNodes{
Group: nodeGroup,
Nodes: []*apiv1.Node{},
}
}
currentNodeGroupWithNodes := nodesToDeleteByNodeGroup[nodeGroup.Id()]
currentNodeGroupWithNodes.Nodes = append(currentNodeGroupWithNodes.Nodes, node)
nodesToDeleteByNodeGroup[nodeGroup.Id()] = currentNodeGroupWithNodes
}

var scaledDownNodes []*status.ScaleDownNode
for _, drainNode := range nodesToDelete {
if sdNode, err := a.scaleDownNodeToReport(drainNode, true); err == nil {
klog.V(0).Infof("Scale-down: removing node %s, utilization: %v, pods to reschedule: %s", drainNode.Name, sdNode.UtilInfo, joinPodNames(sdNode.EvictedPods))
a.ctx.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaleDown", "Scale-down: removing node %s, utilization: %v, pods to reschedule: %s", drainNode.Name, sdNode.UtilInfo, joinPodNames(sdNode.EvictedPods))
scaledDownNodes = append(scaledDownNodes, sdNode)
} else {
klog.Errorf("Scale-down: couldn't report scaled down node, err: %v", err)
for nodeGroupID, nodesToDeleteBucket := range nodesToDeleteByNodeGroup {
nodesToDeleteNodeGroupViews := []*budgets.NodeGroupView{
{
Nodes: nodesToDeleteBucket.Nodes,
},
}
}

klog.V(4).Infof("Draining to-be-deleted nodes.")
// Drain to-be-deleted nodes synchronously.
finishFuncList, cpErr := a.drainNodesSyncForGridscaleProvider(nodeGroup.Id(), nodesToDelete)
if cpErr != nil {
return status.ScaleDownError, nil, errors.NewAutoscalerError(errors.CloudProviderError, "failed to drain nodes: %v", cpErr)
}
klog.V(4).Infof("Finish draining to-be-deleted nodes.")
// Taint all nodes that need drain synchronously, but don't start any drain/deletion yet. Otherwise, pods evicted from one to-be-deleted node
// could get recreated on another.
klog.V(4).Infof("Tainting to-be-deleted nodes for node group %s", nodeGroupID)
err := a.taintNodesSync(nodesToDeleteNodeGroupViews)
if err != nil {
return status.ScaleDownError, nil, err
}
// Clean taint from NEW to-be-deleted nodes after scale down. We don't care about the error here.
defer func() {
klog.V(4).Infof("Cleaning taint from to-be-deleted nodes for node group %s", nodeGroupID)
for _, node := range nodesToDeleteBucket.Nodes {
taints.CleanToBeDeleted(node, a.ctx.ClientSet, a.ctx.CordonNodeBeforeTerminate)
}
}()
klog.V(4).Infof("Finish tainting to-be-deleted nodes for node group %s", nodeGroupID)

klog.V(4).Infof("Start scaling down nodes")
// Delete the last n nodes in the cluster.
cpErr = nodeGroup.DeleteNodes(nodesToDelete)
if cpErr != nil {
for _, drainNode := range nodesToDeleteBucket.Nodes {
if sdNode, err := a.scaleDownNodeToReport(drainNode, true); err == nil {
klog.V(0).Infof("Scale-down: removing node %s, utilization: %v, pods to reschedule: %s", drainNode.Name, sdNode.UtilInfo, joinPodNames(sdNode.EvictedPods))
a.ctx.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaleDown", "Scale-down: removing node %s, utilization: %v, pods to reschedule: %s", drainNode.Name, sdNode.UtilInfo, joinPodNames(sdNode.EvictedPods))
scaledDownNodes = append(scaledDownNodes, sdNode)
} else {
klog.Errorf("Scale-down: couldn't report scaled down node, err: %v", err)
}
}

klog.V(4).Infof("Draining to-be-deleted nodes for node group %s", nodeGroupID)
// Drain to-be-deleted nodes synchronously.
finishFuncList, cpErr := a.drainNodesSyncForGridscaleProvider(nodeGroupID, nodesToDeleteBucket.Nodes)
if cpErr != nil {
return status.ScaleDownError, nil, errors.NewAutoscalerError(errors.CloudProviderError, "failed to drain nodes: %v", cpErr)
}
klog.V(4).Infof("Finish draining to-be-deleted nodes for node group %s", nodeGroupID)

klog.V(4).Infof("Start scaling down nodes for node group %s", nodeGroupID)
// Delete the last n nodes in the cluster.
dErr := nodesToDeleteBucket.Group.DeleteNodes(nodesToDeleteBucket.Nodes)
if dErr != nil {
for _, finishFunc := range finishFuncList {
finishFunc(status.NodeDeleteErrorFailedToDelete, dErr)
}
return status.ScaleDownError, nil, errors.NewAutoscalerError(errors.CloudProviderError, "failed to delete nodes: %v", dErr)
}
for _, finishFunc := range finishFuncList {
finishFunc(status.NodeDeleteErrorFailedToDelete, cpErr)
finishFunc(status.NodeDeleteOk, nil)
}
return status.ScaleDownError, nil, errors.NewAutoscalerError(errors.CloudProviderError, "failed to delete nodes: %v", cpErr)
}
for _, finishFunc := range finishFuncList {
finishFunc(status.NodeDeleteOk, nil)
}
klog.V(4).Infof("Finish scaling down nodes")
return status.ScaleDownNodeDeleteStarted, scaledDownNodes, nil
Expand Down

0 comments on commit cc2ad62

Please sign in to comment.