Skip to content

Commit

Permalink
Return nodes with create errors by node group id
Browse files Browse the repository at this point in the history
In order to simplify the deleteNodesWithErrors code, return nodeGroupID
as well as nodes with create errors. That way we avoid the additional
node group matching code.
  • Loading branch information
bskiba authored and domenicbozzuto committed Sep 23, 2024
1 parent d0c3053 commit bbbd4ef
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 47 deletions.
10 changes: 5 additions & 5 deletions cluster-autoscaler/clusterstate/clusterstate.go
Original file line number Diff line number Diff line change
Expand Up @@ -1171,17 +1171,17 @@ func (csr *ClusterStateRegistry) buildInstanceToErrorCodeMappings(instances []cl
return
}

// GetCreatedNodesWithErrors returns list of nodes being created which reported create error.
func (csr *ClusterStateRegistry) GetCreatedNodesWithErrors() []*apiv1.Node {
// GetCreatedNodesWithErrors returns a map from node group id to list of nodes which reported a create error.
func (csr *ClusterStateRegistry) GetCreatedNodesWithErrors() map[string][]*apiv1.Node {
csr.Lock()
defer csr.Unlock()

nodesWithCreateErrors := make([]*apiv1.Node, 0, 0)
for _, nodeGroupInstances := range csr.cloudProviderNodeInstances {
nodesWithCreateErrors := make(map[string][]*apiv1.Node)
for nodeGroupId, nodeGroupInstances := range csr.cloudProviderNodeInstances {
_, _, instancesByErrorCode := csr.buildInstanceToErrorCodeMappings(nodeGroupInstances)
for _, instances := range instancesByErrorCode {
for _, instance := range instances {
nodesWithCreateErrors = append(nodesWithCreateErrors, FakeNode(instance, cloudprovider.FakeNodeCreateError))
nodesWithCreateErrors[nodeGroupId] = append(nodesWithCreateErrors[nodeGroupId], FakeNode(instance, cloudprovider.FakeNodeCreateError))
}
}
}
Expand Down
38 changes: 7 additions & 31 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -421,12 +421,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
return nil
}

danglingNodes, err := a.deleteCreatedNodesWithErrors()
if err != nil {
klog.Warningf("Failed to remove nodes that were created with errors, skipping iteration: %v", err)
return nil
}
if danglingNodes {
if deletedNodes := a.deleteCreatedNodesWithErrors(); deletedNodes {
klog.V(0).Infof("Some nodes that failed to create were removed, skipping iteration")
return nil
}
Expand Down Expand Up @@ -813,30 +808,11 @@ func toNodes(unregisteredNodes []clusterstate.UnregisteredNode) []*apiv1.Node {
return nodes
}

func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() (bool, error) {
func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool {
// We always schedule deleting of incoming errornous nodes
// TODO[lukaszos] Consider adding logic to not retry delete every loop iteration
nodes := a.clusterStateRegistry.GetCreatedNodesWithErrors()

nodeGroups := a.nodeGroupsById()
nodesToBeDeletedByNodeGroupId := make(map[string][]*apiv1.Node)

for _, node := range nodes {
nodeGroup, err := a.CloudProvider.NodeGroupForNode(node)
if err != nil {
id := "<nil>"
if node != nil {
id = node.Spec.ProviderID
}
klog.Warningf("Cannot determine nodeGroup for node %v; %v", id, err)
continue
}
if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() {
a.clusterStateRegistry.RefreshCloudProviderNodeInstancesCache()
return false, fmt.Errorf("node %s has no known nodegroup", node.GetName())
}
nodesToBeDeletedByNodeGroupId[nodeGroup.Id()] = append(nodesToBeDeletedByNodeGroupId[nodeGroup.Id()], node)
}
nodesToBeDeletedByNodeGroupId := a.clusterStateRegistry.GetCreatedNodesWithErrors()

deletedAny := false

Expand Down Expand Up @@ -868,13 +844,13 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() (bool, error) {

if err != nil {
klog.Warningf("Error while trying to delete nodes from %v: %v", nodeGroupId, err)
} else {
deletedAny = true
a.clusterStateRegistry.InvalidateNodeInstancesCacheEntry(nodeGroup)
}

deletedAny = deletedAny || err == nil
a.clusterStateRegistry.InvalidateNodeInstancesCacheEntry(nodeGroup)
}

return deletedAny, nil
return deletedAny
}

// instancesToNodes returns a list of fake nodes with just names populated,
Expand Down
17 changes: 6 additions & 11 deletions cluster-autoscaler/core/static_autoscaler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1164,9 +1164,8 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
removedNodes, err := autoscaler.deleteCreatedNodesWithErrors()
removedNodes := autoscaler.deleteCreatedNodesWithErrors()
assert.True(t, removedNodes)
assert.NoError(t, err)

// check delete was called on correct nodes
nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy(
Expand All @@ -1190,9 +1189,8 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
removedNodes, err = autoscaler.deleteCreatedNodesWithErrors()
removedNodes = autoscaler.deleteCreatedNodesWithErrors()
assert.True(t, removedNodes)
assert.NoError(t, err)

// nodes should be deleted again
nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy(
Expand Down Expand Up @@ -1255,9 +1253,8 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
removedNodes, err = autoscaler.deleteCreatedNodesWithErrors()
removedNodes = autoscaler.deleteCreatedNodesWithErrors()
assert.False(t, removedNodes)
assert.NoError(t, err)

// we expect no more Delete Nodes
nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", 2)
Expand Down Expand Up @@ -1297,10 +1294,9 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {
// update cluster state
clusterState.UpdateNodes([]*apiv1.Node{}, nil, time.Now())

// return early on failed nodes without matching nodegroups
removedNodes, err = autoscaler.deleteCreatedNodesWithErrors()
// No nodes are deleted when failed nodes don't have matching node groups
removedNodes = autoscaler.deleteCreatedNodesWithErrors()
assert.False(t, removedNodes)
assert.Error(t, err)
nodeGroupC.AssertNumberOfCalls(t, "DeleteNodes", 0)

nodeGroupAtomic := &mockprovider.NodeGroup{}
Expand Down Expand Up @@ -1355,9 +1351,8 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
removedNodes, err = autoscaler.deleteCreatedNodesWithErrors()
removedNodes = autoscaler.deleteCreatedNodesWithErrors()
assert.True(t, removedNodes)
assert.NoError(t, err)

nodeGroupAtomic.AssertCalled(t, "DeleteNodes", mock.MatchedBy(
func(nodes []*apiv1.Node) bool {
Expand Down

0 comments on commit bbbd4ef

Please sign in to comment.