Skip to content

Commit

Permalink
Merge pull request #7065 from bskiba/cleanup-for-stockout
Browse files Browse the repository at this point in the history
Return nodes with create errors mapped to node group id
  • Loading branch information
k8s-ci-robot authored Jul 18, 2024
2 parents 328244d + 9ed9b46 commit dee3865
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 47 deletions.
10 changes: 5 additions & 5 deletions cluster-autoscaler/clusterstate/clusterstate.go
Original file line number Diff line number Diff line change
Expand Up @@ -1199,17 +1199,17 @@ func (csr *ClusterStateRegistry) buildInstanceToErrorCodeMappings(instances []cl
return
}

// GetCreatedNodesWithErrors returns list of nodes being created which reported create error.
func (csr *ClusterStateRegistry) GetCreatedNodesWithErrors() []*apiv1.Node {
// GetCreatedNodesWithErrors returns a map from node group id to list of nodes which reported a create error.
func (csr *ClusterStateRegistry) GetCreatedNodesWithErrors() map[string][]*apiv1.Node {
csr.Lock()
defer csr.Unlock()

nodesWithCreateErrors := make([]*apiv1.Node, 0, 0)
for _, nodeGroupInstances := range csr.cloudProviderNodeInstances {
nodesWithCreateErrors := make(map[string][]*apiv1.Node)
for nodeGroupId, nodeGroupInstances := range csr.cloudProviderNodeInstances {
_, _, instancesByErrorCode := csr.buildInstanceToErrorCodeMappings(nodeGroupInstances)
for _, instances := range instancesByErrorCode {
for _, instance := range instances {
nodesWithCreateErrors = append(nodesWithCreateErrors, FakeNode(instance, cloudprovider.FakeNodeCreateError))
nodesWithCreateErrors[nodeGroupId] = append(nodesWithCreateErrors[nodeGroupId], FakeNode(instance, cloudprovider.FakeNodeCreateError))
}
}
}
Expand Down
38 changes: 7 additions & 31 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -448,12 +448,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
return nil
}

danglingNodes, err := a.deleteCreatedNodesWithErrors()
if err != nil {
klog.Warningf("Failed to remove nodes that were created with errors, skipping iteration: %v", err)
return nil
}
if danglingNodes {
if deletedNodes := a.deleteCreatedNodesWithErrors(); deletedNodes {
klog.V(0).Infof("Some nodes that failed to create were removed, skipping iteration")
return nil
}
Expand Down Expand Up @@ -836,30 +831,11 @@ func toNodes(unregisteredNodes []clusterstate.UnregisteredNode) []*apiv1.Node {
return nodes
}

func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() (bool, error) {
func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool {
// We always schedule deleting of incoming errornous nodes
// TODO[lukaszos] Consider adding logic to not retry delete every loop iteration
nodes := a.clusterStateRegistry.GetCreatedNodesWithErrors()

nodeGroups := a.nodeGroupsById()
nodesToBeDeletedByNodeGroupId := make(map[string][]*apiv1.Node)

for _, node := range nodes {
nodeGroup, err := a.CloudProvider.NodeGroupForNode(node)
if err != nil {
id := "<nil>"
if node != nil {
id = node.Spec.ProviderID
}
klog.Warningf("Cannot determine nodeGroup for node %v; %v", id, err)
continue
}
if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() {
a.clusterStateRegistry.RefreshCloudProviderNodeInstancesCache()
return false, fmt.Errorf("node %s has no known nodegroup", node.GetName())
}
nodesToBeDeletedByNodeGroupId[nodeGroup.Id()] = append(nodesToBeDeletedByNodeGroupId[nodeGroup.Id()], node)
}
nodesToBeDeletedByNodeGroupId := a.clusterStateRegistry.GetCreatedNodesWithErrors()

deletedAny := false

Expand Down Expand Up @@ -892,13 +868,13 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() (bool, error) {

if err != nil {
klog.Warningf("Error while trying to delete nodes from %v: %v", nodeGroupId, err)
} else {
deletedAny = true
a.clusterStateRegistry.InvalidateNodeInstancesCacheEntry(nodeGroup)
}

deletedAny = deletedAny || err == nil
a.clusterStateRegistry.InvalidateNodeInstancesCacheEntry(nodeGroup)
}

return deletedAny, nil
return deletedAny
}

// instancesToNodes returns a list of fake nodes with just names populated,
Expand Down
17 changes: 6 additions & 11 deletions cluster-autoscaler/core/static_autoscaler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1709,9 +1709,8 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
removedNodes, err := autoscaler.deleteCreatedNodesWithErrors()
removedNodes := autoscaler.deleteCreatedNodesWithErrors()
assert.True(t, removedNodes)
assert.NoError(t, err)

// check delete was called on correct nodes
nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy(
Expand All @@ -1735,9 +1734,8 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
removedNodes, err = autoscaler.deleteCreatedNodesWithErrors()
removedNodes = autoscaler.deleteCreatedNodesWithErrors()
assert.True(t, removedNodes)
assert.NoError(t, err)

// nodes should be deleted again
nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy(
Expand Down Expand Up @@ -1800,9 +1798,8 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
removedNodes, err = autoscaler.deleteCreatedNodesWithErrors()
removedNodes = autoscaler.deleteCreatedNodesWithErrors()
assert.False(t, removedNodes)
assert.NoError(t, err)

// we expect no more Delete Nodes
nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", 2)
Expand Down Expand Up @@ -1842,10 +1839,9 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {
// update cluster state
clusterState.UpdateNodes([]*apiv1.Node{}, nil, time.Now())

// return early on failed nodes without matching nodegroups
removedNodes, err = autoscaler.deleteCreatedNodesWithErrors()
// No nodes are deleted when failed nodes don't have matching node groups
removedNodes = autoscaler.deleteCreatedNodesWithErrors()
assert.False(t, removedNodes)
assert.Error(t, err)
nodeGroupC.AssertNumberOfCalls(t, "DeleteNodes", 0)

nodeGroupAtomic := &mockprovider.NodeGroup{}
Expand Down Expand Up @@ -1900,9 +1896,8 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
removedNodes, err = autoscaler.deleteCreatedNodesWithErrors()
removedNodes = autoscaler.deleteCreatedNodesWithErrors()
assert.True(t, removedNodes)
assert.NoError(t, err)

nodeGroupAtomic.AssertCalled(t, "DeleteNodes", mock.MatchedBy(
func(nodes []*apiv1.Node) bool {
Expand Down

0 comments on commit dee3865

Please sign in to comment.