diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go index 68477081f8b0..894fe1008aa2 100644 --- a/cluster-autoscaler/clusterstate/clusterstate.go +++ b/cluster-autoscaler/clusterstate/clusterstate.go @@ -1199,17 +1199,17 @@ func (csr *ClusterStateRegistry) buildInstanceToErrorCodeMappings(instances []cl return } -// GetCreatedNodesWithErrors returns list of nodes being created which reported create error. -func (csr *ClusterStateRegistry) GetCreatedNodesWithErrors() []*apiv1.Node { +// GetCreatedNodesWithErrors returns a map from node group id to list of nodes which reported a create error. +func (csr *ClusterStateRegistry) GetCreatedNodesWithErrors() map[string][]*apiv1.Node { csr.Lock() defer csr.Unlock() - nodesWithCreateErrors := make([]*apiv1.Node, 0, 0) - for _, nodeGroupInstances := range csr.cloudProviderNodeInstances { + nodesWithCreateErrors := make(map[string][]*apiv1.Node) + for nodeGroupId, nodeGroupInstances := range csr.cloudProviderNodeInstances { _, _, instancesByErrorCode := csr.buildInstanceToErrorCodeMappings(nodeGroupInstances) for _, instances := range instancesByErrorCode { for _, instance := range instances { - nodesWithCreateErrors = append(nodesWithCreateErrors, FakeNode(instance, cloudprovider.FakeNodeCreateError)) + nodesWithCreateErrors[nodeGroupId] = append(nodesWithCreateErrors[nodeGroupId], FakeNode(instance, cloudprovider.FakeNodeCreateError)) } } } diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 1fcc08c08ad8..d3c59b56c59c 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -448,12 +448,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr return nil } - danglingNodes, err := a.deleteCreatedNodesWithErrors() - if err != nil { - klog.Warningf("Failed to remove nodes that were created with errors, skipping iteration: %v", err) - return nil - } - if danglingNodes { + if deletedNodes := a.deleteCreatedNodesWithErrors(); deletedNodes { klog.V(0).Infof("Some nodes that failed to create were removed, skipping iteration") return nil } @@ -836,30 +831,11 @@ func toNodes(unregisteredNodes []clusterstate.UnregisteredNode) []*apiv1.Node { return nodes } -func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() (bool, error) { +func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { // We always schedule deleting of incoming errornous nodes // TODO[lukaszos] Consider adding logic to not retry delete every loop iteration - nodes := a.clusterStateRegistry.GetCreatedNodesWithErrors() - nodeGroups := a.nodeGroupsById() - nodesToBeDeletedByNodeGroupId := make(map[string][]*apiv1.Node) - - for _, node := range nodes { - nodeGroup, err := a.CloudProvider.NodeGroupForNode(node) - if err != nil { - id := "" - if node != nil { - id = node.Spec.ProviderID - } - klog.Warningf("Cannot determine nodeGroup for node %v; %v", id, err) - continue - } - if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { - a.clusterStateRegistry.RefreshCloudProviderNodeInstancesCache() - return false, fmt.Errorf("node %s has no known nodegroup", node.GetName()) - } - nodesToBeDeletedByNodeGroupId[nodeGroup.Id()] = append(nodesToBeDeletedByNodeGroupId[nodeGroup.Id()], node) - } + nodesToBeDeletedByNodeGroupId := a.clusterStateRegistry.GetCreatedNodesWithErrors() deletedAny := false @@ -892,13 +868,13 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() (bool, error) { if err != nil { klog.Warningf("Error while trying to delete nodes from %v: %v", nodeGroupId, err) + } else { + deletedAny = true + a.clusterStateRegistry.InvalidateNodeInstancesCacheEntry(nodeGroup) } - - deletedAny = deletedAny || err == nil - a.clusterStateRegistry.InvalidateNodeInstancesCacheEntry(nodeGroup) } - return deletedAny, nil + return deletedAny } // instancesToNodes returns a list of fake nodes with just names populated, diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index f0d3ed512047..f47e88f2021f 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -1709,9 +1709,8 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes, err := autoscaler.deleteCreatedNodesWithErrors() + removedNodes := autoscaler.deleteCreatedNodesWithErrors() assert.True(t, removedNodes) - assert.NoError(t, err) // check delete was called on correct nodes nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy( @@ -1735,9 +1734,8 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes, err = autoscaler.deleteCreatedNodesWithErrors() + removedNodes = autoscaler.deleteCreatedNodesWithErrors() assert.True(t, removedNodes) - assert.NoError(t, err) // nodes should be deleted again nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy( @@ -1800,9 +1798,8 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes, err = autoscaler.deleteCreatedNodesWithErrors() + removedNodes = autoscaler.deleteCreatedNodesWithErrors() assert.False(t, removedNodes) - assert.NoError(t, err) // we expect no more Delete Nodes nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", 2) @@ -1842,10 +1839,9 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { // update cluster state clusterState.UpdateNodes([]*apiv1.Node{}, nil, time.Now()) - // return early on failed nodes without matching nodegroups - removedNodes, err = autoscaler.deleteCreatedNodesWithErrors() + // No nodes are deleted when failed nodes don't have matching node groups + removedNodes = autoscaler.deleteCreatedNodesWithErrors() assert.False(t, removedNodes) - assert.Error(t, err) nodeGroupC.AssertNumberOfCalls(t, "DeleteNodes", 0) nodeGroupAtomic := &mockprovider.NodeGroup{} @@ -1900,9 +1896,8 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes, err = autoscaler.deleteCreatedNodesWithErrors() + removedNodes = autoscaler.deleteCreatedNodesWithErrors() assert.True(t, removedNodes) - assert.NoError(t, err) nodeGroupAtomic.AssertCalled(t, "DeleteNodes", mock.MatchedBy( func(nodes []*apiv1.Node) bool {