From 9ed9b461377b41bf70c0460addc89084810bd418 Mon Sep 17 00:00:00 2001 From: "Beata Lach (Skiba)" Date: Fri, 12 Jul 2024 11:08:54 +0000 Subject: [PATCH] Return nodes with create errors by node group id In order to simplify the deleteNodesWithErrors code, return nodeGroupID as well as nodes with create errors. That way we avoid the additional node group matching code. --- .../clusterstate/clusterstate.go | 10 ++--- cluster-autoscaler/core/static_autoscaler.go | 38 ++++--------------- .../core/static_autoscaler_test.go | 17 +++------ 3 files changed, 18 insertions(+), 47 deletions(-) diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go index 68477081f8b0..894fe1008aa2 100644 --- a/cluster-autoscaler/clusterstate/clusterstate.go +++ b/cluster-autoscaler/clusterstate/clusterstate.go @@ -1199,17 +1199,17 @@ func (csr *ClusterStateRegistry) buildInstanceToErrorCodeMappings(instances []cl return } -// GetCreatedNodesWithErrors returns list of nodes being created which reported create error. -func (csr *ClusterStateRegistry) GetCreatedNodesWithErrors() []*apiv1.Node { +// GetCreatedNodesWithErrors returns a map from node group id to list of nodes which reported a create error. +func (csr *ClusterStateRegistry) GetCreatedNodesWithErrors() map[string][]*apiv1.Node { csr.Lock() defer csr.Unlock() - nodesWithCreateErrors := make([]*apiv1.Node, 0, 0) - for _, nodeGroupInstances := range csr.cloudProviderNodeInstances { + nodesWithCreateErrors := make(map[string][]*apiv1.Node) + for nodeGroupId, nodeGroupInstances := range csr.cloudProviderNodeInstances { _, _, instancesByErrorCode := csr.buildInstanceToErrorCodeMappings(nodeGroupInstances) for _, instances := range instancesByErrorCode { for _, instance := range instances { - nodesWithCreateErrors = append(nodesWithCreateErrors, FakeNode(instance, cloudprovider.FakeNodeCreateError)) + nodesWithCreateErrors[nodeGroupId] = append(nodesWithCreateErrors[nodeGroupId], FakeNode(instance, cloudprovider.FakeNodeCreateError)) } } } diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index b45733bdbb20..5a1245fcf89c 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -448,12 +448,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr return nil } - danglingNodes, err := a.deleteCreatedNodesWithErrors() - if err != nil { - klog.Warningf("Failed to remove nodes that were created with errors, skipping iteration: %v", err) - return nil - } - if danglingNodes { + if deletedNodes := a.deleteCreatedNodesWithErrors(); deletedNodes { klog.V(0).Infof("Some nodes that failed to create were removed, skipping iteration") return nil } @@ -837,30 +832,11 @@ func toNodes(unregisteredNodes []clusterstate.UnregisteredNode) []*apiv1.Node { return nodes } -func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() (bool, error) { +func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { // We always schedule deleting of incoming errornous nodes // TODO[lukaszos] Consider adding logic to not retry delete every loop iteration - nodes := a.clusterStateRegistry.GetCreatedNodesWithErrors() - nodeGroups := a.nodeGroupsById() - nodesToBeDeletedByNodeGroupId := make(map[string][]*apiv1.Node) - - for _, node := range nodes { - nodeGroup, err := a.CloudProvider.NodeGroupForNode(node) - if err != nil { - id := "" - if node != nil { - id = node.Spec.ProviderID - } - klog.Warningf("Cannot determine nodeGroup for node %v; %v", id, err) - continue - } - if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { - a.clusterStateRegistry.RefreshCloudProviderNodeInstancesCache() - return false, fmt.Errorf("node %s has no known nodegroup", node.GetName()) - } - nodesToBeDeletedByNodeGroupId[nodeGroup.Id()] = append(nodesToBeDeletedByNodeGroupId[nodeGroup.Id()], node) - } + nodesToBeDeletedByNodeGroupId := a.clusterStateRegistry.GetCreatedNodesWithErrors() deletedAny := false @@ -893,13 +869,13 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() (bool, error) { if err != nil { klog.Warningf("Error while trying to delete nodes from %v: %v", nodeGroupId, err) + } else { + deletedAny = true + a.clusterStateRegistry.InvalidateNodeInstancesCacheEntry(nodeGroup) } - - deletedAny = deletedAny || err == nil - a.clusterStateRegistry.InvalidateNodeInstancesCacheEntry(nodeGroup) } - return deletedAny, nil + return deletedAny } // instancesToNodes returns a list of fake nodes with just names populated, diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index f0d3ed512047..f47e88f2021f 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -1709,9 +1709,8 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes, err := autoscaler.deleteCreatedNodesWithErrors() + removedNodes := autoscaler.deleteCreatedNodesWithErrors() assert.True(t, removedNodes) - assert.NoError(t, err) // check delete was called on correct nodes nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy( @@ -1735,9 +1734,8 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes, err = autoscaler.deleteCreatedNodesWithErrors() + removedNodes = autoscaler.deleteCreatedNodesWithErrors() assert.True(t, removedNodes) - assert.NoError(t, err) // nodes should be deleted again nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy( @@ -1800,9 +1798,8 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes, err = autoscaler.deleteCreatedNodesWithErrors() + removedNodes = autoscaler.deleteCreatedNodesWithErrors() assert.False(t, removedNodes) - assert.NoError(t, err) // we expect no more Delete Nodes nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", 2) @@ -1842,10 +1839,9 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { // update cluster state clusterState.UpdateNodes([]*apiv1.Node{}, nil, time.Now()) - // return early on failed nodes without matching nodegroups - removedNodes, err = autoscaler.deleteCreatedNodesWithErrors() + // No nodes are deleted when failed nodes don't have matching node groups + removedNodes = autoscaler.deleteCreatedNodesWithErrors() assert.False(t, removedNodes) - assert.Error(t, err) nodeGroupC.AssertNumberOfCalls(t, "DeleteNodes", 0) nodeGroupAtomic := &mockprovider.NodeGroup{} @@ -1900,9 +1896,8 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes, err = autoscaler.deleteCreatedNodesWithErrors() + removedNodes = autoscaler.deleteCreatedNodesWithErrors() assert.True(t, removedNodes) - assert.NoError(t, err) nodeGroupAtomic.AssertCalled(t, "DeleteNodes", mock.MatchedBy( func(nodes []*apiv1.Node) bool {