From f424743658882adfec903abaeed04b90b0053256 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Wr=C3=B3tniak?= Date: Tue, 17 Oct 2023 14:36:14 +0000 Subject: [PATCH] Reports node taints. --- cluster-autoscaler/core/static_autoscaler.go | 8 ++ cluster-autoscaler/metrics/metrics.go | 14 +++ cluster-autoscaler/utils/taints/taints.go | 73 +++++++++++++ .../utils/taints/taints_test.go | 101 ++++++++++++++++++ 4 files changed, 196 insertions(+) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 68b870786d02..ac3103973931 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -955,6 +955,7 @@ func (a *StaticAutoscaler) obtainNodeLists(cp cloudprovider.CloudProvider) ([]*a klog.Errorf("Failed to list ready nodes: %v", err) return nil, nil, caerrors.ToAutoscalerError(caerrors.ApiCallError, err) } + a.reportTaintsCount(allNodes) // Handle GPU case - allocatable GPU may be equal to 0 up to 15 minutes after // node registers as ready. See https://github.com/kubernetes/kubernetes/issues/54959 @@ -978,6 +979,13 @@ func (a *StaticAutoscaler) updateClusterState(allNodes []*apiv1.Node, nodeInfosF return nil } +func (a *StaticAutoscaler) reportTaintsCount(nodes []*apiv1.Node) { + foundTaints := taints.CountNodeTaints(nodes, a.taintConfig) + for taintType, count := range foundTaints { + metrics.ObserveNodeTaintsCount(taintType, float64(count)) + } +} + func allPodsAreNew(pods []*apiv1.Pod, currentTime time.Time) bool { if core_utils.GetOldestCreateTime(pods).Add(unschedulablePodTimeBuffer).After(currentTime) { return true diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go index 8f4e0d869ddd..a62f68aaa398 100644 --- a/cluster-autoscaler/metrics/metrics.go +++ b/cluster-autoscaler/metrics/metrics.go @@ -373,6 +373,15 @@ var ( Help: "Number of node groups deleted by Node Autoprovisioning.", }, ) + + nodeTaintsCount = k8smetrics.NewGaugeVec( + &k8smetrics.GaugeOpts{ + Namespace: caNamespace, + Name: "node_taints_count", + Help: "Number of taints per type used in the cluster.", + }, + []string{"type"}, + ) ) // RegisterAll registers all metrics. @@ -615,3 +624,8 @@ func RegisterSkippedScaleUpMemory() { func ObservePendingNodeDeletions(value int) { pendingNodeDeletions.Set(float64(value)) } + +// ObserveNodeTaintsCount records the node taints count of given type. +func ObserveNodeTaintsCount(taintType string, count float64) { + nodeTaintsCount.WithLabelValues(taintType).Set(count) +} diff --git a/cluster-autoscaler/utils/taints/taints.go b/cluster-autoscaler/utils/taints/taints.go index c3a09db60e12..8e06a44fbcc8 100644 --- a/cluster-autoscaler/utils/taints/taints.go +++ b/cluster-autoscaler/utils/taints/taints.go @@ -54,6 +54,15 @@ const ( // AWS: Indicates that a node has volumes stuck in attaching state and hence it is not fit for scheduling more pods awsNodeWithImpairedVolumesTaint = "NodeWithImpairedVolumes" + + // statusNodeTaintReportedType is the value used when reporting node taint count defined as status taint in given taintConfig. + statusNodeTaintReportedType = "status-taint" + + // startupNodeTaintReportedType is the value used when reporting node taint count defined as startup taint in given taintConfig. + startupNodeTaintReportedType = "startup-taint" + + // unlistedNodeTaintReportedType is the value used when reporting node taint count in case taint key is other than defined in explicitlyReportedNodeTaints and taintConfig. + unlistedNodeTaintReportedType = "other" ) // TaintKeySet is a set of taint key @@ -108,6 +117,23 @@ var ( // Mutable only in unit tests maxRetryDeadline time.Duration = 5 * time.Second conflictRetryInterval time.Duration = 750 * time.Millisecond + + explicitlyReportedNodeTaints = TaintKeySet{ + apiv1.TaintNodeNotReady: true, + apiv1.TaintNodeUnreachable: true, + apiv1.TaintNodeUnschedulable: true, + apiv1.TaintNodeMemoryPressure: true, + apiv1.TaintNodeDiskPressure: true, + apiv1.TaintNodeNetworkUnavailable: true, + apiv1.TaintNodePIDPressure: true, + apiv1.TaintNodeOutOfService: true, + cloudproviderapi.TaintExternalCloudProvider: true, + cloudproviderapi.TaintNodeShutdown: true, + gkeNodeTerminationHandlerTaint: true, + awsNodeWithImpairedVolumesTaint: true, + ToBeDeletedTaint: true, + DeletionCandidateTaint: true, + } ) // getKeyShortName converts taint key to short name for logging @@ -416,3 +442,50 @@ func FilterOutNodesWithStartupTaints(taintConfig TaintConfig, allNodes, readyNod } return newAllNodes, newReadyNodes } + +// CountNodeTaints counts used node taints. +func CountNodeTaints(nodes []*apiv1.Node, taintConfig TaintConfig) map[string]int { + foundTaintsCount := make(map[string]int) + for _, node := range nodes { + for _, taint := range node.Spec.Taints { + key := getTaintTypeToReport(taint.Key, taintConfig) + if _, ok := foundTaintsCount[key]; ok { + foundTaintsCount[key] += 1 + } else { + foundTaintsCount[key] = 1 + } + } + } + return foundTaintsCount +} + +func getTaintTypeToReport(key string, taintConfig TaintConfig) string { + // Track deprecated taints. + if strings.HasPrefix(key, IgnoreTaintPrefix) { + return IgnoreTaintPrefix + } + + if _, ok := explicitlyReportedNodeTaints[key]; ok { + return key + } + + if _, ok := taintConfig.StartupTaints[key]; ok { + return startupNodeTaintReportedType + } + for _, pref := range taintConfig.StartupTaintPrefixes { + if strings.HasPrefix(key, pref) { + return startupNodeTaintReportedType + } + } + + if _, ok := taintConfig.StatusTaints[key]; ok { + return statusNodeTaintReportedType + } + for _, pref := range taintConfig.StatusTaintPrefixes { + if strings.HasPrefix(key, pref) { + return statusNodeTaintReportedType + } + } + + return unlistedNodeTaintReportedType +} diff --git a/cluster-autoscaler/utils/taints/taints_test.go b/cluster-autoscaler/utils/taints/taints_test.go index fff009c81792..141728607709 100644 --- a/cluster-autoscaler/utils/taints/taints_test.go +++ b/cluster-autoscaler/utils/taints/taints_test.go @@ -572,3 +572,104 @@ func TestSanitizeTaints(t *testing.T) { assert.Equal(t, newTaints[0].Key, StatusTaintPrefix+"some-taint") assert.Equal(t, newTaints[1].Key, "test-taint") } + +func TestCountNodeTaints(t *testing.T) { + node := &apiv1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-count-node-taints", + CreationTimestamp: metav1.NewTime(time.Now()), + }, + Spec: apiv1.NodeSpec{ + Taints: []apiv1.Taint{ + { + Key: IgnoreTaintPrefix + "another-taint", + Value: "myValue", + Effect: apiv1.TaintEffectNoSchedule, + }, + { + Key: StatusTaintPrefix + "some-taint", + Value: "myValue", + Effect: apiv1.TaintEffectNoSchedule, + }, + { + Key: StartupTaintPrefix + "some-taint", + Value: "myValue", + Effect: apiv1.TaintEffectNoSchedule, + }, + { + Key: "test-taint", + Value: "test2", + Effect: apiv1.TaintEffectNoSchedule, + }, + { + Key: ToBeDeletedTaint, + Value: "1", + Effect: apiv1.TaintEffectNoSchedule, + }, + { + Key: "ignore-me", + Value: "1", + Effect: apiv1.TaintEffectNoSchedule, + }, + { + Key: "status-me", + Value: "1", + Effect: apiv1.TaintEffectNoSchedule, + }, + { + Key: "node.kubernetes.io/memory-pressure", + Value: "1", + Effect: apiv1.TaintEffectNoSchedule, + }, + { + Key: "ignore-taint.cluster-autoscaler.kubernetes.io/to-be-ignored", + Value: "myValue2", + Effect: apiv1.TaintEffectNoSchedule, + }, + }, + }, + Status: apiv1.NodeStatus{ + Conditions: []apiv1.NodeCondition{}, + }, + } + node2 := &apiv1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-count-node-taints", + CreationTimestamp: metav1.NewTime(time.Now()), + }, + Spec: apiv1.NodeSpec{ + Taints: []apiv1.Taint{ + { + Key: StatusTaintPrefix + "some-taint", + Value: "myValue", + Effect: apiv1.TaintEffectNoSchedule, + }, + { + Key: "node.kubernetes.io/unschedulable", + Value: "1", + Effect: apiv1.TaintEffectNoSchedule, + }, + }, + }, + Status: apiv1.NodeStatus{ + Conditions: []apiv1.NodeCondition{}, + }, + } + taintConfig := TaintConfig{ + StartupTaints: map[string]bool{"ignore-me": true}, + StatusTaints: map[string]bool{"status-me": true}, + StartupTaintPrefixes: []string{IgnoreTaintPrefix, StartupTaintPrefix}, + StatusTaintPrefixes: []string{StatusTaintPrefix}, + } + want := map[string]int{ + "ignore-taint.cluster-autoscaler.kubernetes.io/": 2, + "ToBeDeletedByClusterAutoscaler": 1, + "node.kubernetes.io/memory-pressure": 1, + "node.kubernetes.io/unschedulable": 1, + "other": 1, + "startup-taint": 2, + "status-taint": 3, + } + got := CountNodeTaints([]*apiv1.Node{node, node2}, taintConfig) + assert.Equal(t, want, got) +}