Skip to content

Commit

Permalink
Reports node taints.
Browse files Browse the repository at this point in the history
  • Loading branch information
piotrwrotniak committed Oct 17, 2023
1 parent cc888a1 commit 8a6e156
Show file tree
Hide file tree
Showing 5 changed files with 209 additions and 11 deletions.
8 changes: 8 additions & 0 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -955,6 +955,7 @@ func (a *StaticAutoscaler) obtainNodeLists(cp cloudprovider.CloudProvider) ([]*a
klog.Errorf("Failed to list ready nodes: %v", err)
return nil, nil, caerrors.ToAutoscalerError(caerrors.ApiCallError, err)
}
a.reportTaintsCount(allNodes)

// Handle GPU case - allocatable GPU may be equal to 0 up to 15 minutes after
// node registers as ready. See https://github.com/kubernetes/kubernetes/issues/54959
Expand All @@ -978,6 +979,13 @@ func (a *StaticAutoscaler) updateClusterState(allNodes []*apiv1.Node, nodeInfosF
return nil
}

func (a *StaticAutoscaler) reportTaintsCount(nodes []*apiv1.Node) {
foundTaints := taints.CountNodeTaints(nodes, a.taintConfig)
for taintType, count := range foundTaints {
metrics.ObserveNodeTaintsCount(taintType, float64(count))
}
}

func allPodsAreNew(pods []*apiv1.Pod, currentTime time.Time) bool {
if core_utils.GetOldestCreateTime(pods).Add(unschedulablePodTimeBuffer).After(currentTime) {
return true
Expand Down
15 changes: 15 additions & 0 deletions cluster-autoscaler/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,15 @@ var (
Help: "Number of node groups deleted by Node Autoprovisioning.",
},
)

nodeTaintsCount = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "node_taints_count",
Help: "Number of taints per type used in the cluster.",
},
[]string{"type"},
)
)

// RegisterAll registers all metrics.
Expand Down Expand Up @@ -407,6 +416,7 @@ func RegisterAll(emitPerNodeGroupMetrics bool) {
legacyregistry.MustRegister(nodeGroupCreationCount)
legacyregistry.MustRegister(nodeGroupDeletionCount)
legacyregistry.MustRegister(pendingNodeDeletions)
legacyregistry.MustRegister(nodeTaintsCount)

if emitPerNodeGroupMetrics {
legacyregistry.MustRegister(nodesGroupMinNodes)
Expand Down Expand Up @@ -615,3 +625,8 @@ func RegisterSkippedScaleUpMemory() {
func ObservePendingNodeDeletions(value int) {
pendingNodeDeletions.Set(float64(value))
}

// ObserveNodeTaintsCount records the node taints count of given type.
func ObserveNodeTaintsCount(taintType string, count float64) {
nodeTaintsCount.WithLabelValues(taintType).Set(count)
}
23 changes: 12 additions & 11 deletions cluster-autoscaler/proposals/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,18 @@ All the metrics are prefixed with `cluster_autoscaler_`.

### Cluster state

| Metric name | Metric type | Labels | Description |
| ----------- | ----------- | ------ | ----------- |
| cluster_safe_to_autoscale | Gauge | | Whether or not cluster is healthy enough for autoscaling. 1 if it is, 0 otherwise. |
| nodes_count | Gauge | `state`=<node-state> | Number of nodes in cluster. |
| unschedulable_pods_count | Gauge | | Number of unschedulable ("Pending") pods in the cluster. |
| node_groups_count | Gauge | `node_group_type`=<node-group-type> | Number of node groups managed by CA. |
| max_nodes_count | Gauge | | Maximum number of nodes in all node groups. |
| cluster_cpu_current_cores | Gauge | | | Current number of cores in the cluster, minus deleting nodes. |
| cpu_limits_cores | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of cores in the cluster. |
| cluster_memory_current_bytes | Gauge | | Current number of bytes of memory in the cluster, minus deleting nodes. |
| memory_limits_bytes | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of bytes of memory in cluster. |
| Metric name | Metric type | Labels | Description |
| ----------- | ----------- |--------------------------------------------|------------------------------------------------------------------------------------|
| cluster_safe_to_autoscale | Gauge | | Whether or not cluster is healthy enough for autoscaling. 1 if it is, 0 otherwise. |
| nodes_count | Gauge | `state`=<node-state> | Number of nodes in cluster. |
| unschedulable_pods_count | Gauge | | Number of unschedulable ("Pending") pods in the cluster. |
| node_groups_count | Gauge | `node_group_type`=<node-group-type> | Number of node groups managed by CA. |
| max_nodes_count | Gauge | | Maximum number of nodes in all node groups. |
| cluster_cpu_current_cores | Gauge | | | Current number of cores in the cluster, minus deleting nodes. |
| cpu_limits_cores | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of cores in the cluster. |
| cluster_memory_current_bytes | Gauge | | Current number of bytes of memory in the cluster, minus deleting nodes. |
| memory_limits_bytes | Gauge | `direction`=<`minimum` or `maximum`> | Minimum and maximum number of bytes of memory in cluster. |
| node_taints_count | Gauge | `type`=<taint-type> | Number of taints per type in cluster. |

* `cluster_safe_to_autoscale` indicates whether cluster is healthy enough for autoscaling. CA stops all operations if significant number of nodes are unready (by default 33% as of CA 0.5.4).
* `nodes_count` records the total number of nodes, labeled by node state. Possible
Expand Down
73 changes: 73 additions & 0 deletions cluster-autoscaler/utils/taints/taints.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,15 @@ const (

// AWS: Indicates that a node has volumes stuck in attaching state and hence it is not fit for scheduling more pods
awsNodeWithImpairedVolumesTaint = "NodeWithImpairedVolumes"

// statusNodeTaintReportedType is the value used when reporting node taint count defined as status taint in given taintConfig.
statusNodeTaintReportedType = "status-taint"

// startupNodeTaintReportedType is the value used when reporting node taint count defined as startup taint in given taintConfig.
startupNodeTaintReportedType = "startup-taint"

// unlistedNodeTaintReportedType is the value used when reporting node taint count in case taint key is other than defined in explicitlyReportedNodeTaints and taintConfig.
unlistedNodeTaintReportedType = "other"
)

// TaintKeySet is a set of taint key
Expand Down Expand Up @@ -108,6 +117,23 @@ var (
// Mutable only in unit tests
maxRetryDeadline time.Duration = 5 * time.Second
conflictRetryInterval time.Duration = 750 * time.Millisecond

explicitlyReportedNodeTaints = TaintKeySet{
apiv1.TaintNodeNotReady: true,
apiv1.TaintNodeUnreachable: true,
apiv1.TaintNodeUnschedulable: true,
apiv1.TaintNodeMemoryPressure: true,
apiv1.TaintNodeDiskPressure: true,
apiv1.TaintNodeNetworkUnavailable: true,
apiv1.TaintNodePIDPressure: true,
apiv1.TaintNodeOutOfService: true,
cloudproviderapi.TaintExternalCloudProvider: true,
cloudproviderapi.TaintNodeShutdown: true,
gkeNodeTerminationHandlerTaint: true,
awsNodeWithImpairedVolumesTaint: true,
ToBeDeletedTaint: true,
DeletionCandidateTaint: true,
}
)

// getKeyShortName converts taint key to short name for logging
Expand Down Expand Up @@ -416,3 +442,50 @@ func FilterOutNodesWithStartupTaints(taintConfig TaintConfig, allNodes, readyNod
}
return newAllNodes, newReadyNodes
}

// CountNodeTaints counts used node taints.
func CountNodeTaints(nodes []*apiv1.Node, taintConfig TaintConfig) map[string]int {
foundTaintsCount := make(map[string]int)
for _, node := range nodes {
for _, taint := range node.Spec.Taints {
key := getTaintTypeToReport(taint.Key, taintConfig)
if _, ok := foundTaintsCount[key]; ok {
foundTaintsCount[key] += 1
} else {
foundTaintsCount[key] = 1
}
}
}
return foundTaintsCount
}

func getTaintTypeToReport(key string, taintConfig TaintConfig) string {
// Track deprecated taints.
if strings.HasPrefix(key, IgnoreTaintPrefix) {
return IgnoreTaintPrefix
}

if _, ok := explicitlyReportedNodeTaints[key]; ok {
return key
}

if _, ok := taintConfig.StartupTaints[key]; ok {
return startupNodeTaintReportedType
}
for _, pref := range taintConfig.StartupTaintPrefixes {
if strings.HasPrefix(key, pref) {
return startupNodeTaintReportedType
}
}

if _, ok := taintConfig.StatusTaints[key]; ok {
return statusNodeTaintReportedType
}
for _, pref := range taintConfig.StatusTaintPrefixes {
if strings.HasPrefix(key, pref) {
return statusNodeTaintReportedType
}
}

return unlistedNodeTaintReportedType
}
101 changes: 101 additions & 0 deletions cluster-autoscaler/utils/taints/taints_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -572,3 +572,104 @@ func TestSanitizeTaints(t *testing.T) {
assert.Equal(t, newTaints[0].Key, StatusTaintPrefix+"some-taint")
assert.Equal(t, newTaints[1].Key, "test-taint")
}

func TestCountNodeTaints(t *testing.T) {
node := &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-count-node-taints",
CreationTimestamp: metav1.NewTime(time.Now()),
},
Spec: apiv1.NodeSpec{
Taints: []apiv1.Taint{
{
Key: IgnoreTaintPrefix + "another-taint",
Value: "myValue",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: StatusTaintPrefix + "some-taint",
Value: "myValue",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: StartupTaintPrefix + "some-taint",
Value: "myValue",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "test-taint",
Value: "test2",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: ToBeDeletedTaint,
Value: "1",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "ignore-me",
Value: "1",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "status-me",
Value: "1",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "node.kubernetes.io/memory-pressure",
Value: "1",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "ignore-taint.cluster-autoscaler.kubernetes.io/to-be-ignored",
Value: "myValue2",
Effect: apiv1.TaintEffectNoSchedule,
},
},
},
Status: apiv1.NodeStatus{
Conditions: []apiv1.NodeCondition{},
},
}
node2 := &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-count-node-taints",
CreationTimestamp: metav1.NewTime(time.Now()),
},
Spec: apiv1.NodeSpec{
Taints: []apiv1.Taint{
{
Key: StatusTaintPrefix + "some-taint",
Value: "myValue",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "node.kubernetes.io/unschedulable",
Value: "1",
Effect: apiv1.TaintEffectNoSchedule,
},
},
},
Status: apiv1.NodeStatus{
Conditions: []apiv1.NodeCondition{},
},
}
taintConfig := TaintConfig{
StartupTaints: map[string]bool{"ignore-me": true},
StatusTaints: map[string]bool{"status-me": true},
StartupTaintPrefixes: []string{IgnoreTaintPrefix, StartupTaintPrefix},
StatusTaintPrefixes: []string{StatusTaintPrefix},
}
want := map[string]int{
"ignore-taint.cluster-autoscaler.kubernetes.io/": 2,
"ToBeDeletedByClusterAutoscaler": 1,
"node.kubernetes.io/memory-pressure": 1,
"node.kubernetes.io/unschedulable": 1,
"other": 1,
"startup-taint": 2,
"status-taint": 3,
}
got := CountNodeTaints([]*apiv1.Node{node, node2}, taintConfig)
assert.Equal(t, want, got)
}

0 comments on commit 8a6e156

Please sign in to comment.