Skip to content

Commit

Permalink
Merge pull request #6201 from piotrwrotniak/addmetrics
Browse files Browse the repository at this point in the history
Adds and implements node taints count metric.
  • Loading branch information
k8s-ci-robot authored Oct 23, 2023
2 parents 4872bdd + fe6eae5 commit 5cea330
Show file tree
Hide file tree
Showing 4 changed files with 226 additions and 45 deletions.
8 changes: 8 additions & 0 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -955,6 +955,7 @@ func (a *StaticAutoscaler) obtainNodeLists(cp cloudprovider.CloudProvider) ([]*a
klog.Errorf("Failed to list ready nodes: %v", err)
return nil, nil, caerrors.ToAutoscalerError(caerrors.ApiCallError, err)
}
a.reportTaintsCount(allNodes)

// Handle GPU case - allocatable GPU may be equal to 0 up to 15 minutes after
// node registers as ready. See https://github.com/kubernetes/kubernetes/issues/54959
Expand All @@ -978,6 +979,13 @@ func (a *StaticAutoscaler) updateClusterState(allNodes []*apiv1.Node, nodeInfosF
return nil
}

func (a *StaticAutoscaler) reportTaintsCount(nodes []*apiv1.Node) {
foundTaints := taints.CountNodeTaints(nodes, a.taintConfig)
for taintType, count := range foundTaints {
metrics.ObserveNodeTaintsCount(taintType, float64(count))
}
}

func allPodsAreNew(pods []*apiv1.Pod, currentTime time.Time) bool {
if core_utils.GetOldestCreateTime(pods).Add(unschedulablePodTimeBuffer).After(currentTime) {
return true
Expand Down
15 changes: 15 additions & 0 deletions cluster-autoscaler/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,15 @@ var (
Help: "Number of node groups deleted by Node Autoprovisioning.",
},
)

nodeTaintsCount = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "node_taints_count",
Help: "Number of taints per type used in the cluster.",
},
[]string{"type"},
)
)

// RegisterAll registers all metrics.
Expand Down Expand Up @@ -407,6 +416,7 @@ func RegisterAll(emitPerNodeGroupMetrics bool) {
legacyregistry.MustRegister(nodeGroupCreationCount)
legacyregistry.MustRegister(nodeGroupDeletionCount)
legacyregistry.MustRegister(pendingNodeDeletions)
legacyregistry.MustRegister(nodeTaintsCount)

if emitPerNodeGroupMetrics {
legacyregistry.MustRegister(nodesGroupMinNodes)
Expand Down Expand Up @@ -615,3 +625,8 @@ func RegisterSkippedScaleUpMemory() {
func ObservePendingNodeDeletions(value int) {
pendingNodeDeletions.Set(float64(value))
}

// ObserveNodeTaintsCount records the node taints count of given type.
func ObserveNodeTaintsCount(taintType string, count float64) {
nodeTaintsCount.WithLabelValues(taintType).Set(count)
}
138 changes: 98 additions & 40 deletions cluster-autoscaler/utils/taints/taints.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package taints
import (
"context"
"fmt"
"maps"
"strconv"
"strings"
"time"
Expand Down Expand Up @@ -54,17 +55,48 @@ const (

// AWS: Indicates that a node has volumes stuck in attaching state and hence it is not fit for scheduling more pods
awsNodeWithImpairedVolumesTaint = "NodeWithImpairedVolumes"

// statusNodeTaintReportedType is the value used when reporting node taint count defined as status taint in given taintConfig.
statusNodeTaintReportedType = "status-taint"

// startupNodeTaintReportedType is the value used when reporting node taint count defined as startup taint in given taintConfig.
startupNodeTaintReportedType = "startup-taint"

// unlistedNodeTaintReportedType is the value used when reporting node taint count in case taint key is other than defined in explicitlyReportedNodeTaints and taintConfig.
unlistedNodeTaintReportedType = "other"
)

var (
// NodeConditionTaints lists taint keys used as node conditions
NodeConditionTaints = TaintKeySet{
apiv1.TaintNodeNotReady: true,
apiv1.TaintNodeUnreachable: true,
apiv1.TaintNodeUnschedulable: true,
apiv1.TaintNodeMemoryPressure: true,
apiv1.TaintNodeDiskPressure: true,
apiv1.TaintNodeNetworkUnavailable: true,
apiv1.TaintNodePIDPressure: true,
cloudproviderapi.TaintExternalCloudProvider: true,
cloudproviderapi.TaintNodeShutdown: true,
gkeNodeTerminationHandlerTaint: true,
awsNodeWithImpairedVolumesTaint: true,
}

// Mutable only in unit tests
maxRetryDeadline time.Duration = 5 * time.Second
conflictRetryInterval time.Duration = 750 * time.Millisecond
)

// TaintKeySet is a set of taint key
type TaintKeySet map[string]bool

// TaintConfig is a config of taints that require special handling
type TaintConfig struct {
StartupTaints TaintKeySet
StatusTaints TaintKeySet
StartupTaintPrefixes []string
StatusTaintPrefixes []string
startupTaints TaintKeySet
statusTaints TaintKeySet
startupTaintPrefixes []string
statusTaintPrefixes []string
explicitlyReportedTaints TaintKeySet
}

// NewTaintConfig returns the taint config extracted from options
Expand All @@ -81,34 +113,41 @@ func NewTaintConfig(opts config.AutoscalingOptions) TaintConfig {
statusTaints[taintKey] = true
}

explicitlyReportedTaints := TaintKeySet{
ToBeDeletedTaint: true,
DeletionCandidateTaint: true,
}
maps.Copy(explicitlyReportedTaints, NodeConditionTaints)

return TaintConfig{
StartupTaints: startupTaints,
StatusTaints: statusTaints,
StartupTaintPrefixes: []string{IgnoreTaintPrefix, StartupTaintPrefix},
StatusTaintPrefixes: []string{StatusTaintPrefix},
startupTaints: startupTaints,
statusTaints: statusTaints,
startupTaintPrefixes: []string{IgnoreTaintPrefix, StartupTaintPrefix},
statusTaintPrefixes: []string{StatusTaintPrefix},
explicitlyReportedTaints: explicitlyReportedTaints,
}
}

var (
// NodeConditionTaints lists taint keys used as node conditions
NodeConditionTaints = TaintKeySet{
apiv1.TaintNodeNotReady: true,
apiv1.TaintNodeUnreachable: true,
apiv1.TaintNodeUnschedulable: true,
apiv1.TaintNodeMemoryPressure: true,
apiv1.TaintNodeDiskPressure: true,
apiv1.TaintNodeNetworkUnavailable: true,
apiv1.TaintNodePIDPressure: true,
cloudproviderapi.TaintExternalCloudProvider: true,
cloudproviderapi.TaintNodeShutdown: true,
gkeNodeTerminationHandlerTaint: true,
awsNodeWithImpairedVolumesTaint: true,
// IsStartupTaint checks whether given taint is a startup taint.
func (tc TaintConfig) IsStartupTaint(taint string) bool {
if _, ok := tc.startupTaints[taint]; ok {
return true
}
return matchesAnyPrefix(tc.startupTaintPrefixes, taint)
}

// Mutable only in unit tests
maxRetryDeadline time.Duration = 5 * time.Second
conflictRetryInterval time.Duration = 750 * time.Millisecond
)
// IsStatusTaint checks whether given taint is a status taint.
func (tc TaintConfig) IsStatusTaint(taint string) bool {
if _, ok := tc.statusTaints[taint]; ok {
return true
}
return matchesAnyPrefix(tc.statusTaintPrefixes, taint)
}

func (tc TaintConfig) isExplicitlyReportedTaint(taint string) bool {
_, ok := tc.explicitlyReportedTaints[taint]
return ok
}

// getKeyShortName converts taint key to short name for logging
func getKeyShortName(key string) string {
Expand Down Expand Up @@ -361,18 +400,8 @@ func SanitizeTaints(taints []apiv1.Taint, taintConfig TaintConfig) []apiv1.Taint
continue
}

if _, exists := taintConfig.StartupTaints[taint.Key]; exists {
klog.V(4).Infof("Removing startup taint %s, when creating template from node", taint.Key)
continue
}
shouldRemoveBasedOnPrefix := matchesAnyPrefix(taintConfig.StartupTaintPrefixes, taint.Key) || matchesAnyPrefix(taintConfig.StatusTaintPrefixes, taint.Key)
if shouldRemoveBasedOnPrefix {
klog.V(4).Infof("Removing taint %s based on prefix, when creation template from node", taint.Key)
continue
}

if _, exists := taintConfig.StatusTaints[taint.Key]; exists {
klog.V(4).Infof("Removing status taint %s, when creating template from node", taint.Key)
if taintConfig.IsStartupTaint(taint.Key) || taintConfig.IsStatusTaint(taint.Key) {
klog.V(4).Infof("Removing taint %s, when creating template from node", taint.Key)
continue
}

Expand All @@ -394,8 +423,7 @@ func FilterOutNodesWithStartupTaints(taintConfig TaintConfig, allNodes, readyNod
}
ready := true
for _, t := range node.Spec.Taints {
_, hasStartupTaint := taintConfig.StartupTaints[t.Key]
if hasStartupTaint || matchesAnyPrefix(taintConfig.StartupTaintPrefixes, t.Key) {
if taintConfig.IsStartupTaint(t.Key) {
ready = false
nodesWithStartupTaints[node.Name] = kubernetes.GetUnreadyNodeCopy(node, kubernetes.StartupNodes)
klog.V(3).Infof("Overriding status of node %v, which seems to have startup taint %q", node.Name, t.Key)
Expand All @@ -416,3 +444,33 @@ func FilterOutNodesWithStartupTaints(taintConfig TaintConfig, allNodes, readyNod
}
return newAllNodes, newReadyNodes
}

// CountNodeTaints counts used node taints.
func CountNodeTaints(nodes []*apiv1.Node, taintConfig TaintConfig) map[string]int {
foundTaintsCount := make(map[string]int)
for _, node := range nodes {
for _, taint := range node.Spec.Taints {
key := getTaintTypeToReport(taint.Key, taintConfig)
foundTaintsCount[key] += 1
}
}
return foundTaintsCount
}

func getTaintTypeToReport(key string, taintConfig TaintConfig) string {
// Track deprecated taints.
if strings.HasPrefix(key, IgnoreTaintPrefix) {
return IgnoreTaintPrefix
}

if taintConfig.isExplicitlyReportedTaint(key) {
return key
}
if taintConfig.IsStartupTaint(key) {
return startupNodeTaintReportedType
}
if taintConfig.IsStatusTaint(key) {
return statusNodeTaintReportedType
}
return unlistedNodeTaintReportedType
}
110 changes: 105 additions & 5 deletions cluster-autoscaler/utils/taints/taints_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"testing"
"time"

"k8s.io/autoscaler/cluster-autoscaler/config"
. "k8s.io/autoscaler/cluster-autoscaler/utils/test"

apiv1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -472,8 +473,8 @@ func TestFilterOutNodesWithStartupTaints(t *testing.T) {
nodes = append(nodes, tc.node)
}
taintConfig := TaintConfig{
StartupTaints: tc.startupTaints,
StartupTaintPrefixes: tc.startupTaintsPrefixes,
startupTaints: tc.startupTaints,
startupTaintPrefixes: tc.startupTaintsPrefixes,
}
allNodes, readyNodes := FilterOutNodesWithStartupTaints(taintConfig, nodes, nodes)
assert.Equal(t, tc.allNodes, len(allNodes))
Expand Down Expand Up @@ -562,13 +563,112 @@ func TestSanitizeTaints(t *testing.T) {
},
}
taintConfig := TaintConfig{
StartupTaints: map[string]bool{"ignore-me": true},
StatusTaints: map[string]bool{"status-me": true},
StartupTaintPrefixes: []string{IgnoreTaintPrefix, StartupTaintPrefix},
startupTaints: map[string]bool{"ignore-me": true},
statusTaints: map[string]bool{"status-me": true},
startupTaintPrefixes: []string{IgnoreTaintPrefix, StartupTaintPrefix},
}

newTaints := SanitizeTaints(node.Spec.Taints, taintConfig)
require.Equal(t, 2, len(newTaints))
assert.Equal(t, newTaints[0].Key, StatusTaintPrefix+"some-taint")
assert.Equal(t, newTaints[1].Key, "test-taint")
}

func TestCountNodeTaints(t *testing.T) {
node := &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-count-node-taints",
CreationTimestamp: metav1.NewTime(time.Now()),
},
Spec: apiv1.NodeSpec{
Taints: []apiv1.Taint{
{
Key: IgnoreTaintPrefix + "another-taint",
Value: "myValue",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: StatusTaintPrefix + "some-taint",
Value: "myValue",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: StartupTaintPrefix + "some-taint",
Value: "myValue",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "test-taint",
Value: "test2",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: ToBeDeletedTaint,
Value: "1",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "ignore-me",
Value: "1",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "status-me",
Value: "1",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "node.kubernetes.io/memory-pressure",
Value: "1",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "ignore-taint.cluster-autoscaler.kubernetes.io/to-be-ignored",
Value: "myValue2",
Effect: apiv1.TaintEffectNoSchedule,
},
},
},
Status: apiv1.NodeStatus{
Conditions: []apiv1.NodeCondition{},
},
}
node2 := &apiv1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-count-node-taints",
CreationTimestamp: metav1.NewTime(time.Now()),
},
Spec: apiv1.NodeSpec{
Taints: []apiv1.Taint{
{
Key: StatusTaintPrefix + "some-taint",
Value: "myValue",
Effect: apiv1.TaintEffectNoSchedule,
},
{
Key: "node.kubernetes.io/unschedulable",
Value: "1",
Effect: apiv1.TaintEffectNoSchedule,
},
},
},
Status: apiv1.NodeStatus{
Conditions: []apiv1.NodeCondition{},
},
}
taintConfig := NewTaintConfig(config.AutoscalingOptions{
StatusTaints: []string{"status-me"},
StartupTaints: []string{"ignore-me"},
})
want := map[string]int{
"ignore-taint.cluster-autoscaler.kubernetes.io/": 2,
"ToBeDeletedByClusterAutoscaler": 1,
"node.kubernetes.io/memory-pressure": 1,
"node.kubernetes.io/unschedulable": 1,
"other": 1,
"startup-taint": 2,
"status-taint": 3,
}
got := CountNodeTaints([]*apiv1.Node{node, node2}, taintConfig)
assert.Equal(t, want, got)
}

0 comments on commit 5cea330

Please sign in to comment.