Skip to content

Commit

Permalink
test: fix clusterstate tests failing
Browse files Browse the repository at this point in the history
Signed-off-by: vadasambar <[email protected]>
  • Loading branch information
vadasambar committed May 18, 2023
1 parent 7b6f538 commit 4a22c5e
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 3 deletions.
22 changes: 20 additions & 2 deletions cluster-autoscaler/clusterstate/clusterstate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"testing"
"time"

"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/metrics"

apiv1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -1041,13 +1042,21 @@ func TestScaleUpFailures(t *testing.T) {
fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap")
clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{}, fakeLogRecorder, newBackoff(), NewStaticMaxNodeProvisionTimeProvider(15*time.Minute))

// happened before current/now time
oldTime := now.Add(-config.DefaultScaleDownDelayAfterFailure).Add(-time.Minute)
clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, oldTime)

// happened 'now'
clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), metrics.Timeout, now)
clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng2"), metrics.Timeout, now)
clusterstate.RegisterFailedScaleUp(provider.GetNodeGroup("ng1"), metrics.APIError, now.Add(time.Minute))

failures := clusterstate.GetScaleUpFailures()
assert.Equal(t, map[string][]ScaleUpFailure{
"ng1": {
// failure time has crossed DefaultScaleDownDelayAfterFailure duration
// should be removed when `clearScaleUpFailures` is called
{NodeGroup: provider.GetNodeGroup("ng1"), Reason: metrics.CloudProviderError, Time: oldTime},
{NodeGroup: provider.GetNodeGroup("ng1"), Reason: metrics.Timeout, Time: now},
{NodeGroup: provider.GetNodeGroup("ng1"), Reason: metrics.APIError, Time: now.Add(time.Minute)},
},
Expand All @@ -1056,8 +1065,17 @@ func TestScaleUpFailures(t *testing.T) {
},
}, failures)

clusterstate.clearScaleUpFailures()
assert.Empty(t, clusterstate.GetScaleUpFailures())
clusterstate.clearScaleUpFailures(config.DefaultScaleDownDelayAfterFailure, now)
fmt.Printf("scale up failures: %v", clusterstate.GetScaleUpFailures())
assert.Equal(t, map[string][]ScaleUpFailure{
"ng1": {
{NodeGroup: provider.GetNodeGroup("ng1"), Reason: metrics.Timeout, Time: now},
{NodeGroup: provider.GetNodeGroup("ng1"), Reason: metrics.APIError, Time: now.Add(time.Minute)},
},
"ng2": {
{NodeGroup: provider.GetNodeGroup("ng2"), Reason: metrics.Timeout, Time: now},
},
}, clusterstate.GetScaleUpFailures())
}

func newBackoff() backoff.Backoff {
Expand Down
5 changes: 5 additions & 0 deletions cluster-autoscaler/config/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ limitations under the License.

package config

import "time"

const (
// DefaultMaxClusterCores is the default maximum number of cores in the cluster.
DefaultMaxClusterCores = 5000 * 64
Expand All @@ -32,4 +34,7 @@ const (
DefaultScaleDownUnreadyTimeKey = "scaledownunreadytime"
// DefaultMaxNodeProvisionTimeKey identifies MaxNodeProvisionTime autoscaling option
DefaultMaxNodeProvisionTimeKey = "maxnodeprovisiontime"

// DefaultScaleDownDelayAfterFailure is the default value for ScaleDownDelayAfterFailure autoscaling option
DefaultScaleDownDelayAfterFailure = 3 * time.Minute
)
2 changes: 1 addition & 1 deletion cluster-autoscaler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ var (
"Should --scale-down-delay-after-* flags be applied locally per nodegroup or globally across all nodegroups")
scaleDownDelayAfterDelete = flag.Duration("scale-down-delay-after-delete", 0,
"How long after node deletion that scale down evaluation resumes, defaults to scanInterval")
scaleDownDelayAfterFailure = flag.Duration("scale-down-delay-after-failure", 3*time.Minute,
scaleDownDelayAfterFailure = flag.Duration("scale-down-delay-after-failure", config.DefaultScaleDownDelayAfterFailure,
"How long after scale down failure that scale down evaluation resumes")
scaleDownUnneededTime = flag.Duration("scale-down-unneeded-time", 10*time.Minute,
"How long a node should be unneeded before it is eligible for scale down")
Expand Down

0 comments on commit 4a22c5e

Please sign in to comment.