From 41676aee05bf48930823857823eebc8c16f48780 Mon Sep 17 00:00:00 2001 From: prachigandhi Date: Wed, 24 Jan 2024 11:29:09 -0800 Subject: [PATCH 01/11] introduce getVmssSizeRefreshPeriod env var --- cluster-autoscaler/cloudprovider/azure/README.md | 6 ++++++ .../cloudprovider/azure/azure_config.go | 15 +++++++++++++++ .../cloudprovider/azure/azure_manager_test.go | 9 +++++++++ 3 files changed, 30 insertions(+) diff --git a/cluster-autoscaler/cloudprovider/azure/README.md b/cluster-autoscaler/cloudprovider/azure/README.md index bfe964fe176f..557814d74b3c 100644 --- a/cluster-autoscaler/cloudprovider/azure/README.md +++ b/cluster-autoscaler/cloudprovider/azure/README.md @@ -197,6 +197,12 @@ The `AZURE_ENABLE_VMSS_FLEX` environment variable enables VMSS Flex support. By |---------------------------|---------|-----------------------------------------|---------------------------| | enableVmssFlex | false | AZURE_ENABLE_VMSS_FLEX | enableVmssFlex | +The `AZURE_GET_VMSS_SIZE_REFRESH_PERIOD` environment variable defines in seconds how frequently to call GET VMSS API to fetch VMSS info per nodegroup instance. By default, value is 30 seconds. + +| Config Name | Default | Environment Variable | Cloud Config File | +|---------------------------|---------|-------------------------------------|---------------------------| +| getVmssSizeRefreshPeriod | 30 | AZURE_GET_VMSS_SIZE_REFRESH_PERIOD | getVmssSizeRefreshPeriod | + When using K8s 1.18 or higher, it is also recommended to configure backoff and retries on the client as described [here](#rate-limit-and-back-off-retries) ### Standard deployment diff --git a/cluster-autoscaler/cloudprovider/azure/azure_config.go b/cluster-autoscaler/cloudprovider/azure/azure_config.go index 2f6f5f279ab2..92eea7a3e3ee 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_config.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_config.go @@ -56,6 +56,9 @@ const ( rateLimitWriteQPSEnvVar = "RATE_LIMIT_WRITE_QPS" rateLimitWriteBucketsEnvVar = "RATE_LIMIT_WRITE_BUCKETS" + // refresh period in seconds + vmssSizeRefreshPeriodDefault = 30 + // auth methods authMethodPrincipal = "principal" authMethodCLI = "cli" @@ -120,6 +123,9 @@ type Config struct { // Jitter in seconds subtracted from the VMSS cache TTL before the first refresh VmssVmsCacheJitter int `json:"vmssVmsCacheJitter" yaml:"vmssVmsCacheJitter"` + // GetVmssSizeRefreshPeriod (seconds) defines how frequently to call GET VMSS API to fetch VMSS info per nodegroup instance + GetVmssSizeRefreshPeriod int `json:"getVmssSizeRefreshPeriod" yaml:"getVmssSizeRefreshPeriod"` + // number of latest deployments that will not be deleted MaxDeploymentsCount int64 `json:"maxDeploymentsCount" yaml:"maxDeploymentsCount"` @@ -248,6 +254,15 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) { cfg.EnableDynamicInstanceList = dynamicInstanceListDefault } + if getVmssSizeRefreshPeriod := os.Getenv("AZURE_GET_VMSS_SIZE_REFRESH_PERIOD"); getVmssSizeRefreshPeriod != "" { + cfg.GetVmssSizeRefreshPeriod, err = strconv.Atoi(getVmssSizeRefreshPeriod) + if err != nil { + return nil, fmt.Errorf("failed to parse AZURE_GET_VMSS_SIZE_REFRESH_PERIOD %q: %v", getVmssSizeRefreshPeriod, err) + } + } else { + cfg.GetVmssSizeRefreshPeriod = vmssSizeRefreshPeriodDefault + } + if enableVmssFlex := os.Getenv("AZURE_ENABLE_VMSS_FLEX"); enableVmssFlex != "" { cfg.EnableVmssFlex, err = strconv.ParseBool(enableVmssFlex) if err != nil { diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go index 3bad46343f50..9320b6ab0ac5 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go @@ -360,6 +360,7 @@ func TestCreateAzureManagerWithNilConfig(t *testing.T) { VmssCacheTTL: 100, VmssVmsCacheTTL: 110, VmssVmsCacheJitter: 90, + GetVmssSizeRefreshPeriod: 30, MaxDeploymentsCount: 8, CloudProviderBackoff: true, CloudProviderBackoffRetries: 1, @@ -471,6 +472,14 @@ func TestCreateAzureManagerWithNilConfig(t *testing.T) { assert.Equal(t, expectedErr, err, "Return err does not match, expected: %v, actual: %v", expectedErr, err) }) + t.Run("invalid int for AZURE_GET_VMSS_SIZE_REFRESH_PERIOD", func(t *testing.T) { + t.Setenv("AZURE_GET_VMSS_SIZE_REFRESH_PERIOD", "invalidint") + manager, err := createAzureManagerInternal(nil, cloudprovider.NodeGroupDiscoveryOptions{}, mockAzClient) + expectedErr := fmt.Errorf("failed to parse AZURE_GET_VMSS_SIZE_REFRESH_PERIOD \"invalidint\": strconv.Atoi: parsing \"invalidint\": invalid syntax") + assert.Nil(t, manager) + assert.Equal(t, expectedErr, err, "Return err does not match, expected: %v, actual: %v", expectedErr, err) + }) + t.Run("invalid int for AZURE_MAX_DEPLOYMENT_COUNT", func(t *testing.T) { t.Setenv("AZURE_MAX_DEPLOYMENT_COUNT", "invalidint") manager, err := createAzureManagerInternal(nil, cloudprovider.NodeGroupDiscoveryOptions{}, mockAzClient) From a97ffab02acc903d4dc125af7ba24c587fc6315e Mon Sep 17 00:00:00 2001 From: prachigandhi Date: Wed, 24 Jan 2024 12:37:35 -0800 Subject: [PATCH 02/11] logic for getvmss call for spot, uniform vmss --- .../cloudprovider/azure/azure_config.go | 4 +- .../cloudprovider/azure/azure_manager_test.go | 15 ++--- .../cloudprovider/azure/azure_scale_set.go | 55 ++++++++++++++++--- .../azure/azure_scale_set_test.go | 1 - 4 files changed, 58 insertions(+), 17 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_config.go b/cluster-autoscaler/cloudprovider/azure/azure_config.go index 92eea7a3e3ee..c418b3ac4578 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_config.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_config.go @@ -57,7 +57,7 @@ const ( rateLimitWriteBucketsEnvVar = "RATE_LIMIT_WRITE_BUCKETS" // refresh period in seconds - vmssSizeRefreshPeriodDefault = 30 + VmssSizeRefreshPeriodDefault = 30 // auth methods authMethodPrincipal = "principal" @@ -260,7 +260,7 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) { return nil, fmt.Errorf("failed to parse AZURE_GET_VMSS_SIZE_REFRESH_PERIOD %q: %v", getVmssSizeRefreshPeriod, err) } } else { - cfg.GetVmssSizeRefreshPeriod = vmssSizeRefreshPeriodDefault + cfg.GetVmssSizeRefreshPeriod = VmssSizeRefreshPeriodDefault } if enableVmssFlex := os.Getenv("AZURE_ENABLE_VMSS_FLEX"); enableVmssFlex != "" { diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go index 9320b6ab0ac5..3caf8ae0f721 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go @@ -685,13 +685,14 @@ func TestGetFilteredAutoscalingGroupsVmss(t *testing.T) { azureRef: azureRef{ Name: vmssName, }, - minSize: minVal, - maxSize: maxVal, - manager: manager, - enableForceDelete: manager.config.EnableForceDelete, - curSize: 3, - sizeRefreshPeriod: manager.azureCache.refreshInterval, - instancesRefreshPeriod: defaultVmssInstancesRefreshPeriod, + minSize: minVal, + maxSize: maxVal, + manager: manager, + enableForceDelete: manager.config.EnableForceDelete, + curSize: 3, + sizeRefreshPeriod: manager.azureCache.refreshInterval, + instancesRefreshPeriod: defaultVmssInstancesRefreshPeriod, + getVmssSizeRefreshPeriod: time.Duration(VmssSizeRefreshPeriodDefault) * time.Second, }} assert.True(t, assert.ObjectsAreEqualValues(expectedAsgs, asgs), "expected %#v, but found: %#v", expectedAsgs, asgs) } diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index e7c6079c2206..b848bf099143 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -39,6 +39,7 @@ var ( defaultVmssInstancesRefreshPeriod = 5 * time.Minute vmssContextTimeout = 3 * time.Minute vmssSizeMutex sync.Mutex + defaultGetVmssSizeRefreshPeriod = VmssSizeRefreshPeriodDefault * time.Second ) const ( @@ -65,8 +66,9 @@ type ScaleSet struct { enableDynamicInstanceList bool - lastSizeRefresh time.Time - sizeRefreshPeriod time.Duration + lastSizeRefresh time.Time + sizeRefreshPeriod time.Duration + getVmssSizeRefreshPeriod time.Duration instancesRefreshPeriod time.Duration instancesRefreshJitter int @@ -98,6 +100,12 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64) ( scaleSet.instancesRefreshPeriod = defaultVmssInstancesRefreshPeriod } + if az.config.GetVmssSizeRefreshPeriod != 0 { + scaleSet.getVmssSizeRefreshPeriod = time.Duration(az.config.GetVmssSizeRefreshPeriod) * time.Second + } else { + scaleSet.getVmssSizeRefreshPeriod = time.Duration(VmssSizeRefreshPeriodDefault) * time.Second + } + return scaleSet, nil } @@ -157,17 +165,44 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) { scaleSet.sizeMutex.Lock() defer scaleSet.sizeMutex.Unlock() - if scaleSet.lastSizeRefresh.Add(scaleSet.sizeRefreshPeriod).After(time.Now()) { - klog.V(3).Infof("VMSS: %s, returning in-memory size: %d", scaleSet.Name, scaleSet.curSize) - return scaleSet.curSize, nil - } - set, err := scaleSet.getVMSSFromCache() if err != nil { klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, err) return -1, err } + effectiveSizeRefreshPeriod := scaleSet.sizeRefreshPeriod + + // If the scale set is Spot, we want to have a more fresh view of the Sku.Capacity field. + // This is because evictions can happen at any given point in time, + // even before VMs are materialized as nodes. We should be able to + // react to those and have the autoscaler readjust the goal again to force restoration. + // Taking into account only if orchestrationMode == Uniform because flex mode can have + // combination of spot and regular vms + if isSpotAndUniform(&set) { + effectiveSizeRefreshPeriod = scaleSet.getVmssSizeRefreshPeriod + } + + if scaleSet.lastSizeRefresh.Add(effectiveSizeRefreshPeriod).After(time.Now()) { + klog.V(3).Infof("VMSS: %s, returning in-memory size: %d", scaleSet.Name, scaleSet.curSize) + return scaleSet.curSize, nil + } + + // If the toggle to utilize the GET VMSS is enabled or the scale set is on Spot, + // make a GET VMSS call to fetch more updated fresh info + if isSpotAndUniform(&set) { + ctx, cancel := getContextWithCancel() + defer cancel() + + var rerr *retry.Error + set, rerr = scaleSet.manager.azClient.virtualMachineScaleSetsClient.Get(ctx, scaleSet.manager.config.ResourceGroup, + scaleSet.Name) + if rerr != nil { + klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, rerr) + return -1, err + } + } + vmssSizeMutex.Lock() curSize := *set.Sku.Capacity vmssSizeMutex.Unlock() @@ -184,6 +219,12 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) { return scaleSet.curSize, nil } +func isSpotAndUniform(vmss *compute.VirtualMachineScaleSet) bool { + return vmss != nil && vmss.VirtualMachineScaleSetProperties != nil && + vmss.VirtualMachineScaleSetProperties.VirtualMachineProfile != nil && + vmss.VirtualMachineScaleSetProperties.VirtualMachineProfile.Priority == compute.Spot +} + // GetScaleSetSize gets Scale Set size. func (scaleSet *ScaleSet) GetScaleSetSize() (int64, error) { return scaleSet.getCurSize() diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go index edfdb1fa08eb..b2f702a63310 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go @@ -181,7 +181,6 @@ func TestTargetSize(t *testing.T) { provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient if orchMode == compute.Uniform { - mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient From fe9c5b0cd51e9d29a4673a44502c0c8956146482 Mon Sep 17 00:00:00 2001 From: prachigandhi Date: Wed, 24 Jan 2024 14:42:11 -0800 Subject: [PATCH 03/11] modify test for spot verification --- .../azure/azure_scale_set_test.go | 60 ++++++++++--------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go index b2f702a63310..1c761b700ec4 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go @@ -168,42 +168,44 @@ func TestTargetSize(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() - orchestrationModes := [2]compute.OrchestrationMode{compute.Uniform, compute.Flexible} - expectedScaleSets := newTestVMSSList(3, "test-asg", "eastus", compute.Uniform) - expectedVMSSVMs := newTestVMSSVMList(3) - expectedVMs := newTestVMList(3) + spotScaleSet := newTestVMSSList(5, "spot-vmss", "eastus", compute.Uniform)[0] + spotScaleSet.VirtualMachineProfile = &compute.VirtualMachineScaleSetVMProfile{ + Priority: compute.Spot, + } + expectedScaleSets = append(expectedScaleSets, spotScaleSet) - for _, orchMode := range orchestrationModes { - provider := newTestProvider(t) - mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) - mockVMSSClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedScaleSets, nil).AnyTimes() - provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient + provider := newTestProvider(t) + mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) + mockVMSSClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedScaleSets, nil).AnyTimes() + provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient - if orchMode == compute.Uniform { - mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) - mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() - provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient + err := provider.azureManager.forceRefresh() + assert.NoError(t, err) - } else { - provider.azureManager.config.EnableVmssFlex = true - mockVMClient := mockvmclient.NewMockInterface(ctrl) - mockVMClient.EXPECT().ListVmssFlexVMsWithoutInstanceView(gomock.Any(), "test-asg").Return(expectedVMs, nil).AnyTimes() - provider.azureManager.azClient.virtualMachinesClient = mockVMClient - } + // non-spot nodepool + registered := provider.azureManager.RegisterNodeGroup( + newTestScaleSet(provider.azureManager, "test-asg")) + assert.True(t, registered) + assert.Equal(t, len(provider.NodeGroups()), 1) - err := provider.azureManager.forceRefresh() - assert.NoError(t, err) + targetSize, err := provider.NodeGroups()[0].TargetSize() + assert.NoError(t, err) + assert.Equal(t, 3, targetSize) - registered := provider.azureManager.RegisterNodeGroup( - newTestScaleSet(provider.azureManager, "test-asg")) - assert.True(t, registered) - assert.Equal(t, len(provider.NodeGroups()), 1) + // Register a spot nodepool + spotregistered := provider.azureManager.RegisterNodeGroup( + newTestScaleSet(provider.azureManager, "spot-vmss")) + assert.True(t, spotregistered) + assert.Equal(t, len(provider.NodeGroups()), 2) - targetSize, err := provider.NodeGroups()[0].TargetSize() - assert.NoError(t, err) - assert.Equal(t, 3, targetSize) - } + // mock getvmss call for spotnode pool returning different capacity + spotScaleSet.Sku.Capacity = to.Int64Ptr(1) + mockVMSSClient.EXPECT().Get(gomock.Any(), provider.azureManager.config.ResourceGroup, "spot-vmss").Return(spotScaleSet, nil).Times(1) + + targetSize, err = provider.NodeGroups()[1].TargetSize() + assert.NoError(t, err) + assert.Equal(t, 1, targetSize) } func TestIncreaseSize(t *testing.T) { From 017e88d02b595181b415b694316f80793ff6d99a Mon Sep 17 00:00:00 2001 From: prachigandhi Date: Wed, 24 Jan 2024 14:46:32 -0800 Subject: [PATCH 04/11] golint fixes --- cluster-autoscaler/cloudprovider/azure/azure_config.go | 2 +- cluster-autoscaler/cloudprovider/azure/azure_scale_set.go | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_config.go b/cluster-autoscaler/cloudprovider/azure/azure_config.go index c418b3ac4578..65493e5bc476 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_config.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_config.go @@ -56,7 +56,7 @@ const ( rateLimitWriteQPSEnvVar = "RATE_LIMIT_WRITE_QPS" rateLimitWriteBucketsEnvVar = "RATE_LIMIT_WRITE_BUCKETS" - // refresh period in seconds + // VmssSizeRefreshPeriodDefault in seconds VmssSizeRefreshPeriodDefault = 30 // auth methods diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index b848bf099143..3ae1b6857d0b 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -179,7 +179,7 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) { // react to those and have the autoscaler readjust the goal again to force restoration. // Taking into account only if orchestrationMode == Uniform because flex mode can have // combination of spot and regular vms - if isSpotAndUniform(&set) { + if isSpot(&set) { effectiveSizeRefreshPeriod = scaleSet.getVmssSizeRefreshPeriod } @@ -190,7 +190,7 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) { // If the toggle to utilize the GET VMSS is enabled or the scale set is on Spot, // make a GET VMSS call to fetch more updated fresh info - if isSpotAndUniform(&set) { + if isSpot(&set) { ctx, cancel := getContextWithCancel() defer cancel() @@ -219,7 +219,7 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) { return scaleSet.curSize, nil } -func isSpotAndUniform(vmss *compute.VirtualMachineScaleSet) bool { +func isSpot(vmss *compute.VirtualMachineScaleSet) bool { return vmss != nil && vmss.VirtualMachineScaleSetProperties != nil && vmss.VirtualMachineScaleSetProperties.VirtualMachineProfile != nil && vmss.VirtualMachineScaleSetProperties.VirtualMachineProfile.Priority == compute.Spot From c45e45d0b3af66b2127757d7cfbd489e2d069528 Mon Sep 17 00:00:00 2001 From: prachigandhi Date: Tue, 19 Mar 2024 09:48:32 -0700 Subject: [PATCH 05/11] update comment --- cluster-autoscaler/cloudprovider/azure/azure_scale_set.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index 3ae1b6857d0b..87886e57d367 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -188,8 +188,7 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) { return scaleSet.curSize, nil } - // If the toggle to utilize the GET VMSS is enabled or the scale set is on Spot, - // make a GET VMSS call to fetch more updated fresh info + // If the scale set is on Spot, make a GET VMSS call to fetch more updated fresh info if isSpot(&set) { ctx, cancel := getContextWithCancel() defer cancel() From eb99a37b48998a8fc55b9dff19b33ea00234c3b7 Mon Sep 17 00:00:00 2001 From: prachigandhi Date: Thu, 28 Mar 2024 08:12:43 -0700 Subject: [PATCH 06/11] review comments --- .../cloudprovider/azure/README.md | 3 ++- .../cloudprovider/azure/azure_scale_set.go | 19 ++++++++++++++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/azure/README.md b/cluster-autoscaler/cloudprovider/azure/README.md index 557814d74b3c..7b018c4f4643 100644 --- a/cluster-autoscaler/cloudprovider/azure/README.md +++ b/cluster-autoscaler/cloudprovider/azure/README.md @@ -197,7 +197,8 @@ The `AZURE_ENABLE_VMSS_FLEX` environment variable enables VMSS Flex support. By |---------------------------|---------|-----------------------------------------|---------------------------| | enableVmssFlex | false | AZURE_ENABLE_VMSS_FLEX | enableVmssFlex | -The `AZURE_GET_VMSS_SIZE_REFRESH_PERIOD` environment variable defines in seconds how frequently to call GET VMSS API to fetch VMSS info per nodegroup instance. By default, value is 30 seconds. +The AZURE_GET_VMSS_SIZE_REFRESH_PERIOD environment variable defines, in seconds, the frequency to call the GET VMSS API to fetch VMSS info per spot nodegroup instance. +By default, the value is 30 seconds. Currently, this API is only utilized for spot instances. | Config Name | Default | Environment Variable | Cloud Config File | |---------------------------|---------|-------------------------------------|---------------------------| diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index 87886e57d367..6b6a39168366 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -39,7 +39,6 @@ var ( defaultVmssInstancesRefreshPeriod = 5 * time.Minute vmssContextTimeout = 3 * time.Minute vmssSizeMutex sync.Mutex - defaultGetVmssSizeRefreshPeriod = VmssSizeRefreshPeriodDefault * time.Second ) const ( @@ -60,19 +59,29 @@ type ScaleSet struct { maxSize int enableForceDelete bool - - sizeMutex sync.Mutex - curSize int64 - enableDynamicInstanceList bool + + // curSize tracks (and caches) the number of VMs in this ScaleSet. + // It is periodically updated from vmss.Sku.Capacity, with VMSS itself coming + // either from azure.Cache (which periodically does VMSS.List) + // or from direct VMSS.Get (used for Spot). + curSize int64 + // lastSizeRefresh is the time curSize was last refreshed from vmss.Sku.Capacity. + // Together with sizeRefreshPeriod, it is used to determine if it is time to refresh curSize. lastSizeRefresh time.Time + // sizeRefreshPeriod is how often curSize is refreshed from vmss.Sku.Capacity. + // (Set from azureCache.refreshInterval = VmssCacheTTL or [defaultMetadataCache]refreshInterval = 1min) sizeRefreshPeriod time.Duration + // getVmssSizeRefreshPeriod is how often curSize should be refreshed in case VMSS.Get call is used (only spot instances). + // (Set from GetVmssSizeRefreshPeriod, if specified = get-vmss-size-refresh-period = 30s, + // or override from autoscalerProfile.GetVmssSizeRefreshPeriod) getVmssSizeRefreshPeriod time.Duration instancesRefreshPeriod time.Duration instancesRefreshJitter int + sizeMutex sync.Mutex instanceMutex sync.Mutex instanceCache []cloudprovider.Instance lastInstanceRefresh time.Time From 124a27383a6d5b8c836139af927b094bfe8709c1 Mon Sep 17 00:00:00 2001 From: prachigandhi Date: Wed, 17 Apr 2024 12:10:53 -0700 Subject: [PATCH 07/11] update gofmt --- .../cloudprovider/azure/azure_scale_set.go | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index 6b6a39168366..a506d54df869 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -58,21 +58,20 @@ type ScaleSet struct { minSize int maxSize int - enableForceDelete bool + enableForceDelete bool enableDynamicInstanceList bool - // curSize tracks (and caches) the number of VMs in this ScaleSet. // It is periodically updated from vmss.Sku.Capacity, with VMSS itself coming // either from azure.Cache (which periodically does VMSS.List) // or from direct VMSS.Get (used for Spot). - curSize int64 + curSize int64 // lastSizeRefresh is the time curSize was last refreshed from vmss.Sku.Capacity. // Together with sizeRefreshPeriod, it is used to determine if it is time to refresh curSize. - lastSizeRefresh time.Time + lastSizeRefresh time.Time // sizeRefreshPeriod is how often curSize is refreshed from vmss.Sku.Capacity. // (Set from azureCache.refreshInterval = VmssCacheTTL or [defaultMetadataCache]refreshInterval = 1min) - sizeRefreshPeriod time.Duration + sizeRefreshPeriod time.Duration // getVmssSizeRefreshPeriod is how often curSize should be refreshed in case VMSS.Get call is used (only spot instances). // (Set from GetVmssSizeRefreshPeriod, if specified = get-vmss-size-refresh-period = 30s, // or override from autoscalerProfile.GetVmssSizeRefreshPeriod) @@ -81,7 +80,7 @@ type ScaleSet struct { instancesRefreshPeriod time.Duration instancesRefreshJitter int - sizeMutex sync.Mutex + sizeMutex sync.Mutex instanceMutex sync.Mutex instanceCache []cloudprovider.Instance lastInstanceRefresh time.Time From 317fd783d0b62f94e04c7902b7bbb077457fac82 Mon Sep 17 00:00:00 2001 From: Rachel Gregory Date: Wed, 3 Jul 2024 16:42:46 -0700 Subject: [PATCH 08/11] Add VMs pool checks back into target size test --- cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go index 484c0f2b2d52..c9ba730852f9 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go @@ -179,6 +179,9 @@ func TestTargetSize(t *testing.T) { mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) mockVMSSClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedScaleSets, nil).AnyTimes() provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient + mockVMClient := mockvmclient.NewMockInterface(ctrl) + mockVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return([]compute.VirtualMachine{}, nil).AnyTimes() + provider.azureManager.azClient.virtualMachinesClient = mockVMClient err := provider.azureManager.forceRefresh() assert.NoError(t, err) From 6829c332dd6bf908e5172507aa409510d3b87868 Mon Sep 17 00:00:00 2001 From: Rachel Gregory Date: Sun, 7 Jul 2024 22:55:02 -0700 Subject: [PATCH 09/11] Add getVmssSizeRefreshPeriod default val to azure_manager_test for autodiscovery --- .../cloudprovider/azure/azure_manager_test.go | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go index 642d6ca62439..61bfd28ad87f 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go @@ -742,13 +742,14 @@ func TestGetFilteredAutoscalingGroupsVmssWithConfiguredSizes(t *testing.T) { azureRef: azureRef{ Name: vmssName, }, - minSize: minVal, - maxSize: maxVal, - manager: manager, - enableForceDelete: manager.config.EnableForceDelete, - curSize: 3, - sizeRefreshPeriod: manager.azureCache.refreshInterval, - instancesRefreshPeriod: defaultVmssInstancesRefreshPeriod, + minSize: minVal, + maxSize: maxVal, + manager: manager, + enableForceDelete: manager.config.EnableForceDelete, + curSize: 3, + sizeRefreshPeriod: manager.azureCache.refreshInterval, + instancesRefreshPeriod: defaultVmssInstancesRefreshPeriod, + getVmssSizeRefreshPeriod: VmssSizeRefreshPeriodDefault, }} assert.True(t, assert.ObjectsAreEqualValues(expectedAsgs, asgs), "expected %#v, but found: %#v", expectedAsgs, asgs) } From 2551d8648ebe6c83479b219f985c29bc8ec4c935 Mon Sep 17 00:00:00 2001 From: Rachel Gregory Date: Wed, 17 Jul 2024 21:24:01 +0000 Subject: [PATCH 10/11] Update getVmssSizeRefreshPeriod to use time conversions --- cluster-autoscaler/cloudprovider/azure/azure_manager_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go index 61bfd28ad87f..eb5eda27a8ac 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go @@ -749,7 +749,7 @@ func TestGetFilteredAutoscalingGroupsVmssWithConfiguredSizes(t *testing.T) { curSize: 3, sizeRefreshPeriod: manager.azureCache.refreshInterval, instancesRefreshPeriod: defaultVmssInstancesRefreshPeriod, - getVmssSizeRefreshPeriod: VmssSizeRefreshPeriodDefault, + getVmssSizeRefreshPeriod: time.Duration(VmssSizeRefreshPeriodDefault) * time.Second, }} assert.True(t, assert.ObjectsAreEqualValues(expectedAsgs, asgs), "expected %#v, but found: %#v", expectedAsgs, asgs) } From b69fd45000f93470916c4cbeb59e1f865e18310c Mon Sep 17 00:00:00 2001 From: Rachel Gregory Date: Fri, 19 Jul 2024 18:37:50 +0000 Subject: [PATCH 11/11] remove changes to README --- cluster-autoscaler/cloudprovider/azure/README.md | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/azure/README.md b/cluster-autoscaler/cloudprovider/azure/README.md index fcfa856d5097..f8ed746dfa7f 100644 --- a/cluster-autoscaler/cloudprovider/azure/README.md +++ b/cluster-autoscaler/cloudprovider/azure/README.md @@ -197,13 +197,6 @@ The `AZURE_ENABLE_VMSS_FLEX` environment variable enables VMSS Flex support. By |---------------------------|---------|-----------------------------------------|---------------------------| | enableVmssFlex | false | AZURE_ENABLE_VMSS_FLEX | enableVmssFlex | -The AZURE_GET_VMSS_SIZE_REFRESH_PERIOD environment variable defines, in seconds, the frequency to call the GET VMSS API to fetch VMSS info per spot nodegroup instance. -By default, the value is 30 seconds. Currently, this API is only utilized for spot instances. - -| Config Name | Default | Environment Variable | Cloud Config File | -|---------------------------|---------|-------------------------------------|---------------------------| -| getVmssSizeRefreshPeriod | 30 | AZURE_GET_VMSS_SIZE_REFRESH_PERIOD | getVmssSizeRefreshPeriod | - When using K8s 1.18 or higher, it is also recommended to configure backoff and retries on the client as described [here](#rate-limit-and-back-off-retries) ### Standard deployment