Skip to content

Commit

Permalink
Reject increase that would be less than current capacity, add env var
Browse files Browse the repository at this point in the history
  • Loading branch information
domenicbozzuto committed Jun 25, 2024
1 parent 8f70376 commit 4461ff2
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 11 deletions.
8 changes: 5 additions & 3 deletions cluster-autoscaler/cloudprovider/azure/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,10 +173,12 @@ When using K8s versions older than v1.18, we recommend using at least **v.1.17.5
As for CA versions older than 1.18, we recommend using at least **v.1.17.2, v1.16.5, v1.15.6**.

In addition, cluster-autoscaler exposes a `AZURE_VMSS_CACHE_TTL` environment variable which controls the rate of `GetVMScaleSet` being made. By default, this is 15 seconds but setting this to a higher value such as 60 seconds can protect against API throttling. The caches used are proactively incremented and decremented with the scale up and down operations and this higher value doesn't have any noticeable impact on performance. **Note that the value is in seconds**
`AZURE_VMSS_CACHE_FORCE_REFRESH` can also be set to always refresh the cache before an upscale.

| Config Name | Default | Environment Variable | Cloud Config File |
| ----------- | ------- | -------------------- | ----------------- |
| VmssCacheTTL | 60 | AZURE_VMSS_CACHE_TTL | vmssCacheTTL |
| Config Name | Default | Environment Variable | Cloud Config File |
|-----------------------|--------|--------------------------------|------------------------|
| VmssCacheTTL | 60 | AZURE_VMSS_CACHE_TTL | vmssCacheTTL |
| VmssCacheForceRefresh | false | AZURE_VMSS_CACHE_FORCE_REFRESH | vmssCacheForceRefresh |

The `AZURE_VMSS_VMS_CACHE_TTL` environment variable affects the `GetScaleSetVms` (VMSS VM List) calls rate. The default value is 300 seconds.
A configurable jitter (`AZURE_VMSS_VMS_CACHE_JITTER` environment variable, default 0) expresses the maximum number of second that will be subtracted from that initial VMSS cache TTL after a new VMSS is discovered by the cluster-autoscaler: this can prevent a dogpile effect on clusters having many VMSS.
Expand Down
10 changes: 10 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ type Config struct {
// VMSS metadata cache TTL in seconds, only applies for vmss type
VmssCacheTTL int64 `json:"vmssCacheTTL" yaml:"vmssCacheTTL"`

// VMSS cache option to force refresh of the vmss capacity before an upscale
VmssCacheForceRefresh bool `json:"vmssCacheForceRefresh" yaml:"vmssCacheForceRefresh"`

// VMSS instances cache TTL in seconds, only applies for vmss type
VmssVmsCacheTTL int64 `json:"vmssVmsCacheTTL" yaml:"vmssVmsCacheTTL"`

Expand Down Expand Up @@ -215,6 +218,13 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) {
}
}

if vmssCacheForceRefresh := os.Getenv("AZURE_VMSS_CACHE_FORCE_REFRESH"); vmssCacheForceRefresh != "" {
cfg.VmssCacheForceRefresh, err = strconv.ParseBool(vmssCacheForceRefresh)
if err != nil {
return nil, fmt.Errorf("failed to parse AZURE_VMSS_CACHE_FORCE_REFRESH %q: %v", vmssCacheForceRefresh, err)
}
}

if vmssVmsCacheTTL := os.Getenv("AZURE_VMSS_VMS_CACHE_TTL"); vmssVmsCacheTTL != "" {
cfg.VmssVmsCacheTTL, err = strconv.ParseInt(vmssVmsCacheTTL, 10, 0)
if err != nil {
Expand Down
27 changes: 20 additions & 7 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,10 @@ type ScaleSet struct {
instancesRefreshPeriod time.Duration
instancesRefreshJitter int

instanceMutex sync.Mutex
instanceCache []cloudprovider.Instance
lastInstanceRefresh time.Time
instanceMutex sync.Mutex
instanceCache []cloudprovider.Instance
lastInstanceRefresh time.Time
forceRefreshOnUpscale bool
}

// NewScaleSet creates a new NewScaleSet.
Expand Down Expand Up @@ -231,11 +232,22 @@ func (scaleSet *ScaleSet) SetScaleSetSize(size int64) error {
return err
}

// Update the new capacity to cache.
// Update the new capacity to cache, if the size increase is a net increase
vmssSizeMutex.Lock()
vmssInfo.Sku.Capacity = &size
rejectIncrease := false
lastCapacity := *vmssInfo.Sku.Capacity
if size < lastCapacity {
rejectIncrease = true
} else {
vmssInfo.Sku.Capacity = &size
}
vmssSizeMutex.Unlock()

if rejectIncrease {
klog.Warningf("VMSS %s: rejecting size decrease from %d to %d", scaleSet.Name, lastCapacity, size)
return fmt.Errorf("vmss %s: rejected size decrease from %d to %d", scaleSet.Name, lastCapacity, size)
}

// Compose a new VMSS for updating.
op := compute.VirtualMachineScaleSet{
Name: vmssInfo.Name,
Expand All @@ -254,6 +266,7 @@ func (scaleSet *ScaleSet) SetScaleSetSize(size int64) error {
// Proactively set the VMSS size so autoscaler makes better decisions.
scaleSet.curSize = size
scaleSet.lastSizeRefresh = time.Now()
klog.V(3).Infof("VMSS %s: requested size increase from %d to %d", scaleSet.Name, lastCapacity, size)

go scaleSet.updateVMSSCapacity(future)
return nil
Expand All @@ -272,8 +285,7 @@ func (scaleSet *ScaleSet) IncreaseSize(delta int) error {
return fmt.Errorf("size increase must be positive")
}

forceRefresh := true
if forceRefresh {
if scaleSet.manager.config.VmssCacheForceRefresh {
if err := scaleSet.manager.forceRefresh(); err != nil {
klog.Errorf("VMSS %s: force refresh of VMSSs failed: %v", scaleSet.Name, err)
return err
Expand Down Expand Up @@ -448,6 +460,7 @@ func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef, hasUnregistered
scaleSet.sizeMutex.Lock()
scaleSet.curSize -= int64(len(instanceIDs))
scaleSet.lastSizeRefresh = time.Now()
klog.V(3).Infof("VMSS %s: had unregistered nodes, current size decremented by %d to %d", scaleSet.Name, len(instanceIDs), scaleSet.curSize)
scaleSet.sizeMutex.Unlock()
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ func TestIncreaseSizeOnVMSSUpdating(t *testing.T) {
assert.NoError(t, err)
}

func TestIncreaseSizeOnUpdatedSize(t *testing.T) {
func TestIncreaseSizeWithForceRefresh(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

Expand All @@ -359,6 +359,7 @@ func TestIncreaseSizeOnUpdatedSize(t *testing.T) {
provider := newTestProvider(t)
expectedScaleSets := newTestVMSSList(3, "test-asg", "eastus", orchMode)
provider.azureManager.explicitlyConfigured["test-asg"] = true
provider.azureManager.config.VmssCacheForceRefresh = true

mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
mockVMSSClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedScaleSets, nil).Times(2)
Expand Down

0 comments on commit 4461ff2

Please sign in to comment.