Skip to content

Commit

Permalink
azure: enable vmss deallocate
Browse files Browse the repository at this point in the history
Signed-off-by: Jack Francis <[email protected]>
  • Loading branch information
jackfrancis committed Dec 12, 2023
1 parent 98a77a8 commit b33b8fb
Show file tree
Hide file tree
Showing 15 changed files with 878 additions and 36 deletions.
2 changes: 1 addition & 1 deletion charts/cluster-autoscaler/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ name: cluster-autoscaler
sources:
- https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler
type: application
version: 9.34.0
version: 9.35.0
1 change: 1 addition & 0 deletions charts/cluster-autoscaler/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,7 @@ vpa:
| azureClientID | string | `""` | Service Principal ClientID with contributor permission to Cluster and Node ResourceGroup. Required if `cloudProvider=azure` |
| azureClientSecret | string | `""` | Service Principal ClientSecret with contributor permission to Cluster and Node ResourceGroup. Required if `cloudProvider=azure` |
| azureResourceGroup | string | `""` | Azure resource group that the cluster is located. Required if `cloudProvider=azure` |
| azureScaleDownPolicy | string | `"Delete"` | Azure ScaleDownPolicy, either "Delete" (default) or "Deallocate" Only relevant if `cloudProvider=azure` |
| azureSubscriptionID | string | `""` | Azure subscription where the resources are located. Required if `cloudProvider=azure` |
| azureTenantID | string | `""` | Azure tenant where the resources are located. Required if `cloudProvider=azure` |
| azureUseManagedIdentityExtension | bool | `false` | Whether to use Azure's managed identity extension for credentials. If using MSI, ensure subscription ID, resource group, and azure AKS cluster name are set. You can only use one authentication method at a time, either azureUseWorkloadIdentityExtension or azureUseManagedIdentityExtension should be set. |
Expand Down
5 changes: 5 additions & 0 deletions charts/cluster-autoscaler/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,11 @@ spec:
secretKeyRef:
key: NodeResourceGroup
name: {{ default (include "cluster-autoscaler.fullname" .) .Values.secretKeyRefNameOverride }}
- name: AZURE_SCALE_DOWN_POLICY
valueFrom:
secretKeyRef:
key: ScaleDownPolicy
name: {{ default (include "cluster-autoscaler.fullname" .) .Values.secretKeyRefNameOverride }}
{{- end }}
{{- else if eq .Values.cloudProvider "exoscale" }}
- name: EXOSCALE_API_KEY
Expand Down
1 change: 1 addition & 0 deletions charts/cluster-autoscaler/templates/secret.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ data:
SubscriptionID: "{{ .Values.azureSubscriptionID | b64enc }}"
TenantID: "{{ .Values.azureTenantID | b64enc }}"
VMType: "{{ .Values.azureVMType | b64enc }}"
ScaleDownPolicy: "{{ .Values.azureScaleDownPolicy | b64enc }}"
{{- else if $isAws }}
AwsAccessKeyId: "{{ .Values.awsAccessKeyID | b64enc }}"
AwsSecretAccessKey: "{{ .Values.awsSecretAccessKey | b64enc }}"
Expand Down
4 changes: 4 additions & 0 deletions charts/cluster-autoscaler/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ azureUseWorkloadIdentityExtension: false
# azureVMType -- Azure VM type.
azureVMType: "AKS"

# azureScaleDownPolicy -- Azure ScaleDownPolicy, either "Delete" (default) or "Deallocate"
# Only relevant if `cloudProvider=azure`
azureScaleDownPolicy: "Delete"

# cloudConfigPath -- Configuration file for cloud provider.
cloudConfigPath: ""

Expand Down
4 changes: 4 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,9 @@ type Config struct {

// EnableVmssFlex defines whether to enable Vmss Flex support or not
EnableVmssFlex bool `json:"enableVmssFlex,omitempty" yaml:"enableVmssFlex,omitempty"`

// ScaleDownPolicy is the VMSS scale down policy, either "Delete" or "Deallocate"
ScaleDownPolicy string `json:"scaleDownPolicy" yaml:"scaleDownPolicy"`
}

// BuildAzureConfig returns a Config object for the Azure clients
Expand Down Expand Up @@ -169,6 +172,7 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) {
cfg.AADClientCertPath = os.Getenv("ARM_CLIENT_CERT_PATH")
cfg.AADClientCertPassword = os.Getenv("ARM_CLIENT_CERT_PASSWORD")
cfg.Deployment = os.Getenv("ARM_DEPLOYMENT")
cfg.ScaleDownPolicy = os.Getenv("AZURE_SCALE_DOWN_POLICY")

subscriptionID, err := getSubscriptionIdFromInstanceMetadata()
if err != nil {
Expand Down
204 changes: 204 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_error.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package azure

import (
"encoding/json"
"fmt"
"strings"

"github.com/Azure/go-autorest/autorest/azure"
"sigs.k8s.io/cloud-provider-azure/pkg/retry"
)

// Unknown is for errors that have nil RawError body
const Unknown CloudProviderErrorReason = "Unknown"

// Errors on the sync path
const (
// QuotaExceeded falls under OperationNotAllowed error code but we make it more specific here
QuotaExceeded CloudProviderErrorReason = "QuotaExceeded"
// OperationNotAllowed is an umbrella for a lot of errors returned by Azure
OperationNotAllowed string = "OperationNotAllowed"
)

// AutoscalerErrorType describes a high-level category of a given error
type AutoscalerErrorType string

// AutoscalerErrorReason is a more detailed reason for the failed operation
type AutoscalerErrorReason string

// CloudProviderErrorReason providers more details on errors of type CloudProviderError
type CloudProviderErrorReason AutoscalerErrorReason

// AutoscalerError contains information about Autoscaler errors
type AutoscalerError interface {
// Error implements golang error interface
Error() string

// Type returns the type of AutoscalerError
Type() AutoscalerErrorType

// Reason returns the reason of the AutoscalerError
Reason() AutoscalerErrorReason

// AddPrefix adds a prefix to error message.
// Returns the error it's called for convenient inline use.
// Example:
// if err := DoSomething(myObject); err != nil {
// return err.AddPrefix("can't do something with %v: ", myObject)
// }
AddPrefix(msg string, args ...interface{}) AutoscalerError
}

type autoscalerErrorImpl struct {
errorType AutoscalerErrorType
errorReason AutoscalerErrorReason
msg string
}

const (
// CloudProviderError is an error related to underlying infrastructure
CloudProviderError AutoscalerErrorType = "CloudProviderError"
// ApiCallError is an error related to communication with k8s API server
ApiCallError AutoscalerErrorType = "ApiCallError"
// Timeout is an error related to nodes not joining the cluster in maxNodeProvisionTime
Timeout AutoscalerErrorType = "Timeout"
// InternalError is an error inside Cluster Autoscaler
InternalError AutoscalerErrorType = "InternalError"
// TransientError is an error that causes us to skip a single loop, but
// does not require any additional action.
TransientError AutoscalerErrorType = "TransientError"
// ConfigurationError is an error related to bad configuration provided
// by a user.
ConfigurationError AutoscalerErrorType = "ConfigurationError"
// NodeGroupDoesNotExistError signifies that a NodeGroup
// does not exist.
NodeGroupDoesNotExistError AutoscalerErrorType = "nodeGroupDoesNotExistError"
)

const (
// NodeRegistration signifies an error with node registering
NodeRegistration AutoscalerErrorReason = "NodeRegistration"
)

// NewAutoscalerError returns new autoscaler error with a message constructed from format string
func NewAutoscalerError(errorType AutoscalerErrorType, msg string, args ...interface{}) AutoscalerError {
return autoscalerErrorImpl{
errorType: errorType,
msg: fmt.Sprintf(msg, args...),
}
}

// NewAutoscalerErrorWithReason returns new autoscaler error with a reason and a message constructed from format string
func NewAutoscalerErrorWithReason(errorType AutoscalerErrorType, reason AutoscalerErrorReason, msg string, args ...interface{}) AutoscalerError {
return autoscalerErrorImpl{
errorType: errorType,
errorReason: reason,
msg: fmt.Sprintf(msg, args...),
}
}

// NewAutoscalerCloudProviderError returns new autoscaler error with a cloudprovider error type and a message constructed from format string
func NewAutoscalerCloudProviderError(errorReason CloudProviderErrorReason, msg string, args ...interface{}) AutoscalerError {
return autoscalerErrorImpl{
errorType: CloudProviderError,
errorReason: AutoscalerErrorReason(errorReason),
msg: fmt.Sprintf(msg, args...),
}
}

// ToAutoscalerError converts an error to AutoscalerError with given type,
// unless it already is an AutoscalerError (in which case it's not modified).
func ToAutoscalerError(defaultType AutoscalerErrorType, err error) AutoscalerError {
if err == nil {
return nil
}
if e, ok := err.(AutoscalerError); ok {
return e
}
return NewAutoscalerError(defaultType, "%v", err)
}

// Error implements golang error interface
func (e autoscalerErrorImpl) Error() string {
return e.msg
}

// Type returns the type of AutoscalerError
func (e autoscalerErrorImpl) Type() AutoscalerErrorType {
return e.errorType
}

func (e autoscalerErrorImpl) Reason() AutoscalerErrorReason {
return e.errorReason
}

// AddPrefix adds a prefix to error message.
// Returns the error it's called for convenient inline use.
// Example:
// if err := DoSomething(myObject); err != nil {
//
// return err.AddPrefix("can't do something with %v: ", myObject)
//
// }
func (e autoscalerErrorImpl) AddPrefix(msg string, args ...interface{}) AutoscalerError {
e.msg = fmt.Sprintf(msg, args...) + e.msg
return e
}

// ServiceRawError wraps the RawError returned by the k8s/cloudprovider
// Azure clients. The error body should satisfy the autorest.ServiceError type
type ServiceRawError struct {
ServiceError *azure.ServiceError `json:"error,omitempty"`
}

func azureToAutoscalerError(rerr *retry.Error) AutoscalerError {
if rerr == nil {
return nil
}
if rerr.RawError == nil {
return NewAutoscalerCloudProviderError(Unknown, rerr.Error().Error())
}

re := ServiceRawError{}
err := json.Unmarshal([]byte(rerr.RawError.Error()), &re)
if err != nil {
return NewAutoscalerCloudProviderError(Unknown, rerr.Error().Error())
}
se := re.ServiceError
if se == nil {
return NewAutoscalerCloudProviderError(Unknown, rerr.Error().Error())
}
var errCode CloudProviderErrorReason
if se.Code == "" {
errCode = Unknown
} else if se.Code == OperationNotAllowed {
errCode = getOperationNotAllowedReason(se)
} else {
errCode = CloudProviderErrorReason(se.Code)
}
return NewAutoscalerCloudProviderError(errCode, se.Message)
}

// getOperationNotAllowedReason renames the error code for quotas to a more human-readable error
func getOperationNotAllowedReason(se *azure.ServiceError) CloudProviderErrorReason {
if strings.Contains(se.Message, "Quota increase") {
return QuotaExceeded
}
return CloudProviderErrorReason(OperationNotAllowed)
}
109 changes: 109 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_logger.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package azure

import (
"runtime"
"time"

"crypto/rand"

"github.com/sirupsen/logrus"
)

// Logger is a type that can log QOSEventInfo
type Logger struct {
QOSLogger *logrus.Entry
EnableQOSLogging bool
}

// QOSEventInfo is a type used to store qosEvents' info when logging is delayed
type QOSEventInfo struct {
Start time.Time
End time.Time
Properties map[string]interface{}
}

var (
logger *Logger
)

const (
// SourceFieldName source
SourceFieldName = "source"
// ClusterAutoscalerQosLog source field name for the QosLogger
ClusterAutoscalerQosLog = "ClusterAutoscalerQosLog"
epochFieldName = "env_epoch"
fileNameFieldName = "fileName"
lineNumberFieldName = "lineNumber"
methodNameFieldName = "methodName"
durationInMillisecondsFieldName = "durationInMilliseconds"
resultFieldName = "result"
errorDetailsFieldName = "errorDetails"
errorTypeFieldName = "errorType"
errorReasonFieldName = "errorReason"
startTimeFieldName = "startTime"
endTimeFieldName = "endTime"
upperCaseAlphanumeric = "ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890"
)

// NewLogger makes a new Logger that can log qos events
func NewLogger(enableQOSLogging bool) {
entryLogger := logrus.New()
entryLogger.Formatter = &logrus.JSONFormatter{}
log := logrus.NewEntry(entryLogger)
epoch, _ := getEpochRandomString()
log.WithField(epochFieldName, epoch)
log = withCallerInfo(log)
logger = &Logger{
log.WithField(SourceFieldName, ClusterAutoscalerQosLog),
enableQOSLogging,
}
}

func withCallerInfo(logger *logrus.Entry) *logrus.Entry {
_, file, line, _ := runtime.Caller(3)
fields := make(map[string]interface{})
fields[fileNameFieldName] = file
fields[lineNumberFieldName] = line
return logger.WithFields(fields)
}

// getAutoscalerErrorInfo formats the passed in AutoscalerError to be logged in QosEvent
func getAutoscalerErrorInfo(autoscalerErrors AutoscalerError) (map[string]interface{}, string) {
autoscalerErrorsMap := map[string]interface{}{errorDetailsFieldName: nil, errorTypeFieldName: nil, errorReasonFieldName: nil}
result := "Succeeded"
if autoscalerErrors != nil {
autoscalerErrorsMap = map[string]interface{}{errorDetailsFieldName: autoscalerErrors.Error(), errorTypeFieldName: autoscalerErrors.Type(), errorReasonFieldName: autoscalerErrors.Reason()}
result = "Failed"
}
return autoscalerErrorsMap, result
}

// getEpochRandomString generates a random string with the provided length using the given alphabet
func getEpochRandomString() (string, error) {
randomBytes := make([]byte, 5)
_, err := rand.Read(randomBytes)
if err != nil {
return "", err
}
for index, randomByte := range randomBytes {
foldedOffset := randomByte % byte(len(upperCaseAlphanumeric))
randomBytes[index] = upperCaseAlphanumeric[foldedOffset]
}
return string(randomBytes), nil
}
2 changes: 2 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ import (
)

const (
azurePrefix = "azure://"

vmTypeVMSS = "vmss"
vmTypeStandard = "standard"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,7 @@ func TestGetFilteredAutoscalingGroupsVmss(t *testing.T) {
curSize: 3,
sizeRefreshPeriod: manager.azureCache.refreshInterval,
instancesRefreshPeriod: defaultVmssInstancesRefreshPeriod,
scaleDownPolicy: ScaleDownPolicyDelete,
}}
assert.True(t, assert.ObjectsAreEqualValues(expectedAsgs, asgs), "expected %#v, but found: %#v", expectedAsgs, asgs)
}
Expand Down
Loading

0 comments on commit b33b8fb

Please sign in to comment.