Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: node hibernate #6202

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions charts/cluster-autoscaler/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,7 @@ vpa:
| azureClientID | string | `""` | Service Principal ClientID with contributor permission to Cluster and Node ResourceGroup. Required if `cloudProvider=azure` |
| azureClientSecret | string | `""` | Service Principal ClientSecret with contributor permission to Cluster and Node ResourceGroup. Required if `cloudProvider=azure` |
| azureResourceGroup | string | `""` | Azure resource group that the cluster is located. Required if `cloudProvider=azure` |
| azureScaleDownPolicy | string | `"Delete"` | Azure ScaleDownPolicy, either "Delete" (default) or "Hibernate" Only relevant if `cloudProvider=azure` |
| azureSubscriptionID | string | `""` | Azure subscription where the resources are located. Required if `cloudProvider=azure` |
| azureTenantID | string | `""` | Azure tenant where the resources are located. Required if `cloudProvider=azure` |
| azureUseManagedIdentityExtension | bool | `false` | Whether to use Azure's managed identity extension for credentials. If using MSI, ensure subscription ID, resource group, and azure AKS cluster name are set. You can only use one authentication method at a time, either azureUseWorkloadIdentityExtension or azureUseManagedIdentityExtension should be set. |
Expand Down
5 changes: 5 additions & 0 deletions charts/cluster-autoscaler/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,11 @@ spec:
secretKeyRef:
key: VMType
name: {{ default (include "cluster-autoscaler.fullname" .) .Values.secretKeyRefNameOverride }}
- name: AZURE_SCALE_DOWN_POLICY
valueFrom:
secretKeyRef:
key: ScaleDownPolicy
name: {{ default (include "cluster-autoscaler.fullname" .) .Values.secretKeyRefNameOverride }}
{{- if .Values.azureUseWorkloadIdentityExtension }}
- name: ARM_USE_WORKLOAD_IDENTITY_EXTENSION
value: "true"
Expand Down
1 change: 1 addition & 0 deletions charts/cluster-autoscaler/templates/secret.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ data:
SubscriptionID: "{{ .Values.azureSubscriptionID | b64enc }}"
TenantID: "{{ .Values.azureTenantID | b64enc }}"
VMType: "{{ .Values.azureVMType | b64enc }}"
ScaleDownPolicy: "{{ .Values.azureScaleDownPolicy | b64enc }}"
{{- else if $isAws }}
AwsAccessKeyId: "{{ .Values.awsAccessKeyID | b64enc }}"
AwsSecretAccessKey: "{{ .Values.awsSecretAccessKey | b64enc }}"
Expand Down
4 changes: 4 additions & 0 deletions charts/cluster-autoscaler/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ azureUseWorkloadIdentityExtension: false
# azureVMType -- Azure VM type.
azureVMType: "vmss"

# azureScaleDownPolicy -- Azure ScaleDownPolicy, either "Delete" (default) or "Hibernate"
# Only relevant if `cloudProvider=azure`
azureScaleDownPolicy: "Delete"

# cloudConfigPath -- Configuration file for cloud provider.
cloudConfigPath: ""

Expand Down
4 changes: 4 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,9 @@ type Config struct {

// EnableVmssFlex defines whether to enable Vmss Flex support or not
EnableVmssFlex bool `json:"enableVmssFlex,omitempty" yaml:"enableVmssFlex,omitempty"`

// ScaleDownPolicy is the VMSS scale down policy, either "Delete" or "Hibernate"
ScaleDownPolicy string `json:"scaleDownPolicy" yaml:"scaleDownPolicy"`
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Jont828 FYI this is where a new feature config lives for the azure provider

}

// BuildAzureConfig returns a Config object for the Azure clients
Expand Down Expand Up @@ -169,6 +172,7 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) {
cfg.AADClientCertPath = os.Getenv("ARM_CLIENT_CERT_PATH")
cfg.AADClientCertPassword = os.Getenv("ARM_CLIENT_CERT_PASSWORD")
cfg.Deployment = os.Getenv("ARM_DEPLOYMENT")
cfg.ScaleDownPolicy = os.Getenv("AZURE_SCALE_DOWN_POLICY")

subscriptionID, err := getSubscriptionIdFromInstanceMetadata()
if err != nil {
Expand Down
204 changes: 204 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_error.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
/*
Copyright 2023 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package azure

import (
"encoding/json"
"fmt"
"strings"

"github.com/Azure/go-autorest/autorest/azure"
"sigs.k8s.io/cloud-provider-azure/pkg/retry"
)

// Unknown is for errors that have nil RawError body
const Unknown CloudProviderErrorReason = "Unknown"

// Errors on the sync path
const (
// QuotaExceeded falls under OperationNotAllowed error code but we make it more specific here
QuotaExceeded CloudProviderErrorReason = "QuotaExceeded"
// OperationNotAllowed is an umbrella for a lot of errors returned by Azure
OperationNotAllowed string = "OperationNotAllowed"
)

// AutoscalerErrorType describes a high-level category of a given error
type AutoscalerErrorType string

// AutoscalerErrorReason is a more detailed reason for the failed operation
type AutoscalerErrorReason string

// CloudProviderErrorReason providers more details on errors of type CloudProviderError
type CloudProviderErrorReason AutoscalerErrorReason

// AutoscalerError contains information about Autoscaler errors
type AutoscalerError interface {
// Error implements golang error interface
Error() string

// Type returns the type of AutoscalerError
Type() AutoscalerErrorType

// Reason returns the reason of the AutoscalerError
Reason() AutoscalerErrorReason

// AddPrefix adds a prefix to error message.
// Returns the error it's called for convenient inline use.
// Example:
// if err := DoSomething(myObject); err != nil {
// return err.AddPrefix("can't do something with %v: ", myObject)
// }
AddPrefix(msg string, args ...interface{}) AutoscalerError
}

type autoscalerErrorImpl struct {
errorType AutoscalerErrorType
errorReason AutoscalerErrorReason
msg string
}

const (
// CloudProviderError is an error related to underlying infrastructure
CloudProviderError AutoscalerErrorType = "CloudProviderError"
// ApiCallError is an error related to communication with k8s API server
ApiCallError AutoscalerErrorType = "ApiCallError"
// Timeout is an error related to nodes not joining the cluster in maxNodeProvisionTime
Timeout AutoscalerErrorType = "Timeout"
// InternalError is an error inside Cluster Autoscaler
InternalError AutoscalerErrorType = "InternalError"
// TransientError is an error that causes us to skip a single loop, but
// does not require any additional action.
TransientError AutoscalerErrorType = "TransientError"
// ConfigurationError is an error related to bad configuration provided
// by a user.
ConfigurationError AutoscalerErrorType = "ConfigurationError"
// NodeGroupDoesNotExistError signifies that a NodeGroup
// does not exist.
NodeGroupDoesNotExistError AutoscalerErrorType = "nodeGroupDoesNotExistError"
)

const (
// NodeRegistration signifies an error with node registering
NodeRegistration AutoscalerErrorReason = "NodeRegistration"
)

// NewAutoscalerError returns new autoscaler error with a message constructed from format string
func NewAutoscalerError(errorType AutoscalerErrorType, msg string, args ...interface{}) AutoscalerError {
return autoscalerErrorImpl{
errorType: errorType,
msg: fmt.Sprintf(msg, args...),
}
}

// NewAutoscalerErrorWithReason returns new autoscaler error with a reason and a message constructed from format string
func NewAutoscalerErrorWithReason(errorType AutoscalerErrorType, reason AutoscalerErrorReason, msg string, args ...interface{}) AutoscalerError {
return autoscalerErrorImpl{
errorType: errorType,
errorReason: reason,
msg: fmt.Sprintf(msg, args...),
}
}

// NewAutoscalerCloudProviderError returns new autoscaler error with a cloudprovider error type and a message constructed from format string
func NewAutoscalerCloudProviderError(errorReason CloudProviderErrorReason, msg string, args ...interface{}) AutoscalerError {
return autoscalerErrorImpl{
errorType: CloudProviderError,
errorReason: AutoscalerErrorReason(errorReason),
msg: fmt.Sprintf(msg, args...),
}
}

// ToAutoscalerError converts an error to AutoscalerError with given type,
// unless it already is an AutoscalerError (in which case it's not modified).
func ToAutoscalerError(defaultType AutoscalerErrorType, err error) AutoscalerError {
if err == nil {
return nil
}
if e, ok := err.(AutoscalerError); ok {
return e
}
return NewAutoscalerError(defaultType, "%v", err)
}

// Error implements golang error interface
func (e autoscalerErrorImpl) Error() string {
return e.msg
}

// Type returns the type of AutoscalerError
func (e autoscalerErrorImpl) Type() AutoscalerErrorType {
return e.errorType
}

func (e autoscalerErrorImpl) Reason() AutoscalerErrorReason {
return e.errorReason
}

// AddPrefix adds a prefix to error message.
// Returns the error it's called for convenient inline use.
// Example:
// if err := DoSomething(myObject); err != nil {
//
// return err.AddPrefix("can't do something with %v: ", myObject)
//
// }
func (e autoscalerErrorImpl) AddPrefix(msg string, args ...interface{}) AutoscalerError {
e.msg = fmt.Sprintf(msg, args...) + e.msg
return e
}

// ServiceRawError wraps the RawError returned by the k8s/cloudprovider
// Azure clients. The error body should satisfy the autorest.ServiceError type
type ServiceRawError struct {
ServiceError *azure.ServiceError `json:"error,omitempty"`
}

func azureToAutoscalerError(rerr *retry.Error) AutoscalerError {
if rerr == nil {
return nil
}
if rerr.RawError == nil {
return NewAutoscalerCloudProviderError(Unknown, rerr.Error().Error())
}

re := ServiceRawError{}
err := json.Unmarshal([]byte(rerr.RawError.Error()), &re)
if err != nil {
return NewAutoscalerCloudProviderError(Unknown, rerr.Error().Error())
}
se := re.ServiceError
if se == nil {
return NewAutoscalerCloudProviderError(Unknown, rerr.Error().Error())
}
var errCode CloudProviderErrorReason
if se.Code == "" {
errCode = Unknown
} else if se.Code == OperationNotAllowed {
errCode = getOperationNotAllowedReason(se)
} else {
errCode = CloudProviderErrorReason(se.Code)
}
return NewAutoscalerCloudProviderError(errCode, se.Message)
}

// getOperationNotAllowedReason renames the error code for quotas to a more human-readable error
func getOperationNotAllowedReason(se *azure.ServiceError) CloudProviderErrorReason {
if strings.Contains(se.Message, "Quota increase") {
return QuotaExceeded
}
return CloudProviderErrorReason(OperationNotAllowed)
}
2 changes: 2 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ import (
)

const (
azurePrefix = "azure://"

vmTypeVMSS = "vmss"
vmTypeStandard = "standard"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,7 @@ func TestGetFilteredAutoscalingGroupsVmss(t *testing.T) {
curSize: 3,
sizeRefreshPeriod: manager.azureCache.refreshInterval,
instancesRefreshPeriod: defaultVmssInstancesRefreshPeriod,
scaleDownPolicy: cloudprovider.Delete,
}}
assert.True(t, assert.ObjectsAreEqualValues(expectedAsgs, asgs), "expected %#v, but found: %#v", expectedAsgs, asgs)
}
Expand Down
Loading
Loading