diff --git a/cluster-autoscaler/cloudprovider/hetzner/README.md b/cluster-autoscaler/cloudprovider/hetzner/README.md index 066d87b7cddb..b4af539f2ccb 100644 --- a/cluster-autoscaler/cloudprovider/hetzner/README.md +++ b/cluster-autoscaler/cloudprovider/hetzner/README.md @@ -10,6 +10,38 @@ The cluster autoscaler for Hetzner Cloud scales worker nodes. `HCLOUD_IMAGE` Defaults to `ubuntu-20.04`, @see https://docs.hetzner.cloud/#images. You can also use an image ID here (e.g. `15512617`), or a label selector associated with a custom snapshot (e.g. `customized_ubuntu=true`). The most recent snapshot will be used in the latter case. +`HCLOUD_CLUSTER_CONFIG` This is the new format replacing + * `HCLOUD_CLOUD_INIT` + * `HCLOUD_IMAGE` + + Base64 encoded JSON according to the following structure + +```json +{ + "imagesForArch": { // These should be the same format as HCLOUD_IMAGE + "arm64": "", + "amd64": "" + }, + "nodeConfigs": { + "pool1": { // This equals the pool name. Required for each pool that you have + "cloudInit": "", // HCLOUD_CLOUD_INIT make sure it isn't base64 encoded twice ;] + "labels": { + "node.kubernetes.io/role": "autoscaler-node" + }, + "taints": + [ + { + "key": "node.kubernetes.io/role", + "value": "autoscaler-node", + "effect": "NoExecute", + } + ] + } + } +} +``` + + `HCLOUD_NETWORK` Default empty , The name of the network that is used in the cluster , @see https://docs.hetzner.cloud/#networks `HCLOUD_FIREWALL` Default empty , The name of the firewall that is used in the cluster , @see https://docs.hetzner.cloud/#firewalls diff --git a/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go b/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go index 0deab54b679a..ad9e35d2e85e 100644 --- a/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/hetzner/hetzner_cloud_provider.go @@ -191,9 +191,12 @@ func BuildHetzner(_ config.AutoscalingOptions, do cloudprovider.NodeGroupDiscove klog.Fatalf("Failed to create Hetzner cloud provider: %v", err) } + if manager.clusterConfig.IsUsingNewFormat && len(manager.clusterConfig.NodeConfigs) == 0 { + klog.Fatalf("No cluster config present provider: %v", err) + } + validNodePoolName := regexp.MustCompile(`^[a-z0-9A-Z]+[a-z0-9A-Z\-\.\_]*[a-z0-9A-Z]+$|^[a-z0-9A-Z]{1}$`) clusterUpdateLock := sync.Mutex{} - for _, nodegroupSpec := range do.NodeGroupSpecs { spec, err := createNodePoolSpec(nodegroupSpec) if err != nil { @@ -206,6 +209,13 @@ func BuildHetzner(_ config.AutoscalingOptions, do cloudprovider.NodeGroupDiscove klog.Fatalf("Failed to get servers for for node pool %s error: %v", nodegroupSpec, err) } + if manager.clusterConfig.IsUsingNewFormat { + _, ok := manager.clusterConfig.NodeConfigs[spec.name] + if !ok { + klog.Fatalf("No node config present for node group id `%s` error: %v", spec.name, err) + } + } + manager.nodeGroups[spec.name] = &hetznerNodeGroup{ manager: manager, id: spec.name, diff --git a/cluster-autoscaler/cloudprovider/hetzner/hetzner_manager.go b/cluster-autoscaler/cloudprovider/hetzner/hetzner_manager.go index 2d2afd44a5c1..89c1f12b6196 100644 --- a/cluster-autoscaler/cloudprovider/hetzner/hetzner_manager.go +++ b/cluster-autoscaler/cloudprovider/hetzner/hetzner_manager.go @@ -19,6 +19,7 @@ package hetzner import ( "context" "encoding/base64" + "encoding/json" "errors" "fmt" "net/http" @@ -45,8 +46,7 @@ type hetznerManager struct { client *hcloud.Client nodeGroups map[string]*hetznerNodeGroup apiCallContext context.Context - cloudInit string - image string + clusterConfig *ClusterConfig sshKey *hcloud.SSHKey network *hcloud.Network firewall *hcloud.Firewall @@ -57,6 +57,33 @@ type hetznerManager struct { cachedServers *serversCache } +// ClusterConfig holds the configuration for all the nodepools +type ClusterConfig struct { + ImagesForArch ImageList + NodeConfigs map[string]*NodeConfig + IsUsingNewFormat bool + LegacyConfig LegacyConfig +} + +// ImageList holds the image id/names for the different architectures +type ImageList struct { + Arm64 string + Amd64 string +} + +// NodeConfig holds the configuration for a single nodepool +type NodeConfig struct { + CloudInit string + Taints []apiv1.Taint + Labels map[string]string +} + +// LegacyConfig holds the configuration in the legacy format +type LegacyConfig struct { + CloudInit string + ImageName string +} + func newManager() (*hetznerManager, error) { token := os.Getenv("HCLOUD_TOKEN") if token == "" { @@ -71,19 +98,44 @@ func newManager() (*hetznerManager, error) { ) ctx := context.Background() + var err error + clusterConfigBase64 := os.Getenv("HCLOUD_CLUSTER_CONFIG") cloudInitBase64 := os.Getenv("HCLOUD_CLOUD_INIT") - if cloudInitBase64 == "" { - return nil, errors.New("`HCLOUD_CLOUD_INIT` is not specified") + + if clusterConfigBase64 == "" && cloudInitBase64 == "" { + return nil, errors.New("`HCLOUD_CLUSTER_CONFIG` or `HCLOUD_CLOUD_INIT` is not specified") } - cloudInit, err := base64.StdEncoding.DecodeString(cloudInitBase64) - if err != nil { - return nil, fmt.Errorf("failed to parse cloud init error: %s", err) + var clusterConfig *ClusterConfig = &ClusterConfig{} + + if clusterConfigBase64 != "" { + clusterConfig.IsUsingNewFormat = true } - imageName := os.Getenv("HCLOUD_IMAGE") - if imageName == "" { - imageName = "ubuntu-20.04" + if clusterConfig.IsUsingNewFormat { + clusterConfigEnv, err := base64.StdEncoding.DecodeString(clusterConfigBase64) + if err != nil { + return nil, fmt.Errorf("failed to parse cluster config error: %s", err) + } + err = json.Unmarshal(clusterConfigEnv, &clusterConfig) + if err != nil { + return nil, fmt.Errorf("failed to unmarshal cluster config JSON: %s", err) + } + } + + if !clusterConfig.IsUsingNewFormat { + cloudInit, err := base64.StdEncoding.DecodeString(cloudInitBase64) + if err != nil { + return nil, fmt.Errorf("failed to parse cloud init error: %s", err) + } + + imageName := os.Getenv("HCLOUD_IMAGE") + if imageName == "" { + imageName = "ubuntu-20.04" + } + + clusterConfig.LegacyConfig.CloudInit = string(cloudInit) + clusterConfig.LegacyConfig.ImageName = imageName } publicIPv4 := true @@ -141,8 +193,6 @@ func newManager() (*hetznerManager, error) { m := &hetznerManager{ client: client, nodeGroups: make(map[string]*hetznerNodeGroup), - cloudInit: string(cloudInit), - image: imageName, sshKey: sshKey, network: network, firewall: firewall, @@ -150,6 +200,7 @@ func newManager() (*hetznerManager, error) { apiCallContext: ctx, publicIPv4: publicIPv4, publicIPv6: publicIPv6, + clusterConfig: clusterConfig, cachedServerType: newServerTypeCache(ctx, client), cachedServers: newServersCache(ctx, client), } diff --git a/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go b/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go index af79f94c8987..c819cfb4886f 100644 --- a/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go +++ b/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go @@ -19,6 +19,7 @@ package hetzner import ( "context" "fmt" + "maps" "math/rand" "strings" "sync" @@ -241,6 +242,16 @@ func (n *hetznerNodeGroup) TemplateNodeInfo() (*schedulerframework.NodeInfo, err } node.Labels = cloudprovider.JoinStringMaps(node.Labels, nodeGroupLabels) + if n.manager.clusterConfig.IsUsingNewFormat && n.id != drainingNodePoolId { + for _, taint := range n.manager.clusterConfig.NodeConfigs[n.id].Taints { + node.Spec.Taints = append(node.Spec.Taints, apiv1.Taint{ + Key: taint.Key, + Value: taint.Value, + Effect: taint.Effect, + }) + } + } + nodeInfo := schedulerframework.NewNodeInfo(cloudprovider.BuildKubeProxy(n.id)) nodeInfo.SetNode(&node) @@ -325,14 +336,23 @@ func buildNodeGroupLabels(n *hetznerNodeGroup) (map[string]string, error) { if err != nil { return nil, err } + klog.V(4).Infof("Build node group label for %s", n.id) - return map[string]string{ + labels := map[string]string{ apiv1.LabelInstanceType: n.instanceType, apiv1.LabelTopologyRegion: n.region, apiv1.LabelArchStable: archLabel, "csi.hetzner.cloud/location": n.region, nodeGroupLabel: n.id, - }, nil + } + + if n.manager.clusterConfig.IsUsingNewFormat && n.id != drainingNodePoolId { + maps.Copy(labels, n.manager.clusterConfig.NodeConfigs[n.id].Labels) + } + + klog.V(4).Infof("%s nodegroup labels: %s", n.id, labels) + + return labels, nil } func getMachineTypeResourceList(m *hetznerManager, instanceType string) (apiv1.ResourceList, error) { @@ -392,10 +412,16 @@ func createServer(n *hetznerNodeGroup) error { return err } + cloudInit := n.manager.clusterConfig.LegacyConfig.CloudInit + + if n.manager.clusterConfig.IsUsingNewFormat { + cloudInit = n.manager.clusterConfig.NodeConfigs[n.id].CloudInit + } + StartAfterCreate := true opts := hcloud.ServerCreateOpts{ Name: newNodeName(n), - UserData: n.manager.cloudInit, + UserData: cloudInit, Location: &hcloud.Location{Name: n.region}, ServerType: serverType, Image: image, @@ -443,7 +469,18 @@ func createServer(n *hetznerNodeGroup) error { // server. func findImage(n *hetznerNodeGroup, serverType *hcloud.ServerType) (*hcloud.Image, error) { // Select correct image based on server type architecture - image, _, err := n.manager.client.Image.GetForArchitecture(context.TODO(), n.manager.image, serverType.Architecture) + imageName := n.manager.clusterConfig.LegacyConfig.ImageName + if n.manager.clusterConfig.IsUsingNewFormat { + if serverType.Architecture == hcloud.ArchitectureARM { + imageName = n.manager.clusterConfig.ImagesForArch.Arm64 + } + + if serverType.Architecture == hcloud.ArchitectureX86 { + imageName = n.manager.clusterConfig.ImagesForArch.Amd64 + } + } + + image, _, err := n.manager.client.Image.GetForArchitecture(context.TODO(), imageName, serverType.Architecture) if err != nil { // Keep looking for label if image was not found by id or name if !strings.HasPrefix(err.Error(), "image not found") { @@ -462,12 +499,12 @@ func findImage(n *hetznerNodeGroup, serverType *hcloud.ServerType) (*hcloud.Imag Sort: []string{"created:desc"}, Architecture: []hcloud.Architecture{serverType.Architecture}, ListOpts: hcloud.ListOpts{ - LabelSelector: n.manager.image, + LabelSelector: imageName, }, }) if err != nil || len(images) == 0 { - return nil, fmt.Errorf("unable to find image %s with architecture %s: %v", n.manager.image, serverType.Architecture, err) + return nil, fmt.Errorf("unable to find image %s with architecture %s: %v", imageName, serverType.Architecture, err) } return images[0], nil