From e450c57ed260c63bff62d21ada76bbd19fb44bd7 Mon Sep 17 00:00:00 2001 From: Brian Mendoza Date: Mon, 8 Apr 2024 09:57:55 -0400 Subject: [PATCH] requeue during machine preflight --- cloud/scope/common.go | 8 +- cloud/scope/machine.go | 5 +- controller/linodemachine_controller.go | 188 ++++++++++++++++--------- util/reconciler/conditions.go | 20 +++ 4 files changed, 150 insertions(+), 71 deletions(-) create mode 100644 util/reconciler/conditions.go diff --git a/cloud/scope/common.go b/cloud/scope/common.go index 3e912f23e..23e42b600 100644 --- a/cloud/scope/common.go +++ b/cloud/scope/common.go @@ -14,7 +14,9 @@ import ( "github.com/linode/cluster-api-provider-linode/version" ) -func CreateLinodeClient(apiKey string) (*linodego.Client, error) { +type ClientOpt func(*linodego.Client) + +func CreateLinodeClient(apiKey string, opts ...ClientOpt) (*linodego.Client, error) { if apiKey == "" { return nil, errors.New("missing Linode API key") } @@ -28,6 +30,10 @@ func CreateLinodeClient(apiKey string) (*linodego.Client, error) { } linodeClient := linodego.NewClient(oauth2Client) + for _, opt := range opts { + opt(&linodeClient) + } + linodeClient.SetUserAgent(fmt.Sprintf("CAPL/%s", version.GetVersion())) return &linodeClient, nil diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go index 37ee24b38..2ecbc1aca 100644 --- a/cloud/scope/machine.go +++ b/cloud/scope/machine.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" + "github.com/linode/linodego" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" @@ -82,7 +83,9 @@ func NewMachineScope(ctx context.Context, apiKey string, params MachineScopePara } apiKey = string(data) } - linodeClient, err := CreateLinodeClient(apiKey) + linodeClient, err := CreateLinodeClient(apiKey, func(linodeClient *linodego.Client) { + linodeClient.SetRetryCount(0) + }) if err != nil { return nil, fmt.Errorf("failed to create linode client: %w", err) } diff --git a/controller/linodemachine_controller.go b/controller/linodemachine_controller.go index 9c3fea0b5..d74fd3348 100644 --- a/controller/linodemachine_controller.go +++ b/controller/linodemachine_controller.go @@ -49,10 +49,18 @@ import ( "github.com/linode/cluster-api-provider-linode/util/reconciler" ) -// default etcd Disk size in MB const ( + // default etcd disk size in MB defaultEtcdDiskSize = 10240 - defaultResizeTimeoutSeconds = 30 + defaultResizeTimeoutSeconds = 5 + + // conditions for preflight instance creation + ConditionPreflightRootDiskResizing clusterv1.ConditionType = "PreflightRootDiskResizing" + ConditionPreflightRootDiskResized clusterv1.ConditionType = "PreflightRootDiskResized" + ConditionPreflightEtcdDiskCreated clusterv1.ConditionType = "PreflightEtcdDiskCreated" + ConditionPreflightConfigured clusterv1.ConditionType = "PreflightConfigured" + ConditionPreflightBootStarted clusterv1.ConditionType = "PreflightBootStarted" + ConditionPreflightReady clusterv1.ConditionType = "PreflightReady" ) var skippedMachinePhases = map[string]bool{ @@ -61,6 +69,7 @@ var skippedMachinePhases = map[string]bool{ string(clusterv1.MachinePhaseUnknown): true, } +// statuses to keep requeueing on while an instance is booting var requeueInstanceStatuses = map[linodego.InstanceStatus]bool{ linodego.InstanceOffline: true, linodego.InstanceBooting: true, @@ -204,16 +213,15 @@ func (r *LinodeMachineReconciler) reconcile( return } - var linodeInstance *linodego.Instance - defer func() { - machineScope.LinodeMachine.Status.InstanceState = util.Pointer(linodego.InstanceOffline) - if linodeInstance != nil { - machineScope.LinodeMachine.Status.InstanceState = &linodeInstance.Status - } - }() - // Update - if machineScope.LinodeMachine.Spec.InstanceID != nil { + if machineScope.LinodeMachine.Status.InstanceState != nil { + var linodeInstance *linodego.Instance + defer func() { + if linodeInstance != nil { + machineScope.LinodeMachine.Status.InstanceState = &linodeInstance.Status + } + }() + failureReason = cerrs.UpdateMachineError logger = logger.WithValues("ID", *machineScope.LinodeMachine.Spec.InstanceID) @@ -232,7 +240,7 @@ func (r *LinodeMachineReconciler) reconcile( return } - linodeInstance, err = r.reconcileCreate(ctx, logger, machineScope) + res, err = r.reconcileCreate(ctx, logger, machineScope) return } @@ -241,7 +249,7 @@ func (r *LinodeMachineReconciler) reconcileCreate( ctx context.Context, logger logr.Logger, machineScope *scope.MachineScope, -) (*linodego.Instance, error) { +) (ctrl.Result, error) { logger.Info("creating machine") tags := []string{machineScope.LinodeCluster.Name} @@ -253,13 +261,14 @@ func (r *LinodeMachineReconciler) reconcileCreate( } filter, err := listFilter.String() if err != nil { - return nil, err + return ctrl.Result{}, err } linodeInstances, err := machineScope.LinodeClient.ListInstances(ctx, linodego.NewListOptions(1, filter)) if err != nil { logger.Error(err, "Failed to list Linode machine instances") - return nil, err + // TODO: What terminal errors should we not requeue for, and just return an error? + return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerWaitForRunningDelay}, nil } var linodeInstance *linodego.Instance @@ -274,43 +283,66 @@ func (r *LinodeMachineReconciler) reconcileCreate( if err != nil { logger.Error(err, "Failed to create Linode machine InstanceCreateOptions") - return nil, err + return ctrl.Result{}, err } + linodeInstance, err = machineScope.LinodeClient.CreateInstance(ctx, *createOpts) - if err != nil || linodeInstance == nil { + if err != nil { logger.Error(err, "Failed to create Linode machine instance") - return nil, err + // TODO: What terminal errors should we not requeue for, and just return an error? + return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerWaitForRunningDelay}, nil } + + machineScope.LinodeMachine.Spec.InstanceID = &linodeInstance.ID + + default: + err = errors.New("multiple instances") + logger.Error(err, "multiple instances found", "tags", tags) + + return ctrl.Result{}, err + } + + if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightConfigured) { if err = r.configureDisksControlPlane(ctx, logger, machineScope, linodeInstance.ID); err != nil { logger.Error(err, "Failed to configure instance disks") - return nil, err + // TODO: What terminal errors should we not requeue for, and just return an error? + return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerWaitForRunningDelay}, nil } + + conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightConfigured) + } + + if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightBootStarted) { if err = machineScope.LinodeClient.BootInstance(ctx, linodeInstance.ID, 0); err != nil { logger.Error(err, "Failed to boot instance") - return nil, err + // TODO: What terminal errors should we not requeue for, and just return an error? + return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerWaitForRunningDelay}, nil } - default: - err = errors.New("multiple instances") - logger.Error(err, "multiple instances found", "tags", tags) - return nil, err + conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightBootStarted) } - machineScope.LinodeMachine.Status.Ready = true - machineScope.LinodeMachine.Spec.InstanceID = &linodeInstance.ID - machineScope.LinodeMachine.Spec.ProviderID = util.Pointer(fmt.Sprintf("linode://%d", linodeInstance.ID)) - machineScope.LinodeMachine.Status.Addresses = buildInstanceAddrs(linodeInstance) + if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightReady) { + if err = services.AddNodeToNB(ctx, logger, machineScope); err != nil { + logger.Error(err, "Failed to add instance to Node Balancer backend") - if err = services.AddNodeToNB(ctx, logger, machineScope); err != nil { - logger.Error(err, "Failed to add instance to Node Balancer backend") + // TODO: What terminal errors should we not requeue for, and just return an error? + return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerWaitForRunningDelay}, nil + } - return linodeInstance, err + conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightReady) } - return linodeInstance, nil + machineScope.LinodeMachine.Spec.ProviderID = util.Pointer(fmt.Sprintf("linode://%d", linodeInstance.ID)) + machineScope.LinodeMachine.Status.Addresses = buildInstanceAddrs(linodeInstance) + + // Set the instance state to signal preflight process is done + machineScope.LinodeMachine.Status.InstanceState = util.Pointer(linodego.InstanceOffline) + + return ctrl.Result{}, nil } func (r *LinodeMachineReconciler) configureDisksControlPlane( @@ -322,51 +354,69 @@ func (r *LinodeMachineReconciler) configureDisksControlPlane( if !kutil.IsControlPlaneMachine(machineScope.Machine) { return nil } - // get the default instance config - configs, err := machineScope.LinodeClient.ListInstanceConfigs(ctx, linodeInstanceID, &linodego.ListOptions{}) - if err != nil || len(configs) == 0 { - logger.Error(err, "Failed to list instance configs") - return err - } - instanceConfig := &configs[0] + if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightRootDiskResized) { + // get the default instance config + configs, err := machineScope.LinodeClient.ListInstanceConfigs(ctx, linodeInstanceID, &linodego.ListOptions{}) + if err != nil || len(configs) == 0 { + logger.Error(err, "Failed to list instance configs") - // carve out space for the etcd disk - rootDiskID := instanceConfig.Devices.SDA.DiskID - rootDisk, err := machineScope.LinodeClient.GetInstanceDisk(ctx, linodeInstanceID, rootDiskID) - if err != nil { - logger.Error(err, "Failed to get root disk for instance") + return err + } + instanceConfig := configs[0] - return err - } - diskSize := rootDisk.Size - defaultEtcdDiskSize - if err = machineScope.LinodeClient.ResizeInstanceDisk(ctx, linodeInstanceID, rootDiskID, diskSize); err != nil { - logger.Error(err, "Failed to resize root disk") + if instanceConfig.Devices.SDA == nil { + return errors.New("root disk not yet ready") + } - return err - } - // wait for the disk to resize - _, err = machineScope.LinodeClient.WaitForInstanceDiskStatus(ctx, linodeInstanceID, rootDiskID, linodego.DiskReady, defaultResizeTimeoutSeconds) - if err != nil { - logger.Error(err, fmt.Sprintf("Failed to resize root disk within resize timeout of %d seconds", defaultResizeTimeoutSeconds)) + rootDiskID := instanceConfig.Devices.SDA.DiskID - return err + // carve out space for the etcd disk + if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightRootDiskResizing) { + rootDisk, err := machineScope.LinodeClient.GetInstanceDisk(ctx, linodeInstanceID, rootDiskID) + if err != nil { + logger.Error(err, "Failed to get root disk for instance") + + return err + } + diskSize := rootDisk.Size - defaultEtcdDiskSize + if err = machineScope.LinodeClient.ResizeInstanceDisk(ctx, linodeInstanceID, rootDiskID, diskSize); err != nil { + logger.Error(err, "Failed to resize root disk") + + return err + } + + conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightRootDiskResizing) + } + + // wait for the disk to resize + if _, err := machineScope.LinodeClient.WaitForInstanceDiskStatus(ctx, linodeInstanceID, rootDiskID, linodego.DiskReady, defaultResizeTimeoutSeconds); err != nil { + logger.Error(err, fmt.Sprintf("Failed to resize root disk within resize timeout of %d seconds", defaultResizeTimeoutSeconds)) + + return err + } + + conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightRootDiskResized) + conditions.Delete(machineScope.LinodeMachine, ConditionPreflightRootDiskResizing) } - // create the etcd disk - _, err = machineScope.LinodeClient.CreateInstanceDisk( - ctx, - linodeInstanceID, - linodego.InstanceDiskCreateOptions{ - Label: "etcd-data", - Size: defaultEtcdDiskSize, - Filesystem: string(linodego.FilesystemExt4), - }, - ) - if err != nil { - logger.Error(err, "Failed to create etcd disk") + if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightEtcdDiskCreated) { + // create the etcd disk + if _, err := machineScope.LinodeClient.CreateInstanceDisk( + ctx, + linodeInstanceID, + linodego.InstanceDiskCreateOptions{ + Label: "etcd-data", + Size: defaultEtcdDiskSize, + Filesystem: string(linodego.FilesystemExt4), + }, + ); err != nil { + logger.Error(err, "Failed to create etcd disk") - return err + return err + } + + conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightEtcdDiskCreated) } return nil diff --git a/util/reconciler/conditions.go b/util/reconciler/conditions.go new file mode 100644 index 000000000..402ee32c5 --- /dev/null +++ b/util/reconciler/conditions.go @@ -0,0 +1,20 @@ +package reconciler + +import ( + corev1 "k8s.io/api/core/v1" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/util/conditions" +) + +func ConditionTrue(from conditions.Getter, typ clusterv1.ConditionType) bool { + return HasConditionStatus(from, typ, "True") +} + +func HasConditionStatus(from conditions.Getter, typ clusterv1.ConditionType, status corev1.ConditionStatus) bool { + cond := conditions.Get(from, typ) + if cond == nil { + return false + } + + return cond.Status == status +}