From e450c57ed260c63bff62d21ada76bbd19fb44bd7 Mon Sep 17 00:00:00 2001 From: Brian Mendoza Date: Mon, 8 Apr 2024 09:57:55 -0400 Subject: [PATCH 1/8] requeue during machine preflight --- cloud/scope/common.go | 8 +- cloud/scope/machine.go | 5 +- controller/linodemachine_controller.go | 188 ++++++++++++++++--------- util/reconciler/conditions.go | 20 +++ 4 files changed, 150 insertions(+), 71 deletions(-) create mode 100644 util/reconciler/conditions.go diff --git a/cloud/scope/common.go b/cloud/scope/common.go index 3e912f23e..23e42b600 100644 --- a/cloud/scope/common.go +++ b/cloud/scope/common.go @@ -14,7 +14,9 @@ import ( "github.com/linode/cluster-api-provider-linode/version" ) -func CreateLinodeClient(apiKey string) (*linodego.Client, error) { +type ClientOpt func(*linodego.Client) + +func CreateLinodeClient(apiKey string, opts ...ClientOpt) (*linodego.Client, error) { if apiKey == "" { return nil, errors.New("missing Linode API key") } @@ -28,6 +30,10 @@ func CreateLinodeClient(apiKey string) (*linodego.Client, error) { } linodeClient := linodego.NewClient(oauth2Client) + for _, opt := range opts { + opt(&linodeClient) + } + linodeClient.SetUserAgent(fmt.Sprintf("CAPL/%s", version.GetVersion())) return &linodeClient, nil diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go index 37ee24b38..2ecbc1aca 100644 --- a/cloud/scope/machine.go +++ b/cloud/scope/machine.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" + "github.com/linode/linodego" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" @@ -82,7 +83,9 @@ func NewMachineScope(ctx context.Context, apiKey string, params MachineScopePara } apiKey = string(data) } - linodeClient, err := CreateLinodeClient(apiKey) + linodeClient, err := CreateLinodeClient(apiKey, func(linodeClient *linodego.Client) { + linodeClient.SetRetryCount(0) + }) if err != nil { return nil, fmt.Errorf("failed to create linode client: %w", err) } diff --git a/controller/linodemachine_controller.go b/controller/linodemachine_controller.go index 9c3fea0b5..d74fd3348 100644 --- a/controller/linodemachine_controller.go +++ b/controller/linodemachine_controller.go @@ -49,10 +49,18 @@ import ( "github.com/linode/cluster-api-provider-linode/util/reconciler" ) -// default etcd Disk size in MB const ( + // default etcd disk size in MB defaultEtcdDiskSize = 10240 - defaultResizeTimeoutSeconds = 30 + defaultResizeTimeoutSeconds = 5 + + // conditions for preflight instance creation + ConditionPreflightRootDiskResizing clusterv1.ConditionType = "PreflightRootDiskResizing" + ConditionPreflightRootDiskResized clusterv1.ConditionType = "PreflightRootDiskResized" + ConditionPreflightEtcdDiskCreated clusterv1.ConditionType = "PreflightEtcdDiskCreated" + ConditionPreflightConfigured clusterv1.ConditionType = "PreflightConfigured" + ConditionPreflightBootStarted clusterv1.ConditionType = "PreflightBootStarted" + ConditionPreflightReady clusterv1.ConditionType = "PreflightReady" ) var skippedMachinePhases = map[string]bool{ @@ -61,6 +69,7 @@ var skippedMachinePhases = map[string]bool{ string(clusterv1.MachinePhaseUnknown): true, } +// statuses to keep requeueing on while an instance is booting var requeueInstanceStatuses = map[linodego.InstanceStatus]bool{ linodego.InstanceOffline: true, linodego.InstanceBooting: true, @@ -204,16 +213,15 @@ func (r *LinodeMachineReconciler) reconcile( return } - var linodeInstance *linodego.Instance - defer func() { - machineScope.LinodeMachine.Status.InstanceState = util.Pointer(linodego.InstanceOffline) - if linodeInstance != nil { - machineScope.LinodeMachine.Status.InstanceState = &linodeInstance.Status - } - }() - // Update - if machineScope.LinodeMachine.Spec.InstanceID != nil { + if machineScope.LinodeMachine.Status.InstanceState != nil { + var linodeInstance *linodego.Instance + defer func() { + if linodeInstance != nil { + machineScope.LinodeMachine.Status.InstanceState = &linodeInstance.Status + } + }() + failureReason = cerrs.UpdateMachineError logger = logger.WithValues("ID", *machineScope.LinodeMachine.Spec.InstanceID) @@ -232,7 +240,7 @@ func (r *LinodeMachineReconciler) reconcile( return } - linodeInstance, err = r.reconcileCreate(ctx, logger, machineScope) + res, err = r.reconcileCreate(ctx, logger, machineScope) return } @@ -241,7 +249,7 @@ func (r *LinodeMachineReconciler) reconcileCreate( ctx context.Context, logger logr.Logger, machineScope *scope.MachineScope, -) (*linodego.Instance, error) { +) (ctrl.Result, error) { logger.Info("creating machine") tags := []string{machineScope.LinodeCluster.Name} @@ -253,13 +261,14 @@ func (r *LinodeMachineReconciler) reconcileCreate( } filter, err := listFilter.String() if err != nil { - return nil, err + return ctrl.Result{}, err } linodeInstances, err := machineScope.LinodeClient.ListInstances(ctx, linodego.NewListOptions(1, filter)) if err != nil { logger.Error(err, "Failed to list Linode machine instances") - return nil, err + // TODO: What terminal errors should we not requeue for, and just return an error? + return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerWaitForRunningDelay}, nil } var linodeInstance *linodego.Instance @@ -274,43 +283,66 @@ func (r *LinodeMachineReconciler) reconcileCreate( if err != nil { logger.Error(err, "Failed to create Linode machine InstanceCreateOptions") - return nil, err + return ctrl.Result{}, err } + linodeInstance, err = machineScope.LinodeClient.CreateInstance(ctx, *createOpts) - if err != nil || linodeInstance == nil { + if err != nil { logger.Error(err, "Failed to create Linode machine instance") - return nil, err + // TODO: What terminal errors should we not requeue for, and just return an error? + return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerWaitForRunningDelay}, nil } + + machineScope.LinodeMachine.Spec.InstanceID = &linodeInstance.ID + + default: + err = errors.New("multiple instances") + logger.Error(err, "multiple instances found", "tags", tags) + + return ctrl.Result{}, err + } + + if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightConfigured) { if err = r.configureDisksControlPlane(ctx, logger, machineScope, linodeInstance.ID); err != nil { logger.Error(err, "Failed to configure instance disks") - return nil, err + // TODO: What terminal errors should we not requeue for, and just return an error? + return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerWaitForRunningDelay}, nil } + + conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightConfigured) + } + + if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightBootStarted) { if err = machineScope.LinodeClient.BootInstance(ctx, linodeInstance.ID, 0); err != nil { logger.Error(err, "Failed to boot instance") - return nil, err + // TODO: What terminal errors should we not requeue for, and just return an error? + return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerWaitForRunningDelay}, nil } - default: - err = errors.New("multiple instances") - logger.Error(err, "multiple instances found", "tags", tags) - return nil, err + conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightBootStarted) } - machineScope.LinodeMachine.Status.Ready = true - machineScope.LinodeMachine.Spec.InstanceID = &linodeInstance.ID - machineScope.LinodeMachine.Spec.ProviderID = util.Pointer(fmt.Sprintf("linode://%d", linodeInstance.ID)) - machineScope.LinodeMachine.Status.Addresses = buildInstanceAddrs(linodeInstance) + if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightReady) { + if err = services.AddNodeToNB(ctx, logger, machineScope); err != nil { + logger.Error(err, "Failed to add instance to Node Balancer backend") - if err = services.AddNodeToNB(ctx, logger, machineScope); err != nil { - logger.Error(err, "Failed to add instance to Node Balancer backend") + // TODO: What terminal errors should we not requeue for, and just return an error? + return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerWaitForRunningDelay}, nil + } - return linodeInstance, err + conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightReady) } - return linodeInstance, nil + machineScope.LinodeMachine.Spec.ProviderID = util.Pointer(fmt.Sprintf("linode://%d", linodeInstance.ID)) + machineScope.LinodeMachine.Status.Addresses = buildInstanceAddrs(linodeInstance) + + // Set the instance state to signal preflight process is done + machineScope.LinodeMachine.Status.InstanceState = util.Pointer(linodego.InstanceOffline) + + return ctrl.Result{}, nil } func (r *LinodeMachineReconciler) configureDisksControlPlane( @@ -322,51 +354,69 @@ func (r *LinodeMachineReconciler) configureDisksControlPlane( if !kutil.IsControlPlaneMachine(machineScope.Machine) { return nil } - // get the default instance config - configs, err := machineScope.LinodeClient.ListInstanceConfigs(ctx, linodeInstanceID, &linodego.ListOptions{}) - if err != nil || len(configs) == 0 { - logger.Error(err, "Failed to list instance configs") - return err - } - instanceConfig := &configs[0] + if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightRootDiskResized) { + // get the default instance config + configs, err := machineScope.LinodeClient.ListInstanceConfigs(ctx, linodeInstanceID, &linodego.ListOptions{}) + if err != nil || len(configs) == 0 { + logger.Error(err, "Failed to list instance configs") - // carve out space for the etcd disk - rootDiskID := instanceConfig.Devices.SDA.DiskID - rootDisk, err := machineScope.LinodeClient.GetInstanceDisk(ctx, linodeInstanceID, rootDiskID) - if err != nil { - logger.Error(err, "Failed to get root disk for instance") + return err + } + instanceConfig := configs[0] - return err - } - diskSize := rootDisk.Size - defaultEtcdDiskSize - if err = machineScope.LinodeClient.ResizeInstanceDisk(ctx, linodeInstanceID, rootDiskID, diskSize); err != nil { - logger.Error(err, "Failed to resize root disk") + if instanceConfig.Devices.SDA == nil { + return errors.New("root disk not yet ready") + } - return err - } - // wait for the disk to resize - _, err = machineScope.LinodeClient.WaitForInstanceDiskStatus(ctx, linodeInstanceID, rootDiskID, linodego.DiskReady, defaultResizeTimeoutSeconds) - if err != nil { - logger.Error(err, fmt.Sprintf("Failed to resize root disk within resize timeout of %d seconds", defaultResizeTimeoutSeconds)) + rootDiskID := instanceConfig.Devices.SDA.DiskID - return err + // carve out space for the etcd disk + if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightRootDiskResizing) { + rootDisk, err := machineScope.LinodeClient.GetInstanceDisk(ctx, linodeInstanceID, rootDiskID) + if err != nil { + logger.Error(err, "Failed to get root disk for instance") + + return err + } + diskSize := rootDisk.Size - defaultEtcdDiskSize + if err = machineScope.LinodeClient.ResizeInstanceDisk(ctx, linodeInstanceID, rootDiskID, diskSize); err != nil { + logger.Error(err, "Failed to resize root disk") + + return err + } + + conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightRootDiskResizing) + } + + // wait for the disk to resize + if _, err := machineScope.LinodeClient.WaitForInstanceDiskStatus(ctx, linodeInstanceID, rootDiskID, linodego.DiskReady, defaultResizeTimeoutSeconds); err != nil { + logger.Error(err, fmt.Sprintf("Failed to resize root disk within resize timeout of %d seconds", defaultResizeTimeoutSeconds)) + + return err + } + + conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightRootDiskResized) + conditions.Delete(machineScope.LinodeMachine, ConditionPreflightRootDiskResizing) } - // create the etcd disk - _, err = machineScope.LinodeClient.CreateInstanceDisk( - ctx, - linodeInstanceID, - linodego.InstanceDiskCreateOptions{ - Label: "etcd-data", - Size: defaultEtcdDiskSize, - Filesystem: string(linodego.FilesystemExt4), - }, - ) - if err != nil { - logger.Error(err, "Failed to create etcd disk") + if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightEtcdDiskCreated) { + // create the etcd disk + if _, err := machineScope.LinodeClient.CreateInstanceDisk( + ctx, + linodeInstanceID, + linodego.InstanceDiskCreateOptions{ + Label: "etcd-data", + Size: defaultEtcdDiskSize, + Filesystem: string(linodego.FilesystemExt4), + }, + ); err != nil { + logger.Error(err, "Failed to create etcd disk") - return err + return err + } + + conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightEtcdDiskCreated) } return nil diff --git a/util/reconciler/conditions.go b/util/reconciler/conditions.go new file mode 100644 index 000000000..402ee32c5 --- /dev/null +++ b/util/reconciler/conditions.go @@ -0,0 +1,20 @@ +package reconciler + +import ( + corev1 "k8s.io/api/core/v1" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/util/conditions" +) + +func ConditionTrue(from conditions.Getter, typ clusterv1.ConditionType) bool { + return HasConditionStatus(from, typ, "True") +} + +func HasConditionStatus(from conditions.Getter, typ clusterv1.ConditionType, status corev1.ConditionStatus) bool { + cond := conditions.Get(from, typ) + if cond == nil { + return false + } + + return cond.Status == status +} From a682ccc734cdfa789a096abc185a352929022a8f Mon Sep 17 00:00:00 2001 From: Brian Mendoza Date: Mon, 8 Apr 2024 10:54:57 -0400 Subject: [PATCH 2/8] test happy path --- cloud/scope/machine.go | 7 +- cloud/scope/machine_test.go | 2 +- controller/linodemachine_controller_test.go | 403 ++++++++++++++++++++ 3 files changed, 407 insertions(+), 5 deletions(-) create mode 100644 controller/linodemachine_controller_test.go diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go index 2ecbc1aca..4d2566409 100644 --- a/cloud/scope/machine.go +++ b/cloud/scope/machine.go @@ -24,8 +24,7 @@ type MachineScopeParams struct { } type MachineScope struct { - client k8sClient - + Client k8sClient PatchHelper *patch.Helper Cluster *clusterv1.Cluster Machine *clusterv1.Machine @@ -96,7 +95,7 @@ func NewMachineScope(ctx context.Context, apiKey string, params MachineScopePara } return &MachineScope{ - client: params.Client, + Client: params.Client, PatchHelper: helper, Cluster: params.Cluster, Machine: params.Machine, @@ -138,7 +137,7 @@ func (m *MachineScope) GetBootstrapData(ctx context.Context) ([]byte, error) { secret := &corev1.Secret{} key := types.NamespacedName{Namespace: m.LinodeMachine.Namespace, Name: *m.Machine.Spec.Bootstrap.DataSecretName} - if err := m.client.Get(ctx, key, secret); err != nil { + if err := m.Client.Get(ctx, key, secret); err != nil { return []byte{}, fmt.Errorf( "failed to retrieve bootstrap data secret for LinodeMachine %s/%s", m.LinodeMachine.Namespace, diff --git a/cloud/scope/machine_test.go b/cloud/scope/machine_test.go index 5705c6ef8..a355a69d8 100644 --- a/cloud/scope/machine_test.go +++ b/cloud/scope/machine_test.go @@ -543,7 +543,7 @@ func TestMachineScopeGetBootstrapData(t *testing.T) { testcase.expects(mockK8sClient) mScope := &MachineScope{ - client: mockK8sClient, + Client: mockK8sClient, PatchHelper: &patch.Helper{}, // empty patch helper Cluster: testcase.fields.Cluster, Machine: testcase.fields.Machine, diff --git a/controller/linodemachine_controller_test.go b/controller/linodemachine_controller_test.go new file mode 100644 index 000000000..397339cd7 --- /dev/null +++ b/controller/linodemachine_controller_test.go @@ -0,0 +1,403 @@ +// /* +// Copyright 2023 Akamai Technologies, Inc. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// */ + +package controller + +import ( + "bytes" + "errors" + "net" + + "github.com/go-logr/logr" + "github.com/linode/linodego" + "go.uber.org/mock/gomock" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/record" + "k8s.io/utils/ptr" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + + infrav1alpha1 "github.com/linode/cluster-api-provider-linode/api/v1alpha1" + "github.com/linode/cluster-api-provider-linode/cloud/scope" + "github.com/linode/cluster-api-provider-linode/mock" + rutil "github.com/linode/cluster-api-provider-linode/util/reconciler" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("create", Label("machine", "create"), func() { + var machine clusterv1.Machine + var linodeMachine infrav1alpha1.LinodeMachine + var secret corev1.Secret + + var mockCtrl *gomock.Controller + var testLogs *bytes.Buffer + var logger logr.Logger + + cluster := clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mock", + Namespace: "default", + }, + } + + linodeCluster := infrav1alpha1.LinodeCluster{ + Spec: infrav1alpha1.LinodeClusterSpec{ + Network: infrav1alpha1.NetworkSpec{ + NodeBalancerID: ptr.To(1), + NodeBalancerConfigID: ptr.To(2), + }, + }, + } + + recorder := record.NewFakeRecorder(10) + reconciler := &LinodeMachineReconciler{ + Recorder: recorder, + } + + BeforeEach(func(ctx SpecContext) { + secret = corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "bootstrap-secret", + Namespace: "default", + }, + Data: map[string][]byte{ + "value": []byte("userdata"), + }, + } + Expect(k8sClient.Create(ctx, &secret)).To(Succeed()) + + machine = clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "default", + Labels: make(map[string]string), + }, + Spec: clusterv1.MachineSpec{ + Bootstrap: clusterv1.Bootstrap{ + DataSecretName: ptr.To("bootstrap-secret"), + }, + }, + } + linodeMachine = infrav1alpha1.LinodeMachine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mock", + Namespace: "default", + UID: "12345", + }, + Spec: infrav1alpha1.LinodeMachineSpec{ + InstanceID: ptr.To(0), + Type: "g6-nanode-1", + Image: rutil.DefaultMachineControllerLinodeImage, + }, + } + mockCtrl = gomock.NewController(GinkgoT()) + testLogs = &bytes.Buffer{} + logger = zap.New( + zap.WriteTo(GinkgoWriter), + zap.WriteTo(testLogs), + zap.UseDevMode(true), + ) + }) + + AfterEach(func(ctx SpecContext) { + Expect(k8sClient.Delete(ctx, &secret)).To(Succeed()) + + mockCtrl.Finish() + for len(recorder.Events) > 0 { + <-recorder.Events + } + }) + + It("creates a worker instance", func(ctx SpecContext) { + mockLinodeClient := mock.NewMockLinodeMachineClient(mockCtrl) + listInst := mockLinodeClient.EXPECT(). + ListInstances(ctx, gomock.Any()). + Return([]linodego.Instance{}, nil) + getRegion := mockLinodeClient.EXPECT(). + GetRegion(ctx, gomock.Any()). + After(listInst). + Return(&linodego.Region{Capabilities: []string{"Metadata"}}, nil) + getImage := mockLinodeClient.EXPECT(). + GetImage(ctx, gomock.Any()). + After(getRegion). + Return(&linodego.Image{Capabilities: []string{"cloud-init"}}, nil) + createInst := mockLinodeClient.EXPECT(). + CreateInstance(ctx, gomock.Any()). + After(getImage). + Return(&linodego.Instance{ + ID: 123, + IPv4: []*net.IP{ptr.To(net.IPv4(192, 168, 0, 2))}, + Status: linodego.InstanceOffline, + }, nil) + mockLinodeClient.EXPECT(). + BootInstance(ctx, 123, 0). + After(createInst). + Return(nil) + + mScope := scope.MachineScope{ + Client: k8sClient, + LinodeClient: mockLinodeClient, + Cluster: &cluster, + Machine: &machine, + LinodeCluster: &linodeCluster, + LinodeMachine: &linodeMachine, + } + + _, err := reconciler.reconcileCreate(ctx, logger, &mScope) + Expect(err).NotTo(HaveOccurred()) + + Expect(*linodeMachine.Status.InstanceState).To(Equal(linodego.InstanceOffline)) + Expect(*linodeMachine.Spec.InstanceID).To(Equal(123)) + Expect(*linodeMachine.Spec.ProviderID).To(Equal("linode://123")) + Expect(linodeMachine.Status.Addresses).To(Equal([]clusterv1.MachineAddress{{ + Type: clusterv1.MachineInternalIP, + Address: "192.168.0.2", + }})) + + Expect(testLogs.String()).To(ContainSubstring("creating machine")) + Expect(testLogs.String()).NotTo(ContainSubstring("Failed to list Linode machine instance")) + Expect(testLogs.String()).NotTo(ContainSubstring("Linode instance already exists")) + Expect(testLogs.String()).NotTo(ContainSubstring("Failed to create Linode machine InstanceCreateOptions")) + Expect(testLogs.String()).NotTo(ContainSubstring("Failed to create Linode machine instance")) + Expect(testLogs.String()).NotTo(ContainSubstring("Failed to boot instance")) + Expect(testLogs.String()).NotTo(ContainSubstring("multiple instances found")) + Expect(testLogs.String()).NotTo(ContainSubstring("Failed to add instance to Node Balancer backend")) + }) + + Context("creates a control plane instance", func() { + It("in a single call when disks aren't delayed", func(ctx SpecContext) { + machine.Labels[clusterv1.MachineControlPlaneLabel] = "true" + + mockLinodeClient := mock.NewMockLinodeMachineClient(mockCtrl) + listInst := mockLinodeClient.EXPECT(). + ListInstances(ctx, gomock.Any()). + Return([]linodego.Instance{}, nil) + getRegion := mockLinodeClient.EXPECT(). + GetRegion(ctx, gomock.Any()). + After(listInst). + Return(&linodego.Region{Capabilities: []string{"Metadata"}}, nil) + getImage := mockLinodeClient.EXPECT(). + GetImage(ctx, gomock.Any()). + After(getRegion). + Return(&linodego.Image{Capabilities: []string{"cloud-init"}}, nil) + createInst := mockLinodeClient.EXPECT(). + CreateInstance(ctx, gomock.Any()). + After(getImage). + Return(&linodego.Instance{ + ID: 123, + IPv4: []*net.IP{ptr.To(net.IPv4(192, 168, 0, 2))}, + Status: linodego.InstanceOffline, + }, nil) + listInstConfs := mockLinodeClient.EXPECT(). + ListInstanceConfigs(ctx, 123, gomock.Any()). + After(createInst). + Return([]linodego.InstanceConfig{{ + Devices: &linodego.InstanceConfigDeviceMap{ + SDA: &linodego.InstanceConfigDevice{DiskID: 100}, + }, + }}, nil) + getInstDisk := mockLinodeClient.EXPECT(). + GetInstanceDisk(ctx, 123, 100). + After(listInstConfs). + Return(&linodego.InstanceDisk{ID: 100, Size: 15000}, nil) + resizeInstDisk := mockLinodeClient.EXPECT(). + ResizeInstanceDisk(ctx, 123, 100, 15000-defaultEtcdDiskSize). + After(getInstDisk). + Return(nil) + waitForInstDisk := mockLinodeClient.EXPECT(). + WaitForInstanceDiskStatus(ctx, 123, 100, linodego.DiskReady, defaultResizeTimeoutSeconds). + After(resizeInstDisk). + Return(nil, nil) + createEtcdDisk := mockLinodeClient.EXPECT(). + CreateInstanceDisk(ctx, 123, linodego.InstanceDiskCreateOptions{ + Label: "etcd-data", + Size: defaultEtcdDiskSize, + Filesystem: string(linodego.FilesystemExt4), + }). + After(waitForInstDisk). + Return(&linodego.InstanceDisk{ID: 101}, nil) + bootInst := mockLinodeClient.EXPECT(). + BootInstance(ctx, 123, 0). + After(createEtcdDisk). + Return(nil) + getAddrs := mockLinodeClient.EXPECT(). + GetInstanceIPAddresses(ctx, 123). + After(bootInst). + Return(&linodego.InstanceIPAddressResponse{ + IPv4: &linodego.InstanceIPv4Response{ + Private: []*linodego.InstanceIP{{Address: "192.168.0.2"}}, + }, + }, nil) + mockLinodeClient.EXPECT(). + CreateNodeBalancerNode(ctx, 1, 2, linodego.NodeBalancerNodeCreateOptions{ + Label: "mock", + Address: "192.168.0.2:6443", + Mode: linodego.ModeAccept, + }). + After(getAddrs). + Return(nil, nil) + + mScope := scope.MachineScope{ + Client: k8sClient, + LinodeClient: mockLinodeClient, + Cluster: &cluster, + Machine: &machine, + LinodeCluster: &linodeCluster, + LinodeMachine: &linodeMachine, + } + + _, err := reconciler.reconcileCreate(ctx, logger, &mScope) + Expect(err).NotTo(HaveOccurred()) + + Expect(*linodeMachine.Status.InstanceState).To(Equal(linodego.InstanceOffline)) + Expect(*linodeMachine.Spec.InstanceID).To(Equal(123)) + Expect(*linodeMachine.Spec.ProviderID).To(Equal("linode://123")) + Expect(linodeMachine.Status.Addresses).To(Equal([]clusterv1.MachineAddress{{ + Type: clusterv1.MachineInternalIP, + Address: "192.168.0.2", + }})) + + Expect(testLogs.String()).To(ContainSubstring("creating machine")) + Expect(testLogs.String()).NotTo(ContainSubstring("Failed to list Linode machine instance")) + Expect(testLogs.String()).NotTo(ContainSubstring("Linode instance already exists")) + Expect(testLogs.String()).NotTo(ContainSubstring("Failed to create Linode machine InstanceCreateOptions")) + Expect(testLogs.String()).NotTo(ContainSubstring("Failed to create Linode machine instance")) + Expect(testLogs.String()).NotTo(ContainSubstring("Failed to configure instance profile")) + Expect(testLogs.String()).NotTo(ContainSubstring("Waiting for control plane disks to be ready")) + Expect(testLogs.String()).NotTo(ContainSubstring("Failed to boot instance")) + Expect(testLogs.String()).NotTo(ContainSubstring("multiple instances found")) + Expect(testLogs.String()).NotTo(ContainSubstring("Failed to add instance to Node Balancer backend")) + }) + + It("in multiple calls when disks are delayed", func(ctx SpecContext) { + machine.Labels[clusterv1.MachineControlPlaneLabel] = "true" + + mockLinodeClient := mock.NewMockLinodeMachineClient(mockCtrl) + listInst := mockLinodeClient.EXPECT(). + ListInstances(ctx, gomock.Any()). + Return([]linodego.Instance{}, nil) + getRegion := mockLinodeClient.EXPECT(). + GetRegion(ctx, gomock.Any()). + After(listInst). + Return(&linodego.Region{Capabilities: []string{"Metadata"}}, nil) + getImage := mockLinodeClient.EXPECT(). + GetImage(ctx, gomock.Any()). + After(getRegion). + Return(&linodego.Image{Capabilities: []string{"cloud-init"}}, nil) + createInst := mockLinodeClient.EXPECT(). + CreateInstance(ctx, gomock.Any()). + After(getImage). + Return(&linodego.Instance{ + ID: 123, + IPv4: []*net.IP{ptr.To(net.IPv4(192, 168, 0, 2))}, + Status: linodego.InstanceOffline, + }, nil) + listInstConfs := mockLinodeClient.EXPECT(). + ListInstanceConfigs(ctx, 123, gomock.Any()). + After(createInst). + Return([]linodego.InstanceConfig{{ + Devices: &linodego.InstanceConfigDeviceMap{ + SDA: &linodego.InstanceConfigDevice{DiskID: 100}, + }, + }}, nil) + getInstDisk := mockLinodeClient.EXPECT(). + GetInstanceDisk(ctx, 123, 100). + After(listInstConfs). + Return(&linodego.InstanceDisk{ID: 100, Size: 15000}, nil) + resizeInstDisk := mockLinodeClient.EXPECT(). + ResizeInstanceDisk(ctx, 123, 100, 15000-defaultEtcdDiskSize). + After(getInstDisk). + Return(nil) + mockLinodeClient.EXPECT(). + WaitForInstanceDiskStatus(ctx, 123, 100, linodego.DiskReady, defaultResizeTimeoutSeconds). + After(resizeInstDisk). + Return(nil, errors.New("Waiting for Instance 123 Disk 100 status ready: not yet")) + + mScope := scope.MachineScope{ + Client: k8sClient, + LinodeClient: mockLinodeClient, + Cluster: &cluster, + Machine: &machine, + LinodeCluster: &linodeCluster, + LinodeMachine: &linodeMachine, + } + + res, err := reconciler.reconcileCreate(ctx, logger, &mScope) + Expect(res.RequeueAfter).To(Equal(rutil.DefaultMachineControllerWaitForBootstrapDelay)) + Expect(err).ToNot(HaveOccurred()) + + listInst = mockLinodeClient.EXPECT(). + ListInstances(ctx, gomock.Any()). + Return([]linodego.Instance{{ + ID: 123, + IPv4: []*net.IP{ptr.To(net.IPv4(192, 168, 0, 2))}, + Status: linodego.InstanceOffline, + }}, nil) + listInstConfs = mockLinodeClient.EXPECT(). + ListInstanceConfigs(ctx, 123, gomock.Any()). + After(listInst). + Return([]linodego.InstanceConfig{{ + Devices: &linodego.InstanceConfigDeviceMap{ + SDA: &linodego.InstanceConfigDevice{DiskID: 100}, + }, + }}, nil) + waitForInstDisk := mockLinodeClient.EXPECT(). + WaitForInstanceDiskStatus(ctx, 123, 100, linodego.DiskReady, defaultResizeTimeoutSeconds). + After(listInstConfs). + Return(nil, nil) + createEtcdDisk := mockLinodeClient.EXPECT(). + CreateInstanceDisk(ctx, 123, gomock.Any()). + After(waitForInstDisk). + Return(nil, nil) + bootInst := mockLinodeClient.EXPECT(). + BootInstance(ctx, 123, 0). + After(createEtcdDisk). + Return(nil) + getAddrs := mockLinodeClient.EXPECT(). + GetInstanceIPAddresses(ctx, 123). + After(bootInst). + Return(&linodego.InstanceIPAddressResponse{ + IPv4: &linodego.InstanceIPv4Response{ + Private: []*linodego.InstanceIP{{Address: "192.168.0.2"}}, + }, + }, nil) + mockLinodeClient.EXPECT(). + CreateNodeBalancerNode(ctx, 1, 2, linodego.NodeBalancerNodeCreateOptions{ + Label: "mock", + Address: "192.168.0.2:6443", + Mode: linodego.ModeAccept, + }). + After(getAddrs). + Return(nil, nil) + + _, err = reconciler.reconcileCreate(ctx, logger, &mScope) + Expect(err).NotTo(HaveOccurred()) + + Expect(*linodeMachine.Status.InstanceState).To(Equal(linodego.InstanceOffline)) + Expect(*linodeMachine.Spec.InstanceID).To(Equal(123)) + Expect(*linodeMachine.Spec.ProviderID).To(Equal("linode://123")) + Expect(linodeMachine.Status.Addresses).To(Equal([]clusterv1.MachineAddress{{ + Type: clusterv1.MachineInternalIP, + Address: "192.168.0.2", + }})) + + Expect(testLogs.String()).To(ContainSubstring("creating machine")) + Expect(testLogs.String()).To(ContainSubstring("Linode instance already exists")) + }) + }) +}) From 5f28d4e96aa8ca37d73b67392a9b712ad1a65806 Mon Sep 17 00:00:00 2001 From: Brian Mendoza Date: Mon, 8 Apr 2024 13:05:13 -0400 Subject: [PATCH 3/8] backoff on stale condition check --- controller/linodemachine_controller.go | 59 +++++++++++---- controller/linodemachine_controller_test.go | 79 +++++++++++++++++++-- util/reconciler/conditions.go | 22 ++++++ util/reconciler/defaults.go | 11 +++ 4 files changed, 151 insertions(+), 20 deletions(-) diff --git a/controller/linodemachine_controller.go b/controller/linodemachine_controller.go index d74fd3348..7647466ef 100644 --- a/controller/linodemachine_controller.go +++ b/controller/linodemachine_controller.go @@ -51,15 +51,16 @@ import ( const ( // default etcd disk size in MB - defaultEtcdDiskSize = 10240 - defaultResizeTimeoutSeconds = 5 + defaultEtcdDiskSize = 10240 + defaultResizeWaitSeconds = 5 // conditions for preflight instance creation + ConditionPreflightCreated clusterv1.ConditionType = "PreflightCreated" ConditionPreflightRootDiskResizing clusterv1.ConditionType = "PreflightRootDiskResizing" ConditionPreflightRootDiskResized clusterv1.ConditionType = "PreflightRootDiskResized" ConditionPreflightEtcdDiskCreated clusterv1.ConditionType = "PreflightEtcdDiskCreated" ConditionPreflightConfigured clusterv1.ConditionType = "PreflightConfigured" - ConditionPreflightBootStarted clusterv1.ConditionType = "PreflightBootStarted" + ConditionPreflightBootTriggered clusterv1.ConditionType = "PreflightBootTriggered" ConditionPreflightReady clusterv1.ConditionType = "PreflightReady" ) @@ -267,7 +268,6 @@ func (r *LinodeMachineReconciler) reconcileCreate( if err != nil { logger.Error(err, "Failed to list Linode machine instances") - // TODO: What terminal errors should we not requeue for, and just return an error? return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerWaitForRunningDelay}, nil } @@ -290,10 +290,16 @@ func (r *LinodeMachineReconciler) reconcileCreate( if err != nil { logger.Error(err, "Failed to create Linode machine instance") - // TODO: What terminal errors should we not requeue for, and just return an error? + if reconciler.RecordDecayingCondition(machineScope.LinodeMachine, + ConditionPreflightCreated, string(cerrs.CreateMachineError), err.Error(), + reconciler.DefaultMachineControllerPreflightTimeout(r.ReconcileTimeout)) { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerWaitForRunningDelay}, nil } + conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightCreated) machineScope.LinodeMachine.Spec.InstanceID = &linodeInstance.ID default: @@ -307,29 +313,44 @@ func (r *LinodeMachineReconciler) reconcileCreate( if err = r.configureDisksControlPlane(ctx, logger, machineScope, linodeInstance.ID); err != nil { logger.Error(err, "Failed to configure instance disks") - // TODO: What terminal errors should we not requeue for, and just return an error? + if reconciler.RecordDecayingCondition(machineScope.LinodeMachine, + ConditionPreflightConfigured, string(cerrs.CreateMachineError), err.Error(), + reconciler.DefaultMachineControllerPreflightTimeout(r.ReconcileTimeout)) { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerWaitForRunningDelay}, nil } conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightConfigured) } - if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightBootStarted) { + if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightBootTriggered) { if err = machineScope.LinodeClient.BootInstance(ctx, linodeInstance.ID, 0); err != nil { logger.Error(err, "Failed to boot instance") - // TODO: What terminal errors should we not requeue for, and just return an error? + if reconciler.RecordDecayingCondition(machineScope.LinodeMachine, + ConditionPreflightBootTriggered, string(cerrs.CreateMachineError), err.Error(), + reconciler.DefaultMachineControllerPreflightTimeout(r.ReconcileTimeout)) { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerWaitForRunningDelay}, nil } - conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightBootStarted) + conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightBootTriggered) } if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightReady) { if err = services.AddNodeToNB(ctx, logger, machineScope); err != nil { logger.Error(err, "Failed to add instance to Node Balancer backend") - // TODO: What terminal errors should we not requeue for, and just return an error? + if reconciler.RecordDecayingCondition(machineScope.LinodeMachine, + ConditionPreflightReady, string(cerrs.CreateMachineError), err.Error(), + reconciler.DefaultMachineControllerPreflightTimeout(r.ReconcileTimeout)) { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerWaitForRunningDelay}, nil } @@ -361,11 +382,15 @@ func (r *LinodeMachineReconciler) configureDisksControlPlane( if err != nil || len(configs) == 0 { logger.Error(err, "Failed to list instance configs") + conditions.MarkFalse(machineScope.LinodeMachine, ConditionPreflightRootDiskResized, string(cerrs.CreateMachineError), clusterv1.ConditionSeverityWarning, err.Error()) + return err } instanceConfig := configs[0] if instanceConfig.Devices.SDA == nil { + conditions.MarkFalse(machineScope.LinodeMachine, ConditionPreflightRootDiskResized, string(cerrs.CreateMachineError), clusterv1.ConditionSeverityWarning, "root disk not yet ready") + return errors.New("root disk not yet ready") } @@ -377,12 +402,16 @@ func (r *LinodeMachineReconciler) configureDisksControlPlane( if err != nil { logger.Error(err, "Failed to get root disk for instance") + conditions.MarkFalse(machineScope.LinodeMachine, ConditionPreflightRootDiskResizing, string(cerrs.CreateMachineError), clusterv1.ConditionSeverityWarning, err.Error()) + return err } diskSize := rootDisk.Size - defaultEtcdDiskSize if err = machineScope.LinodeClient.ResizeInstanceDisk(ctx, linodeInstanceID, rootDiskID, diskSize); err != nil { logger.Error(err, "Failed to resize root disk") + conditions.MarkFalse(machineScope.LinodeMachine, ConditionPreflightRootDiskResizing, string(cerrs.CreateMachineError), clusterv1.ConditionSeverityWarning, err.Error()) + return err } @@ -390,14 +419,16 @@ func (r *LinodeMachineReconciler) configureDisksControlPlane( } // wait for the disk to resize - if _, err := machineScope.LinodeClient.WaitForInstanceDiskStatus(ctx, linodeInstanceID, rootDiskID, linodego.DiskReady, defaultResizeTimeoutSeconds); err != nil { - logger.Error(err, fmt.Sprintf("Failed to resize root disk within resize timeout of %d seconds", defaultResizeTimeoutSeconds)) + if _, err := machineScope.LinodeClient.WaitForInstanceDiskStatus(ctx, linodeInstanceID, rootDiskID, linodego.DiskReady, defaultResizeWaitSeconds); err != nil { + logger.Error(err, fmt.Sprintf("Failed to resize root disk within resize timeout of %d seconds", defaultResizeWaitSeconds)) + + conditions.MarkFalse(machineScope.LinodeMachine, ConditionPreflightRootDiskResized, string(cerrs.CreateMachineError), clusterv1.ConditionSeverityWarning, err.Error()) return err } - conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightRootDiskResized) conditions.Delete(machineScope.LinodeMachine, ConditionPreflightRootDiskResizing) + conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightRootDiskResized) } if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightEtcdDiskCreated) { @@ -413,6 +444,8 @@ func (r *LinodeMachineReconciler) configureDisksControlPlane( ); err != nil { logger.Error(err, "Failed to create etcd disk") + conditions.MarkFalse(machineScope.LinodeMachine, ConditionPreflightEtcdDiskCreated, string(cerrs.CreateMachineError), clusterv1.ConditionSeverityWarning, err.Error()) + return err } diff --git a/controller/linodemachine_controller_test.go b/controller/linodemachine_controller_test.go index 397339cd7..8a57c2e4c 100644 --- a/controller/linodemachine_controller_test.go +++ b/controller/linodemachine_controller_test.go @@ -20,6 +20,7 @@ import ( "bytes" "errors" "net" + "time" "github.com/go-logr/logr" "github.com/linode/linodego" @@ -29,6 +30,7 @@ import ( "k8s.io/client-go/tools/record" "k8s.io/utils/ptr" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/controller-runtime/pkg/log/zap" infrav1alpha1 "github.com/linode/cluster-api-provider-linode/api/v1alpha1" @@ -44,6 +46,7 @@ var _ = Describe("create", Label("machine", "create"), func() { var machine clusterv1.Machine var linodeMachine infrav1alpha1.LinodeMachine var secret corev1.Secret + var reconciler *LinodeMachineReconciler var mockCtrl *gomock.Controller var testLogs *bytes.Buffer @@ -66,9 +69,6 @@ var _ = Describe("create", Label("machine", "create"), func() { } recorder := record.NewFakeRecorder(10) - reconciler := &LinodeMachineReconciler{ - Recorder: recorder, - } BeforeEach(func(ctx SpecContext) { secret = corev1.Secret{ @@ -105,6 +105,9 @@ var _ = Describe("create", Label("machine", "create"), func() { Image: rutil.DefaultMachineControllerLinodeImage, }, } + reconciler = &LinodeMachineReconciler{ + Recorder: recorder, + } mockCtrl = gomock.NewController(GinkgoT()) testLogs = &bytes.Buffer{} logger = zap.New( @@ -161,6 +164,11 @@ var _ = Describe("create", Label("machine", "create"), func() { _, err := reconciler.reconcileCreate(ctx, logger, &mScope) Expect(err).NotTo(HaveOccurred()) + Expect(rutil.ConditionTrue(&linodeMachine, ConditionPreflightCreated)).To(BeTrue()) + Expect(rutil.ConditionTrue(&linodeMachine, ConditionPreflightConfigured)).To(BeTrue()) + Expect(rutil.ConditionTrue(&linodeMachine, ConditionPreflightBootTriggered)).To(BeTrue()) + Expect(rutil.ConditionTrue(&linodeMachine, ConditionPreflightReady)).To(BeTrue()) + Expect(*linodeMachine.Status.InstanceState).To(Equal(linodego.InstanceOffline)) Expect(*linodeMachine.Spec.InstanceID).To(Equal(123)) Expect(*linodeMachine.Spec.ProviderID).To(Equal("linode://123")) @@ -179,6 +187,50 @@ var _ = Describe("create", Label("machine", "create"), func() { Expect(testLogs.String()).NotTo(ContainSubstring("Failed to add instance to Node Balancer backend")) }) + Context("fails when a preflight condition is stale", func() { + It("can't create an instance in time", func(ctx SpecContext) { + mockLinodeClient := mock.NewMockLinodeMachineClient(mockCtrl) + listInst := mockLinodeClient.EXPECT(). + ListInstances(ctx, gomock.Any()). + Return([]linodego.Instance{}, nil) + getRegion := mockLinodeClient.EXPECT(). + GetRegion(ctx, gomock.Any()). + After(listInst). + Return(&linodego.Region{Capabilities: []string{"Metadata"}}, nil) + getImage := mockLinodeClient.EXPECT(). + GetImage(ctx, gomock.Any()). + After(getRegion). + Return(&linodego.Image{Capabilities: []string{"cloud-init"}}, nil) + mockLinodeClient.EXPECT(). + CreateInstance(ctx, gomock.Any()). + After(getImage). + DoAndReturn(func(_, _ any) (*linodego.Instance, error) { + time.Sleep(time.Microsecond) + return nil, errors.New("time is up") + }) + + mScope := scope.MachineScope{ + Client: k8sClient, + LinodeClient: mockLinodeClient, + Cluster: &cluster, + Machine: &machine, + LinodeCluster: &linodeCluster, + LinodeMachine: &linodeMachine, + } + + reconciler.ReconcileTimeout = time.Nanosecond + + res, err := reconciler.reconcileCreate(ctx, logger, &mScope) + Expect(res).NotTo(Equal(rutil.DefaultMachineControllerWaitForRunningDelay)) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("time is up")) + + Expect(rutil.ConditionTrue(&linodeMachine, ConditionPreflightCreated)).To(BeFalse()) + Expect(conditions.Get(&linodeMachine, ConditionPreflightCreated).Severity).To(Equal(clusterv1.ConditionSeverityError)) + Expect(conditions.Get(&linodeMachine, ConditionPreflightCreated).Message).To(ContainSubstring("time is up")) + }) + }) + Context("creates a control plane instance", func() { It("in a single call when disks aren't delayed", func(ctx SpecContext) { machine.Labels[clusterv1.MachineControlPlaneLabel] = "true" @@ -220,7 +272,7 @@ var _ = Describe("create", Label("machine", "create"), func() { After(getInstDisk). Return(nil) waitForInstDisk := mockLinodeClient.EXPECT(). - WaitForInstanceDiskStatus(ctx, 123, 100, linodego.DiskReady, defaultResizeTimeoutSeconds). + WaitForInstanceDiskStatus(ctx, 123, 100, linodego.DiskReady, defaultResizeWaitSeconds). After(resizeInstDisk). Return(nil, nil) createEtcdDisk := mockLinodeClient.EXPECT(). @@ -264,6 +316,11 @@ var _ = Describe("create", Label("machine", "create"), func() { _, err := reconciler.reconcileCreate(ctx, logger, &mScope) Expect(err).NotTo(HaveOccurred()) + Expect(rutil.ConditionTrue(&linodeMachine, ConditionPreflightCreated)).To(BeTrue()) + Expect(rutil.ConditionTrue(&linodeMachine, ConditionPreflightConfigured)).To(BeTrue()) + Expect(rutil.ConditionTrue(&linodeMachine, ConditionPreflightBootTriggered)).To(BeTrue()) + Expect(rutil.ConditionTrue(&linodeMachine, ConditionPreflightReady)).To(BeTrue()) + Expect(*linodeMachine.Status.InstanceState).To(Equal(linodego.InstanceOffline)) Expect(*linodeMachine.Spec.InstanceID).To(Equal(123)) Expect(*linodeMachine.Spec.ProviderID).To(Equal("linode://123")) @@ -324,7 +381,7 @@ var _ = Describe("create", Label("machine", "create"), func() { After(getInstDisk). Return(nil) mockLinodeClient.EXPECT(). - WaitForInstanceDiskStatus(ctx, 123, 100, linodego.DiskReady, defaultResizeTimeoutSeconds). + WaitForInstanceDiskStatus(ctx, 123, 100, linodego.DiskReady, defaultResizeWaitSeconds). After(resizeInstDisk). Return(nil, errors.New("Waiting for Instance 123 Disk 100 status ready: not yet")) @@ -338,9 +395,12 @@ var _ = Describe("create", Label("machine", "create"), func() { } res, err := reconciler.reconcileCreate(ctx, logger, &mScope) - Expect(res.RequeueAfter).To(Equal(rutil.DefaultMachineControllerWaitForBootstrapDelay)) + Expect(res.RequeueAfter).To(Equal(rutil.DefaultMachineControllerWaitForRunningDelay)) Expect(err).ToNot(HaveOccurred()) + Expect(rutil.ConditionTrue(&linodeMachine, ConditionPreflightCreated)).To(BeTrue()) + Expect(rutil.ConditionTrue(&linodeMachine, ConditionPreflightConfigured)).To(BeFalse()) + listInst = mockLinodeClient.EXPECT(). ListInstances(ctx, gomock.Any()). Return([]linodego.Instance{{ @@ -357,7 +417,7 @@ var _ = Describe("create", Label("machine", "create"), func() { }, }}, nil) waitForInstDisk := mockLinodeClient.EXPECT(). - WaitForInstanceDiskStatus(ctx, 123, 100, linodego.DiskReady, defaultResizeTimeoutSeconds). + WaitForInstanceDiskStatus(ctx, 123, 100, linodego.DiskReady, defaultResizeWaitSeconds). After(listInstConfs). Return(nil, nil) createEtcdDisk := mockLinodeClient.EXPECT(). @@ -388,6 +448,11 @@ var _ = Describe("create", Label("machine", "create"), func() { _, err = reconciler.reconcileCreate(ctx, logger, &mScope) Expect(err).NotTo(HaveOccurred()) + Expect(rutil.ConditionTrue(&linodeMachine, ConditionPreflightCreated)).To(BeTrue()) + Expect(rutil.ConditionTrue(&linodeMachine, ConditionPreflightConfigured)).To(BeTrue()) + Expect(rutil.ConditionTrue(&linodeMachine, ConditionPreflightBootTriggered)).To(BeTrue()) + Expect(rutil.ConditionTrue(&linodeMachine, ConditionPreflightReady)).To(BeTrue()) + Expect(*linodeMachine.Status.InstanceState).To(Equal(linodego.InstanceOffline)) Expect(*linodeMachine.Spec.InstanceID).To(Equal(123)) Expect(*linodeMachine.Spec.ProviderID).To(Equal("linode://123")) diff --git a/util/reconciler/conditions.go b/util/reconciler/conditions.go index 402ee32c5..c9c315889 100644 --- a/util/reconciler/conditions.go +++ b/util/reconciler/conditions.go @@ -1,6 +1,8 @@ package reconciler import ( + "time" + corev1 "k8s.io/api/core/v1" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util/conditions" @@ -18,3 +20,23 @@ func HasConditionStatus(from conditions.Getter, typ clusterv1.ConditionType, sta return cond.Status == status } + +func RecordDecayingCondition(to conditions.Setter, typ clusterv1.ConditionType, reason, message string, timeout time.Duration) bool { + conditions.MarkFalse(to, typ, reason, clusterv1.ConditionSeverityWarning, message) + + if HasStaleCondition(to, typ, timeout) { + conditions.MarkFalse(to, typ, reason, clusterv1.ConditionSeverityError, message) + return true + } + + return false +} + +func HasStaleCondition(from conditions.Getter, typ clusterv1.ConditionType, timeout time.Duration) bool { + cond := conditions.Get(from, typ) + if cond == nil { + return false + } + + return time.Now().After(cond.LastTransitionTime.Add(timeout)) +} diff --git a/util/reconciler/defaults.go b/util/reconciler/defaults.go index db9722be8..d7aa23ca5 100644 --- a/util/reconciler/defaults.go +++ b/util/reconciler/defaults.go @@ -32,6 +32,8 @@ const ( DefaultMachineControllerLinodeImage = "linode/ubuntu22.04" // DefaultMachineControllerWaitForRunningDelay is the default requeue delay if instance is not running. DefaultMachineControllerWaitForRunningDelay = 5 * time.Second + // DefaultMachineControllerWaitForPreflightTimeout is the default timeout during the preflight phase. + DefaultMachineControllerWaitForPreflightTimeout = 5 * time.Minute // DefaultMachineControllerWaitForRunningTimeout is the default timeout if instance is not running. DefaultMachineControllerWaitForRunningTimeout = 20 * time.Minute @@ -49,3 +51,12 @@ func DefaultedLoopTimeout(timeout time.Duration) time.Duration { return timeout } + +// DefaultMachineControllerPreflightTimeout will default the preflight machine timeout if it is zero-valued. +func DefaultMachineControllerPreflightTimeout(timeout time.Duration) time.Duration { + if timeout <= 0 { + return DefaultMachineControllerWaitForPreflightTimeout + } + + return timeout +} From aff9c031cb09768ab26d9cfe06d592f7f3014b2a Mon Sep 17 00:00:00 2001 From: Brian Mendoza Date: Mon, 8 Apr 2024 13:22:10 -0400 Subject: [PATCH 4/8] reduce complexity --- controller/linodemachine_controller.go | 117 ++++++++++++++----------- 1 file changed, 66 insertions(+), 51 deletions(-) diff --git a/controller/linodemachine_controller.go b/controller/linodemachine_controller.go index 7647466ef..acc92eb95 100644 --- a/controller/linodemachine_controller.go +++ b/controller/linodemachine_controller.go @@ -376,82 +376,97 @@ func (r *LinodeMachineReconciler) configureDisksControlPlane( return nil } - if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightRootDiskResized) { - // get the default instance config - configs, err := machineScope.LinodeClient.ListInstanceConfigs(ctx, linodeInstanceID, &linodego.ListOptions{}) - if err != nil || len(configs) == 0 { - logger.Error(err, "Failed to list instance configs") + if err := r.resizeRootDisk(ctx, logger, machineScope, linodeInstanceID); err != nil { + return err + } - conditions.MarkFalse(machineScope.LinodeMachine, ConditionPreflightRootDiskResized, string(cerrs.CreateMachineError), clusterv1.ConditionSeverityWarning, err.Error()) + if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightEtcdDiskCreated) { + // create the etcd disk + if _, err := machineScope.LinodeClient.CreateInstanceDisk( + ctx, + linodeInstanceID, + linodego.InstanceDiskCreateOptions{ + Label: "etcd-data", + Size: defaultEtcdDiskSize, + Filesystem: string(linodego.FilesystemExt4), + }, + ); err != nil { + logger.Error(err, "Failed to create etcd disk") + + conditions.MarkFalse(machineScope.LinodeMachine, ConditionPreflightEtcdDiskCreated, string(cerrs.CreateMachineError), clusterv1.ConditionSeverityWarning, err.Error()) return err } - instanceConfig := configs[0] - if instanceConfig.Devices.SDA == nil { - conditions.MarkFalse(machineScope.LinodeMachine, ConditionPreflightRootDiskResized, string(cerrs.CreateMachineError), clusterv1.ConditionSeverityWarning, "root disk not yet ready") + conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightEtcdDiskCreated) + } - return errors.New("root disk not yet ready") - } + return nil +} - rootDiskID := instanceConfig.Devices.SDA.DiskID +func (r *LinodeMachineReconciler) resizeRootDisk( + ctx context.Context, + logger logr.Logger, + machineScope *scope.MachineScope, + linodeInstanceID int, +) error { + if reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightRootDiskResized) { + return nil + } - // carve out space for the etcd disk - if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightRootDiskResizing) { - rootDisk, err := machineScope.LinodeClient.GetInstanceDisk(ctx, linodeInstanceID, rootDiskID) - if err != nil { - logger.Error(err, "Failed to get root disk for instance") + // get the default instance config + configs, err := machineScope.LinodeClient.ListInstanceConfigs(ctx, linodeInstanceID, &linodego.ListOptions{}) + if err != nil || len(configs) == 0 { + logger.Error(err, "Failed to list instance configs") - conditions.MarkFalse(machineScope.LinodeMachine, ConditionPreflightRootDiskResizing, string(cerrs.CreateMachineError), clusterv1.ConditionSeverityWarning, err.Error()) + conditions.MarkFalse(machineScope.LinodeMachine, ConditionPreflightRootDiskResized, string(cerrs.CreateMachineError), clusterv1.ConditionSeverityWarning, err.Error()) - return err - } - diskSize := rootDisk.Size - defaultEtcdDiskSize - if err = machineScope.LinodeClient.ResizeInstanceDisk(ctx, linodeInstanceID, rootDiskID, diskSize); err != nil { - logger.Error(err, "Failed to resize root disk") + return err + } + instanceConfig := configs[0] - conditions.MarkFalse(machineScope.LinodeMachine, ConditionPreflightRootDiskResizing, string(cerrs.CreateMachineError), clusterv1.ConditionSeverityWarning, err.Error()) + if instanceConfig.Devices.SDA == nil { + conditions.MarkFalse(machineScope.LinodeMachine, ConditionPreflightRootDiskResized, string(cerrs.CreateMachineError), clusterv1.ConditionSeverityWarning, "root disk not yet ready") - return err - } + return errors.New("root disk not yet ready") + } - conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightRootDiskResizing) - } + rootDiskID := instanceConfig.Devices.SDA.DiskID - // wait for the disk to resize - if _, err := machineScope.LinodeClient.WaitForInstanceDiskStatus(ctx, linodeInstanceID, rootDiskID, linodego.DiskReady, defaultResizeWaitSeconds); err != nil { - logger.Error(err, fmt.Sprintf("Failed to resize root disk within resize timeout of %d seconds", defaultResizeWaitSeconds)) + // carve out space for the etcd disk + if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightRootDiskResizing) { + rootDisk, err := machineScope.LinodeClient.GetInstanceDisk(ctx, linodeInstanceID, rootDiskID) + if err != nil { + logger.Error(err, "Failed to get root disk for instance") - conditions.MarkFalse(machineScope.LinodeMachine, ConditionPreflightRootDiskResized, string(cerrs.CreateMachineError), clusterv1.ConditionSeverityWarning, err.Error()) + conditions.MarkFalse(machineScope.LinodeMachine, ConditionPreflightRootDiskResizing, string(cerrs.CreateMachineError), clusterv1.ConditionSeverityWarning, err.Error()) return err } + diskSize := rootDisk.Size - defaultEtcdDiskSize + if err = machineScope.LinodeClient.ResizeInstanceDisk(ctx, linodeInstanceID, rootDiskID, diskSize); err != nil { + logger.Error(err, "Failed to resize root disk") - conditions.Delete(machineScope.LinodeMachine, ConditionPreflightRootDiskResizing) - conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightRootDiskResized) - } - - if !reconciler.ConditionTrue(machineScope.LinodeMachine, ConditionPreflightEtcdDiskCreated) { - // create the etcd disk - if _, err := machineScope.LinodeClient.CreateInstanceDisk( - ctx, - linodeInstanceID, - linodego.InstanceDiskCreateOptions{ - Label: "etcd-data", - Size: defaultEtcdDiskSize, - Filesystem: string(linodego.FilesystemExt4), - }, - ); err != nil { - logger.Error(err, "Failed to create etcd disk") - - conditions.MarkFalse(machineScope.LinodeMachine, ConditionPreflightEtcdDiskCreated, string(cerrs.CreateMachineError), clusterv1.ConditionSeverityWarning, err.Error()) + conditions.MarkFalse(machineScope.LinodeMachine, ConditionPreflightRootDiskResizing, string(cerrs.CreateMachineError), clusterv1.ConditionSeverityWarning, err.Error()) return err } - conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightEtcdDiskCreated) + conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightRootDiskResizing) } + // wait for the disk to resize + if _, err := machineScope.LinodeClient.WaitForInstanceDiskStatus(ctx, linodeInstanceID, rootDiskID, linodego.DiskReady, defaultResizeWaitSeconds); err != nil { + logger.Error(err, fmt.Sprintf("Failed to resize root disk within resize timeout of %d seconds", defaultResizeWaitSeconds)) + + conditions.MarkFalse(machineScope.LinodeMachine, ConditionPreflightRootDiskResized, string(cerrs.CreateMachineError), clusterv1.ConditionSeverityWarning, err.Error()) + + return err + } + + conditions.Delete(machineScope.LinodeMachine, ConditionPreflightRootDiskResizing) + conditions.MarkTrue(machineScope.LinodeMachine, ConditionPreflightRootDiskResized) + return nil } From ac412c750255f8cd3b1af3e16023fccc5b772e94 Mon Sep 17 00:00:00 2001 From: Brian Mendoza Date: Mon, 8 Apr 2024 13:43:21 -0400 Subject: [PATCH 5/8] add nolint --- controller/linodemachine_controller.go | 1 + 1 file changed, 1 insertion(+) diff --git a/controller/linodemachine_controller.go b/controller/linodemachine_controller.go index acc92eb95..8739a941d 100644 --- a/controller/linodemachine_controller.go +++ b/controller/linodemachine_controller.go @@ -246,6 +246,7 @@ func (r *LinodeMachineReconciler) reconcile( return } +//nolint:cyclop // keep top-level preflight condition checks in the same function for readability func (r *LinodeMachineReconciler) reconcileCreate( ctx context.Context, logger logr.Logger, From 32c76ff4b415e357bd499206dbf4ccf20163ead9 Mon Sep 17 00:00:00 2001 From: Brian Mendoza Date: Mon, 8 Apr 2024 13:57:52 -0400 Subject: [PATCH 6/8] fix nilcheck --- controller/linodemachine_controller.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/controller/linodemachine_controller.go b/controller/linodemachine_controller.go index 8739a941d..7e5531c7e 100644 --- a/controller/linodemachine_controller.go +++ b/controller/linodemachine_controller.go @@ -225,7 +225,9 @@ func (r *LinodeMachineReconciler) reconcile( failureReason = cerrs.UpdateMachineError - logger = logger.WithValues("ID", *machineScope.LinodeMachine.Spec.InstanceID) + if machineScope.LinodeMachine.Spec.InstanceID != nil { + logger = logger.WithValues("ID", *machineScope.LinodeMachine.Spec.InstanceID) + } res, linodeInstance, err = r.reconcileUpdate(ctx, logger, machineScope) @@ -493,6 +495,7 @@ func (r *LinodeMachineReconciler) reconcileUpdate( // Create new machine machineScope.LinodeMachine.Spec.ProviderID = nil machineScope.LinodeMachine.Spec.InstanceID = nil + machineScope.LinodeMachine.Status.InstanceState = nil conditions.MarkFalse(machineScope.LinodeMachine, clusterv1.ReadyCondition, "missing", clusterv1.ConditionSeverityWarning, "instance not found") } @@ -563,6 +566,7 @@ func (r *LinodeMachineReconciler) reconcileDelete( machineScope.LinodeMachine.Spec.ProviderID = nil machineScope.LinodeMachine.Spec.InstanceID = nil + machineScope.LinodeMachine.Status.InstanceState = nil controllerutil.RemoveFinalizer(machineScope.LinodeMachine, infrav1alpha1.GroupVersion.String()) return nil From 9461fa431558d7648694e35fcdf2b14a6024f4e0 Mon Sep 17 00:00:00 2001 From: Brian Mendoza Date: Mon, 8 Apr 2024 14:30:38 -0400 Subject: [PATCH 7/8] trigger ci --- cloud/scope/common.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cloud/scope/common.go b/cloud/scope/common.go index 23e42b600..8e17bdf13 100644 --- a/cloud/scope/common.go +++ b/cloud/scope/common.go @@ -14,9 +14,9 @@ import ( "github.com/linode/cluster-api-provider-linode/version" ) -type ClientOpt func(*linodego.Client) +type clientOption func(*linodego.Client) -func CreateLinodeClient(apiKey string, opts ...ClientOpt) (*linodego.Client, error) { +func CreateLinodeClient(apiKey string, opts ...clientOption) (*linodego.Client, error) { if apiKey == "" { return nil, errors.New("missing Linode API key") } From fb51ade29136e116fce6d7124ab08bf56a3087b8 Mon Sep 17 00:00:00 2001 From: Brian Mendoza Date: Mon, 8 Apr 2024 17:58:59 -0400 Subject: [PATCH 8/8] explicit disable retry in machine scope --- cloud/scope/common.go | 8 +------- cloud/scope/machine.go | 6 ++---- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/cloud/scope/common.go b/cloud/scope/common.go index 8e17bdf13..3e912f23e 100644 --- a/cloud/scope/common.go +++ b/cloud/scope/common.go @@ -14,9 +14,7 @@ import ( "github.com/linode/cluster-api-provider-linode/version" ) -type clientOption func(*linodego.Client) - -func CreateLinodeClient(apiKey string, opts ...clientOption) (*linodego.Client, error) { +func CreateLinodeClient(apiKey string) (*linodego.Client, error) { if apiKey == "" { return nil, errors.New("missing Linode API key") } @@ -30,10 +28,6 @@ func CreateLinodeClient(apiKey string, opts ...clientOption) (*linodego.Client, } linodeClient := linodego.NewClient(oauth2Client) - for _, opt := range opts { - opt(&linodeClient) - } - linodeClient.SetUserAgent(fmt.Sprintf("CAPL/%s", version.GetVersion())) return &linodeClient, nil diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go index 4d2566409..4b7ac818b 100644 --- a/cloud/scope/machine.go +++ b/cloud/scope/machine.go @@ -5,7 +5,6 @@ import ( "errors" "fmt" - "github.com/linode/linodego" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" @@ -82,12 +81,11 @@ func NewMachineScope(ctx context.Context, apiKey string, params MachineScopePara } apiKey = string(data) } - linodeClient, err := CreateLinodeClient(apiKey, func(linodeClient *linodego.Client) { - linodeClient.SetRetryCount(0) - }) + linodeClient, err := CreateLinodeClient(apiKey) if err != nil { return nil, fmt.Errorf("failed to create linode client: %w", err) } + linodeClient.SetRetryCount(0) helper, err := patch.NewHelper(params.LinodeMachine, params.Client) if err != nil {