Skip to content

Commit

Permalink
[fix] add more checks for retrying on transient error during machine …
Browse files Browse the repository at this point in the history
…creation (#432)

* add more checks for retrying on transient error during machine creation

* only have 429 errors wait a full minute
  • Loading branch information
AshleyDumaine authored Jul 31, 2024
1 parent 3119045 commit 062d08a
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 35 deletions.
27 changes: 18 additions & 9 deletions controller/linodemachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ import (

const (
linodeBusyCode = 400
linodeTooManyRequests = 429
defaultDiskFilesystem = string(linodego.FilesystemExt4)

// conditions for preflight instance creation
Expand Down Expand Up @@ -262,6 +261,16 @@ func (r *LinodeMachineReconciler) reconcile(
return
}

func retryIfTransient(err error) (ctrl.Result, error) {
if util.IsRetryableError(err) {
if linodego.ErrHasStatus(err, http.StatusTooManyRequests) {
return ctrl.Result{RequeueAfter: reconciler.DefaultLinodeTooManyRequestsErrorRetryDelay}, nil
}
return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerRetryDelay}, nil
}
return ctrl.Result{}, err
}

func (r *LinodeMachineReconciler) reconcileCreate(
ctx context.Context,
logger logr.Logger,
Expand Down Expand Up @@ -302,16 +311,15 @@ func (r *LinodeMachineReconciler) reconcileCreate(
createOpts, err := r.newCreateConfig(ctx, machineScope, tags, logger)
if err != nil {
logger.Error(err, "Failed to create Linode machine InstanceCreateOptions")
if util.IsTransientError(err) {
return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerRetryDelay}, nil
}

return ctrl.Result{}, err
return retryIfTransient(err)
}
linodeInstance, err = machineScope.LinodeClient.CreateInstance(ctx, *createOpts)
if err != nil {
if linodego.ErrHasStatus(err, linodeTooManyRequests) || linodego.ErrHasStatus(err, linodego.ErrorFromError) {
if util.IsRetryableError(err) {
logger.Error(err, "Failed to create Linode instance due to API error, requeing")
if linodego.ErrHasStatus(err, http.StatusTooManyRequests) {
return ctrl.Result{RequeueAfter: reconciler.DefaultLinodeTooManyRequestsErrorRetryDelay}, nil
}
return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerRetryDelay}, nil
}
logger.Error(err, "Failed to create Linode machine instance")
Expand Down Expand Up @@ -362,11 +370,12 @@ func (r *LinodeMachineReconciler) reconcileInstanceCreate(
instanceConfig, err := r.getDefaultInstanceConfig(ctx, machineScope, linodeInstance.ID)
if err != nil {
logger.Error(err, "Failed to get default instance configuration")
return ctrl.Result{}, err
return retryIfTransient(err)
}

if _, err := machineScope.LinodeClient.UpdateInstanceConfig(ctx, linodeInstance.ID, instanceConfig.ID, linodego.InstanceConfigUpdateOptions{Kernel: machineScope.LinodeMachine.Spec.Configuration.Kernel}); err != nil {
return ctrl.Result{}, err
logger.Error(err, "Failed to update default instance configuration")
return retryIfTransient(err)
}
}

Expand Down
113 changes: 110 additions & 3 deletions controller/linodemachine_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"context"
"errors"
"net"
"net/http"
"time"

"github.com/go-logr/logr"
Expand Down Expand Up @@ -811,9 +812,10 @@ var _ = Describe("machine-lifecycle", Ordered, Label("machine", "machine-lifecyc
linodeMachine := &infrav1alpha2.LinodeMachine{
ObjectMeta: metadata,
Spec: infrav1alpha2.LinodeMachineSpec{
InstanceID: ptr.To(0),
Type: "g6-nanode-1",
Image: rutil.DefaultMachineControllerLinodeImage,
InstanceID: ptr.To(0),
Type: "g6-nanode-1",
Image: rutil.DefaultMachineControllerLinodeImage,
Configuration: &infrav1alpha2.InstanceConfiguration{Kernel: "test"},
},
}
machineKey := client.ObjectKeyFromObject(linodeMachine)
Expand Down Expand Up @@ -930,8 +932,113 @@ var _ = Describe("machine-lifecycle", Ordered, Label("machine", "machine-lifecyc
})),
),
),
Path(
Call("machine is not created because there were too many requests", func(ctx context.Context, mck Mock) {
listInst := mck.LinodeClient.EXPECT().
ListInstances(ctx, gomock.Any()).
Return([]linodego.Instance{}, nil)
mck.LinodeClient.EXPECT().
GetRegion(ctx, gomock.Any()).
After(listInst).
Return(&linodego.Region{Capabilities: []string{"Metadata"}}, nil)
}),
OneOf(
Path(Result("create requeues when failing to create instance config", func(ctx context.Context, mck Mock) {
mck.LinodeClient.EXPECT().
GetImage(ctx, gomock.Any()).
Return(nil, &linodego.Error{Code: http.StatusTooManyRequests})
res, err := reconciler.reconcile(ctx, mck.Logger(), mScope)
Expect(err).NotTo(HaveOccurred())
Expect(res.RequeueAfter).To(Equal(rutil.DefaultLinodeTooManyRequestsErrorRetryDelay))
Expect(mck.Logs()).To(ContainSubstring("Failed to create Linode machine InstanceCreateOptions"))
})),
Path(Result("create requeues when failing to create instance", func(ctx context.Context, mck Mock) {
getImage := mck.LinodeClient.EXPECT().
GetImage(ctx, gomock.Any()).
Return(&linodego.Image{Capabilities: []string{"cloud-init"}}, nil)
mck.LinodeClient.EXPECT().CreateInstance(gomock.Any(), gomock.Any()).
After(getImage).
Return(nil, &linodego.Error{Code: http.StatusTooManyRequests})
res, err := reconciler.reconcile(ctx, mck.Logger(), mScope)
Expect(err).NotTo(HaveOccurred())
Expect(res.RequeueAfter).To(Equal(rutil.DefaultLinodeTooManyRequestsErrorRetryDelay))
Expect(mck.Logs()).To(ContainSubstring("Failed to create Linode instance due to API error"))
})),
Path(Result("create requeues when failing to update instance config", func(ctx context.Context, mck Mock) {
getImage := mck.LinodeClient.EXPECT().
GetImage(ctx, gomock.Any()).
Return(&linodego.Image{Capabilities: []string{"cloud-init"}}, nil)
createInst := mck.LinodeClient.EXPECT().
CreateInstance(ctx, gomock.Any()).
After(getImage).
Return(&linodego.Instance{
ID: 123,
IPv4: []*net.IP{ptr.To(net.IPv4(192, 168, 0, 2))},
IPv6: "fd00::",
Status: linodego.InstanceOffline,
}, nil)
listInstConfigs := mck.LinodeClient.EXPECT().
ListInstanceConfigs(ctx, 123, gomock.Any()).
After(createInst).
Return([]linodego.InstanceConfig{{
Devices: &linodego.InstanceConfigDeviceMap{
SDA: &linodego.InstanceConfigDevice{DiskID: 100},
},
}}, nil)
mck.LinodeClient.EXPECT().
UpdateInstanceConfig(ctx, 123, 0, gomock.Any()).
After(listInstConfigs).
Return(nil, &linodego.Error{Code: http.StatusTooManyRequests})
res, err := reconciler.reconcile(ctx, mck.Logger(), mScope)
Expect(err).NotTo(HaveOccurred())
Expect(res.RequeueAfter).To(Equal(rutil.DefaultLinodeTooManyRequestsErrorRetryDelay))
Expect(mck.Logs()).To(ContainSubstring("Failed to update default instance configuration"))
})),
Path(Result("create requeues when failing to get instance config", func(ctx context.Context, mck Mock) {
getImage := mck.LinodeClient.EXPECT().
GetImage(ctx, gomock.Any()).
Return(&linodego.Image{Capabilities: []string{"cloud-init"}}, nil)
createInst := mck.LinodeClient.EXPECT().
CreateInstance(ctx, gomock.Any()).
After(getImage).
Return(&linodego.Instance{
ID: 123,
IPv4: []*net.IP{ptr.To(net.IPv4(192, 168, 0, 2))},
IPv6: "fd00::",
Status: linodego.InstanceOffline,
}, nil)
updateInstConfig := mck.LinodeClient.EXPECT().
UpdateInstanceConfig(ctx, 123, 0, gomock.Any()).
After(createInst).
Return(nil, nil).AnyTimes()
getAddrs := mck.LinodeClient.EXPECT().
GetInstanceIPAddresses(ctx, 123).
After(updateInstConfig).
Return(&linodego.InstanceIPAddressResponse{
IPv4: &linodego.InstanceIPv4Response{
Private: []*linodego.InstanceIP{{Address: "192.168.0.2"}},
Public: []*linodego.InstanceIP{{Address: "172.0.0.2"}},
},
IPv6: &linodego.InstanceIPv6Response{
SLAAC: &linodego.InstanceIP{
Address: "fd00::",
},
},
}, nil).AnyTimes()
mck.LinodeClient.EXPECT().
ListInstanceConfigs(ctx, 123, gomock.Any()).
After(getAddrs).
Return(nil, &linodego.Error{Code: http.StatusTooManyRequests})
res, err := reconciler.reconcile(ctx, mck.Logger(), mScope)
Expect(err).NotTo(HaveOccurred())
Expect(res.RequeueAfter).To(Equal(rutil.DefaultLinodeTooManyRequestsErrorRetryDelay))
Expect(mck.Logs()).To(ContainSubstring("Failed to get default instance configuration"))
})),
),
),
Path(
Call("machine is created", func(ctx context.Context, mck Mock) {
linodeMachine.Spec.Configuration = nil
}),
OneOf(
Path(Result("creates a worker machine without disks", func(ctx context.Context, mck Mock) {
Expand Down
17 changes: 5 additions & 12 deletions util/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,22 +34,15 @@ func UnwrapError(err error) error {
return err
}

// IsTransientError determines if the error is transient, meaning a controller that
// IsRetryableError determines if the error is retryable, meaning a controller that
// encounters this error should requeue reconciliation to try again later
func IsTransientError(err error) bool {
if linodego.ErrHasStatus(
func IsRetryableError(err error) bool {
return linodego.ErrHasStatus(
err,
http.StatusTooManyRequests,
http.StatusInternalServerError,
http.StatusBadGateway,
http.StatusGatewayTimeout,
http.StatusServiceUnavailable) {
return true
}

if errors.Is(err, http.ErrHandlerTimeout) || errors.Is(err, os.ErrDeadlineExceeded) || errors.Is(err, io.ErrUnexpectedEOF) {
return true
}

return false
http.StatusServiceUnavailable,
linodego.ErrorFromError) || errors.Is(err, http.ErrHandlerTimeout) || errors.Is(err, os.ErrDeadlineExceeded) || errors.Is(err, io.ErrUnexpectedEOF)
}
22 changes: 11 additions & 11 deletions util/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,39 +52,39 @@ func TestIgnoreLinodeAPIError(t *testing.T) {
}
}

func TestIsTransientError(t *testing.T) {
func TestIsRetryableError(t *testing.T) {
t.Parallel()
tests := []struct {
name string
err error
shouldRetry bool
name string
err error
want bool
}{{
name: "unexpected EOF",
err: io.ErrUnexpectedEOF,
shouldRetry: true,
name: "unexpected EOF",
err: io.ErrUnexpectedEOF,
want: true,
}, {
name: "not found Linode API error",
err: &linodego.Error{
Response: nil,
Code: http.StatusNotFound,
Message: "not found",
},
shouldRetry: false,
want: false,
}, {
name: "Rate limiting Linode API error",
err: &linodego.Error{
Response: nil,
Code: http.StatusTooManyRequests,
Message: "rate limited",
},
shouldRetry: true,
want: true,
}}
for _, tt := range tests {
testcase := tt
t.Run(testcase.name, func(t *testing.T) {
t.Parallel()
if testcase.shouldRetry != IsTransientError(testcase.err) {
t.Errorf("wanted %v, got %v", testcase.shouldRetry, IsTransientError(testcase.err))
if testcase.want != IsRetryableError(testcase.err) {
t.Errorf("wanted %v, got %v", testcase.want, IsRetryableError(testcase.err))
}
})
}
Expand Down
2 changes: 2 additions & 0 deletions util/reconciler/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ const (
DefaultMachineControllerWaitForRunningTimeout = 20 * time.Minute
// DefaultMachineControllerRetryDelay is the default requeue delay if there is an error.
DefaultMachineControllerRetryDelay = 10 * time.Second
// DefaultLinodeTooManyRequestsErrorRetryDelay is the default requeue delay if there is a Linode API error.
DefaultLinodeTooManyRequestsErrorRetryDelay = time.Minute

// DefaultVPCControllerReconcileDelay is the default requeue delay when a reconcile operation fails.
DefaultVPCControllerReconcileDelay = 5 * time.Second
Expand Down

0 comments on commit 062d08a

Please sign in to comment.