From fa5e2842229d81f8ba1a1ce1995af6e733cbec99 Mon Sep 17 00:00:00 2001 From: Sunny Date: Wed, 13 Dec 2023 14:25:52 +0000 Subject: [PATCH] Early stall condition detection after remediation Detect stall condition due to exhausted remediation retry right after remediating. This helps return from AtomicRelease.Reconcile() with proper stalled status condition and error. Without this, after remediation, a stalled condition detection required a new reconciliation, leaving the status of the object without any Reconciling or Stalled condition. Signed-off-by: Sunny --- internal/reconcile/atomic_release.go | 10 ++++- internal/reconcile/atomic_release_test.go | 54 +++++++++++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/internal/reconcile/atomic_release.go b/internal/reconcile/atomic_release.go index ccd917298..edca52009 100644 --- a/internal/reconcile/atomic_release.go +++ b/internal/reconcile/atomic_release.go @@ -262,10 +262,18 @@ func (r *AtomicRelease) Reconcile(ctx context.Context, req *Request) error { "instructed to stop after running %s action reconciler %s", next.Type(), next.Name()), ) - if remediation := req.Object.GetActiveRemediation(); remediation == nil || !remediation.RetriesExhausted(req.Object) { + remediation := req.Object.GetActiveRemediation() + if remediation == nil || !remediation.RetriesExhausted(req.Object) { conditions.MarkReconciling(req.Object, meta.ProgressingWithRetryReason, conditions.GetMessage(req.Object, meta.ReadyCondition)) return ErrMustRequeue } + // Check if retries have exhausted after remediation for early + // stall condition detection. + if remediation != nil && remediation.RetriesExhausted(req.Object) { + conditions.MarkStalled(req.Object, "RetriesExceeded", "Failed to %s after %d attempt(s)", + req.Object.Status.LastAttemptedReleaseAction, req.Object.GetActiveRemediation().GetFailureCount(req.Object)) + return ErrExceededMaxRetries + } conditions.Delete(req.Object, meta.ReconcilingCondition) return nil diff --git a/internal/reconcile/atomic_release_test.go b/internal/reconcile/atomic_release_test.go index 0de1bdabb..5d0aee128 100644 --- a/internal/reconcile/atomic_release_test.go +++ b/internal/reconcile/atomic_release_test.go @@ -629,6 +629,7 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) { release.ObservedToSnapshot(release.ObserveRelease(releases[0])), } }, + wantErr: ErrExceededMaxRetries, }, { name: "install test failure with remediation", @@ -654,6 +655,7 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) { snap, } }, + wantErr: ErrExceededMaxRetries, }, { name: "install test failure with test ignore", @@ -759,6 +761,7 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) { release.ObservedToSnapshot(release.ObserveRelease(releases[0])), } }, + wantErr: ErrExceededMaxRetries, }, { name: "upgrade failure with uninstall remediation", @@ -799,6 +802,7 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) { release.ObservedToSnapshot(release.ObserveRelease(releases[0])), } }, + wantErr: ErrExceededMaxRetries, }, { name: "upgrade test failure with remediation", @@ -841,6 +845,7 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) { release.ObservedToSnapshot(release.ObserveRelease(releases[0])), } }, + wantErr: ErrExceededMaxRetries, }, { name: "upgrade test failure with test ignore", @@ -928,6 +933,55 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) { }, wantErr: ErrExceededMaxRetries, }, + { + name: "upgrade remediation results in exhausted retries", + releases: func(namespace string) []*helmrelease.Release { + return []*helmrelease.Release{ + testutil.BuildRelease(&helmrelease.MockReleaseOptions{ + Name: mockReleaseName, + Namespace: namespace, + Version: 1, + Chart: testutil.BuildChart(), + Status: helmrelease.StatusSuperseded, + }), + testutil.BuildRelease(&helmrelease.MockReleaseOptions{ + Name: mockReleaseName, + Namespace: namespace, + Version: 2, + Chart: testutil.BuildChart(), + Status: helmrelease.StatusSuperseded, + }), + testutil.BuildRelease(&helmrelease.MockReleaseOptions{ + Name: mockReleaseName, + Namespace: namespace, + Version: 3, + Chart: testutil.BuildChart(), + Status: helmrelease.StatusFailed, + }), + } + }, + spec: func(spec *v2.HelmReleaseSpec) { + spec.Upgrade = &v2.Upgrade{ + Remediation: &v2.UpgradeRemediation{ + Retries: 1, + }, + } + }, + status: func(namespace string, releases []*helmrelease.Release) v2.HelmReleaseStatus { + return v2.HelmReleaseStatus{ + History: v2.Snapshots{ + release.ObservedToSnapshot(release.ObserveRelease(releases[2])), + release.ObservedToSnapshot(release.ObserveRelease(releases[1])), + release.ObservedToSnapshot(release.ObserveRelease(releases[0])), + }, + LastAttemptedReleaseAction: v2.ReleaseActionUpgrade, + Failures: 2, + UpgradeFailures: 2, + } + }, + chart: testutil.BuildChart(), + wantErr: ErrExceededMaxRetries, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) {