From f65b95b75d1bb9cf270d4f43afbd151a91672a73 Mon Sep 17 00:00:00 2001 From: Chris Werner Rau Date: Thu, 11 Jul 2024 16:24:31 +0200 Subject: [PATCH 1/3] fix: remove digest check to never ignore helm errors Closes #1021 Signed-off-by: Chris Werner Rau --- internal/reconcile/uninstall.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/internal/reconcile/uninstall.go b/internal/reconcile/uninstall.go index 5391ba72b..7c0a66e1b 100644 --- a/internal/reconcile/uninstall.go +++ b/internal/reconcile/uninstall.go @@ -138,10 +138,7 @@ func (r *Uninstall) Reconcile(ctx context.Context, req *Request) error { // Handle any error. if err != nil { r.failure(req, logBuf, err) - if req.Object.Status.History.Latest().Digest == cur.Digest { - return err - } - return nil + return err } // Mark success. From ad0d38c19a31792f148f1506ecc2c49a46ddb205 Mon Sep 17 00:00:00 2001 From: Sunny Date: Wed, 18 Sep 2024 22:20:51 +0000 Subject: [PATCH 2/3] Add tests for uninstall error Signed-off-by: Sunny --- .../controller/helmrelease_controller_test.go | 64 +++++++++++++++++++ internal/reconcile/uninstall_test.go | 8 +++ 2 files changed, 72 insertions(+) diff --git a/internal/controller/helmrelease_controller_test.go b/internal/controller/helmrelease_controller_test.go index d56ea0c4c..9ee12a0eb 100644 --- a/internal/controller/helmrelease_controller_test.go +++ b/internal/controller/helmrelease_controller_test.go @@ -2628,6 +2628,70 @@ func TestHelmReleaseReconciler_reconcileUninstall(t *testing.T) { g.Expect(conditions.GetMessage(obj, meta.ReadyCondition)).To(ContainSubstring("no namespace provided")) g.Expect(obj.GetConditions()).To(HaveLen(1)) }) + + t.Run("error due to failing delete hook", func(t *testing.T) { + g := NewWithT(t) + + ns, err := testEnv.CreateNamespace(context.TODO(), "reconcile-uninstall") + g.Expect(err).ToNot(HaveOccurred()) + t.Cleanup(func() { + _ = testEnv.Delete(context.TODO(), ns) + }) + + rls := testutil.BuildRelease(&helmrelease.MockReleaseOptions{ + Name: "reconcile-uninstall", + Namespace: ns.Name, + Version: 1, + Chart: testutil.BuildChart(testutil.ChartWithFailingHook()), + Status: helmrelease.StatusDeployed, + }, testutil.ReleaseWithFailingHook()) + + obj := &v2.HelmRelease{ + ObjectMeta: metav1.ObjectMeta{ + Name: "reconcile-uninstall", + Namespace: ns.Name, + DeletionTimestamp: &metav1.Time{Time: time.Now()}, + }, + Spec: v2.HelmReleaseSpec{ + Uninstall: &v2.Uninstall{ + KeepHistory: true, + Timeout: &metav1.Duration{Duration: time.Millisecond}, + }, + }, + Status: v2.HelmReleaseStatus{ + StorageNamespace: ns.Name, + History: v2.Snapshots{ + release.ObservedToSnapshot(release.ObserveRelease(rls)), + }, + }, + } + + r := &HelmReleaseReconciler{ + Client: testEnv.Client, + GetClusterConfig: GetTestClusterConfig, + EventRecorder: record.NewFakeRecorder(32), + } + + // Store the Helm release mock in the test namespace. + getter, err := r.buildRESTClientGetter(context.TODO(), obj) + g.Expect(err).ToNot(HaveOccurred()) + + cfg, err := action.NewConfigFactory(getter, action.WithStorage(helmdriver.SecretsDriverName, obj.Status.StorageNamespace)) + g.Expect(err).ToNot(HaveOccurred()) + + store := helmstorage.Init(cfg.Driver) + g.Expect(store.Create(rls)).To(Succeed()) + + err = r.reconcileUninstall(context.TODO(), getter, obj) + g.Expect(err).To(HaveOccurred()) + + // Verify status of Helm release has not been updated. + g.Expect(obj.Status.StorageNamespace).ToNot(BeEmpty()) + + // Verify Helm release has not been uninstalled. + _, err = store.History(rls.Name) + g.Expect(err).ToNot(HaveOccurred()) + }) } func TestHelmReleaseReconciler_checkDependencies(t *testing.T) { diff --git a/internal/reconcile/uninstall_test.go b/internal/reconcile/uninstall_test.go index ae441934f..a0e19aa95 100644 --- a/internal/reconcile/uninstall_test.go +++ b/internal/reconcile/uninstall_test.go @@ -61,6 +61,10 @@ func TestUninstall_Reconcile(t *testing.T) { status func(releases []*helmrelease.Release) v2.HelmReleaseStatus // wantErr is the error that is expected to be returned. wantErr error + // wantErrString is the error string that is expected to be in the + // returned error. This is used for scenarios that return + // untyped/unwrapped error that can't be asserted for their value. + wantErrString string // expectedConditions are the conditions that are expected to be set on // the HelmRelease after running rollback. expectConditions []metav1.Condition @@ -150,6 +154,7 @@ func TestUninstall_Reconcile(t *testing.T) { } }, expectFailures: 1, + wantErrString: "timed out waiting", }, { name: "uninstall failure without storage update", @@ -244,6 +249,7 @@ func TestUninstall_Reconcile(t *testing.T) { } }, expectFailures: 1, + wantErrString: "Failed to purge the release", }, { name: "uninstall without current", @@ -482,6 +488,8 @@ func TestUninstall_Reconcile(t *testing.T) { }) if tt.wantErr != nil { g.Expect(errors.Is(got, tt.wantErr)).To(BeTrue()) + } else if tt.wantErrString != "" { + g.Expect(got.Error()).To(ContainSubstring(tt.wantErrString)) } else { g.Expect(got).ToNot(HaveOccurred()) } From 7fee60ed7fa802e1a146165df7b91babfd0cc791 Mon Sep 17 00:00:00 2001 From: Sunny Date: Wed, 18 Sep 2024 23:16:29 +0000 Subject: [PATCH 3/3] Add docs for handling failed uninstall Signed-off-by: Sunny --- docs/spec/v2/helmreleases.md | 54 ++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/docs/spec/v2/helmreleases.md b/docs/spec/v2/helmreleases.md index cb7a828e1..8393fbf3d 100644 --- a/docs/spec/v2/helmreleases.md +++ b/docs/spec/v2/helmreleases.md @@ -1224,6 +1224,60 @@ Using `flux`: flux reconcile helmrelease --reset ``` +### Handling failed uninstall + +At times, a Helm uninstall may fail due to the resource deletion taking a long +time, resources getting stuck in deleting phase due to some resource delete +policy in the cluster or some failing delete hooks. Depending on the scenario, +this can be handled in a few different ways. + +For resources that take long to delete but are certain to get deleted without +any intervention, failed uninstall will be retried until they succeeds. The +HelmRelease object will remain in a failed state until the uninstall succeeds. +Once uninstall is successful, the HelmRelease object will get deleted. + +If resources get stuck at deletion due to some dependency on some other +resource or policy, the controller will keep retrying to delete the resources. +The HelmRelease object will remain in a failed state. Once the cause of resource +deletion issue is resolved by intervention, HelmRelease uninstallation will +succeed and the HelmRelease object will get deleted. In case the cause of the +deletion issue can't be resolved, the HelmRelease can be force deleted by +manually deleting the [Helm storage +secret](https://helm.sh/docs/topics/advanced/#storage-backends) from the +respective release namespace. When the controller retries uninstall and cannot +find the release, it assumes that the release has been deleted, Helm uninstall +succeeds and the HelmRelease object gets deleted. This leaves behind all the +release resources. They have to be manually deleted. + +If a chart with pre-delete hooks fail, the controller will re-run the hooks +until they succeed and unblock the uninstallation. The Helm uninstall error +will be present in the status of HelmRelease. This can be used to identify which +hook is failing. If the hook failure persists, to run uninstall without the +hooks, equivalent of running `helm uninstall --no-hooks`, update the HelmRelease +to set `.spec.uninstall.disableHooks` to `true`. + +```yaml +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +... +spec: + ... + uninstall: + disableHooks: true +``` + +In the next reconciliation, the controller will run Helm uninstall without the +hooks. On success, the HelmRelease will get deleted. Otherwise, check the status +of the HelmRelease for other failure that may be blocking the uninstall. + +In case of charts with post-delete hooks, since the hook runs after the deletion +of the resources and the Helm storage, the hook failure will result in an +initial uninstall failure. In the subsequent reconciliation to retry uninstall, +since the Helm storage for the release got deleted, uninstall will succeed and +the HelmRelease object will get deleted. + +Any leftover pre or post-delete hook resources have to be manually deleted. + ### Waiting for `Ready` When a change is applied, it is possible to wait for the HelmRelease to reach a