From 59c577a92467ebee69e61de5fad562f3025c9b86 Mon Sep 17 00:00:00 2001 From: Sunny Date: Mon, 5 Feb 2024 07:59:25 +0000 Subject: [PATCH] Remove stale Ready=False conditions values When the reconciliation begins, while fulfilling the prerequisites, Ready=False condition for various reasons are added on the object. On failure, this reason is persisted on the object. On a subsequent reconciliation, when the failure is recovered, the Ready=False condition is not updates until the atomic reconciliation reaches a conclusion. During this period if the atomic reconciliation enters a retry loop due to constant drift detection and correction, the stale Ready=False condition with incorrect reason persists on the object. The Ready=False message is also copied to Reconciling=True condition, resulting in an incorrect depiction of what's actually happening. For example, if previously the HelmRelease failed with dependency not ready error, on a subsequent reconciliation, even after going past the dependency check and returning from atomic reconciliation due to drift detection and correction loop scenario, the Ready=False condition continues to show the stale dependency not ready error. In order to show more accurate status, the Ready=False conditions added while fulfilling prerequisites can be removed once those checks have succeeded, updating Ready=False to Ready=Unknown with "reconciliation in progress" message. If the atomic reconciliation gets stuck in the drift detection and correction loop with this, the Ready and Reconciling conditons would show "reconciliation in progress". This should be a better indicator of what's going on. The events and logs can be checked to determine accurately what's causing the reconciliation to be progressing for ever. Signed-off-by: Sunny --- go.mod | 2 +- go.sum | 4 +- internal/controller/helmrelease_controller.go | 28 ++++ .../controller/helmrelease_controller_test.go | 122 ++++++++++++++++++ 4 files changed, 153 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index df999c53f..40990716b 100644 --- a/go.mod +++ b/go.mod @@ -21,7 +21,7 @@ require ( github.com/fluxcd/pkg/apis/event v0.7.0 github.com/fluxcd/pkg/apis/kustomize v1.3.0 github.com/fluxcd/pkg/apis/meta v1.3.0 - github.com/fluxcd/pkg/runtime v0.44.0 + github.com/fluxcd/pkg/runtime v0.44.1 github.com/fluxcd/pkg/ssa v0.36.0 github.com/fluxcd/pkg/testserver v0.5.0 github.com/fluxcd/source-controller/api v1.2.4 diff --git a/go.sum b/go.sum index 85e4f0557..b2418ac0b 100644 --- a/go.sum +++ b/go.sum @@ -104,8 +104,8 @@ github.com/fluxcd/pkg/apis/kustomize v1.3.0 h1:qvB46CfaOWcL1SyR2RiVWN/j7/035D0Ot github.com/fluxcd/pkg/apis/kustomize v1.3.0/go.mod h1:PCXf5kktTzNav0aH2Ns3jsowqwmA9xTcsrEOoPzx/K8= github.com/fluxcd/pkg/apis/meta v1.3.0 h1:KxeEc6olmSZvQ5pBONPE4IKxyoWQbqTJF1X6K5nIXpU= github.com/fluxcd/pkg/apis/meta v1.3.0/go.mod h1:3Ui8xFkoU4sYehqmscjpq7NjqH2YN1A2iX2okbO3/yA= -github.com/fluxcd/pkg/runtime v0.44.0 h1:0BEPSpcsYXOiswKG5TWkin8fhCDHb0nDdAtq/5VrCSI= -github.com/fluxcd/pkg/runtime v0.44.0/go.mod h1:s1AhSOTCEBPaTfz/GdBD/Ws66uOByIuNP4Znrq+is9M= +github.com/fluxcd/pkg/runtime v0.44.1 h1:XuPTcNIgn/NsoIo/A6qfPZaD9E7cbnJTDbeNw8O1SZQ= +github.com/fluxcd/pkg/runtime v0.44.1/go.mod h1:s1AhSOTCEBPaTfz/GdBD/Ws66uOByIuNP4Znrq+is9M= github.com/fluxcd/pkg/ssa v0.36.0 h1:h9FB6SrrdVlxNQtfG+Fb/Roe1e61EPgtmJ5ORlAxwkU= github.com/fluxcd/pkg/ssa v0.36.0/go.mod h1:FJj4xznwBvRM+9h02lGGC0CGYGucPeXO7P6NEPphbys= github.com/fluxcd/pkg/testserver v0.5.0 h1:n/Iskk0tXNt2AgIgjz9qeFK/VhEXGfqeazABXZmO2Es= diff --git a/internal/controller/helmrelease_controller.go b/internal/controller/helmrelease_controller.go index 653295783..10648ae67 100644 --- a/internal/controller/helmrelease_controller.go +++ b/internal/controller/helmrelease_controller.go @@ -247,6 +247,10 @@ func (r *HelmReleaseReconciler) reconcileRelease(ctx context.Context, patchHelpe log.Info("all dependencies are ready") } + // Remove any stale corresponding Ready=False condition with Unknown. + if conditions.HasAnyReason(obj, meta.ReadyCondition, v2.DependencyNotReadyReason) { + conditions.MarkUnknown(obj, meta.ReadyCondition, meta.ProgressingReason, "reconciliation in progress") + } // Get the HelmChart object for the release. hc, err := r.getHelmChart(ctx, obj) @@ -266,6 +270,10 @@ func (r *HelmReleaseReconciler) reconcileRelease(ctx context.Context, patchHelpe conditions.MarkFalse(obj, meta.ReadyCondition, v2.ArtifactFailedReason, msg) return ctrl.Result{}, err } + // Remove any stale corresponding Ready=False condition with Unknown. + if conditions.HasAnyReason(obj, meta.ReadyCondition, aclv1.AccessDeniedReason, v2.ArtifactFailedReason) { + conditions.MarkUnknown(obj, meta.ReadyCondition, meta.ProgressingReason, "reconciliation in progress") + } // Check if the HelmChart is ready. if ready, reason := isHelmChartReady(hc); !ready { @@ -276,6 +284,10 @@ func (r *HelmReleaseReconciler) reconcileRelease(ctx context.Context, patchHelpe // the watcher should trigger a reconciliation. return jitter.JitteredRequeueInterval(ctrl.Result{RequeueAfter: obj.GetRequeueAfter()}), errWaitForChart } + // Remove any stale corresponding Ready=False condition with Unknown. + if conditions.HasAnyReason(obj, meta.ReadyCondition, "HelmChartNotReady") { + conditions.MarkUnknown(obj, meta.ReadyCondition, meta.ProgressingReason, "reconciliation in progress") + } // Compose values based from the spec and references. values, err := chartutil.ChartValuesFromReferences(ctx, r.Client, obj.Namespace, obj.GetValues(), obj.Spec.ValuesFrom...) @@ -283,6 +295,10 @@ func (r *HelmReleaseReconciler) reconcileRelease(ctx context.Context, patchHelpe conditions.MarkFalse(obj, meta.ReadyCondition, "ValuesError", err.Error()) return ctrl.Result{}, err } + // Remove any stale corresponding Ready=False condition with Unknown. + if conditions.HasAnyReason(obj, meta.ReadyCondition, "ValuesError") { + conditions.MarkUnknown(obj, meta.ReadyCondition, meta.ProgressingReason, "reconciliation in progress") + } // Load chart from artifact. loadedChart, err := loader.SecureLoadChartFromURL(loader.NewRetryableHTTPClient(ctx, r.artifactFetchRetries), hc.GetArtifact().URL, hc.GetArtifact().Digest) @@ -297,6 +313,10 @@ func (r *HelmReleaseReconciler) reconcileRelease(ctx context.Context, patchHelpe conditions.MarkFalse(obj, meta.ReadyCondition, v2.ArtifactFailedReason, fmt.Sprintf("Could not load chart: %s", err.Error())) return ctrl.Result{}, err } + // Remove any stale corresponding Ready=False condition with Unknown. + if conditions.HasAnyReason(obj, meta.ReadyCondition, v2.ArtifactFailedReason) { + conditions.MarkUnknown(obj, meta.ReadyCondition, meta.ProgressingReason, "reconciliation in progress") + } // Build the REST client getter. getter, err := r.buildRESTClientGetter(ctx, obj) @@ -304,6 +324,10 @@ func (r *HelmReleaseReconciler) reconcileRelease(ctx context.Context, patchHelpe conditions.MarkFalse(obj, meta.ReadyCondition, "RESTClientError", err.Error()) return ctrl.Result{}, err } + // Remove any stale corresponding Ready=False condition with Unknown. + if conditions.HasAnyReason(obj, meta.ReadyCondition, "RESTClientError") { + conditions.MarkUnknown(obj, meta.ReadyCondition, meta.ProgressingReason, "reconciliation in progress") + } // Attempt to adopt "legacy" v2beta1 release state on a best-effort basis. // If this fails, the controller will fall back to performing an upgrade @@ -354,6 +378,10 @@ func (r *HelmReleaseReconciler) reconcileRelease(ctx context.Context, patchHelpe conditions.MarkFalse(obj, meta.ReadyCondition, "FactoryError", err.Error()) return ctrl.Result{}, err } + // Remove any stale corresponding Ready=False condition with Unknown. + if conditions.HasAnyReason(obj, meta.ReadyCondition, "FactoryError") { + conditions.MarkUnknown(obj, meta.ReadyCondition, meta.ProgressingReason, "reconciliation in progress") + } // Off we go! if err = intreconcile.NewAtomicRelease(patchHelper, cfg, r.EventRecorder, r.FieldManager).Reconcile(ctx, &intreconcile.Request{ diff --git a/internal/controller/helmrelease_controller_test.go b/internal/controller/helmrelease_controller_test.go index 08bd933c1..5de31ad33 100644 --- a/internal/controller/helmrelease_controller_test.go +++ b/internal/controller/helmrelease_controller_test.go @@ -43,6 +43,7 @@ import ( "sigs.k8s.io/yaml" "github.com/fluxcd/pkg/apis/acl" + aclv1 "github.com/fluxcd/pkg/apis/acl" "github.com/fluxcd/pkg/apis/meta" "github.com/fluxcd/pkg/runtime/conditions" feathelper "github.com/fluxcd/pkg/runtime/features" @@ -762,6 +763,127 @@ func TestHelmReleaseReconciler_reconcileRelease(t *testing.T) { g.Expect(obj.Status.LastAttemptedConfigDigest).To(Equal("sha256:1dabc4e3cbbd6a0818bd460f3a6c9855bfe95d506c74726bc0f2edb0aecb1f4e")) g.Expect(obj.Status.LastAttemptedValuesChecksum).To(BeEmpty()) }) + + t.Run("error recovery updates ready condition", func(t *testing.T) { + g := NewWithT(t) + + // Create a test namespace for storing the Helm release mock. + ns, err := testEnv.CreateNamespace(context.TODO(), "error-recovery") + g.Expect(err).ToNot(HaveOccurred()) + t.Cleanup(func() { + _ = testEnv.Delete(context.TODO(), ns) + }) + + // Create HelmChart mock. + chartMock := testutil.BuildChart() + chartArtifact, err := testutil.SaveChartAsArtifact(chartMock, digest.SHA256, testServer.URL(), testServer.Root()) + g.Expect(err).ToNot(HaveOccurred()) + + chart := &sourcev1b2.HelmChart{ + ObjectMeta: metav1.ObjectMeta{ + Name: "error-recovery", + Namespace: ns.Name, + Generation: 1, + }, + Spec: sourcev1b2.HelmChartSpec{ + Chart: "testdata/test-helmrepo", + Version: "0.1.0", + SourceRef: sourcev1b2.LocalHelmChartSourceReference{ + Kind: sourcev1b2.HelmRepositoryKind, + Name: "error-recovery", + }, + }, + Status: sourcev1b2.HelmChartStatus{ + ObservedGeneration: 1, + Artifact: chartArtifact, + Conditions: []metav1.Condition{ + { + Type: meta.ReadyCondition, + Status: metav1.ConditionTrue, + }, + }, + }, + } + + // Create a test Helm release storage mock. + rls := testutil.BuildRelease(&helmrelease.MockReleaseOptions{ + Name: "error-recovery", + Namespace: ns.Name, + Version: 1, + Chart: chartMock, + Status: helmrelease.StatusDeployed, + }, testutil.ReleaseWithConfig(nil)) + + obj := &v2.HelmRelease{ + ObjectMeta: metav1.ObjectMeta{ + Name: "error-recovery", + Namespace: ns.Name, + }, + Spec: v2.HelmReleaseSpec{ + StorageNamespace: ns.Name, + }, + Status: v2.HelmReleaseStatus{ + HelmChart: chart.Namespace + "/" + chart.Name, + History: v2.Snapshots{ + release.ObservedToSnapshot(release.ObserveRelease(rls)), + }, + }, + } + + c := fake.NewClientBuilder(). + WithScheme(NewTestScheme()). + WithStatusSubresource(&v2.HelmRelease{}). + WithObjects(chart, obj). + Build() + + r := &HelmReleaseReconciler{ + Client: c, + GetClusterConfig: GetTestClusterConfig, + EventRecorder: record.NewFakeRecorder(32), + FieldManager: "test", + } + + // Store the Helm release mock in the test namespace. + getter, err := r.buildRESTClientGetter(context.TODO(), obj) + g.Expect(err).ToNot(HaveOccurred()) + + cfg, err := action.NewConfigFactory(getter, action.WithStorage(helmdriver.SecretsDriverName, obj.GetStorageNamespace())) + g.Expect(err).ToNot(HaveOccurred()) + + store := helmstorage.Init(cfg.Driver) + g.Expect(store.Create(rls)).To(Succeed()) + + sp := patch.NewSerialPatcher(obj, r.Client) + + // List of failure reasons to test. + prereqFailures := []string{ + v2.DependencyNotReadyReason, + aclv1.AccessDeniedReason, + v2.ArtifactFailedReason, + "HelmChartNotReady", + "ValuesError", + "RESTClientError", + "FactoryError", + } + + // Update ready condition for each failure, reconcile and check if the + // stale failure condition gets updated. + for _, failReason := range prereqFailures { + conditions.MarkFalse(obj, meta.ReadyCondition, failReason, "foo") + err := sp.Patch(context.TODO(), obj, + patch.WithOwnedConditions{Conditions: intreconcile.OwnedConditions}, + patch.WithFieldOwner(r.FieldManager), + ) + g.Expect(err).ToNot(HaveOccurred()) + + _, err = r.reconcileRelease(context.TODO(), sp, obj) + g.Expect(err).ToNot(HaveOccurred()) + + ready := conditions.Get(obj, meta.ReadyCondition) + g.Expect(ready.Status).To(Equal(metav1.ConditionUnknown)) + g.Expect(ready.Reason).To(Equal(meta.ProgressingReason)) + } + }) } func TestHelmReleaseReconciler_reconcileDelete(t *testing.T) {