Skip to content

Commit

Permalink
feat(verification): improve transient err handling
Browse files Browse the repository at this point in the history
This commit improves the handling of transient Kubernetes API errors
which may occur during the verification step of a Stage's current
Freight.

It does this by selectively returning errors which should be retried,
causing the reconciler to not leave the `Verifying` phase and allowing
it to retry the action it performed on the next reconciliation attempt.

To allow the reconciler to distinguish a creation error from a failed
attempt to retrieve results, closer inspection is performed on the
existence of an `AnalysisRun` reference in the `VerificationInfo`.

Signed-off-by: Hidde Beydals <[email protected]>
  • Loading branch information
hiddeco committed Mar 19, 2024
1 parent ffaad65 commit 01c4bbf
Show file tree
Hide file tree
Showing 8 changed files with 433 additions and 67 deletions.
6 changes: 6 additions & 0 deletions api/v1alpha1/stage_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -750,6 +750,12 @@ type VerificationInfo struct {
AnalysisRun *AnalysisRunReference `json:"analysisRun,omitempty" protobuf:"bytes,3,opt,name=analysisRun"`
}

// HasAnalysisRun returns a bool indicating whether the VerificationInfo has an
// associated AnalysisRun.
func (v *VerificationInfo) HasAnalysisRun() bool {
return v != nil && v.AnalysisRun != nil
}

type VerificationInfoStack []VerificationInfo

// UpdateOrPush updates the VerificationInfo with the same ID as the provided
Expand Down
31 changes: 31 additions & 0 deletions api/v1alpha1/stage_types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,37 @@ import (
"github.com/stretchr/testify/require"
)

func TestVerificationInfo_HasAnalysisRun(t *testing.T) {
testCases := []struct {
name string
info *VerificationInfo
expectedResult bool
}{
{
name: "VerificationInfo is nil",
info: nil,
expectedResult: false,
},
{
name: "AnalysisRun is nil",
info: &VerificationInfo{},
expectedResult: false,
},
{
name: "AnalysisRun is not nil",
info: &VerificationInfo{
AnalysisRun: &AnalysisRunReference{},
},
expectedResult: true,
},
}
for _, testCase := range testCases {
t.Run(testCase.name, func(t *testing.T) {
require.Equal(t, testCase.expectedResult, testCase.info.HasAnalysisRun())
})
}
}

func TestFreightReferenceStackEmpty(t *testing.T) {
testCases := []struct {
name string
Expand Down
28 changes: 23 additions & 5 deletions internal/controller/stages/stages.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ type reconciler struct {
startVerificationFn func(
context.Context,
*kargoapi.Stage,
) *kargoapi.VerificationInfo
) (*kargoapi.VerificationInfo, error)

abortVerificationFn func(
context.Context,
Expand All @@ -97,7 +97,7 @@ type reconciler struct {
getVerificationInfoFn func(
context.Context,
*kargoapi.Stage,
) *kargoapi.VerificationInfo
) (*kargoapi.VerificationInfo, error)

getAnalysisTemplateFn func(
context.Context,
Expand Down Expand Up @@ -755,14 +755,32 @@ func (r *reconciler) syncNormalStage(
// check if verification step is necessary and if yes execute
// step irrespective of phase
if status.Phase == kargoapi.StagePhaseVerifying || status.Phase == kargoapi.StagePhaseNotApplicable {
if status.CurrentFreight.VerificationInfo == nil {
if !status.CurrentFreight.VerificationInfo.HasAnalysisRun() {
if status.Health == nil || status.Health.Status == kargoapi.HealthStateHealthy {
log.Debug("starting verification")
status.CurrentFreight.VerificationInfo = r.startVerificationFn(ctx, stage)
var err error
if status.CurrentFreight.VerificationInfo, err = r.startVerificationFn(
ctx,
stage,
); err != nil && !status.CurrentFreight.VerificationInfo.HasAnalysisRun() {
status.CurrentFreight.VerificationHistory.UpdateOrPush(
*status.CurrentFreight.VerificationInfo,
)
return status, fmt.Errorf("error starting verification: %w", err)
}
}
} else {
log.Debug("checking verification results")
status.CurrentFreight.VerificationInfo = r.getVerificationInfoFn(ctx, stage)
var err error
if status.CurrentFreight.VerificationInfo, err = r.getVerificationInfoFn(
ctx,
stage,
); err != nil && status.CurrentFreight.VerificationInfo.HasAnalysisRun() {
status.CurrentFreight.VerificationHistory.UpdateOrPush(
*status.CurrentFreight.VerificationInfo,
)
return status, fmt.Errorf("error getting verification info: %w", err)
}

// Abort the verification if it's still running and the Stage has
// been marked to do so.
Expand Down
176 changes: 162 additions & 14 deletions internal/controller/stages/stages_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -366,15 +366,15 @@ func TestSyncNormalStage(t *testing.T) {
startVerificationFn: func(
context.Context,
*kargoapi.Stage,
) *kargoapi.VerificationInfo {
) (*kargoapi.VerificationInfo, error) {
return &kargoapi.VerificationInfo{
ID: "new-fake-id",
Phase: kargoapi.VerificationPhasePending,
Message: "Awaiting reconfirmation",
AnalysisRun: &kargoapi.AnalysisRunReference{
Name: "new-fake-analysis-run",
},
}
}, nil
},
},
assertions: func(
Expand Down Expand Up @@ -479,11 +479,11 @@ func TestSyncNormalStage(t *testing.T) {
startVerificationFn: func(
context.Context,
*kargoapi.Stage,
) *kargoapi.VerificationInfo {
) (*kargoapi.VerificationInfo, error) {
return &kargoapi.VerificationInfo{
Phase: kargoapi.VerificationPhaseError,
Message: "something went wrong",
}
}, nil
},
},
assertions: func(
Expand Down Expand Up @@ -519,6 +519,71 @@ func TestSyncNormalStage(t *testing.T) {
},
},

{
name: "retryable error starting verification",
stage: &kargoapi.Stage{
Spec: &kargoapi.StageSpec{
PromotionMechanisms: &kargoapi.PromotionMechanisms{},
Verification: &kargoapi.Verification{},
},
Status: kargoapi.StageStatus{
Phase: kargoapi.StagePhaseVerifying,
CurrentFreight: &kargoapi.FreightReference{},
},
},
reconciler: &reconciler{
hasNonTerminalPromotionsFn: noNonTerminalPromotionsFn,
checkHealthFn: func(
context.Context,
kargoapi.FreightReference,
[]kargoapi.ArgoCDAppUpdate,
) *kargoapi.Health {
return nil
},
startVerificationFn: func(
context.Context,
*kargoapi.Stage,
) (*kargoapi.VerificationInfo, error) {
return &kargoapi.VerificationInfo{
Phase: kargoapi.VerificationPhaseError,
Message: "something went wrong",
}, errors.New("retryable error")
},
},
assertions: func(
t *testing.T,
initialStatus kargoapi.StageStatus,
newStatus kargoapi.StageStatus,
err error,
) {
require.ErrorContains(t, err, "retryable error")
require.NotNil(t, newStatus.CurrentFreight)
require.Equal(t, kargoapi.StagePhaseVerifying, newStatus.Phase)

expectInfo := kargoapi.VerificationInfo{
Phase: kargoapi.VerificationPhaseError,
Message: "something went wrong",
}

require.Equal(
t,
&expectInfo,
newStatus.CurrentFreight.VerificationInfo,
)
require.Equal(
t,
kargoapi.VerificationInfoStack{expectInfo},
newStatus.CurrentFreight.VerificationHistory,
)

// Everything else should be returned unchanged
newStatus.CurrentFreight.VerificationInfo = nil
newStatus.CurrentFreight.VerificationHistory = nil
newStatus.Phase = initialStatus.Phase
require.Equal(t, initialStatus, newStatus)
},
},

{
name: "error checking verification result",
stage: &kargoapi.Stage{
Expand All @@ -529,7 +594,13 @@ func TestSyncNormalStage(t *testing.T) {
Status: kargoapi.StageStatus{
Phase: kargoapi.StagePhaseVerifying,
CurrentFreight: &kargoapi.FreightReference{
VerificationInfo: &kargoapi.VerificationInfo{},
VerificationInfo: &kargoapi.VerificationInfo{
Phase: kargoapi.VerificationPhasePending,
AnalysisRun: &kargoapi.AnalysisRunReference{
Name: "fake-analysis-run",
Namespace: "fake-namespace",
},
},
},
},
},
Expand All @@ -542,11 +613,11 @@ func TestSyncNormalStage(t *testing.T) {
) *kargoapi.Health {
return nil
},
getVerificationInfoFn: func(_ context.Context, _ *kargoapi.Stage) *kargoapi.VerificationInfo {
getVerificationInfoFn: func(_ context.Context, _ *kargoapi.Stage) (*kargoapi.VerificationInfo, error) {
return &kargoapi.VerificationInfo{
Phase: kargoapi.VerificationPhaseError,
Message: "something went wrong",
}
}, nil
},
},
assertions: func(
Expand Down Expand Up @@ -574,6 +645,77 @@ func TestSyncNormalStage(t *testing.T) {
},
},

{
name: "retryable error checking verification result",
stage: &kargoapi.Stage{
Spec: &kargoapi.StageSpec{
PromotionMechanisms: &kargoapi.PromotionMechanisms{},
Verification: &kargoapi.Verification{},
},
Status: kargoapi.StageStatus{
Phase: kargoapi.StagePhaseVerifying,
CurrentFreight: &kargoapi.FreightReference{
VerificationInfo: &kargoapi.VerificationInfo{
Phase: kargoapi.VerificationPhasePending,
AnalysisRun: &kargoapi.AnalysisRunReference{
Name: "fake-analysis-run",
Namespace: "fake-namespace",
},
},
},
},
},
reconciler: &reconciler{
hasNonTerminalPromotionsFn: noNonTerminalPromotionsFn,
checkHealthFn: func(
context.Context,
kargoapi.FreightReference,
[]kargoapi.ArgoCDAppUpdate,
) *kargoapi.Health {
return nil
},
getVerificationInfoFn: func(_ context.Context, _ *kargoapi.Stage) (*kargoapi.VerificationInfo, error) {
return &kargoapi.VerificationInfo{
Phase: kargoapi.VerificationPhaseError,
Message: "something went wrong",
AnalysisRun: &kargoapi.AnalysisRunReference{
Name: "fake-analysis-run",
Namespace: "fake-namespace",
},
}, errors.New("retryable error")
},
},
assertions: func(
t *testing.T,
initialStatus kargoapi.StageStatus,
newStatus kargoapi.StageStatus,
err error,
) {
require.ErrorContains(t, err, "retryable error")
require.NotNil(t, newStatus.CurrentFreight)
require.Equal(
t,
&kargoapi.VerificationInfo{
Phase: kargoapi.VerificationPhaseError,
Message: "something went wrong",
AnalysisRun: &kargoapi.AnalysisRunReference{
Name: "fake-analysis-run",
Namespace: "fake-namespace",
},
},
newStatus.CurrentFreight.VerificationInfo,
)
require.Len(t, newStatus.CurrentFreight.VerificationHistory, 1)

// Phase should not be changed to Steady
require.Equal(t, kargoapi.StagePhaseVerifying, newStatus.Phase)
// Everything else should be unchanged
newStatus.Phase = initialStatus.Phase
newStatus.CurrentFreight = initialStatus.CurrentFreight
require.Equal(t, initialStatus, newStatus)
},
},

{
name: "verification aborted",
stage: &kargoapi.Stage{
Expand Down Expand Up @@ -611,8 +753,8 @@ func TestSyncNormalStage(t *testing.T) {
getVerificationInfoFn: func(
_ context.Context,
s *kargoapi.Stage,
) *kargoapi.VerificationInfo {
return s.Status.CurrentFreight.VerificationInfo
) (*kargoapi.VerificationInfo, error) {
return s.Status.CurrentFreight.VerificationInfo, nil
},
abortVerificationFn: func(
context.Context,
Expand Down Expand Up @@ -683,10 +825,10 @@ func TestSyncNormalStage(t *testing.T) {
getVerificationInfoFn: func(
_ context.Context,
s *kargoapi.Stage,
) *kargoapi.VerificationInfo {
) (*kargoapi.VerificationInfo, error) {
i := s.Status.CurrentFreight.VerificationInfo.DeepCopy()
i.Phase = kargoapi.VerificationPhaseError
return i
return i, nil
},
abortVerificationFn: func(
context.Context,
Expand Down Expand Up @@ -1174,7 +1316,13 @@ func TestSyncNormalStage(t *testing.T) {
Phase: kargoapi.StagePhaseVerifying,
CurrentPromotion: &kargoapi.PromotionInfo{},
CurrentFreight: &kargoapi.FreightReference{
VerificationInfo: &kargoapi.VerificationInfo{},
VerificationInfo: &kargoapi.VerificationInfo{
Phase: kargoapi.VerificationPhasePending,
AnalysisRun: &kargoapi.AnalysisRunReference{
Name: "fake-analysis-run",
Namespace: "fake-namespace",
},
},
},
},
},
Expand All @@ -1192,15 +1340,15 @@ func TestSyncNormalStage(t *testing.T) {
getVerificationInfoFn: func(
context.Context,
*kargoapi.Stage,
) *kargoapi.VerificationInfo {
) (*kargoapi.VerificationInfo, error) {
return &kargoapi.VerificationInfo{
Phase: kargoapi.VerificationPhaseSuccessful,
AnalysisRun: &kargoapi.AnalysisRunReference{
Name: "fake-analysis-run",
Namespace: "fake-namespace",
Phase: string(rollouts.AnalysisPhaseSuccessful),
},
}
}, nil
},
verifyFreightInStageFn: func(context.Context, string, string, string) error {
return nil
Expand Down
Loading

0 comments on commit 01c4bbf

Please sign in to comment.