Skip to content

Commit

Permalink
Check for stale pipeline runs
Browse files Browse the repository at this point in the history
  • Loading branch information
stuartwdouglas committed Nov 8, 2023
1 parent faae7fa commit 132430a
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 9 deletions.
45 changes: 36 additions & 9 deletions controllers/component_dependency_update_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package controllers

import (
l "github.com/redhat-appstudio/build-service/pkg/logs"
"golang.org/x/exp/maps"
"golang.org/x/exp/slices"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
Expand Down Expand Up @@ -272,14 +273,7 @@ func (r *ComponentDependencyUpdateReconciler) handleCompletedBuild(ctx context.C
//update the list of successful updates
//if any
if len(alreadyProcessed) > 0 {
val := ""
for i := range alreadyProcessed {
if val != "" {
val = val + ","
}
val = val + i
}
pipelineRun.Annotations[AlreadyNudgedAnnotationName] = val
pipelineRun.Annotations[AlreadyNudgedAnnotationName] = strings.Join(maps.Keys(alreadyProcessed), ",")
}
r.EventRecorder.Event(updatedComponent, corev1.EventTypeWarning, ComponentNudgeFailedEventType, fmt.Sprintf("component update failed as a result of a build for %s, retry %d/%d", updatedComponent.Name, failureCount, MaxAttempts))

Expand All @@ -301,10 +295,43 @@ func (r *ComponentDependencyUpdateReconciler) handleCompletedBuild(ctx context.C
} else {
return reconcile.Result{RequeueAfter: retryTime * time.Duration(failureCount)}, nil
}
}

_, err = r.removePipelineFinalizer(ctx, pipelineRun)
if err != nil {
return ctrl.Result{}, err
}

return r.removePipelineFinalizer(ctx, pipelineRun)
// Now we need to look for 'stale' pipelines
// These are defined as pipelines that are younger than this one, that target the same component
// If there are two pushes very close together variances in pipeline run times may mean that the
// Older one finishes last. In this case we want to mark the older one as already nudged and remove
// The finalizer

pipelines := tektonapi.PipelineRunList{}
err = r.Client.List(ctx, &pipelines, client.InNamespace(pipelineRun.Namespace), client.MatchingLabels{PipelineRunTypeLabelName: PipelineRunBuildType, PipelineRunComponentLabelName: updatedComponent.Name, PacEventTypeLabelName: PacEventPushType})
if err != nil {
// I don't think we want to retry this, it should be really rare anyway
// And would require an even more complex label based state machine
log.Error(err, "failed to check for stale pipeline runs, this operation will not be retried", l.Audit, "true")
return ctrl.Result{}, nil
}
for i := range pipelines.Items {
pr := pipelines.Items[i]
if pr.Status.CompletionTime == nil && pr.CreationTimestamp.Before(&pipelineRun.CreationTimestamp) {
if pr.Annotations == nil {
pr.Annotations = map[string]string{}
}
pr.Annotations[NudgeProcessedAnnotationName] = "true"
_, err := r.removePipelineFinalizer(ctx, &pr)
if err != nil {
//TODO: Do we need a retry here on conflict?
log.Error(err, "failed to update stale pipeline run, this operation will not be retried", l.Audit, "true")
}
}

}
return ctrl.Result{}, nil
}

// removePipelineFinalizer will remove the finalizer, and add an annotation to indicate we are done with this pipeline run
Expand Down
46 changes: 46 additions & 0 deletions controllers/component_dependency_update_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,52 @@ var _ = Describe("Component nudge controller", func() {
return op1nudged && op2nudged
}, timeout, interval).WithTimeout(ensureTimeout).Should(BeTrue())
})

It("Test state pipeline not nudged", func() {
createBuildPipelineRun("stale-pipeline", UserNamespace, BaseComponent)
time.Sleep(time.Millisecond * 10)
createBuildPipelineRun("test-pipeline-1", UserNamespace, BaseComponent)
Eventually(func() bool {
pr := getPipelineRun("test-pipeline-1", UserNamespace)
return controllerutil.ContainsFinalizer(pr, NudgeFinalizer)
}, timeout, interval).WithTimeout(ensureTimeout).Should(BeTrue())
pr := getPipelineRun("test-pipeline-1", UserNamespace)
pr.Status.SetCondition(&apis.Condition{
Type: apis.ConditionSucceeded,
Status: "True",
LastTransitionTime: apis.VolatileTime{Inner: metav1.Time{Time: time.Now()}},
})
pr.Status.Results = []tektonapi.PipelineRunResult{
{Name: ImageDigestParamName, Value: tektonapi.ResultValue{Type: tektonapi.ParamTypeString, StringVal: "sha256:12345"}},
{Name: ImageUrl, Value: tektonapi.ResultValue{Type: tektonapi.ParamTypeString, StringVal: "quay.io.foo/bar:latest"}},
}
pr.Status.CompletionTime = &metav1.Time{Time: time.Now()}
Expect(k8sClient.Status().Update(ctx, pr)).Should(BeNil())
Eventually(func() bool {
events := v1.EventList{}
err := k8sClient.List(context.TODO(), &events)
Expect(err).ToNot(HaveOccurred())

//test that the state pipeline run was marked as nudged, even though it has not completed
stale := getPipelineRun("stale-pipeline", UserNamespace)
if stale.Annotations == nil || stale.Annotations[NudgeProcessedAnnotationName] != "true" || controllerutil.ContainsFinalizer(stale, NudgeFinalizer) {
return false
}
op1nudged := false
op2nudged := false
for _, i := range events.Items {
if i.Reason == ComponentNudgedEventType && strings.Contains(i.Message, "quay.io.foo/bar:latest@sha256:12345") {
if strings.Contains(i.Message, "operator1") {
op1nudged = true
}
if strings.Contains(i.Message, "operator2") {
op2nudged = true
}
}
}
return op1nudged && op2nudged
}, timeout, interval).WithTimeout(ensureTimeout).Should(BeTrue())
})
})

Context("Test nudge failure handling", func() {
Expand Down

0 comments on commit 132430a

Please sign in to comment.