[TEP-0050] Implement PipelineTask OnError

Part of [tektoncd#7165][tektoncd#7165]. In [TEP-0050][tep-0050], we proposed to add an `OnError` API field under `PipelineTask` to configure error handling strategy. This commits leverages `PipelineTask.OnError` API field introduced in the previous PR, implement the error handling strategy, update related docs and tests. /kind feature [tep-0050]: https://github.com/tektoncd/community/blob/main/teps/0050-ignore-task-failures.md [tektoncd#7165]: tektoncd#7165
QuanZhang-William · Nov 27, 2023 · 0a34d79 · 0a34d79
1 parent 23581c5
commit 0a34d79
Show file tree

Hide file tree

Showing 14 changed files with 401 additions and 17 deletions.
diff --git a/docs/pipelines.md b/docs/pipelines.md
@@ -658,8 +658,6 @@ tasks:
 
 > :seedling: **Specifying `onError` in `PipelineTasks` is an [alpha](additional-configs.md#alpha-features) feature.** The `enable-api-fields` feature flag must be set to `"alpha"` to specify `onError`  in a `PipelineTask`.
 
-> :seedling: This feature is in **Preview Only** mode and not yet supported/implemented.
-
 When a `PipelineTask` fails, the rest of the `PipelineTasks` are skipped and the `PipelineRun` is declared a failure. If you would like to
 ignore such `PipelineTask` failure and continue executing the rest of the `PipelineTasks`, you can specify `onError` for such a `PipelineTask`.
 

diff --git a/examples/v1/pipelineruns/alpha/ignore-task-error.yaml b/examples/v1/pipelineruns/alpha/ignore-task-error.yaml
@@ -0,0 +1,25 @@
+apiVersion: tekton.dev/v1
+kind: PipelineRun
+metadata:
+  generateName: pipelinerun-with-failing-task-
+spec:
+  pipelineSpec:
+    tasks:
+      - name: echo-continue
+        onError: continue
+        taskSpec:
+          steps:
+            - name: write
+              image: alpine
+              script: |
+                echo "this is a failing task"
+                exit 1
+      - name: echo
+        runAfter:
+          - echo-continue
+        taskSpec:
+          steps:
+            - name: write
+              image: alpine
+              script: |
+                echo "this is a success task"
diff --git a/pkg/apis/pipeline/v1/pipeline_types.go b/pkg/apis/pipeline/v1/pipeline_types.go
@@ -248,8 +248,6 @@ type PipelineTask struct {
 
 	// OnError defines the exiting behavior of a PipelineRun on error
 	// can be set to [ continue | stopAndFail ]
-	// Note: OnError is in preview mode and not yet supported
-	// TODO(#7165)
 	// +optional
 	OnError PipelineTaskOnErrorType `json:"onError,omitempty"`
 }

diff --git a/pkg/apis/pipeline/v1/pipelinerun_types.go b/pkg/apis/pipeline/v1/pipelinerun_types.go
@@ -410,6 +410,9 @@ const (
 	PipelineRunReasonCELEvaluationFailed PipelineRunReason = "CELEvaluationFailed"
 )
 
+// PipelineTaskOnErrorAnnotation is used to pass the failure strategy to TaskRun pods
+const PipelineTaskOnErrorAnnotation = "pipeline.tekton.dev/pipeline-task-on-error"
+
 func (t PipelineRunReason) String() string {
 	return string(t)
 }

diff --git a/pkg/apis/pipeline/v1/taskrun_types.go b/pkg/apis/pipeline/v1/taskrun_types.go
@@ -190,6 +190,8 @@ const (
 	TaskRunReasonStopSidecarFailed = "TaskRunStopSidecarFailed"
 	// TaskRunReasonInvalidParamValue indicates that the TaskRun Param input value is not allowed.
 	TaskRunReasonInvalidParamValue = "InvalidParamValue"
+	// TaskRunReasonFailureIgnored is the reason set when the Taskrun has failed and the failure is ignored for the owning PipelineRun
+	TaskRunReasonFailureIgnored TaskRunReason = "FailureIgnored"
 )
 
 func (t TaskRunReason) String() string {

diff --git a/pkg/apis/pipeline/v1beta1/pipeline_types.go b/pkg/apis/pipeline/v1beta1/pipeline_types.go
@@ -262,8 +262,6 @@ type PipelineTask struct {
 
 	// OnError defines the exiting behavior of a PipelineRun on error
 	// can be set to [ continue | stopAndFail ]
-	// Note: OnError is in preview mode and not yet supported
-	// TODO(#7165)
 	// +optional
 	OnError PipelineTaskOnErrorType `json:"onError,omitempty"`
 }

diff --git a/pkg/pod/status.go b/pkg/pod/status.go
@@ -132,7 +132,12 @@ func MakeTaskRunStatus(ctx context.Context, logger *zap.SugaredLogger, tr v1.Tas
 	complete := areStepsComplete(pod) || pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed
 
 	if complete {
-		updateCompletedTaskRunStatus(logger, trs, pod)
+		onError, ok := tr.Annotations[v1.PipelineTaskOnErrorAnnotation]
+		if ok {
+			updateCompletedTaskRunStatus(logger, trs, pod, v1.PipelineTaskOnErrorType(onError))
+		} else {
+			updateCompletedTaskRunStatus(logger, trs, pod, "")
+		}
 	} else {
 		updateIncompleteTaskRunStatus(trs, pod)
 	}
@@ -359,10 +364,14 @@ func extractExitCodeFromResults(results []result.RunResult) (*int32, error) {
 	return nil, nil //nolint:nilnil // would be more ergonomic to return a sentinel error
 }
 
-func updateCompletedTaskRunStatus(logger *zap.SugaredLogger, trs *v1.TaskRunStatus, pod *corev1.Pod) {
+func updateCompletedTaskRunStatus(logger *zap.SugaredLogger, trs *v1.TaskRunStatus, pod *corev1.Pod, onError v1.PipelineTaskOnErrorType) {
 	if DidTaskRunFail(pod) {
 		msg := getFailureMessage(logger, pod)
-		markStatusFailure(trs, v1.TaskRunReasonFailed.String(), msg)
+		if onError == v1.PipelineTaskContinue {
+			markStatusFailure(trs, v1.TaskRunReasonFailureIgnored.String(), msg)
+		} else {
+			markStatusFailure(trs, v1.TaskRunReasonFailed.String(), msg)
+		}
 	} else {
 		markStatusSuccess(trs)
 	}

diff --git a/pkg/pod/status_test.go b/pkg/pod/status_test.go
@@ -1293,6 +1293,75 @@ func TestMakeTaskRunStatus(t *testing.T) {
 	}
 }
 
+func TestMakeRunStatus_OnError(t *testing.T) {
+	for _, c := range []struct {
+		name      string
+		podStatus corev1.PodStatus
+		onError   v1.PipelineTaskOnErrorType
+		want      v1.TaskRunStatus
+	}{{
+		name: "onError: continue",
+		podStatus: corev1.PodStatus{
+			Phase:   corev1.PodFailed,
+			Message: "boom",
+		},
+		onError: v1.PipelineTaskContinue,
+		want: v1.TaskRunStatus{
+			Status: statusFailure(string(v1.TaskRunReasonFailureIgnored), "boom"),
+		},
+	}, {
+		name: "onError: stopAndFail",
+		podStatus: corev1.PodStatus{
+			Phase:   corev1.PodFailed,
+			Message: "boom",
+		},
+		onError: v1.PipelineTaskStopAndFail,
+		want: v1.TaskRunStatus{
+			Status: statusFailure(string(v1.TaskRunReasonFailed), "boom"),
+		},
+	}, {
+		name: "stand alone TaskRun",
+		podStatus: corev1.PodStatus{
+			Phase:   corev1.PodFailed,
+			Message: "boom",
+		},
+		want: v1.TaskRunStatus{
+			Status: statusFailure(string(v1.TaskRunReasonFailed), "boom"),
+		},
+	}} {
+		t.Run(c.name, func(t *testing.T) {
+			pod := corev1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "pod",
+					Namespace: "foo",
+				},
+				Status: c.podStatus,
+			}
+			tr := v1.TaskRun{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "task-run",
+					Namespace: "foo",
+				},
+			}
+			if c.onError != "" {
+				tr.Annotations = map[string]string{}
+				tr.Annotations[v1.PipelineTaskOnErrorAnnotation] = string(c.onError)
+			}
+
+			logger, _ := logging.NewLogger("", "status")
+			kubeclient := fakek8s.NewSimpleClientset()
+			got, err := MakeTaskRunStatus(context.Background(), logger, tr, &pod, kubeclient, &v1.TaskSpec{})
+			if err != nil {
+				t.Errorf("Unexpected err in MakeTaskRunResult: %s", err)
+			}
+
+			if d := cmp.Diff(c.want.Status, got.Status, ignoreVolatileTime); d != "" {
+				t.Errorf("Diff %s", diff.PrintWantGot(d))
+			}
+		})
+	}
+}
+
 func TestMakeTaskRunStatusAlpha(t *testing.T) {
 	for _, c := range []struct {
 		desc      string

diff --git a/pkg/reconciler/pipelinerun/pipelinerun.go b/pkg/reconciler/pipelinerun/pipelinerun.go
@@ -920,6 +920,9 @@ func (c *Reconciler) createTaskRun(ctx context.Context, taskRunName string, para
 	if spanContext, err := getMarshalledSpanFromContext(ctx); err == nil {
 		tr.Annotations[TaskRunSpanContextAnnotation] = spanContext
 	}
+	if rpt.PipelineTask.OnError == v1.PipelineTaskContinue {
+		tr.Annotations[v1.PipelineTaskOnErrorAnnotation] = string(v1.PipelineTaskContinue)
+	}
 
 	if rpt.PipelineTask.Timeout != nil {
 		tr.Spec.Timeout = rpt.PipelineTask.Timeout

diff --git a/pkg/reconciler/pipelinerun/pipelinerun_test.go b/pkg/reconciler/pipelinerun/pipelinerun_test.go
@@ -1060,6 +1060,76 @@ spec:
 	}
 }
 
+// TestPipelineTaskErrorIsIgnored tests that a resource dependent PipelineTask with onError:continue is skipped
+// if the parent PipelineTask fails to produce the result
+func TestPipelineTaskErrorIsIgnored(t *testing.T) {
+	prs := []*v1.PipelineRun{parse.MustParseV1PipelineRun(t, `
+metadata:
+  name: test-pipeline-missing-results
+  namespace: foo
+spec:
+  serviceAccountName: test-sa-0
+  pipelineSpec:
+    tasks:
+    - name: task1
+      taskSpec:
+        results:
+        - name: result1
+          type: string
+        steps:
+        - name: failing-step
+          onError: continue
+          image: busybox
+          script: exit 1; echo -n 123 | tee $(results.result1.path)'
+    - name: task2
+      onError: continue
+      params:
+      - name: param1
+        value: $(tasks.task1.results.result1)
+      taskSpec:
+        params:
+        - name: param1
+          type: string
+        steps:
+        - name: foo
+          image: busybox
+          script: 'echo $(params.param1)'
+`)}
+	trs := []*v1.TaskRun{mustParseTaskRunWithObjectMeta(t,
+		taskRunObjectMeta("test-pipeline-missing-results-task1", "foo",
+			"test-pipeline-missing-results", "test-pipeline", "task1", true),
+		`
+spec:
+  serviceAccountName: test-sa
+  timeout: 1h0m0s
+status:
+  conditions:
+  - status: "True"
+    type: Succeeded
+`)}
+	cms := []*corev1.ConfigMap{withEnabledAlphaAPIFields(newFeatureFlagsConfigMap())}
+
+	d := test.Data{
+		PipelineRuns: prs,
+		TaskRuns:     trs,
+		ConfigMaps:   cms,
+	}
+	prt := newPipelineRunTest(t, d)
+	defer prt.Cancel()
+
+	reconciledRun, _ := prt.reconcileRun("foo", "test-pipeline-missing-results", []string{}, false)
+	cond := reconciledRun.Status.Conditions[0]
+	if cond.Status != corev1.ConditionTrue {
+		t.Fatalf("expected PipelineRun status to be True but got: %s", cond.Status)
+	}
+	if len(reconciledRun.Status.SkippedTasks) != 1 {
+		t.Fatalf("expected 1 skipped Task but got %v", len(reconciledRun.Status.SkippedTasks))
+	}
+	if reconciledRun.Status.SkippedTasks[0].Reason != v1.MissingResultsSkip {
+		t.Fatalf("expected 1 skipped Task with reason %s, but got %v", v1.MissingResultsSkip, reconciledRun.Status.SkippedTasks[0].Reason)
+	}
+}
+
 func TestMissingResultWhenStepErrorIsIgnored(t *testing.T) {
 	prs := []*v1.PipelineRun{parse.MustParseV1PipelineRun(t, `
 metadata:

diff --git a/pkg/reconciler/pipelinerun/resources/pipelinerunresolution.go b/pkg/reconciler/pipelinerun/resources/pipelinerunresolution.go
@@ -392,7 +392,9 @@ func (t *ResolvedPipelineTask) skipBecauseResultReferencesAreMissing(facts *Pipe
 		resolvedResultRefs, pt, err := ResolveResultRefs(facts.State, PipelineRunState{t})
 		rpt := facts.State.ToMap()[pt]
 		if rpt != nil {
-			if err != nil && (t.IsFinalTask(facts) || rpt.Skip(facts).SkippingReason == v1.WhenExpressionsSkip) {
+			if err != nil &&
+				(t.PipelineTask.OnError == v1.PipelineTaskContinue ||
+					(t.IsFinalTask(facts) || rpt.Skip(facts).SkippingReason == v1.WhenExpressionsSkip)) {
 				return true
 			}
 		}

diff --git a/pkg/reconciler/pipelinerun/resources/pipelinerunstate.go b/pkg/reconciler/pipelinerun/resources/pipelinerunstate.go
@@ -88,6 +88,8 @@ type pipelineRunStatusCount struct {
 	Succeeded int
 	// failed tasks count
 	Failed int
+	// failed but ignored tasks count
+	IgnoredFailed int
 	// cancelled tasks count
 	Cancelled int
 	// number of tasks which are still pending, have not executed
@@ -278,11 +280,11 @@ func (state PipelineRunState) getNextTasks(candidateTasks sets.String) []*Resolv
 }
 
 // IsStopping returns true if the PipelineRun won't be scheduling any new Task because
-// at least one task already failed or was cancelled in the specified dag
+// at least one task already failed (with onError: stopAndFail) or was cancelled in the specified dag
 func (facts *PipelineRunFacts) IsStopping() bool {
 	for _, t := range facts.State {
 		if facts.isDAGTask(t.PipelineTask.Name) {
-			if t.isFailure() {
+			if t.isFailure() && t.PipelineTask.OnError != v1.PipelineTaskContinue {
 				return true
 			}
 		}
@@ -417,7 +419,8 @@ func (facts *PipelineRunFacts) GetPipelineConditionStatus(ctx context.Context, p
 	// get the count of successful tasks, failed tasks, cancelled tasks, skipped task, and incomplete tasks
 	s := facts.getPipelineTasksCount()
 	// completed task is a collection of successful, failed, cancelled tasks (skipped tasks are reported separately)
-	cmTasks := s.Succeeded + s.Failed + s.Cancelled
+	cmTasks := s.Succeeded + s.Failed + s.Cancelled + s.IgnoredFailed
+	totalFailedTasks := s.Failed + s.IgnoredFailed
 
 	// The completion reason is set from the TaskRun completion reason
 	// by default, set it to ReasonRunning
@@ -427,8 +430,14 @@ func (facts *PipelineRunFacts) GetPipelineConditionStatus(ctx context.Context, p
 	if s.Incomplete == 0 {
 		status := corev1.ConditionTrue
 		reason := v1.PipelineRunReasonSuccessful.String()
-		message := fmt.Sprintf("Tasks Completed: %d (Failed: %d, Cancelled %d), Skipped: %d",
-			cmTasks, s.Failed, s.Cancelled, s.Skipped)
+		var message string
+		if s.IgnoredFailed > 0 {
+			message = fmt.Sprintf("Tasks Completed: %d (Failed: %d (Ignored: %d), Cancelled %d), Skipped: %d",
+				cmTasks, totalFailedTasks, s.IgnoredFailed, s.Cancelled, s.Skipped)
+		} else {
+			message = fmt.Sprintf("Tasks Completed: %d (Failed: %d, Cancelled %d), Skipped: %d",
+				cmTasks, totalFailedTasks, s.Cancelled, s.Skipped)
+		}
 		// Set reason to ReasonCompleted - At least one is skipped
 		if s.Skipped > 0 {
 			reason = v1.PipelineRunReasonCompleted.String()
@@ -604,6 +613,7 @@ func (facts *PipelineRunFacts) getPipelineTasksCount() pipelineRunStatusCount {
 		Cancelled:           0,
 		Incomplete:          0,
 		SkippedDueToTimeout: 0,
+		IgnoredFailed:       0,
 	}
 	for _, t := range facts.State {
 		switch {
@@ -616,9 +626,13 @@ func (facts *PipelineRunFacts) getPipelineTasksCount() pipelineRunStatusCount {
 		// increment cancelled counter since the task is cancelled
 		case t.isCancelled():
 			s.Cancelled++
-		// increment failure counter since the task has failed
+		// increment failure counter based on Task OnError type since the task has failed
 		case t.isFailure():
-			s.Failed++
+			if t.PipelineTask.OnError == v1.PipelineTaskContinue {
+				s.IgnoredFailed++
+			} else {
+				s.Failed++
+			}
 		// increment skipped and skipped due to timeout counters since the task was skipped due to the pipeline, tasks, or finally timeout being reached before the task was launched
 		case t.Skip(facts).SkippingReason == v1.PipelineTimedOutSkip ||
 			t.Skip(facts).SkippingReason == v1.TasksTimedOutSkip ||