feat: add ability to include reason in count metrics

Adds a configuration flag to enable including the `reason` for a TaskRun or PipelineRun status on their count metrics. This allows for more fine-grained monitoring and alerting of run failures. Signed-off-by: Marcus Noble <[email protected]>
minhoryang · Oct 12, 2023 · d26a45c · d26a45c
1 parent e950a82
commit d26a45c
Show file tree

Hide file tree

Showing 9 changed files with 223 additions and 20 deletions.
diff --git a/config/config-observability.yaml b/config/config-observability.yaml
@@ -58,3 +58,4 @@ data:
     metrics.taskrun.duration-type: "histogram"
     metrics.pipelinerun.level: "pipeline"
     metrics.pipelinerun.duration-type: "histogram"
+    metrics.count.enable-reason: "false"
diff --git a/docs/metrics.md b/docs/metrics.md
@@ -15,10 +15,10 @@ We expose several kinds of exporters, including Prometheus, Google Stackdriver,
 |-----------------------------------------------------------------------------------------| ----------- | ----------- | ----------- |
 | `tekton_pipelines_controller_pipelinerun_duration_seconds_[bucket, sum, count]`         | Histogram/LastValue(Gauge) | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `namespace`=&lt;pipelinerun-namespace&gt; | experimental |
 | `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt;| experimental |
-| `tekton_pipelines_controller_pipelinerun_count`                                         | Counter | `status`=&lt;status&gt; | experimental |
+| `tekton_pipelines_controller_pipelinerun_count`                                         | Counter | `status`=&lt;status&gt; <br> `*reason`=&lt;reason&gt; | experimental |
 | `tekton_pipelines_controller_running_pipelineruns_count`                                | Gauge | | experimental |
 | `tekton_pipelines_controller_taskrun_duration_seconds_[bucket, sum, count]`             | Histogram/LastValue(Gauge) | `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt; | experimental |
-| `tekton_pipelines_controller_taskrun_count`                                             | Counter | `status`=&lt;status&gt; | experimental |
+| `tekton_pipelines_controller_taskrun_count`                                             | Counter | `status`=&lt;status&gt; <br> `*reason`=&lt;reason&gt; | experimental |
 | `tekton_pipelines_controller_running_taskruns_count`                                    | Gauge | | experimental |
 | `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count`                 | Gauge | | experimental |
 | `tekton_pipelines_controller_running_taskruns_throttled_by_node_count`                  | Gauge | | experimental |
@@ -40,6 +40,7 @@ A sample config-map has been provided as [config-observability](./../config/conf
     metrics.taskrun.duration-type: "histogram"
     metrics.pipelinerun.level: "pipeline"
     metrics.pipelinerun.duration-type: "histogram"
+    metrics.count.enable-reason: "false"
 ```
 
 Following values are available in the configmap:
@@ -56,6 +57,7 @@ Following values are available in the configmap:
 | metrics.taskrun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds` and  `tekton_pipelines_controller_taskrun_duration_seconds` is of type gauge or lastvalue |
 | metrics.pipelinerun.duration-type | `histogram` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type histogram |
 | metrics.pipelinerun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type gauge or lastvalue |
+| metrics.count.enable-reason | `false` | Sets if the `reason` label should be included on count metrics |
 
 Histogram value isn't available when pipelinerun or taskrun labels are selected. The Lastvalue or Gauge will be provided. Histogram would serve no purpose because it would generate a single bar. TaskRun and PipelineRun level metrics aren't recommended because they lead to an unbounded cardinality which degrades the observability database.
 

diff --git a/pkg/apis/config/metrics.go b/pkg/apis/config/metrics.go
@@ -36,6 +36,9 @@ const (
 	// metrics to use for aggregating duration for pipelinerun
 	metricsDurationPipelinerunType = "metrics.pipelinerun.duration-type"
 
+	// countWithReasonKey sets if the reason label should be included on count metrics
+	countWithReasonKey = "metrics.count.enable-reason"
+
 	// DefaultTaskrunLevel determines to what level to aggregate metrics
 	// when it isn't specified in configmap
 	DefaultTaskrunLevel = TaskrunLevelAtTask
@@ -92,6 +95,7 @@ type Metrics struct {
 	PipelinerunLevel        string
 	DurationTaskrunType     string
 	DurationPipelinerunType string
+	CountWithReason         bool
 }
 
 // GetMetricsConfigName returns the name of the configmap containing all
@@ -113,7 +117,8 @@ func (cfg *Metrics) Equals(other *Metrics) bool {
 	return other.TaskrunLevel == cfg.TaskrunLevel &&
 		other.PipelinerunLevel == cfg.PipelinerunLevel &&
 		other.DurationTaskrunType == cfg.DurationTaskrunType &&
-		other.DurationPipelinerunType == cfg.DurationPipelinerunType
+		other.DurationPipelinerunType == cfg.DurationPipelinerunType &&
+		other.CountWithReason == cfg.CountWithReason
 }
 
 // newMetricsFromMap returns a Config given a map corresponding to a ConfigMap
@@ -123,6 +128,7 @@ func newMetricsFromMap(cfgMap map[string]string) (*Metrics, error) {
 		PipelinerunLevel:        DefaultPipelinerunLevel,
 		DurationTaskrunType:     DefaultDurationTaskrunType,
 		DurationPipelinerunType: DefaultDurationPipelinerunType,
+		CountWithReason:         false,
 	}
 
 	if taskrunLevel, ok := cfgMap[metricsTaskrunLevelKey]; ok {
@@ -138,6 +144,11 @@ func newMetricsFromMap(cfgMap map[string]string) (*Metrics, error) {
 	if durationPipelinerun, ok := cfgMap[metricsDurationPipelinerunType]; ok {
 		tc.DurationPipelinerunType = durationPipelinerun
 	}
+
+	if countWithReason, ok := cfgMap[countWithReasonKey]; ok && countWithReason != "false" {
+		tc.CountWithReason = true
+	}
+
 	return &tc, nil
 }
 

diff --git a/pkg/apis/config/metrics_test.go b/pkg/apis/config/metrics_test.go
@@ -38,6 +38,7 @@ func TestNewMetricsFromConfigMap(t *testing.T) {
 				PipelinerunLevel:        config.PipelinerunLevelAtPipelinerun,
 				DurationTaskrunType:     config.DurationPipelinerunTypeHistogram,
 				DurationPipelinerunType: config.DurationPipelinerunTypeHistogram,
+				CountWithReason:         false,
 			},
 			fileName: config.GetMetricsConfigName(),
 		},
@@ -47,9 +48,20 @@ func TestNewMetricsFromConfigMap(t *testing.T) {
 				PipelinerunLevel:        config.PipelinerunLevelAtNS,
 				DurationTaskrunType:     config.DurationTaskrunTypeHistogram,
 				DurationPipelinerunType: config.DurationPipelinerunTypeLastValue,
+				CountWithReason:         false,
 			},
 			fileName: "config-observability-namespacelevel",
 		},
+		{
+			expectedConfig: &config.Metrics{
+				TaskrunLevel:            config.TaskrunLevelAtNS,
+				PipelinerunLevel:        config.PipelinerunLevelAtNS,
+				DurationTaskrunType:     config.DurationTaskrunTypeHistogram,
+				DurationPipelinerunType: config.DurationPipelinerunTypeLastValue,
+				CountWithReason:         true,
+			},
+			fileName: "config-observability-reason",
+		},
 	}
 
 	for _, tc := range testCases {
@@ -64,6 +76,7 @@ func TestNewMetricsFromEmptyConfigMap(t *testing.T) {
 		PipelinerunLevel:        config.PipelinerunLevelAtPipeline,
 		DurationTaskrunType:     config.DurationPipelinerunTypeHistogram,
 		DurationPipelinerunType: config.DurationPipelinerunTypeHistogram,
+		CountWithReason:         false,
 	}
 	verifyConfigFileWithExpectedMetricsConfig(t, MetricsConfigEmptyName, expectedConfig)
 }

diff --git a/pkg/apis/config/testdata/config-observability-reason.yaml b/pkg/apis/config/testdata/config-observability-reason.yaml
@@ -0,0 +1,31 @@
+# Copyright 2019 The Tekton Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: config-observability
+  namespace: tekton-pipelines
+  labels:
+    app.kubernetes.io/instance: default
+    app.kubernetes.io/part-of: tekton-pipelines
+data:
+  metrics.backend-destination: prometheus
+  metrics.stackdriver-project-id: "<your stackdriver project id>"
+  metrics.allow-stackdriver-custom-metrics: "false"
+  metrics.taskrun.level: "namespace"
+  metrics.taskrun.duration-type: "histogram"
+  metrics.pipelinerun.level: "namespace"
+  metrics.pipelinerun.duration-type: "lastvalue"
+  metrics.count.enable-reason: "true"
diff --git a/pkg/pipelinerunmetrics/metrics.go b/pkg/pipelinerunmetrics/metrics.go
@@ -43,6 +43,7 @@ var (
 	pipelineTag    = tag.MustNewKey("pipeline")
 	namespaceTag   = tag.MustNewKey("namespace")
 	statusTag      = tag.MustNewKey("status")
+	reasonTag      = tag.MustNewKey("reason")
 
 	prDuration = stats.Float64(
 		"pipelinerun_duration_seconds",
@@ -160,11 +161,15 @@ func viewRegister(cfg *config.Metrics) error {
 		TagKeys:     append([]tag.Key{statusTag, namespaceTag}, prunTag...),
 	}
 
+	prCountViewTags := []tag.Key{statusTag}
+	if cfg.CountWithReason {
+		prCountViewTags = append(prCountViewTags, reasonTag)
+	}
 	prCountView = &view.View{
 		Description: prCount.Description(),
 		Measure:     prCount,
 		Aggregation: view.Count(),
-		TagKeys:     []tag.Key{statusTag},
+		TagKeys:     prCountViewTags,
 	}
 	runningPRsCountView = &view.View{
 		Description: runningPRsCount.Description(),
@@ -253,13 +258,15 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co
 		}
 	}
 
+	cond := pr.Status.GetCondition(apis.ConditionSucceeded)
 	status := "success"
-	if cond := pr.Status.GetCondition(apis.ConditionSucceeded); cond.Status == corev1.ConditionFalse {
+	if cond.Status == corev1.ConditionFalse {
 		status = "failed"
 		if cond.Reason == v1.PipelineRunReasonCancelled.String() {
 			status = "cancelled"
 		}
 	}
+	reason := cond.Reason
 
 	pipelineName := "anonymous"
 	if pr.Spec.PipelineRef != nil && pr.Spec.PipelineRef.Name != "" {
@@ -268,7 +275,7 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co
 	ctx, err := tag.New(
 		context.Background(),
 		append([]tag.Mutator{tag.Insert(namespaceTag, pr.Namespace),
-			tag.Insert(statusTag, status)}, r.insertTag(pipelineName, pr.Name)...)...)
+			tag.Insert(statusTag, status), tag.Insert(reasonTag, reason)}, r.insertTag(pipelineName, pr.Name)...)...)
 	if err != nil {
 		return err
 	}

diff --git a/pkg/pipelinerunmetrics/metrics_test.go b/pkg/pipelinerunmetrics/metrics_test.go
@@ -42,14 +42,15 @@ var (
 	completionTime = metav1.NewTime(startTime.Time.Add(time.Minute))
 )
 
-func getConfigContext() context.Context {
+func getConfigContext(countWithReason bool) context.Context {
 	ctx := context.Background()
 	cfg := &config.Config{
 		Metrics: &config.Metrics{
 			TaskrunLevel:            config.TaskrunLevelAtTaskrun,
 			PipelinerunLevel:        config.PipelinerunLevelAtPipelinerun,
 			DurationTaskrunType:     config.DefaultDurationTaskrunType,
 			DurationPipelinerunType: config.DefaultDurationPipelinerunType,
+			CountWithReason:         countWithReason,
 		},
 	}
 	return config.ToContext(ctx, cfg)
@@ -71,7 +72,7 @@ func TestMetricsOnStore(t *testing.T) {
 	defer log.Sync()
 	logger := log.Sugar()
 
-	ctx := getConfigContext()
+	ctx := getConfigContext(false)
 	metrics, err := NewRecorder(ctx)
 	if err != nil {
 		t.Fatalf("NewRecorder: %v", err)
@@ -117,6 +118,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
 		expectedDuration     float64
 		expectedCount        int64
 		beforeCondition      *apis.Condition
+		countWithReason      bool
 	}{{
 		name: "for succeeded pipeline",
 		pipelineRun: &v1.PipelineRun{
@@ -149,6 +151,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
 		expectedDuration: 60,
 		expectedCount:    1,
 		beforeCondition:  nil,
+		countWithReason:  false,
 	}, {
 		name: "for succeeded pipeline different condition",
 		pipelineRun: &v1.PipelineRun{
@@ -184,6 +187,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
 			Type:   apis.ConditionReady,
 			Status: corev1.ConditionUnknown,
 		},
+		countWithReason: false,
 	}, {
 		name: "for succeeded pipeline recount",
 		pipelineRun: &v1.PipelineRun{
@@ -212,6 +216,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
 			Type:   apis.ConditionSucceeded,
 			Status: corev1.ConditionTrue,
 		},
+		countWithReason: false,
 	}, {
 		name: "for cancelled pipeline",
 		pipelineRun: &v1.PipelineRun{
@@ -245,6 +250,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
 		expectedDuration: 60,
 		expectedCount:    1,
 		beforeCondition:  nil,
+		countWithReason:  false,
 	}, {
 		name: "for failed pipeline",
 		pipelineRun: &v1.PipelineRun{
@@ -277,6 +283,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
 		expectedDuration: 60,
 		expectedCount:    1,
 		beforeCondition:  nil,
+		countWithReason:  false,
 	}, {
 		name: "for pipeline without start or completion time",
 		pipelineRun: &v1.PipelineRun{
@@ -306,11 +313,82 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
 		expectedDuration: 0,
 		expectedCount:    1,
 		beforeCondition:  nil,
+		countWithReason:  false,
+	}, {
+		name: "for failed pipeline with reason",
+		pipelineRun: &v1.PipelineRun{
+			ObjectMeta: metav1.ObjectMeta{Name: "pipelinerun-1", Namespace: "ns"},
+			Spec: v1.PipelineRunSpec{
+				PipelineRef: &v1.PipelineRef{Name: "pipeline-1"},
+			},
+			Status: v1.PipelineRunStatus{
+				Status: duckv1.Status{
+					Conditions: duckv1.Conditions{{
+						Type:   apis.ConditionSucceeded,
+						Status: corev1.ConditionFalse,
+						Reason: "Failed",
+					}},
+				},
+				PipelineRunStatusFields: v1.PipelineRunStatusFields{
+					StartTime:      &startTime,
+					CompletionTime: &completionTime,
+				},
+			},
+		},
+		expectedDurationTags: map[string]string{
+			"pipeline":    "pipeline-1",
+			"pipelinerun": "pipelinerun-1",
+			"namespace":   "ns",
+			"status":      "failed",
+		},
+		expectedCountTags: map[string]string{
+			"status": "failed",
+			"reason": "Failed",
+		},
+		expectedDuration: 60,
+		expectedCount:    1,
+		beforeCondition:  nil,
+		countWithReason:  true,
+	}, {
+		name: "for cancelled pipeline with reason",
+		pipelineRun: &v1.PipelineRun{
+			ObjectMeta: metav1.ObjectMeta{Name: "pipelinerun-1", Namespace: "ns"},
+			Spec: v1.PipelineRunSpec{
+				PipelineRef: &v1.PipelineRef{Name: "pipeline-1"},
+			},
+			Status: v1.PipelineRunStatus{
+				Status: duckv1.Status{
+					Conditions: duckv1.Conditions{{
+						Type:   apis.ConditionSucceeded,
+						Status: corev1.ConditionFalse,
+						Reason: ReasonCancelled.String(),
+					}},
+				},
+				PipelineRunStatusFields: v1.PipelineRunStatusFields{
+					StartTime:      &startTime,
+					CompletionTime: &completionTime,
+				},
+			},
+		},
+		expectedDurationTags: map[string]string{
+			"pipeline":    "pipeline-1",
+			"pipelinerun": "pipelinerun-1",
+			"namespace":   "ns",
+			"status":      "cancelled",
+		},
+		expectedCountTags: map[string]string{
+			"status": "cancelled",
+			"reason": ReasonCancelled.String(),
+		},
+		expectedDuration: 60,
+		expectedCount:    1,
+		beforeCondition:  nil,
+		countWithReason:  true,
 	}} {
 		t.Run(test.name, func(t *testing.T) {
 			unregisterMetrics()
 
-			ctx := getConfigContext()
+			ctx := getConfigContext(test.countWithReason)
 			metrics, err := NewRecorder(ctx)
 			if err != nil {
 				t.Fatalf("NewRecorder: %v", err)
@@ -363,7 +441,7 @@ func TestRecordRunningPipelineRunsCount(t *testing.T) {
 		}
 	}
 
-	ctx = getConfigContext()
+	ctx = getConfigContext(false)
 	metrics, err := NewRecorder(ctx)
 	if err != nil {
 		t.Fatalf("NewRecorder: %v", err)
@@ -443,7 +521,7 @@ func TestRecordRunningPipelineRunsResolutionWaitCounts(t *testing.T) {
 			}
 		}
 
-		ctx = getConfigContext()
+		ctx = getConfigContext(false)
 		metrics, err := NewRecorder(ctx)
 		if err != nil {
 			t.Fatalf("NewRecorder: %v", err)