Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add ability to include reason in count metrics #7060

Merged
merged 1 commit into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/config-observability.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,4 @@ data:
metrics.taskrun.duration-type: "histogram"
metrics.pipelinerun.level: "pipeline"
metrics.pipelinerun.duration-type: "histogram"
metrics.count.enable-reason: "false"
6 changes: 4 additions & 2 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ We expose several kinds of exporters, including Prometheus, Google Stackdriver,
|-----------------------------------------------------------------------------------------| ----------- | ----------- | ----------- |
| `tekton_pipelines_controller_pipelinerun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `namespace`=&lt;pipelinerun-namespace&gt; | experimental |
| `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt;| experimental |
| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=&lt;status&gt; | experimental |
| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=&lt;status&gt; <br> `*reason`=&lt;reason&gt; | experimental |
| `tekton_pipelines_controller_running_pipelineruns_count` | Gauge | | experimental |
| `tekton_pipelines_controller_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt; | experimental |
| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=&lt;status&gt; | experimental |
| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=&lt;status&gt; <br> `*reason`=&lt;reason&gt; | experimental |
| `tekton_pipelines_controller_running_taskruns_count` | Gauge | | experimental |
| `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | experimental |
| `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | experimental |
Expand All @@ -40,6 +40,7 @@ A sample config-map has been provided as [config-observability](./../config/conf
metrics.taskrun.duration-type: "histogram"
metrics.pipelinerun.level: "pipeline"
metrics.pipelinerun.duration-type: "histogram"
metrics.count.enable-reason: "false"
```

Following values are available in the configmap:
Expand All @@ -56,6 +57,7 @@ Following values are available in the configmap:
| metrics.taskrun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds` and `tekton_pipelines_controller_taskrun_duration_seconds` is of type gauge or lastvalue |
| metrics.pipelinerun.duration-type | `histogram` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type histogram |
| metrics.pipelinerun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type gauge or lastvalue |
| metrics.count.enable-reason | `false` | Sets if the `reason` label should be included on count metrics |

Histogram value isn't available when pipelinerun or taskrun labels are selected. The Lastvalue or Gauge will be provided. Histogram would serve no purpose because it would generate a single bar. TaskRun and PipelineRun level metrics aren't recommended because they lead to an unbounded cardinality which degrades the observability database.

Expand Down
13 changes: 12 additions & 1 deletion pkg/apis/config/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ const (
// metrics to use for aggregating duration for pipelinerun
metricsDurationPipelinerunType = "metrics.pipelinerun.duration-type"

// countWithReasonKey sets if the reason label should be included on count metrics
countWithReasonKey = "metrics.count.enable-reason"

// DefaultTaskrunLevel determines to what level to aggregate metrics
// when it isn't specified in configmap
DefaultTaskrunLevel = TaskrunLevelAtTask
Expand Down Expand Up @@ -92,6 +95,7 @@ type Metrics struct {
PipelinerunLevel string
DurationTaskrunType string
DurationPipelinerunType string
CountWithReason bool
}

// GetMetricsConfigName returns the name of the configmap containing all
Expand All @@ -113,7 +117,8 @@ func (cfg *Metrics) Equals(other *Metrics) bool {
return other.TaskrunLevel == cfg.TaskrunLevel &&
other.PipelinerunLevel == cfg.PipelinerunLevel &&
other.DurationTaskrunType == cfg.DurationTaskrunType &&
other.DurationPipelinerunType == cfg.DurationPipelinerunType
other.DurationPipelinerunType == cfg.DurationPipelinerunType &&
other.CountWithReason == cfg.CountWithReason
}

// newMetricsFromMap returns a Config given a map corresponding to a ConfigMap
Expand All @@ -123,6 +128,7 @@ func newMetricsFromMap(cfgMap map[string]string) (*Metrics, error) {
PipelinerunLevel: DefaultPipelinerunLevel,
DurationTaskrunType: DefaultDurationTaskrunType,
DurationPipelinerunType: DefaultDurationPipelinerunType,
CountWithReason: false,
}

if taskrunLevel, ok := cfgMap[metricsTaskrunLevelKey]; ok {
Expand All @@ -138,6 +144,11 @@ func newMetricsFromMap(cfgMap map[string]string) (*Metrics, error) {
if durationPipelinerun, ok := cfgMap[metricsDurationPipelinerunType]; ok {
tc.DurationPipelinerunType = durationPipelinerun
}

if countWithReason, ok := cfgMap[countWithReasonKey]; ok && countWithReason != "false" {
tc.CountWithReason = true
}

return &tc, nil
}

Expand Down
13 changes: 13 additions & 0 deletions pkg/apis/config/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ func TestNewMetricsFromConfigMap(t *testing.T) {
PipelinerunLevel: config.PipelinerunLevelAtPipelinerun,
DurationTaskrunType: config.DurationPipelinerunTypeHistogram,
DurationPipelinerunType: config.DurationPipelinerunTypeHistogram,
CountWithReason: false,
},
fileName: config.GetMetricsConfigName(),
},
Expand All @@ -47,9 +48,20 @@ func TestNewMetricsFromConfigMap(t *testing.T) {
PipelinerunLevel: config.PipelinerunLevelAtNS,
DurationTaskrunType: config.DurationTaskrunTypeHistogram,
DurationPipelinerunType: config.DurationPipelinerunTypeLastValue,
CountWithReason: false,
},
fileName: "config-observability-namespacelevel",
},
{
expectedConfig: &config.Metrics{
TaskrunLevel: config.TaskrunLevelAtNS,
PipelinerunLevel: config.PipelinerunLevelAtNS,
DurationTaskrunType: config.DurationTaskrunTypeHistogram,
DurationPipelinerunType: config.DurationPipelinerunTypeLastValue,
CountWithReason: true,
},
fileName: "config-observability-reason",
},
}

for _, tc := range testCases {
Expand All @@ -64,6 +76,7 @@ func TestNewMetricsFromEmptyConfigMap(t *testing.T) {
PipelinerunLevel: config.PipelinerunLevelAtPipeline,
DurationTaskrunType: config.DurationPipelinerunTypeHistogram,
DurationPipelinerunType: config.DurationPipelinerunTypeHistogram,
CountWithReason: false,
}
verifyConfigFileWithExpectedMetricsConfig(t, MetricsConfigEmptyName, expectedConfig)
}
Expand Down
31 changes: 31 additions & 0 deletions pkg/apis/config/testdata/config-observability-reason.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright 2019 The Tekton Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: v1
kind: ConfigMap
metadata:
name: config-observability
namespace: tekton-pipelines
labels:
app.kubernetes.io/instance: default
app.kubernetes.io/part-of: tekton-pipelines
data:
metrics.backend-destination: prometheus
metrics.stackdriver-project-id: "<your stackdriver project id>"
metrics.allow-stackdriver-custom-metrics: "false"
metrics.taskrun.level: "namespace"
metrics.taskrun.duration-type: "histogram"
metrics.pipelinerun.level: "namespace"
metrics.pipelinerun.duration-type: "lastvalue"
metrics.count.enable-reason: "true"
13 changes: 10 additions & 3 deletions pkg/pipelinerunmetrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ var (
pipelineTag = tag.MustNewKey("pipeline")
namespaceTag = tag.MustNewKey("namespace")
statusTag = tag.MustNewKey("status")
reasonTag = tag.MustNewKey("reason")

prDuration = stats.Float64(
"pipelinerun_duration_seconds",
Expand Down Expand Up @@ -160,11 +161,15 @@ func viewRegister(cfg *config.Metrics) error {
TagKeys: append([]tag.Key{statusTag, namespaceTag}, prunTag...),
}

prCountViewTags := []tag.Key{statusTag}
if cfg.CountWithReason {
prCountViewTags = append(prCountViewTags, reasonTag)
}
prCountView = &view.View{
Description: prCount.Description(),
Measure: prCount,
Aggregation: view.Count(),
TagKeys: []tag.Key{statusTag},
TagKeys: prCountViewTags,
}
runningPRsCountView = &view.View{
Description: runningPRsCount.Description(),
Expand Down Expand Up @@ -253,13 +258,15 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co
}
}

cond := pr.Status.GetCondition(apis.ConditionSucceeded)
status := "success"
if cond := pr.Status.GetCondition(apis.ConditionSucceeded); cond.Status == corev1.ConditionFalse {
if cond.Status == corev1.ConditionFalse {
status = "failed"
if cond.Reason == v1.PipelineRunReasonCancelled.String() {
status = "cancelled"
}
}
reason := cond.Reason

pipelineName := "anonymous"
if pr.Spec.PipelineRef != nil && pr.Spec.PipelineRef.Name != "" {
Expand All @@ -268,7 +275,7 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co
ctx, err := tag.New(
context.Background(),
append([]tag.Mutator{tag.Insert(namespaceTag, pr.Namespace),
tag.Insert(statusTag, status)}, r.insertTag(pipelineName, pr.Name)...)...)
tag.Insert(statusTag, status), tag.Insert(reasonTag, reason)}, r.insertTag(pipelineName, pr.Name)...)...)
if err != nil {
return err
}
Expand Down
88 changes: 83 additions & 5 deletions pkg/pipelinerunmetrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,15 @@ var (
completionTime = metav1.NewTime(startTime.Time.Add(time.Minute))
)

func getConfigContext() context.Context {
func getConfigContext(countWithReason bool) context.Context {
ctx := context.Background()
cfg := &config.Config{
Metrics: &config.Metrics{
TaskrunLevel: config.TaskrunLevelAtTaskrun,
PipelinerunLevel: config.PipelinerunLevelAtPipelinerun,
DurationTaskrunType: config.DefaultDurationTaskrunType,
DurationPipelinerunType: config.DefaultDurationPipelinerunType,
CountWithReason: countWithReason,
},
}
return config.ToContext(ctx, cfg)
Expand All @@ -71,7 +72,7 @@ func TestMetricsOnStore(t *testing.T) {
defer log.Sync()
logger := log.Sugar()

ctx := getConfigContext()
ctx := getConfigContext(false)
metrics, err := NewRecorder(ctx)
if err != nil {
t.Fatalf("NewRecorder: %v", err)
Expand Down Expand Up @@ -117,6 +118,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
expectedDuration float64
expectedCount int64
beforeCondition *apis.Condition
countWithReason bool
}{{
name: "for succeeded pipeline",
pipelineRun: &v1.PipelineRun{
Expand Down Expand Up @@ -149,6 +151,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
expectedDuration: 60,
expectedCount: 1,
beforeCondition: nil,
countWithReason: false,
}, {
name: "for succeeded pipeline different condition",
pipelineRun: &v1.PipelineRun{
Expand Down Expand Up @@ -184,6 +187,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
Type: apis.ConditionReady,
Status: corev1.ConditionUnknown,
},
countWithReason: false,
}, {
name: "for succeeded pipeline recount",
pipelineRun: &v1.PipelineRun{
Expand Down Expand Up @@ -212,6 +216,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
Type: apis.ConditionSucceeded,
Status: corev1.ConditionTrue,
},
countWithReason: false,
}, {
name: "for cancelled pipeline",
pipelineRun: &v1.PipelineRun{
Expand Down Expand Up @@ -245,6 +250,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
expectedDuration: 60,
expectedCount: 1,
beforeCondition: nil,
countWithReason: false,
}, {
name: "for failed pipeline",
pipelineRun: &v1.PipelineRun{
Expand Down Expand Up @@ -277,6 +283,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
expectedDuration: 60,
expectedCount: 1,
beforeCondition: nil,
countWithReason: false,
}, {
name: "for pipeline without start or completion time",
pipelineRun: &v1.PipelineRun{
Expand Down Expand Up @@ -306,11 +313,82 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
expectedDuration: 0,
expectedCount: 1,
beforeCondition: nil,
countWithReason: false,
}, {
name: "for failed pipeline with reason",
pipelineRun: &v1.PipelineRun{
ObjectMeta: metav1.ObjectMeta{Name: "pipelinerun-1", Namespace: "ns"},
Spec: v1.PipelineRunSpec{
PipelineRef: &v1.PipelineRef{Name: "pipeline-1"},
},
Status: v1.PipelineRunStatus{
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: apis.ConditionSucceeded,
Status: corev1.ConditionFalse,
Reason: "Failed",
}},
},
PipelineRunStatusFields: v1.PipelineRunStatusFields{
StartTime: &startTime,
CompletionTime: &completionTime,
},
},
},
expectedDurationTags: map[string]string{
"pipeline": "pipeline-1",
"pipelinerun": "pipelinerun-1",
"namespace": "ns",
"status": "failed",
},
expectedCountTags: map[string]string{
"status": "failed",
"reason": "Failed",
},
expectedDuration: 60,
expectedCount: 1,
beforeCondition: nil,
countWithReason: true,
}, {
name: "for cancelled pipeline with reason",
pipelineRun: &v1.PipelineRun{
ObjectMeta: metav1.ObjectMeta{Name: "pipelinerun-1", Namespace: "ns"},
Spec: v1.PipelineRunSpec{
PipelineRef: &v1.PipelineRef{Name: "pipeline-1"},
},
Status: v1.PipelineRunStatus{
Status: duckv1.Status{
Conditions: duckv1.Conditions{{
Type: apis.ConditionSucceeded,
Status: corev1.ConditionFalse,
Reason: ReasonCancelled.String(),
}},
},
PipelineRunStatusFields: v1.PipelineRunStatusFields{
StartTime: &startTime,
CompletionTime: &completionTime,
},
},
},
expectedDurationTags: map[string]string{
"pipeline": "pipeline-1",
"pipelinerun": "pipelinerun-1",
"namespace": "ns",
"status": "cancelled",
},
expectedCountTags: map[string]string{
"status": "cancelled",
"reason": ReasonCancelled.String(),
},
expectedDuration: 60,
expectedCount: 1,
beforeCondition: nil,
countWithReason: true,
}} {
t.Run(test.name, func(t *testing.T) {
unregisterMetrics()

ctx := getConfigContext()
ctx := getConfigContext(test.countWithReason)
metrics, err := NewRecorder(ctx)
if err != nil {
t.Fatalf("NewRecorder: %v", err)
Expand Down Expand Up @@ -363,7 +441,7 @@ func TestRecordRunningPipelineRunsCount(t *testing.T) {
}
}

ctx = getConfigContext()
ctx = getConfigContext(false)
metrics, err := NewRecorder(ctx)
if err != nil {
t.Fatalf("NewRecorder: %v", err)
Expand Down Expand Up @@ -443,7 +521,7 @@ func TestRecordRunningPipelineRunsResolutionWaitCounts(t *testing.T) {
}
}

ctx = getConfigContext()
ctx = getConfigContext(false)
metrics, err := NewRecorder(ctx)
if err != nil {
t.Fatalf("NewRecorder: %v", err)
Expand Down
Loading
Loading