From 587cb2bc6b7783695601c51325023737b197306f Mon Sep 17 00:00:00 2001 From: Nan Yu Date: Wed, 4 Oct 2023 22:43:15 +0000 Subject: [PATCH] Update otelcontribcol image from 0.54.0 to 0.86.0 (#916) The currently used version 0.54.0 has two fixable vulnerabilities, which requires to update jaeger to 1.47.0. The openTelemetry-collector-contrib versions that are earlier than 0.85.0 use jaeger 1.41.0 or older. Hence, this commit updates the image to the latest version, which unfortunately introduces breaking changes. This commit includes the following changes: - removed `--feature-gates=-exporter.googlecloud.OTLPDirect` because feature gate "exporter.googlecloud.OTLPDirect" is stable, can not be disabled. - removed retry_on_failure because it was from the googlecloud exporter. The exporter itself handles retries, and retrying can cause issues. (#57233) - added `--feature-gates=pkg.translator.prometheus.NormalizeName` to enable metric suffix trimming because the prometheus transformer appends `_ratio` to gauge metrics. link: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/v0.86.0/pkg/translator/prometheus/normalize_name.go#L149 - updated the expected error message because otel-collector no longer has the message `failed to export time series to GCM`. See the old log[1] and the new log[2]. [1] old log: ``` 2023-10-04T06:27:49.166Z error exporterhelper/queued_retry.go:149 Exporting failed. Try enabling retry_on_failure config option to retry on retryable errors {"kind": "exporter", "data_type": "metrics", "name": "googlecloud/kubernetes", "error": "failed to export time series to GCM: rpc error: code = InvalidArgument desc = One or more TimeSeries could not be written: Unrecognized metric labels: [configsync_sync_namespace], [configsync_sync_name], [configsync_sync_kind]; Unrecognized metric labels: [configsync_sync_name], [configsync_sync_namespace], [configsync_sync_kind]; Unrecognized metric labels: [configsync_sync_kind], [configsync_sync_namespace], [commit], [configsync_sync_name]; Unrecognized metric labels: [configsync_sync_namespace], [configsync_sync_kind], [configsync_sync_name]; Unrecognized metric labels: [configsync_sync_namespace], [configsync_sync_kind], [configsync_sync_name]; Unrecognized metric labels: [configsync_sync_name], [configsync_sync_namespace], [configsync_sync_kind]; Unrecognized metric labels: [configsync_sync_namespace], [configsync_sync_name], [configsync_sync_kind]; Unrecognized metric labels: [configsync_sync_kind], [configsync_sync_name], [configsync_sync_namespace]; Unrecognized metric labels: [configsync_sync_namespace], [configsync_sync_name], [configsync_sync_kind]; Unrecognized metric labels: [configsync_sync_namespace], [configsync_sync_kind], [configsync_sync_name]; Unrecognized metric labels: [configsync_sync_kind], [configsync_sync_namespace], [configsync_sync_name]; Unrecognized metric labels: [configsync_sync_name], [configsync_sync_namespace], [configsync_sync_kind]; Unrecognized metric labels: [configsync_sync_kind], [configsync_sync_namespace], [configsync_sync_name]; Unrecognized metric labels: [configsync_sync_namespace], [configsync_sync_kind], [configsync_sync_name], [commit]; Unrecognized metric labels: [configsync_sync_kind], [configsync_sync_name], [configsync_sync_namespace], [commit]; Unrecognized metric labels: [configsync_sync_name], [configsync_sync_namespace], [configsync_sync_kind]; Unrecognized metric labels: [configsync_sync_name], [configsync_sync_kind], [configsync_sync_namespace]; Unrecognized metric labels: [configsync_sync_namespace], [configsync_sync_name], [configsync_sync_kind]; Unrecognized metric labels: [configsync_sync_kind], [configsync_sync_namespace], [configsync_sync_name]; Unrecognized metric labels: [configsync_sync_name], [commit], [configsync_sync_namespace], [configsync_sync_kind]; Unrecognized metric labels: [configsync_sync_namespace], [configsync_sync_name], [configsync_sync_kind], [commit]; Unrecognized metric labels: [configsync_sync_name], [configsync_sync_namespace], [configsync_sync_kind]; Unrecognized metric labels: [configsync_sync_name], [configsync_sync_namespace], [configsync_sync_kind]; Unrecognized metric labels: [configsync_sync_kind], [configsync_sync_namespace], [configsync_sync_name]; Unrecognized metric labels: [commit], [configsync_sync_name], [configsync_sync_namespace], [configsync_sync_kind]\nerror details: name = Unknown desc = total_point_count:25 errors:{status:{code:3} point_count:25}"} ``` [2] new log: ``` 2023-10-04T06:36:52.554Z warn batchprocessor@v0.86.0/batch_processor.go:258 Sender failed {"kind": "processor", "name": "batch", "pipeline": "metrics/kubernetes", "error": "rpc error: code = InvalidArgument desc = One or more TimeSeries could not be written: Unrecognized metric labels: [configsync_sync_kind], [configsync_sync_namespace], [configsync_sync_name]; Unrecognized metric labels: [configsync_sync_namespace], [configsync_sync_kind], [configsync_sync_name]; Unrecognized metric labels: [configsync_sync_namespace], [configsync_sync_name], [configsync_sync_kind]; Unrecognized metric labels: [configsync_sync_namespace], [configsync_sync_name], [configsync_sync_kind]; Unrecognized metric labels: [configsync_sync_kind], [configsync_sync_name], [configsync_sync_namespace]; Unrecognized metric labels: [configsync_sync_kind], [configsync_sync_namespace], [configsync_sync_name]; Unrecognized metric labels: [configsync_sync_namespace], [configsync_sync_kind], [configsync_sync_name]; Unrecognized metric labels: [configsync_sync_kind], [configsync_sync_namespace], [configsync_sync_name]; Unrecognized metric labels: [configsync_sync_namespace], [configsync_sync_name], [configsync_sync_kind]; Unrecognized metric labels: [configsync_sync_namespace], [configsync_sync_kind], [configsync_sync_name]; Unrecognized metric labels: [configsync_sync_kind], [configsync_sync_namespace], [configsync_sync_name]; Unrecognized metric labels: [configsync_sync_kind], [configsync_sync_namespace], [configsync_sync_name]; Unrecognized metric labels: [configsync_sync_kind], [configsync_sync_name], [configsync_sync_namespace]; Unrecognized metric labels: [configsync_sync_kind], [configsync_sync_namespace], [commit], [configsync_sync_name]; Unrecognized metric labels: [configsync_sync_name], [configsync_sync_kind], [configsync_sync_namespace]; Unrecognized metric labels: [configsync_sync_namespace], [configsync_sync_name], [configsync_sync_kind]; Unrecognized metric labels: [configsync_sync_kind], [commit], [configsync_sync_namespace], [configsync_sync_name]; Unrecognized metric labels: [configsync_sync_namespace], [configsync_sync_kind], [configsync_sync_name]; Unrecognized metric labels: [configsync_sync_name], [configsync_sync_kind], [configsync_sync_namespace]; Unrecognized metric labels: [commit], [configsync_sync_name], [configsync_sync_kind], [configsync_sync_namespace]; Unrecognized metric labels: [configsync_sync_kind], [commit], [configsync_sync_namespace], [configsync_sync_name]; Unrecognized metric labels: [configsync_sync_name], [configsync_sync_kind], [configsync_sync_namespace]; Unrecognized metric labels: [configsync_sync_namespace], [configsync_sync_kind], [commit], [configsync_sync_name]; Unrecognized metric labels: [configsync_sync_name], [configsync_sync_kind], [commit], [configsync_sync_namespace]; Unrecognized metric labels: [configsync_sync_namespace], [configsync_sync_name], [configsync_sync_kind]\nerror details: name = Unknown desc = total_point_count:25 errors:{status:{code:3} point_count:25}"} ``` --- e2e/testcases/otel_collector_test.go | 2 +- .../otel-collector/otel-cm-monarch-rejected-labels.yaml | 4 ---- manifests/templates/otel-collector.yaml | 5 ++++- manifests/templates/reconciler-manager-configmap.yaml | 2 +- manifests/templates/reconciler-manager.yaml | 8 ++++---- manifests/third_party/resourcegroup-manifest.yaml | 2 +- pkg/metrics/otel.go | 4 ---- pkg/reconcilermanager/controllers/otel_controller_test.go | 2 +- 8 files changed, 12 insertions(+), 17 deletions(-) diff --git a/e2e/testcases/otel_collector_test.go b/e2e/testcases/otel_collector_test.go index 47d91e97de..c30f933335 100644 --- a/e2e/testcases/otel_collector_test.go +++ b/e2e/testcases/otel_collector_test.go @@ -47,7 +47,7 @@ import ( const ( DefaultMonitorKSA = "default" MonitorGSA = "e2e-test-metric-writer" - GCMExportErrorCaption = "failed to export time series to GCM" + GCMExportErrorCaption = "One or more TimeSeries could not be written" GCMMetricPrefix = "custom.googleapis.com/opencensus/config_sync" ) diff --git a/e2e/testdata/otel-collector/otel-cm-monarch-rejected-labels.yaml b/e2e/testdata/otel-collector/otel-cm-monarch-rejected-labels.yaml index d32d7525a1..1a559cd65e 100644 --- a/e2e/testdata/otel-collector/otel-cm-monarch-rejected-labels.yaml +++ b/e2e/testdata/otel-collector/otel-cm-monarch-rejected-labels.yaml @@ -51,8 +51,6 @@ data: - prefix: "host.name" - prefix: "k8s.deployment.name" - prefix: "k8s.node.name" - retry_on_failure: - enabled: false sending_queue: enabled: false googlecloud/kubernetes: @@ -72,8 +70,6 @@ data: # will not break this ingestion pipeline create_service_timeseries: true service_resource_labels: false - retry_on_failure: - enabled: false sending_queue: enabled: false processors: diff --git a/manifests/templates/otel-collector.yaml b/manifests/templates/otel-collector.yaml index 89e9533931..6f27ee896e 100644 --- a/manifests/templates/otel-collector.yaml +++ b/manifests/templates/otel-collector.yaml @@ -130,11 +130,14 @@ spec: spec: containers: - name: otel-collector - image: gcr.io/config-management-release/otelcontribcol:v0.54.0-gke.1 + image: gcr.io/config-management-release/otelcontribcol:v0.86.0-gke.1 command: - /otelcontribcol args: - "--config=/conf/otel-collector-config.yaml" + # The prometheus transformer appends `_ratio` to gauge metrics: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/v0.86.0/pkg/translator/prometheus/normalize_name.go#L149 + # Add the feature gate to enable metric suffix trimming. + - "--feature-gates=-pkg.translator.prometheus.NormalizeName" resources: limits: cpu: 1 diff --git a/manifests/templates/reconciler-manager-configmap.yaml b/manifests/templates/reconciler-manager-configmap.yaml index e7e869e736..514ecff62c 100644 --- a/manifests/templates/reconciler-manager-configmap.yaml +++ b/manifests/templates/reconciler-manager-configmap.yaml @@ -181,7 +181,7 @@ data: cpu: "50m" memory: "200Mi" - name: otel-agent - image: gcr.io/config-management-release/otelcontribcol:v0.54.0-gke.1 + image: gcr.io/config-management-release/otelcontribcol:v0.86.0-gke.1 command: - /otelcontribcol args: diff --git a/manifests/templates/reconciler-manager.yaml b/manifests/templates/reconciler-manager.yaml index fbb74636cf..35827e0164 100644 --- a/manifests/templates/reconciler-manager.yaml +++ b/manifests/templates/reconciler-manager.yaml @@ -60,14 +60,14 @@ spec: name: reconciler-manager optional: true # Currently nothing mandatory in the ConfigMap - name: otel-agent - image: gcr.io/config-management-release/otelcontribcol:v0.54.0-gke.1 + image: gcr.io/config-management-release/otelcontribcol:v0.86.0-gke.1 command: - /otelcontribcol args: - "--config=/conf/otel-agent-config.yaml" - # TODO: Remove this feature gate when opentelemetry semantic conventions are used - # in the collector code. - - "--feature-gates=-exporter.googlecloud.OTLPDirect" + # The prometheus transformer appends `_ratio` to gauge metrics: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/v0.86.0/pkg/translator/prometheus/normalize_name.go#L149 + # Add the feature gate to enable metric suffix trimming. + - "--feature-gates=-pkg.translator.prometheus.NormalizeName" resources: limits: cpu: 1 diff --git a/manifests/third_party/resourcegroup-manifest.yaml b/manifests/third_party/resourcegroup-manifest.yaml index 8543b3f928..fd3a29630f 100644 --- a/manifests/third_party/resourcegroup-manifest.yaml +++ b/manifests/third_party/resourcegroup-manifest.yaml @@ -555,7 +555,7 @@ spec: fieldPath: metadata.labels['configsync.gke.io/deployment-name'] - name: OTEL_RESOURCE_ATTRIBUTES value: k8s.pod.name=$(KUBE_POD_NAME),k8s.pod.namespace=$(KUBE_POD_NAMESPACE),k8s.pod.uid=$(KUBE_POD_UID),k8s.pod.ip=$(KUBE_POD_IP),k8s.node.name=$(KUBE_NODE_NAME),k8s.deployment.name=$(KUBE_DEPLOYMENT_NAME) - image: gcr.io/config-management-release/otelcontribcol:v0.54.0-gke.1 + image: gcr.io/config-management-release/otelcontribcol:v0.86.0-gke.1 name: otel-agent ports: - containerPort: 55678 diff --git a/pkg/metrics/otel.go b/pkg/metrics/otel.go index 6877c194cd..155c961f23 100644 --- a/pkg/metrics/otel.go +++ b/pkg/metrics/otel.go @@ -67,8 +67,6 @@ exporters: - prefix: "host.name" - prefix: "k8s.deployment.name" - prefix: "k8s.node.name" - retry_on_failure: - enabled: false sending_queue: enabled: false googlecloud/kubernetes: @@ -88,8 +86,6 @@ exporters: # will not break this ingestion pipeline create_service_timeseries: true service_resource_labels: false - retry_on_failure: - enabled: false sending_queue: enabled: false processors: diff --git a/pkg/reconcilermanager/controllers/otel_controller_test.go b/pkg/reconcilermanager/controllers/otel_controller_test.go index 056c524137..26ddfee82a 100644 --- a/pkg/reconcilermanager/controllers/otel_controller_test.go +++ b/pkg/reconcilermanager/controllers/otel_controller_test.go @@ -46,7 +46,7 @@ const ( // otel-collector ConfigMap. // See `CollectorConfigGooglecloud` in `pkg/metrics/otel.go` // Used by TestOtelReconcilerGooglecloud. - depAnnotationGooglecloud = "de02e5c1da70cff63a1cfb565141899a" + depAnnotationGooglecloud = "017f802612f7bbbac7a90fc8d64ce746" // depAnnotationGooglecloud is the expected hash of the custom // otel-collector ConfigMap test artifact. // Used by TestOtelReconcilerCustom.