From a6fdb9e120aefa7b26286eb60d0bddff6c58b7f2 Mon Sep 17 00:00:00 2001 From: Tiffany Pei Date: Fri, 16 Aug 2024 19:32:47 +0000 Subject: [PATCH 1/8] Update otelcontribcol to 0.106.0-gke.2 This change updates otelcoontribcol to latest and modifies to fit breaking changes from 0.104.0 and 0.106.0. Breaking change in 0.104.0 https://github.com/open-telemetry/opentelemetry-collector-contrib/releases/tag/v0.104.0 Breaking change in 0.106.0 https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/34430 - Localhost is now the default setting, while otel-agent and otel-collector require 0.0.0.0, so the feature gate has been removed. - The format of the environment variable was updated to meet the new syntax requirements. The otel-agent ConfigMap was split between the reconciler and controllers, ensuring that sync-related labels are only applied to reconcilers. - A `no_op_label` has been added to ensure that the aggregation in the metricstransform processor filters on all metric labels. This is a temporary workaround until a permanent fix is implemented upstream. --- Makefile | 2 +- manifests/base/kustomization.yaml | 1 + manifests/otel-agent-cm.yaml | 20 +----- manifests/otel-agent-reconciler-cm.yaml | 69 +++++++++++++++++++ manifests/templates/otel-collector.yaml | 1 + .../reconciler-manager-configmap.yaml | 9 +-- manifests/templates/reconciler-manager.yaml | 1 + .../templates/resourcegroup-manifest.yaml | 1 + pkg/metrics/otel.go | 20 ++++-- .../controllers/otel_controller_test.go | 2 +- 10 files changed, 96 insertions(+), 30 deletions(-) create mode 100644 manifests/otel-agent-reconciler-cm.yaml diff --git a/Makefile b/Makefile index f506b4c4ed..b3395e79b0 100644 --- a/Makefile +++ b/Makefile @@ -89,7 +89,7 @@ COSIGN := $(BIN_DIR)/cosign GIT_SYNC_VERSION := v4.3.0-gke.4__linux_amd64 GIT_SYNC_IMAGE_NAME := gcr.io/config-management-release/git-sync:$(GIT_SYNC_VERSION) -OTELCONTRIBCOL_VERSION := v0.103.0-gke.6 +OTELCONTRIBCOL_VERSION := v0.106.0-gke.2 OTELCONTRIBCOL_IMAGE_NAME := gcr.io/config-management-release/otelcontribcol:$(OTELCONTRIBCOL_VERSION) # Directory used for staging Docker contexts. diff --git a/manifests/base/kustomization.yaml b/manifests/base/kustomization.yaml index d467cc33c8..553cd9a61a 100644 --- a/manifests/base/kustomization.yaml +++ b/manifests/base/kustomization.yaml @@ -23,6 +23,7 @@ resources: - ../ns-reconciler-base-cluster-role.yaml - ../root-reconciler-base-cluster-role.yaml - ../otel-agent-cm.yaml +- ../otel-agent-reconciler-cm.yaml - ../reconciler-manager-service-account.yaml - ../reposync-crd.yaml - ../rootsync-crd.yaml diff --git a/manifests/otel-agent-cm.yaml b/manifests/otel-agent-cm.yaml index db8093e3ae..b1173ffe6b 100644 --- a/manifests/otel-agent-cm.yaml +++ b/manifests/otel-agent-cm.yaml @@ -32,24 +32,6 @@ data: tls: insecure: true processors: - # Attributes processor adds custom configsync metric labels to applicable - # metrics to identify the sync object used to configure this deployment. - # - # Note: configsync.sync.generation is explicitly excluded here, because it - # is high cardinality. So we don't want to send it as a label, only as a - # resource attribute. That way it's only propagated to Prometheus, and not - # Monarch or Cloud Monitoring, which ignore custom resource attributes. - attributes: - actions: - - key: configsync.sync.kind - action: upsert - value: $CONFIGSYNC_SYNC_KIND - - key: configsync.sync.name - action: upsert - value: $CONFIGSYNC_SYNC_NAME - - key: configsync.sync.namespace - action: upsert - value: $CONFIGSYNC_SYNC_NAMESPACE batch: # Populate resource attributes from OTEL_RESOURCE_ATTRIBUTES env var and # the GCE metadata service, if available. @@ -62,7 +44,7 @@ data: pipelines: metrics: receivers: [opencensus] - processors: [batch, resourcedetection, attributes] + processors: [batch, resourcedetection] exporters: [opencensus] telemetry: logs: diff --git a/manifests/otel-agent-reconciler-cm.yaml b/manifests/otel-agent-reconciler-cm.yaml new file mode 100644 index 0000000000..d8b437e85c --- /dev/null +++ b/manifests/otel-agent-reconciler-cm.yaml @@ -0,0 +1,69 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: otel-agent-reconciler + namespace: config-management-system + labels: + app: opentelemetry + component: otel-agent + configmanagement.gke.io/system: "true" + configmanagement.gke.io/arch: "csmr" +data: + otel-agent-reconciler-config.yaml: | + receivers: + opencensus: + exporters: + opencensus: + endpoint: otel-collector.config-management-monitoring:55678 + tls: + insecure: true + processors: + # Attributes processor adds custom configsync metric labels to applicable + # metrics to identify the sync object used to configure this deployment. + # + # Note: configsync.sync.generation is explicitly excluded here, because it + # is high cardinality. So we don't want to send it as a label, only as a + # resource attribute. That way it's only propagated to Prometheus, and not + # Monarch or Cloud Monitoring, which ignore custom resource attributes. + attributes: + actions: + - key: configsync.sync.kind + action: upsert + value: ${CONFIGSYNC_SYNC_KIND} + - key: configsync.sync.name + action: upsert + value: ${CONFIGSYNC_SYNC_NAME} + - key: configsync.sync.namespace + action: upsert + value: ${CONFIGSYNC_SYNC_NAMESPACE} + batch: + # Populate resource attributes from OTEL_RESOURCE_ATTRIBUTES env var and + # the GCE metadata service, if available. + resourcedetection: + detectors: [env, gcp] + extensions: + health_check: + service: + extensions: [health_check] + pipelines: + metrics: + receivers: [opencensus] + processors: [batch, resourcedetection, attributes] + exporters: [opencensus] + telemetry: + logs: + level: "INFO" diff --git a/manifests/templates/otel-collector.yaml b/manifests/templates/otel-collector.yaml index dfdcf6a830..933fcbe84f 100644 --- a/manifests/templates/otel-collector.yaml +++ b/manifests/templates/otel-collector.yaml @@ -101,6 +101,7 @@ spec: # The prometheus transformer appends `_ratio` to gauge metrics: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/v0.86.0/pkg/translator/prometheus/normalize_name.go#L149 # Add the feature gate to enable metric suffix trimming. - "--feature-gates=-pkg.translator.prometheus.NormalizeName" + - "--feature-gates=-component.UseLocalHostAsDefaultHost" resources: limits: cpu: 1 diff --git a/manifests/templates/reconciler-manager-configmap.yaml b/manifests/templates/reconciler-manager-configmap.yaml index eb7589fe73..7ae8da2982 100644 --- a/manifests/templates/reconciler-manager-configmap.yaml +++ b/manifests/templates/reconciler-manager-configmap.yaml @@ -175,10 +175,11 @@ data: command: - /otelcontribcol args: - - "--config=/conf/otel-agent-config.yaml" + - "--config=/conf/otel-agent-reconciler-config.yaml" # The prometheus transformer appends `_ratio` to gauge metrics: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/v0.86.0/pkg/translator/prometheus/normalize_name.go#L149 # Add the feature gate to enable metric suffix trimming. - "--feature-gates=-pkg.translator.prometheus.NormalizeName" + - "--feature-gates=-component.UseLocalHostAsDefaultHost" securityContext: allowPrivilegeEscalation: false readOnlyRootFilesystem: true @@ -191,7 +192,7 @@ data: - containerPort: 8888 # Metrics. protocol: TCP volumeMounts: - - name: otel-agent-config-vol + - name: otel-agent-config-reconciler-vol mountPath: /conf readinessProbe: httpGet: @@ -282,9 +283,9 @@ data: secret: secretName: git-creds defaultMode: 288 - - name: otel-agent-config-vol + - name: otel-agent-config-reconciler-vol configMap: - name: otel-agent + name: otel-agent-reconciler defaultMode: 420 - name: service-account emptyDir: {} diff --git a/manifests/templates/reconciler-manager.yaml b/manifests/templates/reconciler-manager.yaml index d280ae2401..352eaab34b 100644 --- a/manifests/templates/reconciler-manager.yaml +++ b/manifests/templates/reconciler-manager.yaml @@ -71,6 +71,7 @@ spec: # The prometheus transformer appends `_ratio` to gauge metrics: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/v0.86.0/pkg/translator/prometheus/normalize_name.go#L149 # Add the feature gate to enable metric suffix trimming. - "--feature-gates=-pkg.translator.prometheus.NormalizeName" + - "--feature-gates=-component.UseLocalHostAsDefaultHost" resources: limits: cpu: 1 diff --git a/manifests/templates/resourcegroup-manifest.yaml b/manifests/templates/resourcegroup-manifest.yaml index c4c34301e7..67e074f38c 100644 --- a/manifests/templates/resourcegroup-manifest.yaml +++ b/manifests/templates/resourcegroup-manifest.yaml @@ -232,6 +232,7 @@ spec: - args: - --config=/conf/otel-agent-config.yaml - --feature-gates=-pkg.translator.prometheus.NormalizeName + - --feature-gates=-component.UseLocalHostAsDefaultHost command: - /otelcontribcol env: diff --git a/pkg/metrics/otel.go b/pkg/metrics/otel.go index a3808457e7..5b75eb06ca 100644 --- a/pkg/metrics/otel.go +++ b/pkg/metrics/otel.go @@ -184,7 +184,9 @@ processors: new_name: current_declared_resources operations: - action: aggregate_labels - label_set: [] + # Using a no_op_label to get around issue in the upstream + # https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/34430 + label_set: [no_op_label] aggregation_type: max - include: kcc_resource_count action: update @@ -255,14 +257,18 @@ processors: new_name: resource_conflicts_count operations: - action: aggregate_labels - label_set: [] + # Using a no_op_label to get around issue in the upstream + # https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/34430 + label_set: [no_op_label] aggregation_type: max - include: internal_errors_total action: update new_name: internal_errors_count operations: - action: aggregate_labels - label_set: [] + # Using a no_op_label to get around issue in the upstream + # https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/34430 + label_set: [no_op_label] aggregation_type: max - include: remediate_duration_seconds action: update @@ -322,13 +328,17 @@ processors: action: update operations: - action: aggregate_labels - label_set: [] + # Using a no_op_label to get around issue in the upstream + # https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/34430 + label_set: [no_op_label] aggregation_type: max - include: kustomize_build_latency action: update operations: - action: aggregate_labels - label_set: [] + # Using a no_op_label to get around issue in the upstream + # https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/34430 + label_set: [no_op_label] aggregation_type: max extensions: health_check: diff --git a/pkg/reconcilermanager/controllers/otel_controller_test.go b/pkg/reconcilermanager/controllers/otel_controller_test.go index 649035a7ed..0d52054bf0 100644 --- a/pkg/reconcilermanager/controllers/otel_controller_test.go +++ b/pkg/reconcilermanager/controllers/otel_controller_test.go @@ -49,7 +49,7 @@ const ( // otel-collector ConfigMap. // See `CollectorConfigGooglecloud` in `pkg/metrics/otel.go` // Used by TestOtelReconcilerGooglecloud. - depAnnotationGooglecloud = "c2f6078a9afe1f32721173e9e15bbab5" + depAnnotationGooglecloud = "bfa02552b80a227256e825c807254b40" // depAnnotationGooglecloud is the expected hash of the custom // otel-collector ConfigMap test artifact. // Used by TestOtelReconcilerCustom. From 317fa6fe1d8b68777656a52a8d00f1a6a075cfad Mon Sep 17 00:00:00 2001 From: Tiffany Pei Date: Thu, 12 Dec 2024 20:35:20 +0000 Subject: [PATCH 2/8] Change otelcontribcol to v0.115.0-gke.0 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b3395e79b0..0eb0a3e665 100644 --- a/Makefile +++ b/Makefile @@ -89,7 +89,7 @@ COSIGN := $(BIN_DIR)/cosign GIT_SYNC_VERSION := v4.3.0-gke.4__linux_amd64 GIT_SYNC_IMAGE_NAME := gcr.io/config-management-release/git-sync:$(GIT_SYNC_VERSION) -OTELCONTRIBCOL_VERSION := v0.106.0-gke.2 +OTELCONTRIBCOL_VERSION := v0.115.0-gke.0 OTELCONTRIBCOL_IMAGE_NAME := gcr.io/config-management-release/otelcontribcol:$(OTELCONTRIBCOL_VERSION) # Directory used for staging Docker contexts. From 240e3ccc085fa51a20353e487d6686900025c981 Mon Sep 17 00:00:00 2001 From: Tiffany Pei Date: Thu, 12 Dec 2024 20:36:52 +0000 Subject: [PATCH 3/8] Remove deprecated feature gate UseLocalHostAsDefaultHost Also specifying host 0.0.0.0 instead of the new default localhost. --- e2e/testdata/otel-collector/otel-cm-full-gcm.yaml | 1 + .../otel-collector/otel-cm-kustomize-rejected-labels.yaml | 1 + .../otel-collector/otel-cm-monarch-rejected-labels.yaml | 1 + manifests/otel-agent-cm.yaml | 1 + manifests/otel-agent-reconciler-cm.yaml | 1 + manifests/templates/otel-collector.yaml | 5 ++++- manifests/templates/reconciler-manager-configmap.yaml | 1 - manifests/templates/reconciler-manager.yaml | 1 - manifests/templates/resourcegroup-manifest.yaml | 2 +- pkg/metrics/otel.go | 1 + 10 files changed, 11 insertions(+), 4 deletions(-) diff --git a/e2e/testdata/otel-collector/otel-cm-full-gcm.yaml b/e2e/testdata/otel-collector/otel-cm-full-gcm.yaml index b7e46267e2..79bc3f9726 100644 --- a/e2e/testdata/otel-collector/otel-cm-full-gcm.yaml +++ b/e2e/testdata/otel-collector/otel-cm-full-gcm.yaml @@ -17,6 +17,7 @@ data: otel-collector-config.yaml: |- receivers: opencensus: + endpoint: 0.0.0.0:55678 exporters: prometheus: endpoint: :8675 diff --git a/e2e/testdata/otel-collector/otel-cm-kustomize-rejected-labels.yaml b/e2e/testdata/otel-collector/otel-cm-kustomize-rejected-labels.yaml index 1a71b3f4dc..116688bb49 100644 --- a/e2e/testdata/otel-collector/otel-cm-kustomize-rejected-labels.yaml +++ b/e2e/testdata/otel-collector/otel-cm-kustomize-rejected-labels.yaml @@ -21,6 +21,7 @@ data: otel-collector-config.yaml: |- receivers: opencensus: + endpoint: 0.0.0.0:55678 exporters: prometheus: endpoint: :8675 diff --git a/e2e/testdata/otel-collector/otel-cm-monarch-rejected-labels.yaml b/e2e/testdata/otel-collector/otel-cm-monarch-rejected-labels.yaml index 1a559cd65e..58e19dafa5 100644 --- a/e2e/testdata/otel-collector/otel-cm-monarch-rejected-labels.yaml +++ b/e2e/testdata/otel-collector/otel-cm-monarch-rejected-labels.yaml @@ -21,6 +21,7 @@ data: otel-collector-config.yaml: |- receivers: opencensus: + endpoint: 0.0.0.0:55678 exporters: prometheus: endpoint: :8675 diff --git a/manifests/otel-agent-cm.yaml b/manifests/otel-agent-cm.yaml index b1173ffe6b..94f952de79 100644 --- a/manifests/otel-agent-cm.yaml +++ b/manifests/otel-agent-cm.yaml @@ -26,6 +26,7 @@ data: otel-agent-config.yaml: | receivers: opencensus: + endpoint: 0.0.0.0:55678 exporters: opencensus: endpoint: otel-collector.config-management-monitoring:55678 diff --git a/manifests/otel-agent-reconciler-cm.yaml b/manifests/otel-agent-reconciler-cm.yaml index d8b437e85c..8655e67bde 100644 --- a/manifests/otel-agent-reconciler-cm.yaml +++ b/manifests/otel-agent-reconciler-cm.yaml @@ -26,6 +26,7 @@ data: otel-agent-reconciler-config.yaml: | receivers: opencensus: + endpoint: 0.0.0.0:55678 exporters: opencensus: endpoint: otel-collector.config-management-monitoring:55678 diff --git a/manifests/templates/otel-collector.yaml b/manifests/templates/otel-collector.yaml index 933fcbe84f..2e4c98d1bf 100644 --- a/manifests/templates/otel-collector.yaml +++ b/manifests/templates/otel-collector.yaml @@ -26,6 +26,7 @@ data: otel-collector-config.yaml: | receivers: opencensus: + endpoint: 0.0.0.0:55678 exporters: prometheus: endpoint: :8675 @@ -37,6 +38,9 @@ data: extensions: health_check: service: + telemetry: + metrics: + address: 0.0.0.0:55678 extensions: [health_check] pipelines: metrics: @@ -101,7 +105,6 @@ spec: # The prometheus transformer appends `_ratio` to gauge metrics: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/v0.86.0/pkg/translator/prometheus/normalize_name.go#L149 # Add the feature gate to enable metric suffix trimming. - "--feature-gates=-pkg.translator.prometheus.NormalizeName" - - "--feature-gates=-component.UseLocalHostAsDefaultHost" resources: limits: cpu: 1 diff --git a/manifests/templates/reconciler-manager-configmap.yaml b/manifests/templates/reconciler-manager-configmap.yaml index 7ae8da2982..4895e7ec37 100644 --- a/manifests/templates/reconciler-manager-configmap.yaml +++ b/manifests/templates/reconciler-manager-configmap.yaml @@ -179,7 +179,6 @@ data: # The prometheus transformer appends `_ratio` to gauge metrics: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/v0.86.0/pkg/translator/prometheus/normalize_name.go#L149 # Add the feature gate to enable metric suffix trimming. - "--feature-gates=-pkg.translator.prometheus.NormalizeName" - - "--feature-gates=-component.UseLocalHostAsDefaultHost" securityContext: allowPrivilegeEscalation: false readOnlyRootFilesystem: true diff --git a/manifests/templates/reconciler-manager.yaml b/manifests/templates/reconciler-manager.yaml index 352eaab34b..d280ae2401 100644 --- a/manifests/templates/reconciler-manager.yaml +++ b/manifests/templates/reconciler-manager.yaml @@ -71,7 +71,6 @@ spec: # The prometheus transformer appends `_ratio` to gauge metrics: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/v0.86.0/pkg/translator/prometheus/normalize_name.go#L149 # Add the feature gate to enable metric suffix trimming. - "--feature-gates=-pkg.translator.prometheus.NormalizeName" - - "--feature-gates=-component.UseLocalHostAsDefaultHost" resources: limits: cpu: 1 diff --git a/manifests/templates/resourcegroup-manifest.yaml b/manifests/templates/resourcegroup-manifest.yaml index 67e074f38c..2c619d2820 100644 --- a/manifests/templates/resourcegroup-manifest.yaml +++ b/manifests/templates/resourcegroup-manifest.yaml @@ -155,6 +155,7 @@ data: otel-agent-config.yaml: | receivers: opencensus: + endpoint: 0.0.0.0:55678 exporters: opencensus: endpoint: otel-collector.config-management-monitoring:55678 @@ -232,7 +233,6 @@ spec: - args: - --config=/conf/otel-agent-config.yaml - --feature-gates=-pkg.translator.prometheus.NormalizeName - - --feature-gates=-component.UseLocalHostAsDefaultHost command: - /otelcontribcol env: diff --git a/pkg/metrics/otel.go b/pkg/metrics/otel.go index 5b75eb06ca..bb62360c69 100644 --- a/pkg/metrics/otel.go +++ b/pkg/metrics/otel.go @@ -34,6 +34,7 @@ const ( // the googlecloud exporter. CollectorConfigGooglecloud = `receivers: opencensus: + endpoint: 0.0.0.0:55678 exporters: prometheus: endpoint: :8675 From d58128b55137b90623c5375ebe1743ef146f04e2 Mon Sep 17 00:00:00 2001 From: Tiffany Pei Date: Mon, 16 Dec 2024 22:31:44 +0000 Subject: [PATCH 4/8] Explicitly bind to endpoint 0.0.0.0 For all receivers and health_check. Adding networkpolicy for ingress on essential ports. --- .../otel-collector/otel-cm-full-gcm.yaml | 2 +- manifests/otel-agent-cm.yaml | 1 + manifests/otel-agent-reconciler-cm.yaml | 1 + manifests/templates/otel-collector.yaml | 29 ++++++++++++++++--- manifests/templates/reconciler-manager.yaml | 20 +++++++++++++ pkg/metrics/otel.go | 3 +- 6 files changed, 50 insertions(+), 6 deletions(-) diff --git a/e2e/testdata/otel-collector/otel-cm-full-gcm.yaml b/e2e/testdata/otel-collector/otel-cm-full-gcm.yaml index 79bc3f9726..6f305a4898 100644 --- a/e2e/testdata/otel-collector/otel-cm-full-gcm.yaml +++ b/e2e/testdata/otel-collector/otel-cm-full-gcm.yaml @@ -20,7 +20,7 @@ data: endpoint: 0.0.0.0:55678 exporters: prometheus: - endpoint: :8675 + endpoint: 0.0.0.0:8675 namespace: config_sync resource_to_telemetry_conversion: enabled: true diff --git a/manifests/otel-agent-cm.yaml b/manifests/otel-agent-cm.yaml index 94f952de79..344ccdcab6 100644 --- a/manifests/otel-agent-cm.yaml +++ b/manifests/otel-agent-cm.yaml @@ -40,6 +40,7 @@ data: detectors: [env, gcp] extensions: health_check: + endpoint: "0.0.0.0:13133" service: extensions: [health_check] pipelines: diff --git a/manifests/otel-agent-reconciler-cm.yaml b/manifests/otel-agent-reconciler-cm.yaml index 8655e67bde..8e2638ff3e 100644 --- a/manifests/otel-agent-reconciler-cm.yaml +++ b/manifests/otel-agent-reconciler-cm.yaml @@ -58,6 +58,7 @@ data: detectors: [env, gcp] extensions: health_check: + endpoint: 0.0.0.0:13133 service: extensions: [health_check] pipelines: diff --git a/manifests/templates/otel-collector.yaml b/manifests/templates/otel-collector.yaml index 2e4c98d1bf..0bf5fe2ba3 100644 --- a/manifests/templates/otel-collector.yaml +++ b/manifests/templates/otel-collector.yaml @@ -29,7 +29,7 @@ data: endpoint: 0.0.0.0:55678 exporters: prometheus: - endpoint: :8675 + endpoint: 0.0.0.0:8675 namespace: config_sync resource_to_telemetry_conversion: enabled: true @@ -37,10 +37,8 @@ data: batch: extensions: health_check: + endpoint: "0.0.0.0:13133" service: - telemetry: - metrics: - address: 0.0.0.0:55678 extensions: [health_check] pipelines: metrics: @@ -71,6 +69,29 @@ spec: - name: metrics # Prometheus exporter metrics. port: 8675 --- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-port-ingress + namespace: config-management-monitoring +spec: + podSelector: + matchLabels: + app: opentelemetry + component: otel-collector + ingress: + - from: + - namespaceSelector: {} + ports: + - protocol: TCP + port: 13133 + - protocol: TCP + port: 55678 + - protocol: TCP + port: 8675 + - protocol: TCP + port: 8888 +--- apiVersion: apps/v1 kind: Deployment metadata: diff --git a/manifests/templates/reconciler-manager.yaml b/manifests/templates/reconciler-manager.yaml index d280ae2401..a184fadd0e 100644 --- a/manifests/templates/reconciler-manager.yaml +++ b/manifests/templates/reconciler-manager.yaml @@ -150,3 +150,23 @@ spec: runAsNonRoot: true seccompProfile: type: RuntimeDefault +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-port-ingress + namespace: config-management-system +spec: + podSelector: + matchLabels: {} + ingress: + - from: + - namespaceSelector: {} + ports: + - protocol: TCP + port: 13133 + - protocol: TCP + port: 55678 + - protocol: TCP + port: 8888 + diff --git a/pkg/metrics/otel.go b/pkg/metrics/otel.go index bb62360c69..f1a0ff3391 100644 --- a/pkg/metrics/otel.go +++ b/pkg/metrics/otel.go @@ -37,7 +37,7 @@ const ( endpoint: 0.0.0.0:55678 exporters: prometheus: - endpoint: :8675 + endpoint: 0.0.0.0:8675 namespace: config_sync resource_to_telemetry_conversion: enabled: true @@ -343,6 +343,7 @@ processors: aggregation_type: max extensions: health_check: + endpoint: "0.0.0.0:13133" service: extensions: [health_check] pipelines: From a12bfe6fd90793732cd4123f234ec6af6e794b25 Mon Sep 17 00:00:00 2001 From: Tiffany Pei Date: Mon, 16 Dec 2024 22:57:19 +0000 Subject: [PATCH 5/8] Apply change to resource-group-manifest --- manifests/otel-agent-cm.yaml | 2 +- .../templates/resourcegroup-manifest.yaml | 21 +++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/manifests/otel-agent-cm.yaml b/manifests/otel-agent-cm.yaml index 344ccdcab6..f1d2261bc2 100644 --- a/manifests/otel-agent-cm.yaml +++ b/manifests/otel-agent-cm.yaml @@ -40,7 +40,7 @@ data: detectors: [env, gcp] extensions: health_check: - endpoint: "0.0.0.0:13133" + endpoint: 0.0.0.0:13133 service: extensions: [health_check] pipelines: diff --git a/manifests/templates/resourcegroup-manifest.yaml b/manifests/templates/resourcegroup-manifest.yaml index 2c619d2820..b044fcadd9 100644 --- a/manifests/templates/resourcegroup-manifest.yaml +++ b/manifests/templates/resourcegroup-manifest.yaml @@ -169,6 +169,7 @@ data: detectors: [env, gcp] extensions: health_check: + endpoint: 0.0.0.0:13133 service: extensions: [health_check] pipelines: @@ -302,3 +303,23 @@ spec: runAsNonRoot: true seccompProfile: type: RuntimeDefault +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-port-ingress + namespace: resource-group-system +spec: + podSelector: + matchLabels: {} + ingress: + - from: + - namespaceSelector: {} + ports: + - protocol: TCP + port: 13133 + - protocol: TCP + port: 55678 + - protocol: TCP + port: 8888 + From e21b59abf222530a41e96132f9f0f00619fe00d7 Mon Sep 17 00:00:00 2001 From: Tiffany Pei Date: Tue, 17 Dec 2024 00:27:00 +0000 Subject: [PATCH 6/8] Only expose necessary ingress - port 13133: health_check for readiness check. Apply to otel-agent in reconciler-manager and reconcilers; Otel-collector. - port 55678: opencensus receiver for otel-collector to receive metric from other pods Fix test ConfigMap. --- e2e/testdata/otel-collector/otel-cm-full-gcm.yaml | 1 + .../otel-collector/otel-cm-kustomize-rejected-labels.yaml | 3 ++- .../otel-collector/otel-cm-monarch-rejected-labels.yaml | 3 ++- manifests/templates/otel-collector.yaml | 6 +----- manifests/templates/reconciler-manager.yaml | 5 ----- manifests/templates/resourcegroup-manifest.yaml | 4 ---- test/kustomization/expected.yaml | 3 +++ 7 files changed, 9 insertions(+), 16 deletions(-) diff --git a/e2e/testdata/otel-collector/otel-cm-full-gcm.yaml b/e2e/testdata/otel-collector/otel-cm-full-gcm.yaml index 6f305a4898..3c4efab44e 100644 --- a/e2e/testdata/otel-collector/otel-cm-full-gcm.yaml +++ b/e2e/testdata/otel-collector/otel-cm-full-gcm.yaml @@ -304,6 +304,7 @@ data: aggregation_type: max extensions: health_check: + endpoint: 0.0.0.0:13133 service: extensions: [health_check] pipelines: diff --git a/e2e/testdata/otel-collector/otel-cm-kustomize-rejected-labels.yaml b/e2e/testdata/otel-collector/otel-cm-kustomize-rejected-labels.yaml index 116688bb49..2f4f7b62d2 100644 --- a/e2e/testdata/otel-collector/otel-cm-kustomize-rejected-labels.yaml +++ b/e2e/testdata/otel-collector/otel-cm-kustomize-rejected-labels.yaml @@ -24,7 +24,7 @@ data: endpoint: 0.0.0.0:55678 exporters: prometheus: - endpoint: :8675 + endpoint: 0.0.0.0:8675 namespace: config_sync resource_to_telemetry_conversion: enabled: true @@ -259,6 +259,7 @@ data: aggregation_type: max extensions: health_check: + endpoint: 0.0.0.0:13133 service: extensions: [health_check] pipelines: diff --git a/e2e/testdata/otel-collector/otel-cm-monarch-rejected-labels.yaml b/e2e/testdata/otel-collector/otel-cm-monarch-rejected-labels.yaml index 58e19dafa5..63157e3569 100644 --- a/e2e/testdata/otel-collector/otel-cm-monarch-rejected-labels.yaml +++ b/e2e/testdata/otel-collector/otel-cm-monarch-rejected-labels.yaml @@ -24,7 +24,7 @@ data: endpoint: 0.0.0.0:55678 exporters: prometheus: - endpoint: :8675 + endpoint: 0.0.0.0:8675 namespace: config_sync resource_to_telemetry_conversion: enabled: true @@ -168,6 +168,7 @@ data: new_name: no_ssl_verify_count extensions: health_check: + endpoint: 0.0.0.0:13133 service: extensions: [health_check] pipelines: diff --git a/manifests/templates/otel-collector.yaml b/manifests/templates/otel-collector.yaml index 0bf5fe2ba3..c79aab3ac1 100644 --- a/manifests/templates/otel-collector.yaml +++ b/manifests/templates/otel-collector.yaml @@ -37,7 +37,7 @@ data: batch: extensions: health_check: - endpoint: "0.0.0.0:13133" + endpoint: 0.0.0.0:13133 service: extensions: [health_check] pipelines: @@ -87,10 +87,6 @@ spec: port: 13133 - protocol: TCP port: 55678 - - protocol: TCP - port: 8675 - - protocol: TCP - port: 8888 --- apiVersion: apps/v1 kind: Deployment diff --git a/manifests/templates/reconciler-manager.yaml b/manifests/templates/reconciler-manager.yaml index a184fadd0e..d12543382b 100644 --- a/manifests/templates/reconciler-manager.yaml +++ b/manifests/templates/reconciler-manager.yaml @@ -165,8 +165,3 @@ spec: ports: - protocol: TCP port: 13133 - - protocol: TCP - port: 55678 - - protocol: TCP - port: 8888 - diff --git a/manifests/templates/resourcegroup-manifest.yaml b/manifests/templates/resourcegroup-manifest.yaml index b044fcadd9..ac09a118fa 100644 --- a/manifests/templates/resourcegroup-manifest.yaml +++ b/manifests/templates/resourcegroup-manifest.yaml @@ -318,8 +318,4 @@ spec: ports: - protocol: TCP port: 13133 - - protocol: TCP - port: 55678 - - protocol: TCP - port: 8888 diff --git a/test/kustomization/expected.yaml b/test/kustomization/expected.yaml index b8f4cbe27d..df1f6203eb 100644 --- a/test/kustomization/expected.yaml +++ b/test/kustomization/expected.yaml @@ -5628,6 +5628,7 @@ data: batch: extensions: health_check: + endpoint: 0.0.0.0:13133 service: extensions: [health_check] pipelines: @@ -5681,6 +5682,7 @@ data: detectors: [env, gcp] extensions: health_check: + endpoint: 0.0.0.0:13133 service: extensions: [health_check] pipelines: @@ -6003,6 +6005,7 @@ data: detectors: [env, gcp] extensions: health_check: + endpoint: 0.0.0.0:13133 service: extensions: [health_check] pipelines: From d98f4b4e70b785b1f6eb0e200e61c2b0f1a53fa4 Mon Sep 17 00:00:00 2001 From: Tiffany Pei Date: Tue, 17 Dec 2024 01:09:05 +0000 Subject: [PATCH 7/8] Cleanup Only keep networkpolicy for otel-collector --- manifests/templates/otel-collector.yaml | 5 +++-- .../templates/reconciler-manager-configmap.yaml | 2 ++ manifests/templates/reconciler-manager.yaml | 16 +--------------- manifests/templates/resourcegroup-manifest.yaml | 16 +--------------- 4 files changed, 7 insertions(+), 32 deletions(-) diff --git a/manifests/templates/otel-collector.yaml b/manifests/templates/otel-collector.yaml index c79aab3ac1..4feddb86af 100644 --- a/manifests/templates/otel-collector.yaml +++ b/manifests/templates/otel-collector.yaml @@ -68,6 +68,8 @@ spec: port: 8888 - name: metrics # Prometheus exporter metrics. port: 8675 + - name: health-check + port: 13133 --- apiVersion: networking.k8s.io/v1 kind: NetworkPolicy @@ -83,8 +85,6 @@ spec: - from: - namespaceSelector: {} ports: - - protocol: TCP - port: 13133 - protocol: TCP port: 55678 --- @@ -133,6 +133,7 @@ spec: - containerPort: 55678 # Default endpoint for OpenCensus receiver. - containerPort: 8888 # Default endpoint for querying metrics. - containerPort: 8675 # Prometheus exporter metrics. + - containerPort: 13133 # Health check securityContext: allowPrivilegeEscalation: false readOnlyRootFilesystem: true diff --git a/manifests/templates/reconciler-manager-configmap.yaml b/manifests/templates/reconciler-manager-configmap.yaml index 4895e7ec37..92882fa746 100644 --- a/manifests/templates/reconciler-manager-configmap.yaml +++ b/manifests/templates/reconciler-manager-configmap.yaml @@ -190,6 +190,8 @@ data: protocol: TCP - containerPort: 8888 # Metrics. protocol: TCP + - containerPort: 13133 # Health check + protocol: TCP volumeMounts: - name: otel-agent-config-reconciler-vol mountPath: /conf diff --git a/manifests/templates/reconciler-manager.yaml b/manifests/templates/reconciler-manager.yaml index d12543382b..ecde8f5c75 100644 --- a/manifests/templates/reconciler-manager.yaml +++ b/manifests/templates/reconciler-manager.yaml @@ -81,6 +81,7 @@ spec: ports: - containerPort: 55678 # Default OpenCensus receiver port. - containerPort: 8888 # Metrics. + - containerPort: 13133 # Health check securityContext: allowPrivilegeEscalation: false readOnlyRootFilesystem: true @@ -150,18 +151,3 @@ spec: runAsNonRoot: true seccompProfile: type: RuntimeDefault ---- -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - name: allow-port-ingress - namespace: config-management-system -spec: - podSelector: - matchLabels: {} - ingress: - - from: - - namespaceSelector: {} - ports: - - protocol: TCP - port: 13133 diff --git a/manifests/templates/resourcegroup-manifest.yaml b/manifests/templates/resourcegroup-manifest.yaml index ac09a118fa..49c95f66ba 100644 --- a/manifests/templates/resourcegroup-manifest.yaml +++ b/manifests/templates/resourcegroup-manifest.yaml @@ -274,6 +274,7 @@ spec: ports: - containerPort: 55678 - containerPort: 8888 + - containerPort: 13133 readinessProbe: httpGet: path: / @@ -303,19 +304,4 @@ spec: runAsNonRoot: true seccompProfile: type: RuntimeDefault ---- -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - name: allow-port-ingress - namespace: resource-group-system -spec: - podSelector: - matchLabels: {} - ingress: - - from: - - namespaceSelector: {} - ports: - - protocol: TCP - port: 13133 From 111f6bb1e26442427573b3bc810995a23ba8517c Mon Sep 17 00:00:00 2001 From: Tiffany Pei Date: Tue, 17 Dec 2024 01:25:43 +0000 Subject: [PATCH 8/8] Update expected.yaml --- .../controllers/otel_controller_test.go | 2 +- test/kustomization/expected.yaml | 85 +++++++++++++++++-- 2 files changed, 77 insertions(+), 10 deletions(-) diff --git a/pkg/reconcilermanager/controllers/otel_controller_test.go b/pkg/reconcilermanager/controllers/otel_controller_test.go index 0d52054bf0..8b785c56ec 100644 --- a/pkg/reconcilermanager/controllers/otel_controller_test.go +++ b/pkg/reconcilermanager/controllers/otel_controller_test.go @@ -49,7 +49,7 @@ const ( // otel-collector ConfigMap. // See `CollectorConfigGooglecloud` in `pkg/metrics/otel.go` // Used by TestOtelReconcilerGooglecloud. - depAnnotationGooglecloud = "bfa02552b80a227256e825c807254b40" + depAnnotationGooglecloud = "5e22170ef10a382f587d3d7595fdaebc" // depAnnotationGooglecloud is the expected hash of the custom // otel-collector ConfigMap test artifact. // Used by TestOtelReconcilerCustom. diff --git a/test/kustomization/expected.yaml b/test/kustomization/expected.yaml index df1f6203eb..d7932ce205 100644 --- a/test/kustomization/expected.yaml +++ b/test/kustomization/expected.yaml @@ -5618,9 +5618,10 @@ data: otel-collector-config.yaml: | receivers: opencensus: + endpoint: 0.0.0.0:55678 exporters: prometheus: - endpoint: :8675 + endpoint: 0.0.0.0:8675 namespace: config_sync resource_to_telemetry_conversion: enabled: true @@ -5651,6 +5652,47 @@ data: otel-agent-config.yaml: | receivers: opencensus: + endpoint: 0.0.0.0:55678 + exporters: + opencensus: + endpoint: otel-collector.config-management-monitoring:55678 + tls: + insecure: true + processors: + batch: + # Populate resource attributes from OTEL_RESOURCE_ATTRIBUTES env var and + # the GCE metadata service, if available. + resourcedetection: + detectors: [env, gcp] + extensions: + health_check: + endpoint: 0.0.0.0:13133 + service: + extensions: [health_check] + pipelines: + metrics: + receivers: [opencensus] + processors: [batch, resourcedetection] + exporters: [opencensus] + telemetry: + logs: + level: "INFO" +kind: ConfigMap +metadata: + labels: + app: opentelemetry + component: otel-agent + configmanagement.gke.io/arch: csmr + configmanagement.gke.io/system: "true" + name: otel-agent + namespace: config-management-system +--- +apiVersion: v1 +data: + otel-agent-reconciler-config.yaml: | + receivers: + opencensus: + endpoint: 0.0.0.0:55678 exporters: opencensus: endpoint: otel-collector.config-management-monitoring:55678 @@ -5668,13 +5710,13 @@ data: actions: - key: configsync.sync.kind action: upsert - value: $CONFIGSYNC_SYNC_KIND + value: ${CONFIGSYNC_SYNC_KIND} - key: configsync.sync.name action: upsert - value: $CONFIGSYNC_SYNC_NAME + value: ${CONFIGSYNC_SYNC_NAME} - key: configsync.sync.namespace action: upsert - value: $CONFIGSYNC_SYNC_NAMESPACE + value: ${CONFIGSYNC_SYNC_NAMESPACE} batch: # Populate resource attributes from OTEL_RESOURCE_ATTRIBUTES env var and # the GCE metadata service, if available. @@ -5700,7 +5742,7 @@ metadata: component: otel-agent configmanagement.gke.io/arch: csmr configmanagement.gke.io/system: "true" - name: otel-agent + name: otel-agent-reconciler namespace: config-management-system --- apiVersion: v1 @@ -5859,7 +5901,7 @@ data: command: - /otelcontribcol args: - - "--config=/conf/otel-agent-config.yaml" + - "--config=/conf/otel-agent-reconciler-config.yaml" # The prometheus transformer appends `_ratio` to gauge metrics: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/v0.86.0/pkg/translator/prometheus/normalize_name.go#L149 # Add the feature gate to enable metric suffix trimming. - "--feature-gates=-pkg.translator.prometheus.NormalizeName" @@ -5874,8 +5916,10 @@ data: protocol: TCP - containerPort: 8888 # Metrics. protocol: TCP + - containerPort: 13133 # Health check + protocol: TCP volumeMounts: - - name: otel-agent-config-vol + - name: otel-agent-config-reconciler-vol mountPath: /conf readinessProbe: httpGet: @@ -5966,9 +6010,9 @@ data: secret: secretName: git-creds defaultMode: 288 - - name: otel-agent-config-vol + - name: otel-agent-config-reconciler-vol configMap: - name: otel-agent + name: otel-agent-reconciler defaultMode: 420 - name: service-account emptyDir: {} @@ -5992,6 +6036,7 @@ data: otel-agent-config.yaml: | receivers: opencensus: + endpoint: 0.0.0.0:55678 exporters: opencensus: endpoint: otel-collector.config-management-monitoring:55678 @@ -6052,6 +6097,8 @@ spec: port: 8888 - name: metrics port: 8675 + - name: health-check + port: 13133 selector: app: opentelemetry component: otel-collector @@ -6124,6 +6171,7 @@ spec: - containerPort: 55678 - containerPort: 8888 - containerPort: 8675 + - containerPort: 13133 readinessProbe: httpGet: path: / @@ -6335,6 +6383,7 @@ spec: ports: - containerPort: 55678 - containerPort: 8888 + - containerPort: 13133 readinessProbe: httpGet: path: / @@ -6458,6 +6507,7 @@ spec: ports: - containerPort: 55678 - containerPort: 8888 + - containerPort: 13133 readinessProbe: httpGet: path: / @@ -6488,6 +6538,23 @@ spec: name: resource-group-otel-agent name: otel-agent-config-vol --- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-port-ingress + namespace: config-management-monitoring +spec: + ingress: + - from: + - namespaceSelector: {} + ports: + - port: 55678 + protocol: TCP + podSelector: + matchLabels: + app: opentelemetry + component: otel-collector +--- apiVersion: admissionregistration.k8s.io/v1 kind: ValidatingWebhookConfiguration metadata: