From 48136d5a8c97f77ee0ebff0f05dd0cd571cdbee6 Mon Sep 17 00:00:00 2001 From: Tiffany Pei Date: Tue, 28 May 2024 19:18:17 +0000 Subject: [PATCH] [metric] #2 Cleanup declared_resources metric name The metric name was changed to `declared_resources_total` using the metricstransform processor without communicating in the documentation / release note and the intention was unclear. This change reverts the name change and adds e2e test around all the [available metrics](http://cloud/kubernetes-engine/enterprise/config-sync/docs/how-to/monitoring-config-sync) in GCM. The check skips the counter metrics for now as they have no data when no error condition has happened. --- e2e/testcases/otel_collector_test.go | 177 +++++++--- .../otel-collector/otel-cm-full-gcm.yaml | 326 ++++++++++++++++++ go.mod | 2 +- pkg/metrics/otel.go | 6 +- .../controllers/otel_controller_test.go | 2 +- 5 files changed, 463 insertions(+), 50 deletions(-) create mode 100644 e2e/testdata/otel-collector/otel-cm-full-gcm.yaml diff --git a/e2e/testcases/otel_collector_test.go b/e2e/testcases/otel_collector_test.go index 563e72e20f..39d228d169 100644 --- a/e2e/testcases/otel_collector_test.go +++ b/e2e/testcases/otel_collector_test.go @@ -24,9 +24,8 @@ import ( monitoringv2 "cloud.google.com/go/monitoring/apiv3/v2" "cloud.google.com/go/monitoring/apiv3/v2/monitoringpb" "github.com/golang/protobuf/ptypes/timestamp" + "go.uber.org/multierr" "google.golang.org/api/iterator" - "google.golang.org/genproto/googleapis/api/metric" - "google.golang.org/genproto/googleapis/api/monitoredres" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" @@ -42,7 +41,8 @@ import ( "kpt.dev/configsync/pkg/core" "kpt.dev/configsync/pkg/kinds" "kpt.dev/configsync/pkg/metrics" - ocmetrics "kpt.dev/configsync/pkg/metrics" + csmetrics "kpt.dev/configsync/pkg/metrics" + rgmetrics "kpt.dev/configsync/pkg/resourcegroup/controllers/metrics" "kpt.dev/configsync/pkg/testing/fake" ) @@ -54,12 +54,46 @@ const ( GCMMetricPrefix = "custom.googleapis.com/opencensus/config_sync" ) +var DefaultGCMMetricTypes = []string{ + csmetrics.APICallDurationName, + csmetrics.ReconcilerErrorsName, + csmetrics.PipelineErrorName, // name reused in resource group controller + csmetrics.ReconcileDurationName, + csmetrics.LastSyncName, + csmetrics.DeclaredResourcesName, + csmetrics.ApplyOperationsName, + csmetrics.ApplyDurationName, + //csmetrics.InternalErrorsName, + rgmetrics.RGReconcileDurationName, + rgmetrics.ResourceCountName, + rgmetrics.ReadyResourceCountName, + rgmetrics.KCCResourceCountName, + rgmetrics.ClusterScopedResourceCountName, +} + var GCMMetricTypes = []string{ - ocmetrics.ReconcilerErrors.Name(), - ocmetrics.PipelineError.Name(), - ocmetrics.ReconcileDuration.Name(), - ocmetrics.ParserDuration.Name(), - ocmetrics.InternalErrors.Name(), + csmetrics.APICallDurationName, + csmetrics.ReconcilerErrorsName, + csmetrics.PipelineErrorName, // name reused in resource group controller + csmetrics.ReconcileDurationName, + csmetrics.ParserDurationName, + csmetrics.LastSyncName, + csmetrics.DeclaredResourcesName, + csmetrics.ApplyOperationsName, + csmetrics.ApplyDurationName, + //csmetrics.ResourceFightsName, + csmetrics.RemediateDurationName, + csmetrics.LastApplyName, + //csmetrics.ResourceConflictsName, + //csmetrics.InternalErrorsName, + rgmetrics.RGReconcileDurationName, + rgmetrics.ResourceGroupTotalName, + rgmetrics.ResourceCountName, + rgmetrics.ReadyResourceCountName, + rgmetrics.KCCResourceCountName, + rgmetrics.NamespaceCountName, + rgmetrics.ClusterScopedResourceCountName, + rgmetrics.CRDCountName, } // TestOtelCollectorDeployment validates that metrics reporting works for @@ -80,22 +114,22 @@ func TestOtelCollectorDeployment(t *testing.T) { ) nt.T.Cleanup(func() { if t.Failed() { - nt.PodLogs("config-management-monitoring", ocmetrics.OtelCollectorName, "", false) + nt.PodLogs("config-management-monitoring", csmetrics.OtelCollectorName, "", false) } }) setupMetricsServiceAccount(nt) nt.T.Cleanup(func() { - nt.MustKubectl("delete", "cm", ocmetrics.OtelCollectorCustomCM, "-n", configmanagement.MonitoringNamespace, "--ignore-not-found") + nt.MustKubectl("delete", "cm", csmetrics.OtelCollectorCustomCM, "-n", configmanagement.MonitoringNamespace, "--ignore-not-found") nt.T.Log("Restart otel-collector pod to reset the ConfigMap and log") - nomostest.DeletePodByLabel(nt, "app", ocmetrics.OpenTelemetry, false) - if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil { + nomostest.DeletePodByLabel(nt, "app", csmetrics.OpenTelemetry, false) + if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), csmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil { nt.T.Errorf("otel-collector pod failed to come up after a restart: %v", err) } }) nt.T.Log("Restart otel-collector pod to refresh the ConfigMap, log and IAM") - nomostest.DeletePodByLabel(nt, "app", ocmetrics.OpenTelemetry, false) - if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil { + nomostest.DeletePodByLabel(nt, "app", csmetrics.OpenTelemetry, false) + if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), csmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil { nt.T.Fatal(err) } @@ -117,19 +151,20 @@ func TestOtelCollectorDeployment(t *testing.T) { } // retry for 2 minutes until metric is accessible from GCM _, err = retry.Retry(120*time.Second, func() error { - for _, metricType := range GCMMetricTypes { + var err error + for _, metricType := range DefaultGCMMetricTypes { descriptor := fmt.Sprintf("%s/%s", GCMMetricPrefix, metricType) it := listMetricInGCM(ctx, nt, client, startTime, descriptor) - return validateMetricInGCM(nt, it, descriptor, nt.ClusterName) + err = multierr.Append(err, validateMetricInGCM(nt, it, descriptor, nt.ClusterName)) } - return nil + return err }) if err != nil { nt.T.Fatal(err) } nt.T.Log("Checking the otel-collector log contains no failure...") - err = validateDeploymentLogHasNoFailure(nt, ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace, MetricExportErrorCaption) + err = validateDeploymentLogHasNoFailure(nt, csmetrics.OtelCollectorName, configmanagement.MonitoringNamespace, MetricExportErrorCaption) if err != nil { nt.T.Fatal(err) } @@ -141,24 +176,24 @@ func TestOtelCollectorDeployment(t *testing.T) { nt.T.Log("Apply custom otel-collector ConfigMap that could cause duplicate time series error") nt.MustKubectl("apply", "-f", "../testdata/otel-collector/otel-cm-monarch-rejected-labels.yaml") nt.T.Log("Restart otel-collector pod to refresh the ConfigMap and log") - nomostest.DeletePodByLabel(nt, "app", ocmetrics.OpenTelemetry, false) - if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil { + nomostest.DeletePodByLabel(nt, "app", csmetrics.OpenTelemetry, false) + if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), csmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil { nt.T.Fatal(err) } nt.T.Log("Checking the otel-collector log contains failure...") _, err = retry.Retry(60*time.Second, func() error { - return validateDeploymentLogHasFailure(nt, ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace, MetricExportErrorCaption) + return validateDeploymentLogHasFailure(nt, csmetrics.OtelCollectorName, configmanagement.MonitoringNamespace, MetricExportErrorCaption) }) if err != nil { nt.T.Fatal(err) } nt.T.Log("Remove otel-collector ConfigMap that creates duplicated time series error") - nt.MustKubectl("delete", "cm", ocmetrics.OtelCollectorCustomCM, "-n", configmanagement.MonitoringNamespace, "--ignore-not-found") + nt.MustKubectl("delete", "cm", csmetrics.OtelCollectorCustomCM, "-n", configmanagement.MonitoringNamespace, "--ignore-not-found") nt.T.Log("Restart otel-collector pod to refresh the ConfigMap, log and IAM") - nomostest.DeletePodByLabel(nt, "app", ocmetrics.OpenTelemetry, false) - if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil { + nomostest.DeletePodByLabel(nt, "app", csmetrics.OpenTelemetry, false) + if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), csmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil { nt.T.Fatal(err) } @@ -179,32 +214,84 @@ func TestOtelCollectorDeployment(t *testing.T) { // retry for 2 minutes until metric is accessible from GCM _, err = retry.Retry(120*time.Second, func() error { - for _, metricType := range GCMMetricTypes { + var err error + for _, metricType := range DefaultGCMMetricTypes { descriptor := fmt.Sprintf("%s/%s", GCMMetricPrefix, metricType) it := listMetricInGCM(ctx, nt, client, startTime, descriptor) - return validateMetricInGCM(nt, it, descriptor, nt.ClusterName) + err = multierr.Append(err, validateMetricInGCM(nt, it, descriptor, nt.ClusterName)) } - return nil + return err }) if err != nil { nt.T.Fatal(err) } nt.T.Log("Checking the otel-collector log contains no failure...") - err = validateDeploymentLogHasNoFailure(nt, ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace, MetricExportErrorCaption) + err = validateDeploymentLogHasNoFailure(nt, csmetrics.OtelCollectorName, configmanagement.MonitoringNamespace, MetricExportErrorCaption) if err != nil { nt.T.Fatal(err) } nt.T.Log("Apply custom otel-collector ConfigMap that could cause Monarch label rejected error") nt.MustKubectl("apply", "-f", "../testdata/otel-collector/otel-cm-kustomize-rejected-labels.yaml") - if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil { + if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), csmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil { nt.T.Fatal(err) } nt.T.Log("Checking the otel-collector log contains failure...") _, err = retry.Retry(60*time.Second, func() error { - return validateDeploymentLogHasFailure(nt, ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace, UnrecognizedLabelErrorCaption) + return validateDeploymentLogHasFailure(nt, csmetrics.OtelCollectorName, configmanagement.MonitoringNamespace, UnrecognizedLabelErrorCaption) + }) + if err != nil { + nt.T.Fatal(err) + } +} + +func TestGCMMetrics(t *testing.T) { + nt := nomostest.New(t, + nomostesting.Reconciliation1, + ntopts.RequireGKE(t), + ntopts.Unstructured, + ) + nt.T.Cleanup(func() { + if t.Failed() { + nt.PodLogs("config-management-monitoring", csmetrics.OtelCollectorName, "", false) + } + }) + setupMetricsServiceAccount(nt) + nt.T.Cleanup(func() { + nt.MustKubectl("delete", "cm", csmetrics.OtelCollectorCustomCM, "-n", configmanagement.MonitoringNamespace, "--ignore-not-found") + nt.T.Log("Restart otel-collector pod to reset the ConfigMap and log") + nomostest.DeletePodByLabel(nt, "app", csmetrics.OpenTelemetry, false) + if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), csmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil { + nt.T.Errorf("otel-collector pod failed to come up after a restart: %v", err) + } + }) + + nt.T.Log("Apply custom otel-collector ConfigMap that exports full metric list to GCM") + nt.MustKubectl("apply", "-f", "../testdata/otel-collector/otel-cm-full-gcm.yaml") + nt.T.Log("Restart otel-collector pod to refresh the ConfigMap and log") + nomostest.DeletePodByLabel(nt, "app", csmetrics.OpenTelemetry, false) + if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), csmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil { + nt.T.Fatal(err) + } + + startTime := time.Now().UTC() + + nt.T.Log("Watch for full list of metrics in GCM, timeout 2 minutes") + ctx := nt.Context + client, err := createGCMClient(ctx) + if err != nil { + nt.T.Fatal(err) + } + _, err = retry.Retry(60*time.Second, func() error { + var err error + for _, metricType := range GCMMetricTypes { + descriptor := fmt.Sprintf("%s/%s", GCMMetricPrefix, metricType) + it := listMetricInGCM(ctx, nt, client, startTime, descriptor) + err = multierr.Append(err, validateMetricInGCM(nt, it, descriptor, nt.ClusterName)) + } + return err }) if err != nil { nt.T.Fatal(err) @@ -227,8 +314,8 @@ func TestOtelCollectorGCMLabelAggregation(t *testing.T) { setupMetricsServiceAccount(nt) nt.T.Log("Restarting the otel-collector pod to refresh the service account") - nomostest.DeletePodByLabel(nt, "app", ocmetrics.OpenTelemetry, false) - if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil { + nomostest.DeletePodByLabel(nt, "app", csmetrics.OpenTelemetry, false) + if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), csmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil { nt.T.Fatal(err) } @@ -244,10 +331,9 @@ func TestOtelCollectorGCMLabelAggregation(t *testing.T) { // The following metrics are sent to GCM and aggregated to remove the "commit" label. var metricsWithCommitLabel = []string{ - ocmetrics.LastSync.Name(), - ocmetrics.DeclaredResources.Name(), - ocmetrics.ApplyDuration.Name(), - // LastApply also has commit but is filtered by filter/cloudmonitoring. + csmetrics.LastSyncName, + csmetrics.DeclaredResourcesName, + csmetrics.ApplyDurationName, } nt.T.Log("Watch for metrics in GCM, timeout 2 minutes") @@ -258,13 +344,14 @@ func TestOtelCollectorGCMLabelAggregation(t *testing.T) { } // retry for 2 minutes until metric is accessible from GCM _, err = retry.Retry(120*time.Second, func() error { + var err error for _, metricType := range metricsWithCommitLabel { descriptor := fmt.Sprintf("%s/%s", GCMMetricPrefix, metricType) it := listMetricInGCM(ctx, nt, client, startTime, descriptor) - return validateMetricInGCM(nt, it, descriptor, nt.ClusterName, - metricDoesNotHaveLabel(metrics.KeyCommit.Name())) + err = multierr.Append(err, validateMetricInGCM(nt, it, descriptor, nt.ClusterName, + metricDoesNotHaveLabel(metrics.KeyCommit.Name()))) } - return nil + return err }) if err != nil { nt.T.Fatal(err) @@ -370,7 +457,7 @@ func listMetricInGCM(ctx context.Context, nt *nomostest.NT, client *monitoringv2 endTime := time.Now().UTC() req := &monitoringpb.ListTimeSeriesRequest{ Name: "projects/" + *e2e.GCPProject, - Filter: `metric.type="` + metricType + `" AND resource.labels.cluster_name="` + nt.ClusterName + `"`, + Filter: `metric.type="` + metricType + `" AND resource.labels.cluster_name="` + nt.ClusterName + `" AND resource.type="k8s_container"`, Interval: &monitoringpb.TimeInterval{ StartTime: ×tamp.Timestamp{ Seconds: startTime.Unix(), @@ -379,16 +466,16 @@ func listMetricInGCM(ctx context.Context, nt *nomostest.NT, client *monitoringv2 Seconds: endTime.Unix(), }, }, - View: monitoringpb.ListTimeSeriesRequest_HEADERS, + View: monitoringpb.ListTimeSeriesRequest_FULL, } return client.ListTimeSeries(ctx, req) } -type metricValidatorFunc func(*metric.Metric, *monitoredres.MonitoredResource) error +type metricValidatorFunc func(series *monitoringpb.TimeSeries) error func metricDoesNotHaveLabel(label string) metricValidatorFunc { - return func(_ *metric.Metric, r *monitoredres.MonitoredResource) error { - labels := r.GetLabels() + return func(series *monitoringpb.TimeSeries) error { + labels := series.GetResource().GetLabels() if value, found := labels[label]; found { return fmt.Errorf("expected metric to not have label, but found %s=%s", label, value) } @@ -415,7 +502,7 @@ func validateMetricInGCM(nt *nomostest.NT, it *monitoringv2.TimeSeriesIterator, labels := resource.GetLabels() if labels["cluster_name"] == clusterName { for _, valFn := range valFns { - if err := valFn(metric, resource); err != nil { + if err := valFn(resp); err != nil { return fmt.Errorf("GCM metric %s failed validation (cluster_name=%s): %w", metricType, nt.ClusterName, err) } } diff --git a/e2e/testdata/otel-collector/otel-cm-full-gcm.yaml b/e2e/testdata/otel-collector/otel-cm-full-gcm.yaml new file mode 100644 index 0000000000..f07a5fc06e --- /dev/null +++ b/e2e/testdata/otel-collector/otel-cm-full-gcm.yaml @@ -0,0 +1,326 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +data: + otel-collector-config.yaml: |- + receivers: + opencensus: + exporters: + prometheus: + endpoint: :8675 + namespace: config_sync + resource_to_telemetry_conversion: + enabled: true + googlecloud: + metric: + prefix: "custom.googleapis.com/opencensus/config_sync/" + # The exporter would always fail at sending metric descriptor. Skipping + # creation of metric descriptors until the error from upstream is resolved + # The metric streaming data is not affected + # https://github.com/GoogleCloudPlatform/opentelemetry-operations-go/issues/529 + skip_create_descriptor: true + # resource_filters looks for metric resource attributes by prefix and converts + # them into custom metric labels, so they become visible and can be accessed + # under the GroupBy dropdown list in Cloud Monitoring + resource_filters: + - prefix: "cloud.account.id" + - prefix: "cloud.availability.zone" + - prefix: "cloud.platform" + - prefix: "cloud.provider" + - prefix: "k8s.pod.ip" + - prefix: "k8s.pod.namespace" + - prefix: "k8s.pod.uid" + - prefix: "k8s.container.name" + - prefix: "host.id" + - prefix: "host.name" + - prefix: "k8s.deployment.name" + - prefix: "k8s.node.name" + sending_queue: + enabled: false + googlecloud/kubernetes: + metric: + prefix: "kubernetes.io/internal/addons/config_sync/" + # skip_create_descriptor: Metrics start with 'kubernetes.io/' have already + # got descriptors defined internally. Skip sending dupeicated metric + # descriptors here to prevent errors or conflicts. + skip_create_descriptor: true + # instrumentation_library_labels: Otel Collector by default attaches + # 'instrumentation_version' and 'instrumentation_source' labels that are + # not specified in our Cloud Monarch definitions, thus skipping them here + instrumentation_library_labels: false + # create_service_timeseries: This is a recommended configuration for + # 'service metrics' starts with 'kubernetes.io/' prefix. It uses + # CreateTimeSeries API and has its own quotas, so that custom metric write + # will not break this ingestion pipeline + create_service_timeseries: true + service_resource_labels: false + sending_queue: + enabled: false + processors: + batch: + # resourcedetection: This processor is needed to correctly mirror resource + # labels from OpenCensus to OpenTelemetry. We also want to keep this same + # processor in Otel Agent configuration as the resource labels are added from + # there + resourcedetection: + detectors: [env, gcp] + # Aggregate some metrics sent to Cloud Monitoring to remove high-cardinality labels (e.g. "commit") + metricstransform/cloudmonitoring: + transforms: + - include: last_sync_timestamp + action: update + operations: + - action: aggregate_labels + label_set: + - configsync.sync.kind + - configsync.sync.name + - configsync.sync.namespace + - status + aggregation_type: max + - include: declared_resources + action: update + operations: + - action: aggregate_labels + label_set: + - configsync.sync.kind + - configsync.sync.name + - configsync.sync.namespace + aggregation_type: max + - include: apply_duration_seconds + action: update + operations: + - action: aggregate_labels + label_set: + - configsync.sync.kind + - configsync.sync.name + - configsync.sync.namespace + - status + aggregation_type: max + # We do not test Kustomize metrics in GCM pipeline as they are excluded + # in production + filter/cloudmonitoring: + metrics: + exclude: + match_type: regexp + metric_names: + - kustomize.* + filter/kubernetes: + metrics: + include: + match_type: regexp + metric_names: + - kustomize.* + - api_duration_seconds + - reconciler_errors + - pipeline_error_observed + - reconcile_duration_seconds + - rg_reconcile_duration_seconds + - parser_duration_seconds + - declared_resources + - apply_operations_total + - apply_duration_seconds + - resource_fights_total + - remediate_duration_seconds + - resource_conflicts_total + - internal_errors_total + - kcc_resource_count + - last_sync_timestamp + # Transform the metrics so that their names and labels are aligned with definition in go/config-sync-monarch-metrics + metricstransform/kubernetes: + transforms: + - include: api_duration_seconds + action: update + operations: + - action: aggregate_labels + # label_set is the allowlist of labels to keep after aggregation + label_set: [status, operation] + aggregation_type: max + - include: declared_resources + action: update + new_name: current_declared_resources + operations: + - action: aggregate_labels + label_set: [] + aggregation_type: max + - include: kcc_resource_count + action: update + operations: + - action: aggregate_labels + label_set: [resourcegroup] + aggregation_type: max + - include: reconciler_errors + action: update + new_name: last_reconciler_errors + operations: + - action: aggregate_labels + label_set: [component, errorclass] + aggregation_type: max + - include: reconcile_duration_seconds + action: update + operations: + - action: aggregate_labels + label_set: [status] + aggregation_type: max + - include: rg_reconcile_duration_seconds + action: update + operations: + - action: aggregate_labels + label_set: [stallreason] + aggregation_type: max + - include: last_sync_timestamp + action: update + operations: + - action: aggregate_labels + label_set: [status] + aggregation_type: max + - include: parser_duration_seconds + action: update + operations: + - action: aggregate_labels + label_set: [status, source, trigger] + aggregation_type: max + - include: pipeline_error_observed + action: update + new_name: last_pipeline_error_observed + operations: + - action: aggregate_labels + label_set: [name, component, reconciler] + aggregation_type: max + - include: apply_operations_total + action: update + new_name: apply_operations_count + operations: + - action: aggregate_labels + label_set: [controller, operation, status] + aggregation_type: max + - include: apply_duration_seconds + action: update + operations: + - action: aggregate_labels + label_set: [status] + aggregation_type: max + - include: resource_fights_total + action: update + new_name: resource_fights_count + operations: + - action: aggregate_labels + label_set: [name, component, reconciler] + aggregation_type: max + - include: resource_conflicts_total + action: update + new_name: resource_conflicts_count + operations: + - action: aggregate_labels + label_set: [] + aggregation_type: max + - include: internal_errors_total + action: update + new_name: internal_errors_count + operations: + - action: aggregate_labels + label_set: [] + aggregation_type: max + - include: remediate_duration_seconds + action: update + operations: + - action: aggregate_labels + label_set: [status] + aggregation_type: max + - include: kustomize_field_count + action: update + operations: + - action: aggregate_labels + label_set: [field_name] + aggregation_type: max + - include: kustomize_deprecating_field_count + action: update + operations: + - action: aggregate_labels + label_set: [deprecating_field] + aggregation_type: max + - include: kustomize_simplification_adoption_count + action: update + operations: + - action: aggregate_labels + label_set: [simplification_field] + aggregation_type: max + - include: kustomize_builtin_transformers + action: update + operations: + - action: aggregate_labels + label_set: [k8s_metadata_transformer] + aggregation_type: max + - include: kustomize_helm_inflator_count + action: update + operations: + - action: aggregate_labels + label_set: [helm_inflator] + aggregation_type: max + - include: kustomize_base_count + action: update + operations: + - action: aggregate_labels + label_set: [base_source] + aggregation_type: max + - include: kustomize_patch_count + action: update + operations: + - action: aggregate_labels + label_set: [patch_field] + aggregation_type: max + - include: kustomize_ordered_top_tier_metrics + action: update + operations: + - action: aggregate_labels + label_set: [top_tier_field] + aggregation_type: max + - include: kustomize_resource_count + action: update + operations: + - action: aggregate_labels + label_set: [] + aggregation_type: max + - include: kustomize_build_latency + action: update + operations: + - action: aggregate_labels + label_set: [] + aggregation_type: max + extensions: + health_check: + service: + extensions: [health_check] + pipelines: + metrics/cloudmonitoring: + receivers: [opencensus] + processors: [batch, filter/cloudmonitoring, metricstransform/cloudmonitoring, resourcedetection] + exporters: [googlecloud] + metrics/prometheus: + receivers: [opencensus] + processors: [batch] + exporters: [prometheus] + metrics/kubernetes: + receivers: [opencensus] + processors: [batch, filter/kubernetes, metricstransform/kubernetes, resourcedetection] + exporters: [googlecloud/kubernetes] +kind: ConfigMap +metadata: + labels: + app: opentelemetry + component: otel-collector + configmanagement.gke.io/arch: csmr + configmanagement.gke.io/system: "true" + name: otel-collector-custom + namespace: config-management-monitoring diff --git a/go.mod b/go.mod index 8b670a878b..2aad867d79 100644 --- a/go.mod +++ b/go.mod @@ -38,7 +38,6 @@ require ( golang.org/x/net v0.24.0 golang.org/x/oauth2 v0.10.0 google.golang.org/api v0.126.0 - google.golang.org/genproto/googleapis/api v0.0.0-20230726155614-23370e0ffb3e gopkg.in/yaml.v3 v3.0.1 k8s.io/api v0.28.9 k8s.io/apiextensions-apiserver v0.28.9 @@ -145,6 +144,7 @@ require ( gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/appengine v1.6.7 // indirect google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20230726155614-23370e0ffb3e // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20230822172742-b8732ec3820d // indirect google.golang.org/grpc v1.58.3 // indirect google.golang.org/protobuf v1.33.0 // indirect diff --git a/pkg/metrics/otel.go b/pkg/metrics/otel.go index 656b9e058e..d09d0633b2 100644 --- a/pkg/metrics/otel.go +++ b/pkg/metrics/otel.go @@ -96,7 +96,9 @@ processors: filter/cloudmonitoring: metrics: include: - match_type: regexp + # Use strict match type to ensure metrics like 'kustomize_resource_count' + # is excluded + match_type: strict metric_names: - reconciler_errors - apply_duration_seconds @@ -129,7 +131,6 @@ processors: aggregation_type: max - include: declared_resources action: update - new_name: current_declared_resources operations: - action: aggregate_labels label_set: @@ -180,7 +181,6 @@ processors: aggregation_type: max - include: declared_resources action: update - new_name: current_declared_resources operations: - action: aggregate_labels label_set: [] diff --git a/pkg/reconcilermanager/controllers/otel_controller_test.go b/pkg/reconcilermanager/controllers/otel_controller_test.go index 662a43f51a..b9746ca786 100644 --- a/pkg/reconcilermanager/controllers/otel_controller_test.go +++ b/pkg/reconcilermanager/controllers/otel_controller_test.go @@ -47,7 +47,7 @@ const ( // otel-collector ConfigMap. // See `CollectorConfigGooglecloud` in `pkg/metrics/otel.go` // Used by TestOtelReconcilerGooglecloud. - depAnnotationGooglecloud = "4e13c2af229e38d6c85eb7c818ad181f" + depAnnotationGooglecloud = "9ede45bd37596a48556f6dbba3e9fdb3" // depAnnotationGooglecloud is the expected hash of the custom // otel-collector ConfigMap test artifact. // Used by TestOtelReconcilerCustom.