From 32a26a455f8515c2dc4620a02b49b4b96c58975b Mon Sep 17 00:00:00 2001 From: Catherine Fang Date: Wed, 10 Jan 2024 09:36:54 -0500 Subject: [PATCH] Fix OOM issue and "http2: stream closed" issue by returning empty ListCustomMetrics --- custom-metrics-stackdriver-adapter/Makefile | 2 +- custom-metrics-stackdriver-adapter/adapter.go | 6 +++++- .../pkg/adapter/provider/provider.go | 11 +++++++++-- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/custom-metrics-stackdriver-adapter/Makefile b/custom-metrics-stackdriver-adapter/Makefile index f8318ab66..8a0373d2b 100644 --- a/custom-metrics-stackdriver-adapter/Makefile +++ b/custom-metrics-stackdriver-adapter/Makefile @@ -3,7 +3,7 @@ GOOS?=linux OUT_DIR?=build PACKAGE=github.com/GoogleCloudPlatform/k8s-stackdriver/custom-metrics-stackdriver-adapter PREFIX?=staging-k8s.gcr.io -TAG = v0.13.1 +TAG = v0.14.0 PKG := $(shell find pkg/* -type f) .PHONY: build docker push test clean diff --git a/custom-metrics-stackdriver-adapter/adapter.go b/custom-metrics-stackdriver-adapter/adapter.go index e72f70db6..e9afd9ac5 100644 --- a/custom-metrics-stackdriver-adapter/adapter.go +++ b/custom-metrics-stackdriver-adapter/adapter.go @@ -110,7 +110,7 @@ func (sa *StackdriverAdapter) makeProviderOrDie(o *stackdriverAdapterServerOptio conf.GenericConfig.EnableMetrics = true translator := translator.NewTranslator(stackdriverService, gceConf, rateInterval, alignmentPeriod, mapper, o.UseNewResourceModel, o.EnableDistributionSupport) - return adapter.NewStackdriverProvider(client, mapper, gceConf, stackdriverService, translator, rateInterval, o.UseNewResourceModel, o.FallbackForContainerMetrics), translator + return adapter.NewStackdriverProvider(client, mapper, gceConf, stackdriverService, translator, rateInterval, o.UseNewResourceModel, o.FallbackForContainerMetrics, o.SupportListCustomMetrics), translator } func (sa *StackdriverAdapter) withCoreMetrics(translator *translator.Translator) error { @@ -154,6 +154,7 @@ func main() { FallbackForContainerMetrics: false, EnableCoreMetricsAPI: false, EnableDistributionSupport: false, + SupportListCustomMetrics: false, } flags.BoolVar(&serverOptions.UseNewResourceModel, "use-new-resource-model", serverOptions.UseNewResourceModel, @@ -166,6 +167,8 @@ func main() { "If true, fallbacks to k8s_container resource when given metric is not present on k8s_pod. At most one container with given metric is allowed for each pod.") flags.BoolVar(&serverOptions.EnableCoreMetricsAPI, "enable-core-metrics-api", serverOptions.EnableCoreMetricsAPI, "Experimental, do not use. Whether to enable Core Metrics API.") + flags.BoolVar(&serverOptions.SupportListCustomMetrics, "support-list-custom-metrics", serverOptions.SupportListCustomMetrics, + "whether to enable External Metrics API") flags.StringVar(&serverOptions.MetricsAddress, "metrics-address", "", "Endpoint with port on which Prometheus metrics server should be enabled. Example: localhost:8080. If there is no flag, Prometheus metric server is disabled and monitoring metrics are not collected.") flags.StringVar(&serverOptions.StackdriverEndpoint, "stackdriver-endpoint", "", @@ -175,6 +178,7 @@ func main() { flags.Parse(os.Args) + klog.Info("serverOptions: ", serverOptions) if !serverOptions.UseNewResourceModel && serverOptions.FallbackForContainerMetrics { klog.Fatalf("Container metrics work only with new resource model") } diff --git a/custom-metrics-stackdriver-adapter/pkg/adapter/provider/provider.go b/custom-metrics-stackdriver-adapter/pkg/adapter/provider/provider.go index c3fcf52e3..f54157f45 100644 --- a/custom-metrics-stackdriver-adapter/pkg/adapter/provider/provider.go +++ b/custom-metrics-stackdriver-adapter/pkg/adapter/provider/provider.go @@ -59,10 +59,11 @@ type StackdriverProvider struct { metricsCacheSet bool metricsCache []provider.CustomMetricInfo fallbackForContainerMetrics bool + supportListCustomMetrics bool } // NewStackdriverProvider creates a StackdriverProvider -func NewStackdriverProvider(kubeClient *corev1.CoreV1Client, mapper apimeta.RESTMapper, gceConf *config.GceConfig, stackdriverService *stackdriver.Service, translator *translator.Translator, rateInterval time.Duration, useNewResourceModel bool, fallbackForContainerMetrics bool) provider.MetricsProvider { +func NewStackdriverProvider(kubeClient *corev1.CoreV1Client, mapper apimeta.RESTMapper, gceConf *config.GceConfig, stackdriverService *stackdriver.Service, translator *translator.Translator, rateInterval time.Duration, useNewResourceModel bool, fallbackForContainerMetrics bool, supportListCustomMetrics bool) provider.MetricsProvider { return &StackdriverProvider{ kubeClient: kubeClient, stackdriverService: stackdriverService, @@ -71,6 +72,7 @@ func NewStackdriverProvider(kubeClient *corev1.CoreV1Client, mapper apimeta.REST translator: translator, useNewResourceModel: useNewResourceModel, fallbackForContainerMetrics: fallbackForContainerMetrics, + supportListCustomMetrics: supportListCustomMetricsm, } } @@ -309,8 +311,13 @@ func (p *StackdriverProvider) getNamespacedMetricBySelector(groupResource schema } // ListAllMetrics returns all custom metrics available from Stackdriver. -// List only pod metrics func (p *StackdriverProvider) ListAllMetrics() []provider.CustomMetricInfo { + // This can reduce memory usage significantly and ListAllMetrics is not in HPA. + if !p.supportListCustomMetric { + return []provider.CustomMetricInfo{} + } + + // List only pod metrics p.mu.Lock() defer p.mu.Unlock() if !p.metricsCacheSet {