From 5e7ffdebd9b34d3e085aa6d6525f146bb30a5232 Mon Sep 17 00:00:00 2001
From: Krisztian Litkey <krisztian.litkey@intel.com>
Date: Tue, 12 Nov 2024 00:12:08 +0200
Subject: [PATCH] policy: rework policy/backend metrics interface.

Simplify the policy-backend metrics collection interface,
reducing it to a single GetMetrics() call and a returned
Metrics interface which simply implements the collector-
like Describe() and Collect() interfaces. Update policy
implementations accordingly.

Signed-off-by: Krisztian Litkey <krisztian.litkey@intel.com>
---
 cmd/plugins/balloons/policy/metrics.go        | 31 +++++++--------
 .../template/policy/template-policy.go        | 26 ++++++-------
 .../policy/topology-aware-policy.go           | 39 +++++++++++--------
 pkg/resmgr/policy/metrics.go                  | 16 +-------
 pkg/resmgr/policy/policy.go                   | 14 +++----
 5 files changed, 57 insertions(+), 69 deletions(-)

diff --git a/cmd/plugins/balloons/policy/metrics.go b/cmd/plugins/balloons/policy/metrics.go
index 3a5868d09..566be8dee 100644
--- a/cmd/plugins/balloons/policy/metrics.go
+++ b/cmd/plugins/balloons/policy/metrics.go
@@ -91,14 +91,7 @@ type BalloonMetrics struct {
 	ContainerReqMilliCpus int
 }
 
-// DescribeMetrics generates policy-specific prometheus metrics data
-// descriptors.
-func (p *balloons) DescribeMetrics() []*prometheus.Desc {
-	return descriptors
-}
-
-// PollMetrics provides policy metrics for monitoring.
-func (p *balloons) PollMetrics() policy.Metrics {
+func (p *balloons) GetMetrics() policy.Metrics {
 	policyMetrics := &Metrics{}
 	policyMetrics.Balloons = make([]*BalloonMetrics, len(p.balloons))
 	for index, bln := range p.balloons {
@@ -150,16 +143,19 @@ func (p *balloons) PollMetrics() policy.Metrics {
 	return policyMetrics
 }
 
-// CollectMetrics generates prometheus metrics from cached/polled
-// policy-specific metrics data.
-func (p *balloons) CollectMetrics(m policy.Metrics) ([]prometheus.Metric, error) {
-	metrics, ok := m.(*Metrics)
-	if !ok {
-		return nil, balloonsError("type mismatch in balloons metrics")
+func (m *Metrics) Describe(ch chan<- *prometheus.Desc) {
+	for _, d := range descriptors {
+		ch <- d
+	}
+}
+
+func (m *Metrics) Collect(ch chan<- prometheus.Metric) {
+	if m == nil {
+		return
 	}
-	promMetrics := make([]prometheus.Metric, len(metrics.Balloons))
-	for index, bm := range metrics.Balloons {
-		promMetrics[index] = prometheus.MustNewConstMetric(
+
+	for _, bm := range m.Balloons {
+		ch <- prometheus.MustNewConstMetric(
 			descriptors[balloonsDesc],
 			prometheus.GaugeValue,
 			float64(bm.Cpus.Size()),
@@ -185,5 +181,4 @@ func (p *balloons) CollectMetrics(m policy.Metrics) ([]prometheus.Metric, error)
 			bm.ContainerNames,
 			strconv.Itoa(bm.ContainerReqMilliCpus))
 	}
-	return promMetrics, nil
 }
diff --git a/cmd/plugins/template/policy/template-policy.go b/cmd/plugins/template/policy/template-policy.go
index bc74873ce..79464fbeb 100644
--- a/cmd/plugins/template/policy/template-policy.go
+++ b/cmd/plugins/template/policy/template-policy.go
@@ -116,19 +116,9 @@ func (p *policy) HandleEvent(e *events.Policy) (bool, error) {
 	return true, nil
 }
 
-// DescribeMetrics generates policy-specific prometheus metrics data descriptors.
-func (p *policy) DescribeMetrics() []*prometheus.Desc {
-	return nil
-}
-
-// PollMetrics provides policy metrics for monitoring.
-func (p *policy) PollMetrics() policyapi.Metrics {
-	return nil
-}
-
-// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data.
-func (p *policy) CollectMetrics(policyapi.Metrics) ([]prometheus.Metric, error) {
-	return nil, nil
+// GetMetrics returns the policy-specific metrics collector.
+func (p *policy) GetMetrics() policyapi.Metrics {
+	return &NoMetrics{}
 }
 
 // GetTopologyZones returns the policy/pool data for 'topology zone' CRDs.
@@ -145,3 +135,13 @@ func (p *policy) ExportResourceData(c cache.Container) map[string]string {
 func (p *policy) initialize() error {
 	return nil
 }
+
+type NoMetrics struct{}
+
+func (*NoMetrics) Describe(chan<- *prometheus.Desc) {
+	return
+}
+
+func (*NoMetrics) Collect(chan<- prometheus.Metric) {
+	return
+}
diff --git a/cmd/plugins/topology-aware/policy/topology-aware-policy.go b/cmd/plugins/topology-aware/policy/topology-aware-policy.go
index c0b07ce14..6a3d4b16b 100644
--- a/cmd/plugins/topology-aware/policy/topology-aware-policy.go
+++ b/cmd/plugins/topology-aware/policy/topology-aware-policy.go
@@ -19,9 +19,8 @@ import (
 	"fmt"
 
 	"github.com/containers/nri-plugins/pkg/utils/cpuset"
-	"k8s.io/apimachinery/pkg/api/resource"
-
 	"github.com/prometheus/client_golang/prometheus"
+	"k8s.io/apimachinery/pkg/api/resource"
 
 	cfgapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy/topologyaware"
 	"github.com/containers/nri-plugins/pkg/cpuallocator"
@@ -68,6 +67,7 @@ type policy struct {
 	cpuAllocator cpuallocator.CPUAllocator // CPU allocator used by the policy
 	memAllocator *libmem.Allocator
 	coldstartOff bool // coldstart forced off (have movable PMEM zones)
+	metrics      *TopologyAwareMetrics
 }
 
 var opt = &cfgapi.Config{}
@@ -115,6 +115,8 @@ func (p *policy) Setup(opts *policyapi.BackendOptions) error {
 		return policyError("failed to initialize %s policy: %w", PolicyName, err)
 	}
 
+	p.metrics = p.NewTopologyAwareMetrics()
+
 	log.Info("***** default CPU priority is %s", defaultPrio)
 
 	return nil
@@ -306,21 +308,6 @@ func (p *policy) HandleEvent(e *events.Policy) (bool, error) {
 	return false, nil
 }
 
-// DescribeMetrics generates policy-specific prometheus metrics data descriptors.
-func (p *policy) DescribeMetrics() []*prometheus.Desc {
-	return nil
-}
-
-// PollMetrics provides policy metrics for monitoring.
-func (p *policy) PollMetrics() policyapi.Metrics {
-	return nil
-}
-
-// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data.
-func (p *policy) CollectMetrics(policyapi.Metrics) ([]prometheus.Metric, error) {
-	return nil, nil
-}
-
 // GetTopologyZones returns the policy/pool data for 'topology zone' CRDs.
 func (p *policy) GetTopologyZones() []*policyapi.TopologyZone {
 	zones := []*policyapi.TopologyZone{}
@@ -435,6 +422,24 @@ func (p *policy) ExportResourceData(c cache.Container) map[string]string {
 	return data
 }
 
+func (p *policy) GetMetrics() policyapi.Metrics {
+	return p.metrics
+}
+
+func (p *policy) NewTopologyAwareMetrics() *TopologyAwareMetrics {
+	return &TopologyAwareMetrics{}
+}
+
+type TopologyAwareMetrics struct{}
+
+func (*TopologyAwareMetrics) Describe(ch chan<- *prometheus.Desc) {
+	return
+}
+
+func (*TopologyAwareMetrics) Collect(ch chan<- prometheus.Metric) {
+	return
+}
+
 // reallocateResources reallocates the given containers using the given pool hints
 func (p *policy) reallocateResources(containers []cache.Container, pools map[string]string) error {
 	errs := []error{}
diff --git a/pkg/resmgr/policy/metrics.go b/pkg/resmgr/policy/metrics.go
index 19eb09169..12ff57d3e 100644
--- a/pkg/resmgr/policy/metrics.go
+++ b/pkg/resmgr/policy/metrics.go
@@ -35,21 +35,9 @@ func (c *PolicyCollector) register() error {
 }
 
 func (c *PolicyCollector) Describe(ch chan<- *prometheus.Desc) {
-	for _, d := range c.policy.active.DescribeMetrics() {
-		ch <- d
-	}
+	c.policy.active.GetMetrics().Describe(ch)
 }
 
 func (c *PolicyCollector) Collect(ch chan<- prometheus.Metric) {
-	polled := c.policy.active.PollMetrics()
-
-	collected, err := c.policy.active.CollectMetrics(polled)
-	if err != nil {
-		log.Error("failed to collect metrics: %v", err)
-		return
-	}
-
-	for _, m := range collected {
-		ch <- m
-	}
+	c.policy.active.GetMetrics().Collect(ch)
 }
diff --git a/pkg/resmgr/policy/policy.go b/pkg/resmgr/policy/policy.go
index 2ecdcb29a..5b7901e4f 100644
--- a/pkg/resmgr/policy/policy.go
+++ b/pkg/resmgr/policy/policy.go
@@ -115,12 +115,8 @@ type Backend interface {
 	HandleEvent(*events.Policy) (bool, error)
 	// ExportResourceData provides resource data to export for the container.
 	ExportResourceData(cache.Container) map[string]string
-	// DescribeMetrics generates policy-specific prometheus metrics data descriptors.
-	DescribeMetrics() []*prometheus.Desc
-	// PollMetrics provides policy metrics for monitoring.
-	PollMetrics() Metrics
-	// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data.
-	CollectMetrics(Metrics) ([]prometheus.Metric, error)
+	// GetMetrics returns the policy-specific metrics collector.
+	GetMetrics() Metrics
 	// GetTopologyZones returns the policy/pool data for 'topology zone' CRDs.
 	GetTopologyZones() []*TopologyZone
 }
@@ -151,7 +147,11 @@ type Policy interface {
 	GetTopologyZones() []*TopologyZone
 }
 
-type Metrics interface{}
+// Metrics is the interface we expect policy-specific metrics to implement.
+type Metrics interface {
+	Describe(chan<- *prometheus.Desc)
+	Collect(chan<- prometheus.Metric)
+}
 
 // Node resource topology resource and attribute names.
 // XXX TODO(klihub): We'll probably need to add similar unified consts