Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[stacked 1/5] metrics: cleanup metrics registration, collection and gathering. #403

Merged
merged 6 commits into from
Nov 20, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion config/crd/bases/config.nri_balloonspolicies.yaml
Original file line number Diff line number Diff line change
@@ -320,11 +320,35 @@ spec:
to expose Prometheus metrics among other things.
example: :8891
type: string
metrics:
default:
enabled:
- policy
description: Metrics defines which metrics to collect.
properties:
enabled:
description: Enabled enables collection for metrics matched
by glob patterns.
example:
- '*'
items:
type: string
type: array
polled:
description: Polled forces polled collection for metrics matched
by glob patterns.
example:
- computationally-expensive-metrics
items:
type: string
type: array
type: object
prometheusExport:
description: PrometheusExport enables exporting /metrics for Prometheus.
type: boolean
reportPeriod:
description: ReportPeriod is the interval between reporting aggregated
default: 30s
description: ReportPeriod is the interval between collecting polled
metrics.
format: duration
type: string
26 changes: 25 additions & 1 deletion config/crd/bases/config.nri_templatepolicies.yaml
Original file line number Diff line number Diff line change
@@ -92,11 +92,35 @@ spec:
to expose Prometheus metrics among other things.
example: :8891
type: string
metrics:
default:
enabled:
- policy
description: Metrics defines which metrics to collect.
properties:
enabled:
description: Enabled enables collection for metrics matched
by glob patterns.
example:
- '*'
items:
type: string
type: array
polled:
description: Polled forces polled collection for metrics matched
by glob patterns.
example:
- computationally-expensive-metrics
items:
type: string
type: array
type: object
prometheusExport:
description: PrometheusExport enables exporting /metrics for Prometheus.
type: boolean
reportPeriod:
description: ReportPeriod is the interval between reporting aggregated
default: 30s
description: ReportPeriod is the interval between collecting polled
metrics.
format: duration
type: string
26 changes: 25 additions & 1 deletion config/crd/bases/config.nri_topologyawarepolicies.yaml
Original file line number Diff line number Diff line change
@@ -119,11 +119,35 @@ spec:
to expose Prometheus metrics among other things.
example: :8891
type: string
metrics:
default:
enabled:
- policy
description: Metrics defines which metrics to collect.
properties:
enabled:
description: Enabled enables collection for metrics matched
by glob patterns.
example:
- '*'
items:
type: string
type: array
polled:
description: Polled forces polled collection for metrics matched
by glob patterns.
example:
- computationally-expensive-metrics
items:
type: string
type: array
type: object
prometheusExport:
description: PrometheusExport enables exporting /metrics for Prometheus.
type: boolean
reportPeriod:
description: ReportPeriod is the interval between reporting aggregated
default: 30s
description: ReportPeriod is the interval between collecting polled
metrics.
format: duration
type: string
26 changes: 25 additions & 1 deletion deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml
Original file line number Diff line number Diff line change
@@ -320,11 +320,35 @@ spec:
to expose Prometheus metrics among other things.
example: :8891
type: string
metrics:
default:
enabled:
- policy
description: Metrics defines which metrics to collect.
properties:
enabled:
description: Enabled enables collection for metrics matched
by glob patterns.
example:
- '*'
items:
type: string
type: array
polled:
description: Polled forces polled collection for metrics matched
by glob patterns.
example:
- computationally-expensive-metrics
items:
type: string
type: array
type: object
prometheusExport:
description: PrometheusExport enables exporting /metrics for Prometheus.
type: boolean
reportPeriod:
description: ReportPeriod is the interval between reporting aggregated
default: 30s
description: ReportPeriod is the interval between collecting polled
metrics.
format: duration
type: string
26 changes: 25 additions & 1 deletion deployment/helm/template/crds/config.nri_templatepolicies.yaml
Original file line number Diff line number Diff line change
@@ -92,11 +92,35 @@ spec:
to expose Prometheus metrics among other things.
example: :8891
type: string
metrics:
default:
enabled:
- policy
description: Metrics defines which metrics to collect.
properties:
enabled:
description: Enabled enables collection for metrics matched
by glob patterns.
example:
- '*'
items:
type: string
type: array
polled:
description: Polled forces polled collection for metrics matched
by glob patterns.
example:
- computationally-expensive-metrics
items:
type: string
type: array
type: object
prometheusExport:
description: PrometheusExport enables exporting /metrics for Prometheus.
type: boolean
reportPeriod:
description: ReportPeriod is the interval between reporting aggregated
default: 30s
description: ReportPeriod is the interval between collecting polled
metrics.
format: duration
type: string
Original file line number Diff line number Diff line change
@@ -119,11 +119,35 @@ spec:
to expose Prometheus metrics among other things.
example: :8891
type: string
metrics:
default:
enabled:
- policy
description: Metrics defines which metrics to collect.
properties:
enabled:
description: Enabled enables collection for metrics matched
by glob patterns.
example:
- '*'
items:
type: string
type: array
polled:
description: Polled forces polled collection for metrics matched
by glob patterns.
example:
- computationally-expensive-metrics
items:
type: string
type: array
type: object
prometheusExport:
description: PrometheusExport enables exporting /metrics for Prometheus.
type: boolean
reportPeriod:
description: ReportPeriod is the interval between reporting aggregated
default: 30s
description: ReportPeriod is the interval between collecting polled
metrics.
format: duration
type: string
10 changes: 5 additions & 5 deletions docs/resource-policy/developers-guide/architecture.md
Original file line number Diff line number Diff line change
@@ -162,11 +162,11 @@ for post-policy enforcement of decisions.

### [Metrics Collector](tree:/pkg/metrics/)

The metrics collector gathers a set of runtime metrics about the containers
running on the node. NRI-RP can be configured to periodically evaluate this
collected data to determine how optimal the current assignment of container
resources is and to attempt a rebalancing/reallocation if it is deemed
both possible and necessary.
The metrics collector gathers a set of runtime metrics about system resources,
containers running on the node, and policy-specific resource assignments and
expose these as Prometheus metrics. This data can be externally evaluated and
used to trigger rebalancing of resources if the NRI-RP implementation provides
a (policy-specific) external interface for this.

### [Policy Implementations](tree:/cmd/plugins)

9 changes: 6 additions & 3 deletions docs/resource-policy/policy/balloons.md
Original file line number Diff line number Diff line change
@@ -264,13 +264,13 @@ Balloons policy parameters:
- `prometheusExport`: if set to True, balloons with their CPUs
and assigned containers are readable through `/metrics` from the
httpEndpoint.
- `reportPeriod`: `/metrics` aggregation interval.
- `reportPeriod`: `/metrics` aggregation interval for polled metrics.

### Example

Example configuration that runs all pods in balloons of 1-4
CPUs. Instrumentation enables reading CPUs and containers in balloons
from `http://localhost:8891/metrics`.
from `http://$localhost_or_pod_IP:8891/metrics`.

```yaml
apiVersion: config.nri/v1alpha1
@@ -413,9 +413,12 @@ nri-resource-policy global config:
instrumentation:
# The balloons policy exports containers running in each balloon,
# and cpusets of balloons. Accessible in command line:
# curl --silent http://localhost:8891/metrics
# curl --silent http://$localhost_or_pod_IP:8891/metrics
HTTPEndpoint: :8891
PrometheusExport: true
metrics:
enabled: # use '*' instead for all available metrics
- policy
logger:
Debug: policy
```
8 changes: 1 addition & 7 deletions go.mod
Original file line number Diff line number Diff line change
@@ -3,7 +3,6 @@ module github.com/containers/nri-plugins
go 1.22

require (
contrib.go.opencensus.io/exporter/prometheus v0.4.2
github.com/containerd/nri v0.6.0
github.com/containerd/otelttrpc v0.0.0-20240305015340-ea5083fda723
github.com/containerd/ttrpc v1.2.3-0.20231030150553-baadfd8e7956
@@ -17,10 +16,8 @@ require (
github.com/pelletier/go-toml/v2 v2.1.0
github.com/prometheus/client_golang v1.17.0
github.com/prometheus/client_model v0.5.0
github.com/prometheus/common v0.44.0
github.com/sirupsen/logrus v1.9.3
github.com/stretchr/testify v1.8.4
go.opencensus.io v0.24.0
go.opentelemetry.io/otel v1.19.0
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.19.0
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0
@@ -44,8 +41,6 @@ require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/emicklei/go-restful/v3 v3.9.0 // indirect
github.com/evanphx/json-patch v5.6.0+incompatible // indirect
github.com/go-kit/log v0.2.1 // indirect
github.com/go-logfmt/logfmt v0.6.0 // indirect
github.com/go-logr/logr v1.4.1 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-openapi/jsonpointer v0.19.6 // indirect
@@ -54,7 +49,6 @@ require (
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect
github.com/godbus/dbus/v5 v5.0.4 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/google/gnostic-models v0.6.8 // indirect
github.com/google/go-cmp v0.6.0 // indirect
@@ -73,8 +67,8 @@ require (
github.com/opencontainers/runtime-spec v1.1.0 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/common v0.44.0 // indirect
github.com/prometheus/procfs v0.12.0 // indirect
github.com/prometheus/statsd_exporter v0.24.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.19.0 // indirect
go.opentelemetry.io/otel/metric v1.19.0 // indirect
90 changes: 0 additions & 90 deletions go.sum

Large diffs are not rendered by default.

8 changes: 7 additions & 1 deletion pkg/apis/config/v1alpha1/instrumentation/config.go
Original file line number Diff line number Diff line change
@@ -15,10 +15,12 @@
package instrumentation

import (
"github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/metrics"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// Config provides runtime configuration for instrumentation.
// +k8s:deepcopy-gen=true
type Config struct {
// SamplingRatePerMillion is the number of samples to collect per million spans.
// +optional
@@ -33,9 +35,10 @@ type Config struct {
// +optional
// +kubebuilder:example="otlp-http://localhost:4318"
TracingCollector string `json:"tracingCollector,omitempty"`
// ReportPeriod is the interval between reporting aggregated metrics.
// ReportPeriod is the interval between collecting polled metrics.
// +optional
// +kubebuilder:validation:Format="duration"
// +kubebuilder:default="30s"
ReportPeriod metav1.Duration `json:"reportPeriod,omitempty"`
// HTTPEndpoint is the address our HTTP server listens on. This endpoint is used
// to expose Prometheus metrics among other things.
@@ -45,4 +48,7 @@ type Config struct {
// PrometheusExport enables exporting /metrics for Prometheus.
// +optional
PrometheusExport bool `json:"prometheusExport,omitempty"`
// Metrics defines which metrics to collect.
// +kubebuilder:default={"enabled": {"policy"}}
Metrics *metrics.Config `json:"metrics,omitempty"`
}
44 changes: 44 additions & 0 deletions pkg/apis/config/v1alpha1/instrumentation/zz_generated.deepcopy.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020 Intel Corporation. All Rights Reserved.
// Copyright The NRI Plugins Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -14,21 +14,15 @@

package metrics

import (
model "github.com/prometheus/client_model/go"
)

// Gather is our prometheus.Gatherer interface for proxying metrics.
func (m *Metrics) Gather() ([]*model.MetricFamily, error) {
m.Lock()
pend := m.pend
m.Unlock()

if pend == nil {
log.Debug("no data to proxy to prometheus...")
} else {
log.Debug("proxying data to prometheus...")
}

return pend, nil
// Config provides runtime configuration for metrics collection.
// +k8s:deepcopy-gen=true
fmuyassarov marked this conversation as resolved.
Show resolved Hide resolved
type Config struct {
// Enabled enables collection for metrics matched by glob patterns.
// +optional
// +kubebuilder:example={"*"}
Enabled []string `json:"enabled,omitempty"`
// Polled forces polled collection for metrics matched by glob patterns.
// +optional
// +kubebuilder:example={"computationally-expensive-metrics"}
Polled []string `json:"polled,omitempty"`
}
46 changes: 46 additions & 0 deletions pkg/apis/config/v1alpha1/metrics/zz_generated.deepcopy.go
8 changes: 4 additions & 4 deletions pkg/apis/config/v1alpha1/zz_generated.deepcopy.go
22 changes: 8 additions & 14 deletions pkg/cgroupstats/collector.go
Original file line number Diff line number Diff line change
@@ -25,7 +25,6 @@ import (

"github.com/containers/nri-plugins/pkg/cgroups"
logger "github.com/containers/nri-plugins/pkg/log"
"github.com/containers/nri-plugins/pkg/metrics"
"github.com/prometheus/client_golang/prometheus"
)

@@ -42,7 +41,7 @@ const (

var descriptors = [numDescriptors]*prometheus.Desc{
numaStatsDesc: prometheus.NewDesc(
"cgroup_numa_stats",
"numa_stats",
"NUMA statistics for a given container and pod.",
[]string{
// cgroup path
@@ -54,22 +53,22 @@ var descriptors = [numDescriptors]*prometheus.Desc{
}, nil,
),
memoryUsageDesc: prometheus.NewDesc(
"cgroup_memory_usage",
"memory_usage",
"Memory usage statistics for a given container and pod.",
[]string{
"container_id",
"type",
}, nil,
),
memoryMigrateDesc: prometheus.NewDesc(
"cgroup_memory_migrate",
"memory_migrate",
"Memory migrate status for a given container and pod.",
[]string{
"container_id",
}, nil,
),
cpuAcctUsageDesc: prometheus.NewDesc(
"cgroup_cpu_acct",
"cpu_acct",
"CPU accounting for a given container and pod.",
[]string{
"container_id",
@@ -79,7 +78,7 @@ var descriptors = [numDescriptors]*prometheus.Desc{
}, nil,
),
hugeTlbUsageDesc: prometheus.NewDesc(
"cgroup_hugetlb_usage",
"hugetlb_usage",
"Hugepages usage for a given container and pod.",
[]string{
"container_id",
@@ -88,7 +87,7 @@ var descriptors = [numDescriptors]*prometheus.Desc{
}, nil,
),
blkioDeviceUsageDesc: prometheus.NewDesc(
"cgroup_blkio_device_usage",
"blkio_device_usage",
"Blkio Device bytes usage for a given container and pod.",
[]string{
"container_id",
@@ -114,8 +113,8 @@ type collector struct {
}

// NewCollector creates new Prometheus collector
func NewCollector() (prometheus.Collector, error) {
return &collector{}, nil
func NewCollector() prometheus.Collector {
return &collector{}
}

// Describe implements prometheus.Collector interface
@@ -405,9 +404,4 @@ func (c collector) Collect(ch chan<- prometheus.Metric) {
func init() {
flag.StringVar(&cgroupRoot, "cgroup-path", cgroupRoot,
"Path to cgroup filesystem mountpoint")

err := metrics.RegisterCollector("cgroupstats", NewCollector)
if err != nil {
log.Error("failed register cgroupstats collector: %v", err)
}
}
39 changes: 39 additions & 0 deletions pkg/cgroupstats/register/collector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Copyright The NRI Plugins Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package collectors

import (
"github.com/containers/nri-plugins/pkg/cgroupstats"
logger "github.com/containers/nri-plugins/pkg/log"
"github.com/containers/nri-plugins/pkg/metrics"
)

var (
log = logger.Get("collector")
)

func init() {
err := metrics.Register(
"stats",
cgroupstats.NewCollector(),
metrics.WithGroup("cgroup"),
metrics.WithCollectorOptions(
metrics.WithoutNamespace(),
),
)
if err != nil {
log.Error("failed to register cgroup/stats collector: %v", err)
}
}
14 changes: 4 additions & 10 deletions pkg/instrumentation/instrumentation.go
Original file line number Diff line number Diff line change
@@ -18,8 +18,6 @@ import (
"fmt"
"sync"

"github.com/prometheus/client_golang/prometheus"

cfgapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/instrumentation"
"github.com/containers/nri-plugins/pkg/http"
"github.com/containers/nri-plugins/pkg/instrumentation/metrics"
@@ -52,11 +50,6 @@ var (
Attribute = tracing.Attribute
)

// RegisterGatherer registers a prometheus metrics gatherer.
func RegisterGatherer(g prometheus.Gatherer) {
metrics.RegisterGatherer(g)
}

// HTTPServer returns our HTTP server.
func HTTPServer() *http.Server {
return srv
@@ -94,7 +87,7 @@ func Restart() error {

err := start()
if err != nil {
log.Error("failed to start tracing: %v", err)
log.Error("failed to start instrumentation: %v", err)
}

return err
@@ -122,9 +115,10 @@ func start() error {

if err := metrics.Start(
srv.GetMux(),
metrics.WithNamespace("nri"),
metrics.WithExporterDisabled(!cfg.PrometheusExport),
metrics.WithServiceName(ServiceName),
metrics.WithPeriod(cfg.ReportPeriod.Duration),
metrics.WithReportPeriod(cfg.ReportPeriod.Duration),
metrics.WithMetrics(cfg.Metrics),
); err != nil {
return fmt.Errorf("failed to start metrics: %v", err)
}
142 changes: 135 additions & 7 deletions pkg/instrumentation/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -15,14 +15,142 @@
package metrics

import (
oc "github.com/containers/nri-plugins/pkg/instrumentation/metrics/opencensus"
"fmt"
"slices"
"time"

"github.com/prometheus/client_golang/prometheus/promhttp"

"github.com/containers/nri-plugins/pkg/http"
logger "github.com/containers/nri-plugins/pkg/log"
"github.com/containers/nri-plugins/pkg/metrics"

config "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/metrics"
)

type (
Option func() error
)

var (
RegisterGatherer = oc.RegisterGatherer
WithExporterDisabled = oc.WithExporterDisabled
WithPeriod = oc.WithPeriod
WithServiceName = oc.WithServiceName
Start = oc.Start
Stop = oc.Stop
disabled bool
namespace string
enabled []string
polled []string
reportPeriod time.Duration
mux *http.ServeMux
gatherer *metrics.Gatherer
log = logger.Get("metrics")
)

func WithExporterDisabled(v bool) Option {
return func() error {
disabled = v
return nil
}
}

func WithNamespace(v string) Option {
return func() error {
namespace = v
return nil
}
}

func WithReportPeriod(v time.Duration) Option {
return func() error {
reportPeriod = v
return nil
}
}

func WithMetrics(cfg *config.Config) Option {
return func() error {
if cfg != nil {
enabled = slices.Clone(cfg.Enabled)
polled = slices.Clone(cfg.Polled)
} else {
enabled = nil
polled = nil
}
return nil
}
}

func Start(m *http.ServeMux, options ...Option) error {
Stop()

for _, opt := range options {
if err := opt(); err != nil {
return err
}
}

if m == nil {
log.Info("no mux provided, metrics exporter disabled")
return nil
}

if disabled {
log.Info("metrics exporter disabled")
return nil
}

log.Info("starting metrics exporter...")

g, err := metrics.NewGatherer(
metrics.WithNamespace("nri"),
metrics.WithPollInterval(reportPeriod),
metrics.WithMetrics(enabled, polled),
)
if err != nil {
return fmt.Errorf("failed to create metrics gatherer: %v", err)
}

gatherer = g

handlerOpts := promhttp.HandlerOpts{
ErrorLog: log,
ErrorHandling: promhttp.ContinueOnError,
}
m.Handle("/metrics", promhttp.HandlerFor(g, handlerOpts))

mux = m

return nil
}

func Stop() {
if mux == nil {
return
}

mux.Unregister("/metrics")
mux = nil
gatherer.Stop()
gatherer = nil
}

func Block() *MetricsBlock {
return newMetricsBlock(gatherer)
}

type MetricsBlock struct {
g *metrics.Gatherer
}

func newMetricsBlock(g *metrics.Gatherer) *MetricsBlock {
if g == nil {
return nil
}
g.Block()
return &MetricsBlock{g: g}
}

func (b *MetricsBlock) Done() {
if b == nil || b.g == nil {
return
}
b.g.Unblock()
b.g = nil
}
164 changes: 0 additions & 164 deletions pkg/instrumentation/metrics/opencensus/metrics.go

This file was deleted.

7 changes: 7 additions & 0 deletions pkg/log/log.go
Original file line number Diff line number Diff line change
@@ -77,6 +77,9 @@ type Logger interface {
// Fatal formats and emits an error message and os.Exit()'s with status 1.
Fatal(format string, args ...interface{})

// Println to mimic minimal stdlin log.Logger interface.
Println(v ...any)

// DebugBlock formats and emits a multiline debug message.
DebugBlock(prefix string, format string, args ...interface{})
// InfoBlock formats and emits a multiline information message.
@@ -410,6 +413,10 @@ func (l logger) Panic(format string, args ...interface{}) {
panic(msg)
}

func (l logger) Println(a ...any) {
l.Info("%s", fmt.Sprintln(a...))
}

func (l logger) DebugBlock(prefix string, format string, args ...interface{}) {
if l.DebugEnabled() {
l.block(LevelDebug, prefix, format, args...)
67 changes: 67 additions & 0 deletions pkg/metrics/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright The NRI Plugins Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package metrics

// The metrics package provides a simple framework for collecting and
// exporting metrics. It is implemented as a set of simple wrappers around
// prometheus types. These help enforce metrics namespacing, allow metrics
// grouping, provide dynamic runtime configurability, and allow for periodic
// collection of computationally expensive metrics which would be too costly
// to calculate each time they are externally requested.
//
// Simple Usage
//
//package main
//
//import (
// "log"
// "net/http"
// "os"
//
// "github.com/containers/nri-plugins/pkg/metrics"
// "github.com/prometheus/client_golang/prometheus/collectors"
// "github.com/prometheus/client_golang/prometheus/promhttp"
//)
//
//func main() {
// metrics.MustRegister(
// "build",
// collectors.NewBuildInfoCollector(),
// metrics.WithGroup("group1"),
// )
// metrics.MustRegister(
// "golang",
// collectors.NewGoCollector(),
// metrics.WithGroup("group1"),
// )
// metrics.MustRegister(
// "process",
// collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}),
// metrics.WithGroup("group2"),
// )
//
// enabled = []string{"*"}
// if len(os.Args) > 1 {
// enabled = os.Args[1:]
// }
//
// g, err := metrics.NewGatherer(metrics.WithMetrics(enabled, nil))
// if err != nil {
// log.Fatal(err)
// }
//
// http.Handle("/metrics", promhttp.HandlerFor(g, promhttp.HandlerOpts{}))
// log.Fatal(http.ListenAndServe(":8891", nil))
//}
702 changes: 674 additions & 28 deletions pkg/metrics/metrics.go

Large diffs are not rendered by default.

428 changes: 428 additions & 0 deletions pkg/metrics/metrics_test.go

Large diffs are not rendered by default.

6 changes: 0 additions & 6 deletions pkg/metrics/register/register_metrics.go

This file was deleted.

22 changes: 0 additions & 22 deletions pkg/resmgr/events.go
Original file line number Diff line number Diff line change
@@ -17,42 +17,21 @@ package resmgr
import (
logger "github.com/containers/nri-plugins/pkg/log"
"github.com/containers/nri-plugins/pkg/resmgr/cache"
"github.com/containers/nri-plugins/pkg/resmgr/metrics"
)

// Our logger instance for events.
var evtlog = logger.NewLogger("events")

// setupEventProcessing sets up event and metrics processing.
func (m *resmgr) setupEventProcessing() error {
var err error

m.events = make(chan interface{}, 8)
m.stop = make(chan interface{})
options := metrics.Options{
PollInterval: opt.MetricsTimer,
}
if m.metrics, err = metrics.NewMetrics(options); err != nil {
return resmgrError("failed to create metrics (pre)processor: %v", err)
}

return nil
}

func (m *resmgr) startMetricsProcessing() error {
if err := m.metrics.Start(); err != nil {
return resmgrError("failed to start metrics (pre)processor: %v", err)
}

return nil
}

// startEventProcessing starts event and metrics processing.
func (m *resmgr) startEventProcessing() error {
if err := m.startMetricsProcessing(); err != nil {
return resmgrError("failed to start metrics (pre)processor: %v", err)
}

stop := m.stop
go func() {
for {
@@ -73,7 +52,6 @@ func (m *resmgr) startEventProcessing() error {
func (m *resmgr) stopEventProcessing() {
if m.stop != nil {
close(m.stop)
m.metrics.Stop()
m.stop = nil
}
}
22 changes: 9 additions & 13 deletions pkg/resmgr/flags.go
Original file line number Diff line number Diff line change
@@ -29,18 +29,13 @@ const (

// Options captures our command line parameters.
type options struct {
HostRoot string
StateDir string
PidFile string
ResctrlPath string
FallbackConfig string
ForceConfig string
ForceConfigSignal string
MetricsTimer time.Duration
RebalanceTimer time.Duration
NriPluginName string
NriPluginIdx string
NriSocket string
HostRoot string
StateDir string
PidFile string
MetricsTimer time.Duration
NriPluginName string
NriPluginIdx string
NriSocket string
}

// ResourceManager command line options.
@@ -60,7 +55,8 @@ func init() {
flag.StringVar(&opt.PidFile, "pid-file", pidfile.GetPath(),
"PID file to write daemon PID to")
flag.DurationVar(&opt.MetricsTimer, "metrics-interval", 0,
"Interval for polling/gathering runtime metrics data. Use 'disable' for disabling.")
"Obsolete way to set interval for polling/gathering runtime metrics data.\n"+
"Use the instrumentation section of the CR-based configuration interface instead.")
flag.StringVar(&opt.StateDir, "state-dir", "/var/lib/nri-resource-policy",
"Permanent storage directory path for the resource manager to store its state in.")
}
149 changes: 0 additions & 149 deletions pkg/resmgr/metrics/metrics.go

This file was deleted.

25 changes: 25 additions & 0 deletions pkg/resmgr/nri.go
Original file line number Diff line number Diff line change
@@ -19,6 +19,7 @@ import (
"fmt"
"os"

"github.com/containers/nri-plugins/pkg/instrumentation/metrics"
"github.com/containers/nri-plugins/pkg/instrumentation/tracing"
logger "github.com/containers/nri-plugins/pkg/log"
"github.com/containers/nri-plugins/pkg/resmgr/cache"
@@ -256,6 +257,9 @@ func (p *nriPlugin) Synchronize(ctx context.Context, pods []*api.PodSandbox, con
p.dump(out, event, updates, retErr)
}()

b := metrics.Block()
defer b.Done()

m := p.resmgr

allocated, released, err := p.syncWithNRI(pods, containers)
@@ -294,6 +298,8 @@ func (p *nriPlugin) RunPodSandbox(ctx context.Context, pod *api.PodSandbox) (ret
m := p.resmgr
m.Lock()
defer m.Unlock()
b := metrics.Block()
defer b.Done()

m.cache.InsertPod(pod)
return nil
@@ -313,6 +319,10 @@ func (p *nriPlugin) StopPodSandbox(ctx context.Context, podSandbox *api.PodSandb

m := p.resmgr

// TODO(klihub): shouldn't we m.Lock()/defer m.Unlock() here?
b := metrics.Block()
defer b.Done()

released := []cache.Container{}
pod, _ := m.cache.LookupPod(podSandbox.GetId())

@@ -366,6 +376,8 @@ func (p *nriPlugin) RemovePodSandbox(ctx context.Context, podSandbox *api.PodSan

m.Lock()
defer m.Unlock()
b := metrics.Block()
defer b.Done()

m.cache.DeletePod(podSandbox.GetId())
return nil
@@ -391,6 +403,8 @@ func (p *nriPlugin) CreateContainer(ctx context.Context, podSandbox *api.PodSand
m := p.resmgr
m.Lock()
defer m.Unlock()
b := metrics.Block()
defer b.Done()

c, err := m.cache.InsertContainer(container)
if err != nil {
@@ -458,6 +472,8 @@ func (p *nriPlugin) StartContainer(ctx context.Context, pod *api.PodSandbox, con
m := p.resmgr
m.Lock()
defer m.Unlock()
b := metrics.Block()
defer b.Done()

c, ok := m.cache.LookupContainer(container.Id)
if !ok {
@@ -504,6 +520,8 @@ func (p *nriPlugin) UpdateContainer(ctx context.Context, pod *api.PodSandbox, co
m := p.resmgr
m.Lock()
defer m.Unlock()
b := metrics.Block()
defer b.Done()

c, ok := m.cache.LookupContainer(container.Id)
if !ok {
@@ -566,6 +584,8 @@ func (p *nriPlugin) StopContainer(ctx context.Context, pod *api.PodSandbox, cont
m := p.resmgr
m.Lock()
defer m.Unlock()
b := metrics.Block()
defer b.Done()

c, ok := m.cache.LookupContainer(container.Id)
if !ok {
@@ -604,6 +624,8 @@ func (p *nriPlugin) RemoveContainer(ctx context.Context, pod *api.PodSandbox, co
m := p.resmgr
m.Lock()
defer m.Unlock()
b := metrics.Block()
defer b.Done()

m.cache.DeleteContainer(container.Id)
return nil
@@ -612,6 +634,9 @@ func (p *nriPlugin) RemoveContainer(ctx context.Context, pod *api.PodSandbox, co
func (p *nriPlugin) updateContainers() (retErr error) {
// Notes: must be called with p.resmgr lock held.

b := metrics.Block()
defer b.Done()

updates := p.getPendingUpdates(nil)

event := UpdateContainers
55 changes: 55 additions & 0 deletions pkg/resmgr/policy/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// Copyright The NRI Plugins Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package policy

import (
"github.com/prometheus/client_golang/prometheus"

"github.com/containers/nri-plugins/pkg/metrics"
)

type PolicyCollector struct {
policy *policy
}

func (p *policy) newPolicyCollector() *PolicyCollector {
return &PolicyCollector{
policy: p,
}
}

func (c *PolicyCollector) register() error {
return metrics.Register(c.policy.ActivePolicy(), c, metrics.WithGroup("policy"))
}

func (c *PolicyCollector) Describe(ch chan<- *prometheus.Desc) {
for _, d := range c.policy.active.DescribeMetrics() {
ch <- d
}
}

func (c *PolicyCollector) Collect(ch chan<- prometheus.Metric) {
polled := c.policy.active.PollMetrics()

collected, err := c.policy.active.CollectMetrics(polled)
if err != nil {
log.Error("failed to collect metrics: %v", err)
return
}

for _, m := range collected {
ch <- m
}
}
56 changes: 22 additions & 34 deletions pkg/resmgr/policy/policy.go
Original file line number Diff line number Diff line change
@@ -147,12 +147,6 @@ type Policy interface {
HandleEvent(*events.Policy) (bool, error)
// ExportResourceData exports/updates resource data for the container.
ExportResourceData(cache.Container)
// DescribeMetrics generates policy-specific prometheus metrics data descriptors.
DescribeMetrics() []*prometheus.Desc
// PollMetrics provides policy metrics for monitoring.
PollMetrics() Metrics
// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data.
CollectMetrics(Metrics) ([]prometheus.Metric, error)
// GetTopologyZones returns the policy/pool data for 'topology zone' CRDs.
GetTopologyZones() []*TopologyZone
}
@@ -204,11 +198,12 @@ type ZoneAttribute struct {

// Policy instance/state.
type policy struct {
options Options // policy options
cache cache.Cache // system state cache
active Backend // our active backend
system system.System // system/HW/topology info
sendEvent SendEventFn // function to send event up to the resource manager
options Options // policy options
cache cache.Cache // system state cache
active Backend // our active backend
system system.System // system/HW/topology info
sendEvent SendEventFn // function to send event up to the resource manager
pcollect *PolicyCollector // policy metrics collector
}

// backend is a registered Backend.
@@ -225,11 +220,25 @@ var log logger.Logger = logger.NewLogger("policy")
func NewPolicy(backend Backend, cache cache.Cache, o *Options) (Policy, error) {
log.Info("creating '%s' policy...", backend.Name())

return &policy{
p := &policy{
cache: cache,
options: *o,
active: backend,
}, nil
}

sys, err := system.DiscoverSystem()
if err != nil {
return nil, policyError("failed to discover system topology: %v", err)
}
p.system = sys

pcollect := p.newPolicyCollector()
if err := pcollect.register(); err != nil {
return nil, policyError("failed to register policy collector: %v", err)
}
p.pcollect = pcollect

return p, nil
}

func (p *policy) ActivePolicy() string {
@@ -241,12 +250,6 @@ func (p *policy) ActivePolicy() string {

// Start starts up policy, preparing it for serving requests.
func (p *policy) Start(cfg interface{}) error {
sys, err := system.DiscoverSystem()
if err != nil {
return policyError("failed to discover system topology: %v", err)
}
p.system = sys

log.Info("activating '%s' policy...", p.active.Name())

if err := p.active.Setup(&BackendOptions{
@@ -315,21 +318,6 @@ func (p *policy) ExportResourceData(c cache.Container) {
p.cache.WriteFile(c.GetID(), ExportedResources, 0644, buf.Bytes())
}

// PollMetrics provides policy metrics for monitoring.
func (p *policy) PollMetrics() Metrics {
return p.active.PollMetrics()
}

// DescribeMetrics generates policy-specific prometheus metrics data descriptors.
func (p *policy) DescribeMetrics() []*prometheus.Desc {
return p.active.DescribeMetrics()
}

// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data.
func (p *policy) CollectMetrics(m Metrics) ([]prometheus.Metric, error) {
return p.active.CollectMetrics(m)
}

// GetTopologyZones returns the policy/pool data for 'topology zone' CRDs.
func (p *policy) GetTopologyZones() []*TopologyZone {
return p.active.GetTopologyZones()
62 changes: 0 additions & 62 deletions pkg/resmgr/policycollector/collector.go

This file was deleted.

34 changes: 13 additions & 21 deletions pkg/resmgr/resource-manager.go
Original file line number Diff line number Diff line change
@@ -27,16 +27,13 @@ import (
"github.com/containers/nri-plugins/pkg/pidfile"
"github.com/containers/nri-plugins/pkg/resmgr/cache"
"github.com/containers/nri-plugins/pkg/resmgr/control"
"github.com/containers/nri-plugins/pkg/resmgr/metrics"
"github.com/containers/nri-plugins/pkg/resmgr/policy"
"github.com/containers/nri-plugins/pkg/sysfs"
"github.com/containers/nri-plugins/pkg/topology"
goresctrlpath "github.com/intel/goresctrl/pkg/path"
"sigs.k8s.io/yaml"

cfgapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1"

policyCollector "github.com/containers/nri-plugins/pkg/resmgr/policycollector"
)

// ResourceManager is the interface we expose for controlling the CRI resource manager.
@@ -59,7 +56,6 @@ type resmgr struct {
cache cache.Cache // cached state
policy policy.Policy // resource manager policy
control control.Control // policy controllers/enforcement
metrics *metrics.Metrics // metrics collector/pre-processor
events chan interface{} // channel for delivering events
stop chan interface{} // channel for signalling shutdown to goroutines
nri *nriPlugin // NRI plugins, if we're running as such
@@ -84,6 +80,12 @@ func NewResourceManager(backend policy.Backend, agt *agent.Agent) (ResourceManag
goresctrlpath.SetPrefix(opt.HostRoot)
}

if opt.MetricsTimer != 0 {
log.Warn("WARNING: obsolete metrics-interval flag given, ignoring...")
log.Warn("WARNING: use the CR-based configuration interface instead")
log.Warn("WARNING: this flag will be removed in a future release")
}

m := &resmgr{
agent: agt,
}
@@ -103,10 +105,6 @@ func NewResourceManager(backend policy.Backend, agt *agent.Agent) (ResourceManag
return nil, err
}

if err := m.registerPolicyMetricsCollector(); err != nil {
return nil, err
}

if err := m.setupEventProcessing(); err != nil {
return nil, err
}
@@ -174,12 +172,15 @@ func (m *resmgr) start(cfg cfgapi.ResmgrConfig) error {

mCfg := cfg.CommonConfig()
logger.Configure(&mCfg.Log)
instrumentation.Reconfigure(&mCfg.Instrumentation)

if err := m.policy.Start(m.cfg.PolicyConfig()); err != nil {
return err
}

if err := instrumentation.Reconfigure(&mCfg.Instrumentation); err != nil {
return err
}

if err := m.nri.start(); err != nil {
return err
}
@@ -280,23 +281,14 @@ func (m *resmgr) updateTopologyZones() {
}
}

// registerPolicyMetricsCollector registers policy metrics collector·
func (m *resmgr) registerPolicyMetricsCollector() error {
pc := &policyCollector.PolicyCollector{}
pc.SetPolicy(m.policy)
if pc.HasPolicySpecificMetrics() {
return pc.RegisterPolicyMetricsCollector()
}
log.Info("%s policy has no policy-specific metrics.", m.policy.ActivePolicy())
return nil
}

func (m *resmgr) reconfigure(cfg cfgapi.ResmgrConfig) error {
apply := func(cfg cfgapi.ResmgrConfig) error {
mCfg := cfg.CommonConfig()

logger.Configure(&mCfg.Log)
instrumentation.Reconfigure(&mCfg.Instrumentation)
if err := instrumentation.Reconfigure(&mCfg.Instrumentation); err != nil {
return err
}
m.control.StartStopControllers(&mCfg.Control)

err := m.policy.Reconfigure(cfg.PolicyConfig())