diff --git a/cmd/plugins/topology-aware/policy/metrics.go b/cmd/plugins/topology-aware/policy/metrics.go new file mode 100644 index 000000000..78bf76f96 --- /dev/null +++ b/cmd/plugins/topology-aware/policy/metrics.go @@ -0,0 +1,329 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package topologyaware + +import ( + "slices" + "strings" + + libmem "github.com/containers/nri-plugins/pkg/resmgr/lib/memory" + policyapi "github.com/containers/nri-plugins/pkg/resmgr/policy" + "github.com/containers/nri-plugins/pkg/utils/cpuset" + "github.com/prometheus/client_golang/prometheus" +) + +type TopologyAwareMetrics struct { + ZoneNames []string + Zones map[string]*Zone + Metrics Metrics + registered bool +} + +type Zone struct { + Name string + Cpus cpuset.CPUSet + Mems libmem.NodeMask + SharedPool cpuset.CPUSet + SharedAssigned int + SharedAvailable int + MemCapacity int64 + MemAssigned int64 + MemAvailable int64 + ContainerCount int + SharedContainerCount int +} + +type Metrics struct { + zone *prometheus.GaugeVec + cpuSharedCapacity *prometheus.GaugeVec + cpuSharedAssigned *prometheus.GaugeVec + cpuSharedAvailable *prometheus.GaugeVec + memCapacity *prometheus.GaugeVec + memAssigned *prometheus.GaugeVec + memAvailable *prometheus.GaugeVec + containerCount *prometheus.GaugeVec + sharedContainerCount *prometheus.GaugeVec +} + +func NewTopologyAwareMetrics() *TopologyAwareMetrics { + m := &TopologyAwareMetrics{ + Zones: make(map[string]*Zone), + Metrics: Metrics{ + zone: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "topologyaware_policy_zone_cpu_capacity", + Help: "A topology zone of CPUs.", + }, + []string{ + "zone", + "cpus", + "mems", + }, + ), + cpuSharedCapacity: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "topologyaware_policy_zone_cpu_shared_capacity", + Help: "Capacity of shared CPU pool of a topology zone.", + }, + []string{ + "zone", + "cpus", + }, + ), + cpuSharedAssigned: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "topologyaware_policy_zone_cpu_shared_assigned", + Help: "Assigned amount of shared CPU pool of a topology zone.", + }, + []string{ + "zone", + "cpus", + }, + ), + cpuSharedAvailable: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "topologyaware_policy_zone_cpu_shared_available", + Help: "Available amount of shared CPU pool of a topology zone.", + }, + []string{ + "zone", + "cpus", + }, + ), + memCapacity: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "topologyaware_zone_mem_capacity", + Help: "Memory capacity of a topology zone.", + }, + []string{ + "zone", + "mems", + }, + ), + memAssigned: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "topologyaware_zone_mem_assigned", + Help: "Amount of assigned memory of a topology zone.", + }, + []string{ + "zone", + "mems", + }, + ), + memAvailable: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "topologyaware_zone_mem_available", + Help: "Amount of available memory of a topology zone.", + }, + []string{ + "zone", + "mems", + }, + ), + containerCount: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "topologyaware_zone_container_count", + Help: "Number of containers assigned to a topology zone.", + }, + []string{ + "zone", + }, + ), + sharedContainerCount: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "topologyaware_zone_shared_container_count", + Help: "Number of containers in the shared CPU pool of a topology zone.", + }, + []string{ + "zone", + }, + ), + }, + } + + return m +} + +// DescribeMetrics generates policy-specific prometheus metrics data descriptors. +func (p *policy) DescribeMetrics() []*prometheus.Desc { + if p.metrics == nil { + p.metrics = NewTopologyAwareMetrics() + } + + m := p.metrics + + ch := make(chan *prometheus.Desc) + go func(ch chan *prometheus.Desc) { + m.Metrics.zone.Describe(ch) + m.Metrics.cpuSharedCapacity.Describe(ch) + m.Metrics.cpuSharedAssigned.Describe(ch) + m.Metrics.cpuSharedAvailable.Describe(ch) + m.Metrics.memCapacity.Describe(ch) + m.Metrics.memAssigned.Describe(ch) + m.Metrics.memAvailable.Describe(ch) + m.Metrics.containerCount.Describe(ch) + m.Metrics.sharedContainerCount.Describe(ch) + close(ch) + }(ch) + + descriptors := []*prometheus.Desc{} + for d := range ch { + log.Info(" described metric %s", d.String()) + descriptors = append(descriptors, d) + } + + return descriptors +} + +func (p *policy) CollectMetrics(pm policyapi.Metrics) ([]prometheus.Metric, error) { + m := p.metrics + + ch := make(chan prometheus.Metric) + go func(ch chan<- prometheus.Metric) { + m.Metrics.zone.Collect(ch) + m.Metrics.cpuSharedCapacity.Collect(ch) + m.Metrics.cpuSharedAssigned.Collect(ch) + m.Metrics.cpuSharedAvailable.Collect(ch) + m.Metrics.memCapacity.Collect(ch) + m.Metrics.memAssigned.Collect(ch) + m.Metrics.memAvailable.Collect(ch) + m.Metrics.containerCount.Collect(ch) + m.Metrics.sharedContainerCount.Collect(ch) + close(ch) + }(ch) + + metrics := []prometheus.Metric{} + for m := range ch { + log.Info(" collected metric %s", m.Desc().String()) + metrics = append(metrics, m) + } + + return metrics, nil +} + +// PollMetrics provides policy metrics for monitoring. +func (p *policy) PollMetrics() policyapi.Metrics { + if p.metrics == nil { + p.metrics = NewTopologyAwareMetrics() + } + + m := p.metrics + + for _, pool := range p.pools { + var ( + name = pool.Name() + zone = m.Zones[name] + free = pool.FreeSupply().(*supply) + mems = libmem.NewNodeMask(pool.GetMemset(memoryAll).Members()...) + sharedPool = free.SharableCPUs().Union(free.ReservedCPUs()) + containers = 0 + sharedctrs = 0 + ) + + if zone == nil { + var ( + capa = pool.GetSupply().(*supply) + cpus = capa.ReservedCPUs().Union(capa.IsolatedCPUs()).Union(capa.SharableCPUs()) + ) + zone = &Zone{ + Name: name, + Cpus: cpus, + Mems: mems, + MemCapacity: p.memAllocator.ZoneCapacity(mems), + } + + m.Zones[name] = zone + m.ZoneNames = append(m.ZoneNames, name) + + m.Metrics.zone.WithLabelValues( + zone.Name, + zone.Cpus.String(), + zone.Mems.String(), + ).Set(float64(zone.Cpus.Size())) + + m.Metrics.memCapacity.WithLabelValues( + zone.Name, + zone.Mems.String(), + ).Set(float64(zone.MemCapacity)) + + log.Info(" created metrics for zone %s", name) + } + + log.Debug("polling metrics for zone %s...", name) + + for _, g := range p.allocations.grants { + if g.GetCPUNode().Name() == pool.Name() { + containers++ + if g.ReservedPortion() != 0 || g.CPUPortion() != 0 { + sharedctrs++ + } + } + } + + zone.SharedPool = sharedPool + zone.SharedAssigned = free.GrantedReserved() + free.GrantedShared() + zone.SharedAvailable = free.AllocatableSharedCPU() + zone.MemAssigned = p.memAllocator.ZoneUsage(mems) + zone.MemAvailable = p.memAllocator.ZoneAvailable(mems) + zone.ContainerCount = containers + zone.SharedContainerCount = sharedctrs + + m.Metrics.cpuSharedCapacity.WithLabelValues( + zone.Name, + zone.SharedPool.String(), + ).Set(float64(zone.SharedPool.Size())) + + m.Metrics.cpuSharedAssigned.WithLabelValues( + zone.Name, + zone.SharedPool.String(), + ).Set(float64(zone.SharedAssigned) / 1000.0) + + m.Metrics.cpuSharedAvailable.WithLabelValues( + zone.Name, + zone.SharedPool.String(), + ).Set(float64(zone.SharedAvailable) / 1000.0) + + m.Metrics.memAssigned.WithLabelValues( + zone.Name, + zone.Mems.MemsetString(), + ).Set(float64(zone.MemAssigned)) + + m.Metrics.memAvailable.WithLabelValues( + zone.Name, + zone.Mems.MemsetString(), + ).Set(float64(zone.MemAvailable)) + + m.Metrics.containerCount.WithLabelValues( + zone.Name, + ).Set(float64(zone.ContainerCount)) + + m.Metrics.sharedContainerCount.WithLabelValues( + zone.Name, + ).Set(float64(zone.SharedContainerCount)) + } + + if p.metrics == nil { + slices.SortFunc(m.ZoneNames, func(a, b string) int { + poolA, poolB := p.nodes[a], p.nodes[b] + if diff := poolA.RootDistance() - poolB.RootDistance(); diff != 0 { + return diff + } + return strings.Compare(a, b) + }) + p.metrics = m + } + + return p.metrics +} diff --git a/cmd/plugins/topology-aware/policy/topology-aware-policy.go b/cmd/plugins/topology-aware/policy/topology-aware-policy.go index 97da8328d..c83c3d76d 100644 --- a/cmd/plugins/topology-aware/policy/topology-aware-policy.go +++ b/cmd/plugins/topology-aware/policy/topology-aware-policy.go @@ -21,8 +21,6 @@ import ( "github.com/containers/nri-plugins/pkg/utils/cpuset" "k8s.io/apimachinery/pkg/api/resource" - "github.com/prometheus/client_golang/prometheus" - cfgapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy/topologyaware" "github.com/containers/nri-plugins/pkg/cpuallocator" "github.com/containers/nri-plugins/pkg/resmgr/cache" @@ -68,6 +66,7 @@ type policy struct { cpuAllocator cpuallocator.CPUAllocator // CPU allocator used by the policy memAllocator *libmem.Allocator coldstartOff bool // coldstart forced off (have movable PMEM zones) + metrics *TopologyAwareMetrics } var opt = &cfgapi.Config{} @@ -306,21 +305,6 @@ func (p *policy) HandleEvent(e *events.Policy) (bool, error) { return false, nil } -// DescribeMetrics generates policy-specific prometheus metrics data descriptors. -func (p *policy) DescribeMetrics() []*prometheus.Desc { - return nil -} - -// PollMetrics provides policy metrics for monitoring. -func (p *policy) PollMetrics() policyapi.Metrics { - return nil -} - -// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data. -func (p *policy) CollectMetrics(policyapi.Metrics) ([]prometheus.Metric, error) { - return nil, nil -} - // GetTopologyZones returns the policy/pool data for 'topology zone' CRDs. func (p *policy) GetTopologyZones() []*policyapi.TopologyZone { zones := []*policyapi.TopologyZone{}