Skip to content

Commit

Permalink
topology-aware: implement metrics collection.
Browse files Browse the repository at this point in the history
Add initial implementation for per-zone promethus metrics.
For each zone (policy pool) expose
  - zon name, cpuset, and memset
  - shared pool cpuset, and its allocation
  - zone memory capacity, and its allocation
  - number of containers in the zone
  - number of containers using the shared pool

Signed-off-by: Krisztian Litkey <[email protected]>
  • Loading branch information
klihub committed Nov 1, 2024
1 parent 395da2f commit 629ea74
Show file tree
Hide file tree
Showing 2 changed files with 261 additions and 17 deletions.
261 changes: 261 additions & 0 deletions cmd/plugins/topology-aware/policy/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
// Copyright The NRI Plugins Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
"slices"
"strings"

libmem "github.com/containers/nri-plugins/pkg/resmgr/lib/memory"
policyapi "github.com/containers/nri-plugins/pkg/resmgr/policy"
"github.com/containers/nri-plugins/pkg/utils/cpuset"
"github.com/prometheus/client_golang/prometheus"
)

const (
zoneDesc = iota
zoneSharedPool
zoneSharedAssigned
zoneMemCapacity
zoneMemAssigned
zoneNumContainers
zoneSharedContainers
)

var (
descriptors = []*prometheus.Desc{
zoneDesc: prometheus.NewDesc(
"topologyaware_zone",
"A topology zone of CPUs.",
[]string{
"zone",
"cpus",
"mems",
},
nil,
),
zoneSharedPool: prometheus.NewDesc(
"topologyaware_zone_shared_pool",
"Zone shared pool CPUs.",
[]string{
"zone",
"cpus",
},
nil,
),
zoneSharedAssigned: prometheus.NewDesc(
"topologyaware_zone_shared_assigned",
"Zone shared pool capacity assigned to containers.",
[]string{
"zone",
"cpus",
},
nil,
),
zoneMemCapacity: prometheus.NewDesc(
"topologyaware_zone_mem_capacity",
"Zone memory capacity.",
[]string{
"zone",
"mems",
},
nil,
),
zoneMemAssigned: prometheus.NewDesc(
"topologyaware_zone_mem_assigned",
"Zone memory capacity assigned to containers.",
[]string{
"zone",
"mems",
},
nil,
),
zoneNumContainers: prometheus.NewDesc(
"topologyaware_zone_containers",
"Number of containers assigned to zone.",
[]string{
"zone",
},
nil,
),
zoneSharedContainers: prometheus.NewDesc(
"topologyaware_zone_shared_containers",
"Number of containers assigned to shared pool.",
[]string{
"zone",
},
nil,
),
}
)

type ZoneMetrics struct {
Name string
Cpus cpuset.CPUSet
Mems libmem.NodeMask
SharedPool cpuset.CPUSet
SharedAssigned int
MemCapacity int64
MemAssigned int64
Containers []string
SharedContainers []string
}

type TopologyAwareMetrics struct {
Zones []string
Metrics map[string]*ZoneMetrics
}

func (zm *ZoneMetrics) Collect() []prometheus.Metric {
if zm == nil {
return nil
}

var metrics []prometheus.Metric

metrics = append(metrics,
prometheus.MustNewConstMetric(
descriptors[zoneDesc],
prometheus.GaugeValue,
float64(zm.Cpus.Size()),
zm.Name,
zm.Cpus.String(),
zm.Mems.MemsetString(),
),
prometheus.MustNewConstMetric(
descriptors[zoneSharedPool],
prometheus.GaugeValue,
float64(zm.SharedPool.Size()),
zm.Name,
zm.SharedPool.String(),
),
prometheus.MustNewConstMetric(
descriptors[zoneSharedAssigned],
prometheus.GaugeValue,
float64(zm.SharedAssigned)/1000.0,
zm.Name,
zm.SharedPool.String(),
),
prometheus.MustNewConstMetric(
descriptors[zoneMemCapacity],
prometheus.GaugeValue,
float64(zm.MemCapacity),
zm.Name,
zm.Mems.MemsetString(),
),
prometheus.MustNewConstMetric(
descriptors[zoneMemAssigned],
prometheus.GaugeValue,
float64(zm.MemAssigned),
zm.Name,
zm.Mems.MemsetString(),
),
prometheus.MustNewConstMetric(
descriptors[zoneNumContainers],
prometheus.GaugeValue,
float64(len(zm.Containers)),
zm.Name,
),
prometheus.MustNewConstMetric(
descriptors[zoneSharedContainers],
prometheus.GaugeValue,
float64(len(zm.SharedContainers)),
zm.Name,
),
)

log.Debug("collected zone %s metrics...", zm.Name)

return metrics
}

// DescribeMetrics generates policy-specific prometheus metrics data descriptors.
func (p *policy) DescribeMetrics() []*prometheus.Desc {
log.Debug("has %d metrics descriptors", len(descriptors))
return descriptors
}

// PollMetrics provides policy metrics for monitoring.
func (p *policy) PollMetrics() policyapi.Metrics {
m := &TopologyAwareMetrics{
Zones: make([]string, 0, len(p.pools)),
Metrics: make(map[string]*ZoneMetrics),
}

for _, pool := range p.pools {
m.Zones = append(m.Zones, pool.Name())

var (
capa = pool.GetSupply().(*supply)
free = pool.FreeSupply().(*supply)
cpus = capa.ReservedCPUs().Union(capa.IsolatedCPUs()).Union(capa.SharableCPUs())
mems = libmem.NewNodeMask(pool.GetMemset(memoryAll).Members()...)
sharedPool = free.SharableCPUs().Union(free.ReservedCPUs())
containers []string
sharedctrs []string
)

for id, g := range p.allocations.grants {
if g.GetCPUNode().Name() == pool.Name() {
containers = append(containers, id)
if g.ReservedPortion() != 0 || g.CPUPortion() != 0 {
sharedctrs = append(sharedctrs, id)
}
}
}

zone := &ZoneMetrics{
Name: pool.Name(),
Cpus: cpus,
Mems: mems,
SharedPool: sharedPool,
SharedAssigned: free.GrantedReserved() + free.GrantedShared(),
MemCapacity: p.memAllocator.ZoneCapacity(mems),
MemAssigned: p.memAllocator.ZoneUsage(mems),
Containers: containers,
SharedContainers: sharedctrs,
}

m.Metrics[zone.Name] = zone

log.Debug("polled zone %s for metrics...", pool.Name())
}

slices.SortFunc(m.Zones, func(a, b string) int {
poolA, poolB := p.nodes[a], p.nodes[b]
if diff := poolA.RootDistance() - poolB.RootDistance(); diff != 0 {
return diff
}
return strings.Compare(a, b)
})

return m
}

// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data.
func (p *policy) CollectMetrics(pm policyapi.Metrics) ([]prometheus.Metric, error) {
m, ok := pm.(*TopologyAwareMetrics)
if !ok {
return nil, policyError("unexpected policy metrics type %T", pm)
}

var collected []prometheus.Metric

for _, name := range m.Zones {
collected = append(collected, m.Metrics[name].Collect()...)
}

return collected, nil
}
17 changes: 0 additions & 17 deletions cmd/plugins/topology-aware/policy/topology-aware-policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ import (
"github.com/containers/nri-plugins/pkg/utils/cpuset"
"k8s.io/apimachinery/pkg/api/resource"

"github.com/prometheus/client_golang/prometheus"

cfgapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy/topologyaware"
"github.com/containers/nri-plugins/pkg/cpuallocator"
"github.com/containers/nri-plugins/pkg/resmgr/cache"
Expand Down Expand Up @@ -304,21 +302,6 @@ func (p *policy) HandleEvent(e *events.Policy) (bool, error) {
return false, nil
}

// DescribeMetrics generates policy-specific prometheus metrics data descriptors.
func (p *policy) DescribeMetrics() []*prometheus.Desc {
return nil
}

// PollMetrics provides policy metrics for monitoring.
func (p *policy) PollMetrics() policyapi.Metrics {
return nil
}

// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data.
func (p *policy) CollectMetrics(policyapi.Metrics) ([]prometheus.Metric, error) {
return nil, nil
}

// GetTopologyZones returns the policy/pool data for 'topology zone' CRDs.
func (p *policy) GetTopologyZones() []*policyapi.TopologyZone {
zones := []*policyapi.TopologyZone{}
Expand Down

0 comments on commit 629ea74

Please sign in to comment.