Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

topology-aware: implement metrics collection. #392

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
299 changes: 299 additions & 0 deletions cmd/plugins/topology-aware/policy/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,299 @@
// Copyright The NRI Plugins Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package topologyaware

import (
"slices"
"strings"

libmem "github.com/containers/nri-plugins/pkg/resmgr/lib/memory"
policyapi "github.com/containers/nri-plugins/pkg/resmgr/policy"
"github.com/containers/nri-plugins/pkg/utils/cpuset"
"github.com/prometheus/client_golang/prometheus"
)

const (
descZone = iota
descZoneCpuSharedCapacity
descZoneCpuSharedAssigned
descZoneCpuSharedAvailable
descZoneMemCapacity
descZoneMemAssigned
descZoneMemAvailable
descZoneContainerCount
descZoneSharedContainerCount
)

var (
descriptors = []*prometheus.Desc{
descZone: prometheus.NewDesc(
"topologyaware_policy_zone_cpu_capacity",
"A topology zone of CPUs.",
[]string{
"zone",
"cpus",
"mems",
},
nil,
),
descZoneCpuSharedCapacity: prometheus.NewDesc(
"topologyaware_policy_zone_cpu_shared_capacity",
"Capacity of shared CPU pool of a topology zone.",
[]string{
"zone",
"cpus",
},
nil,
),
descZoneCpuSharedAssigned: prometheus.NewDesc(
"topologyaware_policy_zone_cpu_shared_assigned",
"Assigned amount of shared CPU pool of a topology zone.",
[]string{
"zone",
"cpus",
},
nil,
),
descZoneCpuSharedAvailable: prometheus.NewDesc(
"topologyaware_policy_zone_cpu_shared_available",
"Available amount of shared CPU pool of a topology zone.",
[]string{
"zone",
"cpus",
},
nil,
),
descZoneMemCapacity: prometheus.NewDesc(
"topologyaware_zone_mem_capacity",
"Memory capacity of a topology zone.",
[]string{
"zone",
"mems",
},
nil,
),
descZoneMemAssigned: prometheus.NewDesc(
"topologyaware_zone_mem_assigned",
"Amount of assigned memory of a topology zone.",
[]string{
"zone",
"mems",
},
nil,
),
descZoneMemAvailable: prometheus.NewDesc(
"topologyaware_zone_mem_available",
"Amount of available memory of a topology zone.",
[]string{
"zone",
"mems",
},
nil,
),
descZoneContainerCount: prometheus.NewDesc(
"topologyaware_zone_container_count",
"Number of containers assigned to a topology zone.",
[]string{
"zone",
},
nil,
),
descZoneSharedContainerCount: prometheus.NewDesc(
"topologyaware_zone_shared_container_count",
"Number of containers in the shared CPU pool of a topology zone.",
[]string{
"zone",
},
nil,
),
}
)

type ZoneMetrics struct {
Name string
Cpus cpuset.CPUSet
Mems libmem.NodeMask
SharedPool cpuset.CPUSet
SharedAssigned int
SharedAvailable int
MemCapacity int64
MemAssigned int64
MemAvailable int64
Containers int
SharedContainers int
}

type TopologyAwareMetrics struct {
Zones []string
Metrics map[string]*ZoneMetrics
}

func (zm *ZoneMetrics) Collect() []prometheus.Metric {
if zm == nil {
return nil
}

var metrics []prometheus.Metric

metrics = append(metrics,
prometheus.MustNewConstMetric(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why the use of "throwaway" const metric? I'd think about initializing a list of persistent gauges and using the Set method to update the value.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because I mindlessly copied the pattern of implementation from balloons without understanding what I am actually doing... Let me try to read up on this a bit more and fix it.

descriptors[descZone],
prometheus.GaugeValue,
float64(zm.Cpus.Size()),
zm.Name,
zm.Cpus.String(),
zm.Mems.MemsetString(),
),
prometheus.MustNewConstMetric(
descriptors[descZoneCpuSharedCapacity],
prometheus.GaugeValue,
float64(zm.SharedPool.Size()),
zm.Name,
zm.SharedPool.String(),
),
prometheus.MustNewConstMetric(
descriptors[descZoneCpuSharedAssigned],
prometheus.GaugeValue,
float64(zm.SharedAssigned)/1000.0,
zm.Name,
zm.SharedPool.String(),
),
prometheus.MustNewConstMetric(
descriptors[descZoneCpuSharedAvailable],
prometheus.GaugeValue,
float64(zm.SharedAvailable)/1000.0,
zm.Name,
zm.SharedPool.String(),
),
prometheus.MustNewConstMetric(
descriptors[descZoneMemCapacity],
prometheus.GaugeValue,
float64(zm.MemCapacity),
zm.Name,
zm.Mems.MemsetString(),
),
prometheus.MustNewConstMetric(
descriptors[descZoneMemAssigned],
prometheus.GaugeValue,
float64(zm.MemAssigned),
zm.Name,
zm.Mems.MemsetString(),
),
prometheus.MustNewConstMetric(
descriptors[descZoneMemAvailable],
prometheus.GaugeValue,
float64(zm.MemAvailable),
zm.Name,
zm.Mems.MemsetString(),
),
prometheus.MustNewConstMetric(
descriptors[descZoneContainerCount],
prometheus.GaugeValue,
float64(zm.Containers),
zm.Name,
),
prometheus.MustNewConstMetric(
descriptors[descZoneSharedContainerCount],
prometheus.GaugeValue,
float64(zm.SharedContainers),
zm.Name,
),
)

log.Debug("collected zone %s metrics...", zm.Name)

return metrics
}

// DescribeMetrics generates policy-specific prometheus metrics data descriptors.
func (p *policy) DescribeMetrics() []*prometheus.Desc {
log.Debug("has %d metrics descriptors", len(descriptors))
return descriptors
}

// PollMetrics provides policy metrics for monitoring.
func (p *policy) PollMetrics() policyapi.Metrics {
m := &TopologyAwareMetrics{
Zones: make([]string, 0, len(p.pools)),
Metrics: make(map[string]*ZoneMetrics),
}

for _, pool := range p.pools {
m.Zones = append(m.Zones, pool.Name())

var (
capa = pool.GetSupply().(*supply)
free = pool.FreeSupply().(*supply)
cpus = capa.ReservedCPUs().Union(capa.IsolatedCPUs()).Union(capa.SharableCPUs())
mems = libmem.NewNodeMask(pool.GetMemset(memoryAll).Members()...)
sharedPool = free.SharableCPUs().Union(free.ReservedCPUs())
containers []string
sharedctrs []string
)

for id, g := range p.allocations.grants {
if g.GetCPUNode().Name() == pool.Name() {
containers = append(containers, id)
if g.ReservedPortion() != 0 || g.CPUPortion() != 0 {
sharedctrs = append(sharedctrs, id)
}
}
}

zone := &ZoneMetrics{
Name: pool.Name(),
Cpus: cpus,
Mems: mems,
SharedPool: sharedPool,
SharedAssigned: free.GrantedReserved() + free.GrantedShared(),
SharedAvailable: free.AllocatableSharedCPU(),
MemCapacity: p.memAllocator.ZoneCapacity(mems),
MemAssigned: p.memAllocator.ZoneUsage(mems),
MemAvailable: p.memAllocator.ZoneAvailable(mems),
Containers: len(containers),
SharedContainers: len(sharedctrs),
}

m.Metrics[zone.Name] = zone

log.Debug("polled zone %s for metrics...", pool.Name())
}

slices.SortFunc(m.Zones, func(a, b string) int {
poolA, poolB := p.nodes[a], p.nodes[b]
if diff := poolA.RootDistance() - poolB.RootDistance(); diff != 0 {
return diff
}
return strings.Compare(a, b)
})

return m
}

// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data.
func (p *policy) CollectMetrics(pm policyapi.Metrics) ([]prometheus.Metric, error) {
m, ok := pm.(*TopologyAwareMetrics)
if !ok {
return nil, policyError("unexpected policy metrics type %T", pm)
}

var collected []prometheus.Metric

for _, name := range m.Zones {
collected = append(collected, m.Metrics[name].Collect()...)
}

return collected, nil
}
17 changes: 0 additions & 17 deletions cmd/plugins/topology-aware/policy/topology-aware-policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ import (
"github.com/containers/nri-plugins/pkg/utils/cpuset"
"k8s.io/apimachinery/pkg/api/resource"

"github.com/prometheus/client_golang/prometheus"

cfgapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy/topologyaware"
"github.com/containers/nri-plugins/pkg/cpuallocator"
"github.com/containers/nri-plugins/pkg/resmgr/cache"
Expand Down Expand Up @@ -304,21 +302,6 @@ func (p *policy) HandleEvent(e *events.Policy) (bool, error) {
return false, nil
}

// DescribeMetrics generates policy-specific prometheus metrics data descriptors.
func (p *policy) DescribeMetrics() []*prometheus.Desc {
return nil
}

// PollMetrics provides policy metrics for monitoring.
func (p *policy) PollMetrics() policyapi.Metrics {
return nil
}

// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data.
func (p *policy) CollectMetrics(policyapi.Metrics) ([]prometheus.Metric, error) {
return nil, nil
}

// GetTopologyZones returns the policy/pool data for 'topology zone' CRDs.
func (p *policy) GetTopologyZones() []*policyapi.TopologyZone {
zones := []*policyapi.TopologyZone{}
Expand Down
27 changes: 27 additions & 0 deletions pkg/resmgr/lib/memory/zones.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ func (a *Allocator) ZoneFree(zone NodeMask) int64 {
return a.zoneFree(zone & a.masks.nodes.hasMemory)
}

// ZoneAvailable returns the amount of available memory in the zone.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The meaning of "available" is not very clear to me. Maybe this public method comment could help understanding why ZoneAvailable != ZoneFree (where ZoneFree = ZoneCapacity - ZoneUsage). Can it be that ZoneAvailable at t=0 is smaller than ZoneUsage at t=1?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ZoneFree returns the amount of free capacity in this zone, taking into account allocations in this zone and any only this zone and its subzones. ZoneAvailable returns ZoneFree capped to the minimum free capacity of any of the (larger super-)zones this zone is a genuine subzone of. IOW, it returns the amount of memory allocatable from this zone.

func (a *Allocator) ZoneAvailable(zone NodeMask) int64 {
return a.zoneAvailable(zone & a.masks.nodes.hasMemory)
}

// ZoneNumUsers returns the number of requests assigned to the zone.
func (a *Allocator) ZoneNumUsers(zone NodeMask) int {
if z, ok := a.zones[zone]; ok {
Expand Down Expand Up @@ -154,6 +159,28 @@ func (a *Allocator) zoneFree(zone NodeMask) int64 {
return a.zoneCapacity(zone) - a.zoneUsage(zone)
}

func (a *Allocator) zoneAvailable(zone NodeMask) int64 {
available := a.zoneFree(zone)
if available <= 0 {
return 0
}

// Cap available amount to the smallest available free in any of our ancestors.
for _, z := range a.zones {
if z.nodes != zone && (z.nodes&zone) == zone {
if free := a.zoneFree(z.nodes); free < available {
available = free
}
}
if available <= 0 {
available = 0
break
}
}

return available
}

func (a *Allocator) zoneAssign(zone NodeMask, req *Request) {
z, ok := a.zones[zone]
if !ok {
Expand Down