-
Notifications
You must be signed in to change notification settings - Fork 25
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
topology-aware: implement metrics collection. #392
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,299 @@ | ||
// Copyright The NRI Plugins Authors. All Rights Reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package topologyaware | ||
|
||
import ( | ||
"slices" | ||
"strings" | ||
|
||
libmem "github.com/containers/nri-plugins/pkg/resmgr/lib/memory" | ||
policyapi "github.com/containers/nri-plugins/pkg/resmgr/policy" | ||
"github.com/containers/nri-plugins/pkg/utils/cpuset" | ||
"github.com/prometheus/client_golang/prometheus" | ||
) | ||
|
||
const ( | ||
descZone = iota | ||
descZoneCpuSharedCapacity | ||
descZoneCpuSharedAssigned | ||
descZoneCpuSharedAvailable | ||
descZoneMemCapacity | ||
descZoneMemAssigned | ||
descZoneMemAvailable | ||
descZoneContainerCount | ||
descZoneSharedContainerCount | ||
) | ||
|
||
var ( | ||
descriptors = []*prometheus.Desc{ | ||
descZone: prometheus.NewDesc( | ||
"topologyaware_policy_zone_cpu_capacity", | ||
"A topology zone of CPUs.", | ||
[]string{ | ||
"zone", | ||
"cpus", | ||
"mems", | ||
}, | ||
nil, | ||
), | ||
descZoneCpuSharedCapacity: prometheus.NewDesc( | ||
"topologyaware_policy_zone_cpu_shared_capacity", | ||
"Capacity of shared CPU pool of a topology zone.", | ||
[]string{ | ||
"zone", | ||
"cpus", | ||
}, | ||
nil, | ||
), | ||
descZoneCpuSharedAssigned: prometheus.NewDesc( | ||
"topologyaware_policy_zone_cpu_shared_assigned", | ||
"Assigned amount of shared CPU pool of a topology zone.", | ||
[]string{ | ||
"zone", | ||
"cpus", | ||
}, | ||
nil, | ||
), | ||
descZoneCpuSharedAvailable: prometheus.NewDesc( | ||
"topologyaware_policy_zone_cpu_shared_available", | ||
"Available amount of shared CPU pool of a topology zone.", | ||
[]string{ | ||
"zone", | ||
"cpus", | ||
}, | ||
nil, | ||
), | ||
descZoneMemCapacity: prometheus.NewDesc( | ||
"topologyaware_zone_mem_capacity", | ||
"Memory capacity of a topology zone.", | ||
[]string{ | ||
"zone", | ||
"mems", | ||
}, | ||
nil, | ||
), | ||
descZoneMemAssigned: prometheus.NewDesc( | ||
"topologyaware_zone_mem_assigned", | ||
"Amount of assigned memory of a topology zone.", | ||
[]string{ | ||
"zone", | ||
"mems", | ||
}, | ||
nil, | ||
), | ||
descZoneMemAvailable: prometheus.NewDesc( | ||
"topologyaware_zone_mem_available", | ||
"Amount of available memory of a topology zone.", | ||
[]string{ | ||
"zone", | ||
"mems", | ||
}, | ||
nil, | ||
), | ||
descZoneContainerCount: prometheus.NewDesc( | ||
"topologyaware_zone_container_count", | ||
"Number of containers assigned to a topology zone.", | ||
[]string{ | ||
"zone", | ||
}, | ||
nil, | ||
), | ||
descZoneSharedContainerCount: prometheus.NewDesc( | ||
"topologyaware_zone_shared_container_count", | ||
"Number of containers in the shared CPU pool of a topology zone.", | ||
[]string{ | ||
"zone", | ||
}, | ||
nil, | ||
), | ||
} | ||
) | ||
|
||
type ZoneMetrics struct { | ||
Name string | ||
Cpus cpuset.CPUSet | ||
Mems libmem.NodeMask | ||
SharedPool cpuset.CPUSet | ||
SharedAssigned int | ||
SharedAvailable int | ||
MemCapacity int64 | ||
MemAssigned int64 | ||
MemAvailable int64 | ||
Containers int | ||
SharedContainers int | ||
} | ||
|
||
type TopologyAwareMetrics struct { | ||
Zones []string | ||
Metrics map[string]*ZoneMetrics | ||
} | ||
|
||
func (zm *ZoneMetrics) Collect() []prometheus.Metric { | ||
if zm == nil { | ||
return nil | ||
} | ||
|
||
var metrics []prometheus.Metric | ||
|
||
metrics = append(metrics, | ||
prometheus.MustNewConstMetric( | ||
descriptors[descZone], | ||
prometheus.GaugeValue, | ||
float64(zm.Cpus.Size()), | ||
zm.Name, | ||
zm.Cpus.String(), | ||
zm.Mems.MemsetString(), | ||
), | ||
prometheus.MustNewConstMetric( | ||
descriptors[descZoneCpuSharedCapacity], | ||
prometheus.GaugeValue, | ||
float64(zm.SharedPool.Size()), | ||
zm.Name, | ||
zm.SharedPool.String(), | ||
), | ||
prometheus.MustNewConstMetric( | ||
descriptors[descZoneCpuSharedAssigned], | ||
prometheus.GaugeValue, | ||
float64(zm.SharedAssigned)/1000.0, | ||
zm.Name, | ||
zm.SharedPool.String(), | ||
), | ||
prometheus.MustNewConstMetric( | ||
descriptors[descZoneCpuSharedAvailable], | ||
prometheus.GaugeValue, | ||
float64(zm.SharedAvailable)/1000.0, | ||
zm.Name, | ||
zm.SharedPool.String(), | ||
), | ||
prometheus.MustNewConstMetric( | ||
descriptors[descZoneMemCapacity], | ||
prometheus.GaugeValue, | ||
float64(zm.MemCapacity), | ||
zm.Name, | ||
zm.Mems.MemsetString(), | ||
), | ||
prometheus.MustNewConstMetric( | ||
descriptors[descZoneMemAssigned], | ||
prometheus.GaugeValue, | ||
float64(zm.MemAssigned), | ||
zm.Name, | ||
zm.Mems.MemsetString(), | ||
), | ||
prometheus.MustNewConstMetric( | ||
descriptors[descZoneMemAvailable], | ||
prometheus.GaugeValue, | ||
float64(zm.MemAvailable), | ||
zm.Name, | ||
zm.Mems.MemsetString(), | ||
), | ||
prometheus.MustNewConstMetric( | ||
descriptors[descZoneContainerCount], | ||
prometheus.GaugeValue, | ||
float64(zm.Containers), | ||
zm.Name, | ||
), | ||
prometheus.MustNewConstMetric( | ||
descriptors[descZoneSharedContainerCount], | ||
prometheus.GaugeValue, | ||
float64(zm.SharedContainers), | ||
zm.Name, | ||
), | ||
) | ||
|
||
log.Debug("collected zone %s metrics...", zm.Name) | ||
|
||
return metrics | ||
} | ||
|
||
// DescribeMetrics generates policy-specific prometheus metrics data descriptors. | ||
func (p *policy) DescribeMetrics() []*prometheus.Desc { | ||
log.Debug("has %d metrics descriptors", len(descriptors)) | ||
return descriptors | ||
} | ||
|
||
// PollMetrics provides policy metrics for monitoring. | ||
func (p *policy) PollMetrics() policyapi.Metrics { | ||
m := &TopologyAwareMetrics{ | ||
Zones: make([]string, 0, len(p.pools)), | ||
Metrics: make(map[string]*ZoneMetrics), | ||
} | ||
|
||
for _, pool := range p.pools { | ||
m.Zones = append(m.Zones, pool.Name()) | ||
|
||
var ( | ||
capa = pool.GetSupply().(*supply) | ||
free = pool.FreeSupply().(*supply) | ||
cpus = capa.ReservedCPUs().Union(capa.IsolatedCPUs()).Union(capa.SharableCPUs()) | ||
mems = libmem.NewNodeMask(pool.GetMemset(memoryAll).Members()...) | ||
sharedPool = free.SharableCPUs().Union(free.ReservedCPUs()) | ||
containers []string | ||
sharedctrs []string | ||
) | ||
|
||
for id, g := range p.allocations.grants { | ||
if g.GetCPUNode().Name() == pool.Name() { | ||
containers = append(containers, id) | ||
if g.ReservedPortion() != 0 || g.CPUPortion() != 0 { | ||
sharedctrs = append(sharedctrs, id) | ||
} | ||
} | ||
} | ||
|
||
zone := &ZoneMetrics{ | ||
Name: pool.Name(), | ||
Cpus: cpus, | ||
Mems: mems, | ||
SharedPool: sharedPool, | ||
SharedAssigned: free.GrantedReserved() + free.GrantedShared(), | ||
SharedAvailable: free.AllocatableSharedCPU(), | ||
MemCapacity: p.memAllocator.ZoneCapacity(mems), | ||
MemAssigned: p.memAllocator.ZoneUsage(mems), | ||
MemAvailable: p.memAllocator.ZoneAvailable(mems), | ||
Containers: len(containers), | ||
SharedContainers: len(sharedctrs), | ||
} | ||
|
||
m.Metrics[zone.Name] = zone | ||
|
||
log.Debug("polled zone %s for metrics...", pool.Name()) | ||
} | ||
|
||
slices.SortFunc(m.Zones, func(a, b string) int { | ||
poolA, poolB := p.nodes[a], p.nodes[b] | ||
if diff := poolA.RootDistance() - poolB.RootDistance(); diff != 0 { | ||
return diff | ||
} | ||
return strings.Compare(a, b) | ||
}) | ||
|
||
return m | ||
} | ||
|
||
// CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data. | ||
func (p *policy) CollectMetrics(pm policyapi.Metrics) ([]prometheus.Metric, error) { | ||
m, ok := pm.(*TopologyAwareMetrics) | ||
if !ok { | ||
return nil, policyError("unexpected policy metrics type %T", pm) | ||
} | ||
|
||
var collected []prometheus.Metric | ||
|
||
for _, name := range m.Zones { | ||
collected = append(collected, m.Metrics[name].Collect()...) | ||
} | ||
|
||
return collected, nil | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,6 +45,11 @@ func (a *Allocator) ZoneFree(zone NodeMask) int64 { | |
return a.zoneFree(zone & a.masks.nodes.hasMemory) | ||
} | ||
|
||
// ZoneAvailable returns the amount of available memory in the zone. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The meaning of "available" is not very clear to me. Maybe this public method comment could help understanding why ZoneAvailable != ZoneFree (where ZoneFree = ZoneCapacity - ZoneUsage). Can it be that ZoneAvailable at t=0 is smaller than ZoneUsage at t=1? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ZoneFree returns the amount of free capacity in this zone, taking into account allocations in this zone and any only this zone and its subzones. ZoneAvailable returns ZoneFree capped to the minimum free capacity of any of the (larger super-)zones this zone is a genuine subzone of. IOW, it returns the amount of memory allocatable from this zone. |
||
func (a *Allocator) ZoneAvailable(zone NodeMask) int64 { | ||
return a.zoneAvailable(zone & a.masks.nodes.hasMemory) | ||
} | ||
|
||
// ZoneNumUsers returns the number of requests assigned to the zone. | ||
func (a *Allocator) ZoneNumUsers(zone NodeMask) int { | ||
if z, ok := a.zones[zone]; ok { | ||
|
@@ -154,6 +159,28 @@ func (a *Allocator) zoneFree(zone NodeMask) int64 { | |
return a.zoneCapacity(zone) - a.zoneUsage(zone) | ||
} | ||
|
||
func (a *Allocator) zoneAvailable(zone NodeMask) int64 { | ||
available := a.zoneFree(zone) | ||
if available <= 0 { | ||
return 0 | ||
} | ||
|
||
// Cap available amount to the smallest available free in any of our ancestors. | ||
for _, z := range a.zones { | ||
if z.nodes != zone && (z.nodes&zone) == zone { | ||
if free := a.zoneFree(z.nodes); free < available { | ||
available = free | ||
} | ||
} | ||
if available <= 0 { | ||
available = 0 | ||
break | ||
} | ||
} | ||
|
||
return available | ||
} | ||
|
||
func (a *Allocator) zoneAssign(zone NodeMask, req *Request) { | ||
z, ok := a.zones[zone] | ||
if !ok { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why the use of "throwaway" const metric? I'd think about initializing a list of persistent gauges and using the Set method to update the value.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because I mindlessly copied the pattern of implementation from balloons without understanding what I am actually doing... Let me try to read up on this a bit more and fix it.