From 244f501332f1f9ad53a3128926ef26e0d9d26058 Mon Sep 17 00:00:00 2001 From: Aravindh K Date: Tue, 19 Nov 2024 04:56:54 +0530 Subject: [PATCH] Adding Fleet Active GameServerSet Percentage Metrics (#4021) * Add metric for agones_fleets_active_gameserverset_percentage calculation * lint fixed --------- Co-authored-by: Mengye (Max) Gong <8364575+gongmax@users.noreply.github.com> Co-authored-by: Vicente Ferrara <47219931+vicentefb@users.noreply.github.com> --- build/grafana/dashboard-gameservers.yaml | 129 +++++++++++++++++++++-- pkg/metrics/controller.go | 55 ++++++++++ pkg/metrics/controller_metrics.go | 11 +- 3 files changed, 188 insertions(+), 7 deletions(-) diff --git a/build/grafana/dashboard-gameservers.yaml b/build/grafana/dashboard-gameservers.yaml index 89e66f36c0..94909a08d8 100644 --- a/build/grafana/dashboard-gameservers.yaml +++ b/build/grafana/dashboard-gameservers.yaml @@ -48,10 +48,89 @@ data: "links": [], "panels": [ { + "gridPos": { + "h": 6, + "w": 5, + "x": 0, + "y": 0 + }, "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "targets": [ + { + "expr": "sum(agones_fleet_rollout_percent{name=~\"$fleet\", namespace=~\"$namespace\", type=\"current_replicas\"}) / sum(agones_fleet_rollout_percent{name=~\"$fleet\", namespace=~\"$namespace\", type=\"desired_replicas\"}) * 100", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{type}}", + "refId": "A" + } + ], + "options": { + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": true + }, + "orientation": "horizontal", + "textMode": "value", + "colorMode": "value", + "valueMappings": [] + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": {}, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "dark-orange", + "value": 20 + }, + { + "color": "dark-green", + "value": 50 + } + ] + }, + "unit": "percent", + "links": [] + }, + "overrides": [] + }, + "legend": { + "show": false + }, + "timeShift": null, + "nullPointMode": "null", + "options": { + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "tooltip": { + "shared": false, + "value_type": "individual" + }, + "type": "stat", + "title": "Fleet RollOut Percentage" + }, + { + "aliasColors": {}, + "breakPoint": "50%", + "cacheTimeout": null, + "combine": { + "label": "Others", + "threshold": 0 + }, "fieldConfig": { "defaults": { "color": { @@ -72,8 +151,8 @@ data: }, "gridPos": { "h": 6, - "w": 7, - "x": 0, + "w": 5, + "x": 5, "y": 0 }, "id": 4, @@ -224,8 +303,8 @@ data: }, "gridPos": { "h": 6, - "w": 17, - "x": 7, + "w": 14, + "x": 10, "y": 0 }, "id": 2, @@ -401,8 +480,46 @@ data: "refId": "A" } ], - "title": "GameServers count per type", - "type": "timeseries" + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GameServer count overview", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "datasource": { diff --git a/pkg/metrics/controller.go b/pkg/metrics/controller.go index 887e105ae0..7b3c7421d1 100644 --- a/pkg/metrics/controller.go +++ b/pkg/metrics/controller.go @@ -28,6 +28,7 @@ import ( "agones.dev/agones/pkg/client/informers/externalversions" listerv1 "agones.dev/agones/pkg/client/listers/agones/v1" autoscalinglisterv1 "agones.dev/agones/pkg/client/listers/autoscaling/v1" + fleetsv1 "agones.dev/agones/pkg/fleets" "agones.dev/agones/pkg/util/runtime" lru "github.com/hashicorp/golang-lru" "github.com/pkg/errors" @@ -35,6 +36,7 @@ import ( "go.opencensus.io/stats" "go.opencensus.io/tag" corev1 "k8s.io/api/core/v1" + apiequality "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/apimachinery/pkg/util/wait" @@ -77,6 +79,7 @@ type Controller struct { gameServerSynced cache.InformerSynced fleetSynced cache.InformerSynced fleetLister listerv1.FleetLister + gameServerSetLister listerv1.GameServerSetLister fasSynced cache.InformerSynced fasLister autoscalinglisterv1.FleetAutoscalerLister lock sync.Mutex @@ -103,6 +106,8 @@ func NewController( fasInformer := fas.Informer() node := kubeInformerFactory.Core().V1().Nodes() + gameServerSets := agonesInformerFactory.Agones().V1().GameServerSets() + // GameServerStateLastChange Contains the time when the GameServer // changed its state last time // on delete and state change remove GameServerName key @@ -117,6 +122,7 @@ func NewController( gameServerSynced: gsInformer.HasSynced, fleetSynced: fInformer.HasSynced, fleetLister: fleets.Lister(), + gameServerSetLister: gameServerSets.Lister(), fasSynced: fasInformer.HasSynced, fasLister: fas.Lister(), gsCount: GameServerCount{}, @@ -240,6 +246,8 @@ func (c *Controller) recordFleetChanges(obj interface{}) { c.recordFleetReplicas(f.Name, f.Namespace, f.Status.Replicas, f.Status.AllocatedReplicas, f.Status.ReadyReplicas, f.Spec.Replicas, f.Status.ReservedReplicas) + c.recordFleetRolloutPercentage(f) + if runtime.FeatureEnabled(runtime.FeatureCountsAndLists) { if f.Status.Counters != nil { c.recordCounters(f.Name, f.Namespace, f.Status.Counters) @@ -250,6 +258,53 @@ func (c *Controller) recordFleetChanges(obj interface{}) { } } +func (c *Controller) recordFleetRolloutPercentage(fleet *agonesv1.Fleet) { + list, err := fleetsv1.ListGameServerSetsByFleetOwner(c.gameServerSetLister, fleet) + if err != nil { + c.logger.Errorf("Error listing GameServerSets for fleet %s in namespace %s: %v", fleet.Name, fleet.Namespace, err.Error()) + return + } + + active, _ := c.filterGameServerSetByActive(fleet, list) + + if active == nil { + fleetName := fleet.ObjectMeta.Namespace + "/" + fleet.ObjectMeta.Name + c.logger.Debugf("Could not find active GameServerSet %s", fleetName) + active = fleet.GameServerSet() + } + + currentReplicas := active.Status.Replicas + desiredReplicas := fleet.Spec.Replicas + + ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleet.Name), tag.Upsert(keyNamespace, fleet.GetNamespace())) + + // Record current replicas count + RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "current_replicas")}, + fleetRolloutPercentStats.M(int64(currentReplicas))) + + // Record desired replicas count + RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "desired_replicas")}, + fleetRolloutPercentStats.M(int64(desiredReplicas))) +} + +// filterGameServerSetByActive returns the active GameServerSet (or nil if it +// doesn't exist) and then the rest of the GameServerSets that are controlled +// by this Fleet +func (c *Controller) filterGameServerSetByActive(fleet *agonesv1.Fleet, list []*agonesv1.GameServerSet) (*agonesv1.GameServerSet, []*agonesv1.GameServerSet) { + var active *agonesv1.GameServerSet + var rest []*agonesv1.GameServerSet + + for _, gsSet := range list { + if apiequality.Semantic.DeepEqual(gsSet.Spec.Template, fleet.Spec.Template) { + active = gsSet + } else { + rest = append(rest, gsSet) + } + } + + return active, rest +} + func (c *Controller) recordFleetDeletion(obj interface{}) { _, ok := obj.(*agonesv1.Fleet) if !ok { diff --git a/pkg/metrics/controller_metrics.go b/pkg/metrics/controller_metrics.go index 4aee96c741..22bf46defc 100644 --- a/pkg/metrics/controller_metrics.go +++ b/pkg/metrics/controller_metrics.go @@ -21,6 +21,7 @@ import ( ) const ( + fleetRolloutPercent = "fleet_rollout_percent" fleetReplicaCountName = "fleets_replicas_count" fleetAutoscalerBufferLimitName = "fleet_autoscalers_buffer_limits" fleetAutoscalterBufferSizeName = "fleet_autoscalers_buffer_size" @@ -44,9 +45,10 @@ var ( fleetAutoscalerViews = []string{fleetAutoscalerBufferLimitName, fleetAutoscalterBufferSizeName, fleetAutoscalerCurrentReplicaCountName, fleetAutoscalersDesiredReplicaCountName, fleetAutoscalersAbleToScaleName, fleetAutoscalersLimitedName} // fleetViews are metric views associated with Fleets - fleetViews = append([]string{fleetReplicaCountName, gameServersCountName, gameServersTotalName, gameServersPlayerConnectedTotalName, gameServersPlayerCapacityTotalName, gameServerStateDurationName, fleetCountersName, fleetListsName}, fleetAutoscalerViews...) + fleetViews = append([]string{fleetRolloutPercent, fleetReplicaCountName, gameServersCountName, gameServersTotalName, gameServersPlayerConnectedTotalName, gameServersPlayerCapacityTotalName, gameServerStateDurationName, fleetCountersName, fleetListsName}, fleetAutoscalerViews...) stateDurationSeconds = []float64{0, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384} + fleetRolloutPercentStats = stats.Int64("fleets/rollout_percent", "The current fleet rollout percentage", "1") fleetsReplicasCountStats = stats.Int64("fleets/replicas_count", "The count of replicas per fleet", "1") fasBufferLimitsCountStats = stats.Int64("fas/buffer_limits", "The buffer limits of autoscalers", "1") fasBufferSizeStats = stats.Int64("fas/buffer_size", "The buffer size value of autoscalers", "1") @@ -65,6 +67,13 @@ var ( gsStateDurationSec = stats.Float64("gameservers_state/duration", "The duration of gameservers to be in a particular state", stats.UnitSeconds) stateViews = []*view.View{ + { + Name: fleetRolloutPercent, + Measure: fleetRolloutPercentStats, + Description: "Measures the current progress of fleet rollout", + Aggregation: view.LastValue(), + TagKeys: []tag.Key{keyName, keyType, keyNamespace}, + }, { Name: fleetReplicaCountName, Measure: fleetsReplicasCountStats,