From 0d1583eab9a3ca040438538bbdc467568714c245 Mon Sep 17 00:00:00 2001 From: Iaroslav Ciupin Date: Wed, 14 Aug 2024 22:37:37 +0300 Subject: [PATCH] Unexpectedly deleted pod metrics * Count when we see unexpectedly terminated pods Signed-off-by: Andrew Dye --- .../controller/nodes/task/k8s/plugin_manager.go | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/flytepropeller/pkg/controller/nodes/task/k8s/plugin_manager.go b/flytepropeller/pkg/controller/nodes/task/k8s/plugin_manager.go index 42d3ad9b85..c9c9167146 100644 --- a/flytepropeller/pkg/controller/nodes/task/k8s/plugin_manager.go +++ b/flytepropeller/pkg/controller/nodes/task/k8s/plugin_manager.go @@ -5,6 +5,7 @@ import ( "fmt" "time" + "github.com/prometheus/client_golang/prometheus" "golang.org/x/time/rate" v1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" @@ -64,6 +65,7 @@ type PluginMetrics struct { GetCacheHit labeled.StopWatch GetAPILatency labeled.StopWatch ResourceDeleted labeled.Counter + TaskPodErrors *prometheus.CounterVec } func newPluginMetrics(s promutils.Scope) PluginMetrics { @@ -77,6 +79,8 @@ func newPluginMetrics(s promutils.Scope) PluginMetrics { time.Millisecond, s), ResourceDeleted: labeled.NewCounter("pods_deleted", "Counts how many times CheckTaskStatus is"+ " called with a deleted resource.", s), + TaskPodErrors: s.MustNewCounterVec("task_pod_errors", "Counts how many times task pods failed in given phase with given code", + "phase", "error_code"), } } @@ -350,14 +354,19 @@ func (e PluginManager) Handle(ctx context.Context, tCtx pluginsCore.TaskExecutio return transition, err } + phaseInfo := transition.Info() + if phaseInfo.Err() != nil { + e.metrics.TaskPodErrors.WithLabelValues(phaseInfo.Phase().String(), phaseInfo.Err().GetCode()).Inc() + } + // Add events since last update - version := transition.Info().Version() + version := phaseInfo.Version() lastEventUpdate := pluginState.LastEventUpdate if e.eventWatcher != nil && o != nil { nsName := k8stypes.NamespacedName{Namespace: o.GetNamespace(), Name: o.GetName()} recentEvents := e.eventWatcher.List(nsName, lastEventUpdate) if len(recentEvents) > 0 { - taskInfo := transition.Info().Info() + taskInfo := phaseInfo.Info() taskInfo.AdditionalReasons = make([]pluginsCore.ReasonInfo, 0, len(recentEvents)) for _, event := range recentEvents { taskInfo.AdditionalReasons = append(taskInfo.AdditionalReasons, @@ -373,9 +382,9 @@ func (e PluginManager) Handle(ctx context.Context, tCtx pluginsCore.TaskExecutio newPluginState := PluginState{ Phase: pluginPhase, K8sPluginState: k8s.PluginState{ - Phase: transition.Info().Phase(), + Phase: phaseInfo.Phase(), PhaseVersion: version, - Reason: transition.Info().Reason(), + Reason: phaseInfo.Reason(), }, LastEventUpdate: lastEventUpdate, }