From f8d49923f6ac1e0bb31e125abafa0549b519f771 Mon Sep 17 00:00:00 2001 From: Paul Dittamo <37558497+pvditt@users.noreply.github.com> Date: Thu, 4 Apr 2024 05:40:23 -0700 Subject: [PATCH] [House keeping] include container statuses for all container exit errors (#5161) * include container statuses for all container exit errors Signed-off-by: Paul Dittamo * add unit test Signed-off-by: Paul Dittamo --------- Signed-off-by: Paul Dittamo --- .../pluginmachinery/flytek8s/pod_helper.go | 9 ++++++- .../flytek8s/pod_helper_test.go | 24 +++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go b/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go index 7f010f6b7c..e25b3a4994 100644 --- a/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go +++ b/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go @@ -851,9 +851,11 @@ func DemystifyFailure(status v1.PodStatus, info pluginsCore.TaskInfo) (pluginsCo // } // } // + + var isSystemError bool // In some versions of GKE the reason can also be "Terminated" if code == "Shutdown" || code == "Terminated" { - return pluginsCore.PhaseInfoSystemRetryableFailure(Interrupted, message, &info), nil + isSystemError = true } // @@ -887,6 +889,11 @@ func DemystifyFailure(status v1.PodStatus, info pluginsCore.TaskInfo) (pluginsCo } } } + + if isSystemError { + return pluginsCore.PhaseInfoSystemRetryableFailure(Interrupted, message, &info), nil + } + return pluginsCore.PhaseInfoRetryableFailure(code, message, &info), nil } diff --git a/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go b/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go index 08b6c5b9d0..7869ed400f 100644 --- a/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go +++ b/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go @@ -1631,25 +1631,49 @@ func TestDemystifyFailure(t *testing.T) { }) t.Run("GKE kubelet graceful node shutdown", func(t *testing.T) { + containerReason := "some reason" phaseInfo, err := DemystifyFailure(v1.PodStatus{ Message: "Pod Node is in progress of shutting down, not admitting any new pods", Reason: "Shutdown", + ContainerStatuses: []v1.ContainerStatus{ + { + LastTerminationState: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ + Reason: containerReason, + ExitCode: SIGKILL, + }, + }, + }, + }, }, pluginsCore.TaskInfo{}) assert.Nil(t, err) assert.Equal(t, pluginsCore.PhaseRetryableFailure, phaseInfo.Phase()) assert.Equal(t, "Interrupted", phaseInfo.Err().Code) assert.Equal(t, core.ExecutionError_SYSTEM, phaseInfo.Err().Kind) + assert.Contains(t, phaseInfo.Err().Message, containerReason) }) t.Run("GKE kubelet graceful node shutdown", func(t *testing.T) { + containerReason := "some reason" phaseInfo, err := DemystifyFailure(v1.PodStatus{ Message: "Foobar", Reason: "Terminated", + ContainerStatuses: []v1.ContainerStatus{ + { + LastTerminationState: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ + Reason: containerReason, + ExitCode: SIGKILL, + }, + }, + }, + }, }, pluginsCore.TaskInfo{}) assert.Nil(t, err) assert.Equal(t, pluginsCore.PhaseRetryableFailure, phaseInfo.Phase()) assert.Equal(t, "Interrupted", phaseInfo.Err().Code) assert.Equal(t, core.ExecutionError_SYSTEM, phaseInfo.Err().Kind) + assert.Contains(t, phaseInfo.Err().Message, containerReason) }) }