From 940f5313eaa195abea44532678694487d2bbdcb1 Mon Sep 17 00:00:00 2001 From: Aditya Thebe Date: Mon, 5 Aug 2024 11:59:18 +0545 Subject: [PATCH] fix: pod restart detection * we don't care about the restarts if the container has been running for over 4hrs --- pkg/health/health_pod.go | 51 ++++++++++++++++++--------------------- pkg/health/health_test.go | 9 ++++--- 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/pkg/health/health_pod.go b/pkg/health/health_pod.go index a88ad9b..f6e7afe 100644 --- a/pkg/health/health_pod.go +++ b/pkg/health/health_pod.go @@ -85,21 +85,6 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { health = msg.Health status = msg.Status messages = append(messages, msg.Message) - } else if containerStatus.RestartCount > 2 && containerStatus.LastTerminationState.Terminated != nil { - lastRestarted := containerStatus.LastTerminationState.Terminated.FinishedAt.Time - if time.Since(lastRestarted) < time.Minute*30 { - return &HealthStatus{ - Health: HealthUnhealthy, - Status: HealthStatusCode(containerStatus.LastTerminationState.Terminated.Reason), - Message: strings.Join(messages, ", "), - }, nil - } else if time.Since(lastRestarted) < time.Hour*8 { - return &HealthStatus{ - Health: HealthWarning, - Status: HealthStatusCode(containerStatus.LastTerminationState.Terminated.Reason), - Message: strings.Join(messages, ", "), - }, nil - } } } @@ -176,29 +161,39 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { switch pod.Spec.RestartPolicy { case corev1.RestartPolicyAlways: if isReady { - health := HealthHealthy - message := pod.Status.Message + h := &HealthStatus{ + Health: HealthHealthy, + Ready: true, + Status: HealthStatusRunning, + Message: pod.Status.Message, + } // A ready pod can be in a warning state if it has been in a restart loop. // i.e. the container completes successfully, but the pod keeps restarting. for _, s := range pod.Status.ContainerStatuses { - if s.LastTerminationState.Terminated != nil { + possiblyInRestartLoop := s.RestartCount > 2 && + s.LastTerminationState.Terminated != nil && + time.Since(s.State.Running.StartedAt.Time) < time.Hour*4 + + if possiblyInRestartLoop { lastTerminatedTime := s.LastTerminationState.Terminated.FinishedAt.Time - if !lastTerminatedTime.IsZero() && pod.Status.StartTime.Sub(lastTerminatedTime) < time.Hour { - health = HealthWarning - message = fmt.Sprintf("%s has restarted %d time(s)", s.Name, pod.Status.ContainerStatuses[0].RestartCount) + h.Message = fmt.Sprintf("%s has restarted %d time(s)", s.Name, pod.Status.ContainerStatuses[0].RestartCount) + + if s.LastTerminationState.Terminated.Reason != "Completed" { + h.Status = HealthStatusCode(s.LastTerminationState.Terminated.Reason) } - break + if time.Since(lastTerminatedTime) < time.Minute*30 { + h.Health = HealthUnhealthy + h.Ready = false + } else if time.Since(lastTerminatedTime) < time.Hour*8 { + h.Health = HealthWarning + h.Ready = false + } } } - return &HealthStatus{ - Health: health, - Ready: true, - Status: HealthStatusRunning, - Message: message, - }, nil + return h, nil } // if it's not ready, check to see if any container terminated, if so, it's degraded diff --git a/pkg/health/health_test.go b/pkg/health/health_test.go index 8f1eca8..0c971ea 100644 --- a/pkg/health/health_test.go +++ b/pkg/health/health_test.go @@ -172,19 +172,22 @@ func TestPod(t *testing.T) { assertAppHealth(t, "./testdata/pod-not-ready-but-container-ready.yaml", health.HealthStatusRunning, health.HealthWarning, false) // Restart Loop - assertAppHealth(t, "./testdata/pod-ready-container-terminated.yaml", health.HealthStatusRunning, health.HealthWarning, true) + assertAppHealth(t, "./testdata/pod-ready-container-terminated.yaml", health.HealthStatusRunning, health.HealthHealthy, true) + assertAppHealthWithOverwrite(t, "./testdata/pod-ready-container-terminated.yaml", map[string]string{ - "2024-07-18T12:03:16Z": "2024-07-18T12:05:16Z", - }, health.HealthStatusRunning, health.HealthWarning, true) + "2024-07-18T12:03:06Z": time.Now().Add(-time.Minute * 50).UTC().Format("2006-01-02T15:04:05Z"), // container last terminated + }, health.HealthStatusRunning, health.HealthWarning, false) // Less than 30 minutes assertAppHealthWithOverwrite(t, "./testdata/pod-high-restart-count.yaml", map[string]string{ "2024-07-17T14:29:51Z": time.Now().UTC().Add(-time.Minute).Format("2006-01-02T15:04:05Z"), + "2024-07-17T14:29:52Z": time.Now().UTC().Add(-time.Minute).Format("2006-01-02T15:04:05Z"), // start time }, "OOMKilled", health.HealthUnhealthy, false) // Less than 8 hours assertAppHealthWithOverwrite(t, "./testdata/pod-high-restart-count.yaml", map[string]string{ "2024-07-17T14:29:51Z": time.Now().UTC().Add(-time.Hour).Format("2006-01-02T15:04:05Z"), + "2024-07-17T14:29:52Z": time.Now().UTC().Add(-time.Hour).Format("2006-01-02T15:04:05Z"), // start time }, "OOMKilled", health.HealthWarning, false) // More than 8 hours