Skip to content

Commit

Permalink
fix: pod restart detection
Browse files Browse the repository at this point in the history
* we don't care about the restarts if the container has been running for
  over 4hrs
  • Loading branch information
adityathebe committed Aug 5, 2024
1 parent c4d27a6 commit 940f531
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 31 deletions.
51 changes: 23 additions & 28 deletions pkg/health/health_pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,21 +85,6 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
health = msg.Health
status = msg.Status
messages = append(messages, msg.Message)
} else if containerStatus.RestartCount > 2 && containerStatus.LastTerminationState.Terminated != nil {
lastRestarted := containerStatus.LastTerminationState.Terminated.FinishedAt.Time
if time.Since(lastRestarted) < time.Minute*30 {
return &HealthStatus{
Health: HealthUnhealthy,
Status: HealthStatusCode(containerStatus.LastTerminationState.Terminated.Reason),
Message: strings.Join(messages, ", "),
}, nil
} else if time.Since(lastRestarted) < time.Hour*8 {
return &HealthStatus{
Health: HealthWarning,
Status: HealthStatusCode(containerStatus.LastTerminationState.Terminated.Reason),
Message: strings.Join(messages, ", "),
}, nil
}
}
}

Expand Down Expand Up @@ -176,29 +161,39 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
switch pod.Spec.RestartPolicy {
case corev1.RestartPolicyAlways:
if isReady {
health := HealthHealthy
message := pod.Status.Message
h := &HealthStatus{
Health: HealthHealthy,
Ready: true,
Status: HealthStatusRunning,
Message: pod.Status.Message,
}

// A ready pod can be in a warning state if it has been in a restart loop.
// i.e. the container completes successfully, but the pod keeps restarting.
for _, s := range pod.Status.ContainerStatuses {
if s.LastTerminationState.Terminated != nil {
possiblyInRestartLoop := s.RestartCount > 2 &&
s.LastTerminationState.Terminated != nil &&
time.Since(s.State.Running.StartedAt.Time) < time.Hour*4

if possiblyInRestartLoop {
lastTerminatedTime := s.LastTerminationState.Terminated.FinishedAt.Time
if !lastTerminatedTime.IsZero() && pod.Status.StartTime.Sub(lastTerminatedTime) < time.Hour {
health = HealthWarning
message = fmt.Sprintf("%s has restarted %d time(s)", s.Name, pod.Status.ContainerStatuses[0].RestartCount)
h.Message = fmt.Sprintf("%s has restarted %d time(s)", s.Name, pod.Status.ContainerStatuses[0].RestartCount)

if s.LastTerminationState.Terminated.Reason != "Completed" {
h.Status = HealthStatusCode(s.LastTerminationState.Terminated.Reason)
}

break
if time.Since(lastTerminatedTime) < time.Minute*30 {
h.Health = HealthUnhealthy
h.Ready = false
} else if time.Since(lastTerminatedTime) < time.Hour*8 {
h.Health = HealthWarning
h.Ready = false
}
}
}

return &HealthStatus{
Health: health,
Ready: true,
Status: HealthStatusRunning,
Message: message,
}, nil
return h, nil
}

// if it's not ready, check to see if any container terminated, if so, it's degraded
Expand Down
9 changes: 6 additions & 3 deletions pkg/health/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,19 +172,22 @@ func TestPod(t *testing.T) {
assertAppHealth(t, "./testdata/pod-not-ready-but-container-ready.yaml", health.HealthStatusRunning, health.HealthWarning, false)

// Restart Loop
assertAppHealth(t, "./testdata/pod-ready-container-terminated.yaml", health.HealthStatusRunning, health.HealthWarning, true)
assertAppHealth(t, "./testdata/pod-ready-container-terminated.yaml", health.HealthStatusRunning, health.HealthHealthy, true)

assertAppHealthWithOverwrite(t, "./testdata/pod-ready-container-terminated.yaml", map[string]string{
"2024-07-18T12:03:16Z": "2024-07-18T12:05:16Z",
}, health.HealthStatusRunning, health.HealthWarning, true)
"2024-07-18T12:03:06Z": time.Now().Add(-time.Minute * 50).UTC().Format("2006-01-02T15:04:05Z"), // container last terminated
}, health.HealthStatusRunning, health.HealthWarning, false)

// Less than 30 minutes
assertAppHealthWithOverwrite(t, "./testdata/pod-high-restart-count.yaml", map[string]string{
"2024-07-17T14:29:51Z": time.Now().UTC().Add(-time.Minute).Format("2006-01-02T15:04:05Z"),
"2024-07-17T14:29:52Z": time.Now().UTC().Add(-time.Minute).Format("2006-01-02T15:04:05Z"), // start time
}, "OOMKilled", health.HealthUnhealthy, false)

// Less than 8 hours
assertAppHealthWithOverwrite(t, "./testdata/pod-high-restart-count.yaml", map[string]string{
"2024-07-17T14:29:51Z": time.Now().UTC().Add(-time.Hour).Format("2006-01-02T15:04:05Z"),
"2024-07-17T14:29:52Z": time.Now().UTC().Add(-time.Hour).Format("2006-01-02T15:04:05Z"), // start time
}, "OOMKilled", health.HealthWarning, false)

// More than 8 hours
Expand Down

0 comments on commit 940f531

Please sign in to comment.