Skip to content

Commit

Permalink
feat: pod stuck in "terminating" for > 15m (#61)
Browse files Browse the repository at this point in the history
* feat: pod stuck in "terminating" for > 15m

* chore: on pod delete test set the deletion timestamp to 1m ago

* chore: address review comments
  • Loading branch information
adityathebe authored Jul 1, 2024
1 parent 0850096 commit 68bb6ec
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 23 deletions.
27 changes: 15 additions & 12 deletions pkg/health/health_pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package health
import (
"fmt"
"strings"
"time"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
Expand All @@ -27,19 +28,21 @@ func getPodHealth(obj *unstructured.Unstructured) (*HealthStatus, error) {
func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
isReady := IsPodReady(pod)
if pod.ObjectMeta.DeletionTimestamp != nil && !pod.ObjectMeta.DeletionTimestamp.IsZero() {
if isReady {
return &HealthStatus{
Status: HealthStatusTerminating,
Ready: false,
Health: HealthHealthy,
}, nil
} else {
return &HealthStatus{
Status: HealthStatusTerminating,
Ready: false,
Health: HealthUnhealthy,
}, nil
status := HealthUnknown
message := ""

terminatingFor := time.Since(pod.ObjectMeta.DeletionTimestamp.Time)
if terminatingFor >= time.Minute*15 {
status = HealthWarning
message = fmt.Sprintf("stuck in 'Terminating' for %s", terminatingFor)
}

return &HealthStatus{
Status: HealthStatusTerminating,
Ready: false,
Health: status,
Message: message,
}, nil
}

if pod.Status.Reason == "Evicted" {
Expand Down
35 changes: 30 additions & 5 deletions pkg/health/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,47 @@ package health_test
import (
"os"
"testing"
"time"

"github.com/flanksource/is-healthy/pkg/health"
"github.com/flanksource/is-healthy/pkg/lua"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"sigs.k8s.io/yaml"
)

func assertAppHealth(t *testing.T, yamlPath string, expectedStatus health.HealthStatusCode, expectedHealth health.Health, expectedReady bool) {
health := getHealthStatus(yamlPath, t)
health := getHealthStatus(yamlPath, t, nil)
assert.NotNil(t, health)
assert.Equal(t, expectedHealth, health.Health)
assert.Equal(t, expectedReady, health.Ready)
assert.Equal(t, expectedStatus, health.Status)
}

func getHealthStatus(yamlPath string, t *testing.T) *health.HealthStatus {
func assertAppHealthWithOverwrite(t *testing.T, yamlPath string, overwrites map[string]any, expectedStatus health.HealthStatusCode, expectedHealth health.Health, expectedReady bool) {
health := getHealthStatus(yamlPath, t, overwrites)
assert.NotNil(t, health)
assert.Equal(t, expectedHealth, health.Health)
assert.Equal(t, expectedReady, health.Ready)
assert.Equal(t, expectedStatus, health.Status)
}

func getHealthStatus(yamlPath string, t *testing.T, overwrites map[string]any) *health.HealthStatus {
yamlBytes, err := os.ReadFile(yamlPath)
require.NoError(t, err)
var obj unstructured.Unstructured
err = yaml.Unmarshal(yamlBytes, &obj)
require.NoError(t, err)

for k, v := range overwrites {
switch k {
case "deletionTimestamp":
obj.SetDeletionTimestamp(v.(*v1.Time))
}
}

health, err := health.GetResourceHealth(&obj, lua.ResourceHealthOverrides{})
require.NoError(t, err)
return health
Expand Down Expand Up @@ -126,6 +144,10 @@ func TestHPA(t *testing.T) {
}

func TestPod(t *testing.T) {
assertAppHealth(t, "./testdata/pod-terminating.yaml", health.HealthStatusTerminating, health.HealthWarning, false)
status := getHealthStatus("./testdata/pod-terminating.yaml", t, nil)
assert.Contains(t, status.Message, "stuck in 'Terminating' for")

assertAppHealth(t, "./testdata/pod-pending.yaml", health.HealthStatusPending, health.HealthUnknown, false)
assertAppHealth(t, "./testdata/pod-running-not-ready.yaml", health.HealthStatusStarting, health.HealthUnknown, false)
assertAppHealth(t, "./testdata/pod-crashloop.yaml", health.HealthStatusCrashLoopBackoff, health.HealthUnhealthy, false)
Expand All @@ -136,8 +158,11 @@ func TestPod(t *testing.T) {
assertAppHealth(t, "./testdata/pod-running-restart-onfailure.yaml", health.HealthStatusRunning, health.HealthUnhealthy, false)
assertAppHealth(t, "./testdata/pod-failed.yaml", health.HealthStatusError, health.HealthUnhealthy, true)
assertAppHealth(t, "./testdata/pod-succeeded.yaml", health.HealthStatusCompleted, health.HealthHealthy, true)
assertAppHealth(t, "./testdata/pod-deletion.yaml", health.HealthStatusTerminating, health.HealthUnhealthy, false)
assertAppHealth(t, "./testdata/pod-init-container-fail.yaml", health.HealthStatusCrashLoopBackoff, health.HealthUnhealthy, false)

assertAppHealthWithOverwrite(t, "./testdata/pod-deletion.yaml", map[string]any{
"deletionTimestamp": &v1.Time{Time: time.Now().Add(-time.Minute)},
}, health.HealthStatusTerminating, health.HealthUnknown, false)
}

// func TestAPIService(t *testing.T) {
Expand Down Expand Up @@ -206,13 +231,13 @@ func TestFluxResources(t *testing.T) {
assertAppHealth(t, "./testdata/flux-kustomization-healthy.yaml", "Succeeded", health.HealthHealthy, true)
assertAppHealth(t, "./testdata/flux-kustomization-unhealthy.yaml", "Progressing", health.HealthUnknown, false)
assertAppHealth(t, "./testdata/flux-kustomization-failed.yaml", "BuildFailed", health.HealthUnhealthy, false)
status := getHealthStatus("./testdata/flux-kustomization-failed.yaml", t)
status := getHealthStatus("./testdata/flux-kustomization-failed.yaml", t, nil)
assert.Contains(t, status.Message, "err='accumulating resources from 'kubernetes_resource_ingress_fail.yaml'")

assertAppHealth(t, "./testdata/flux-helmrelease-healthy.yaml", "ReconciliationSucceeded", health.HealthHealthy, true)
assertAppHealth(t, "./testdata/flux-helmrelease-unhealthy.yaml", "UpgradeFailed", health.HealthUnhealthy, true)
assertAppHealth(t, "./testdata/flux-helmrelease-upgradefailed.yaml", "UpgradeFailed", health.HealthUnhealthy, true)
helmreleaseStatus := getHealthStatus("./testdata/flux-helmrelease-upgradefailed.yaml", t)
helmreleaseStatus := getHealthStatus("./testdata/flux-helmrelease-upgradefailed.yaml", t, nil)
assert.Contains(t, helmreleaseStatus.Message, "Helm upgrade failed for release mission-control-agent/prod-kubernetes-bundle with chart [email protected]: YAML parse error on mission-control-kubernetes/templates/topology.yaml: error converting YAML to JSON: yaml: line 171: did not find expected '-' indicator")
assert.Equal(t, helmreleaseStatus.Status, health.HealthStatusUpgradeFailed)

Expand Down
9 changes: 3 additions & 6 deletions pkg/health/testdata/pod-deletion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,18 +44,15 @@ spec:
secretName: default-token-f9jvj
status:
conditions:
- lastProbeTime: null
lastTransitionTime: 2018-12-02T10:16:04Z
- lastTransitionTime: 2018-12-02T10:16:04Z
status: "True"
type: Initialized
- lastProbeTime: null
lastTransitionTime: 2018-12-02T10:16:04Z
- lastTransitionTime: 2018-12-02T10:16:04Z
message: 'containers with unready status: [main]'
reason: ContainersNotReady
status: "False"
type: Ready
- lastProbeTime: null
lastTransitionTime: 2018-12-02T10:16:04Z
- lastTransitionTime: 2018-12-02T10:16:04Z
status: "True"
type: PodScheduled
containerStatuses:
Expand Down
111 changes: 111 additions & 0 deletions pkg/health/testdata/pod-terminating.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
apiVersion: v1
kind: Pod
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"v1","kind":"Pod","metadata":{"annotations":{},"finalizers":["example.com/test-finalizer"],"name":"test-pod","namespace":"default"},"spec":{"containers":[{"command":["sh","-c","while true; do echo hello; sleep 10;done"],"image":"busybox","name":"test-container"}]}}
creationTimestamp: "2024-07-01T05:51:36Z"
deletionGracePeriodSeconds: 0
deletionTimestamp: "2024-07-01T06:52:22Z"
finalizers:
- example.com/test-finalizer
name: test-pod
namespace: default
resourceVersion: "58029548"
uid: 4bb10d70-5481-41e9-bf05-43b740bf6ffa
spec:
containers:
- command:
- sh
- -c
- while true; do echo hello; sleep 10;done
image: busybox
imagePullPolicy: Always
name: test-container
resources: {}
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: kube-api-access-784np
readOnly: true
dnsPolicy: ClusterFirst
enableServiceLinks: true
nodeName: esr
preemptionPolicy: PreemptLowerPriority
priority: 0
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
serviceAccount: default
serviceAccountName: default
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoExecute
key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 300
- effect: NoExecute
key: node.kubernetes.io/unreachable
operator: Exists
tolerationSeconds: 300
volumes:
- name: kube-api-access-784np
projected:
defaultMode: 420
sources:
- serviceAccountToken:
expirationSeconds: 3607
path: token
- configMap:
items:
- key: ca.crt
path: ca.crt
name: kube-root-ca.crt
- downwardAPI:
items:
- fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
path: namespace
status:
conditions:
- lastTransitionTime: "2024-07-01T08:51:36Z"
status: "True"
type: Initialized
- lastTransitionTime: "2024-07-01T08:52:53Z"
message: 'containers with unready status: [test-container]'
reason: ContainersNotReady
status: "False"
type: Ready
- lastTransitionTime: "2024-07-01T08:52:53Z"
message: 'containers with unready status: [test-container]'
reason: ContainersNotReady
status: "False"
type: ContainersReady
- lastTransitionTime: "2024-07-01T08:51:36Z"
status: "True"
type: PodScheduled
containerStatuses:
- containerID: containerd://06962418f541510abda8a61803dd03cd27cc1b309402006420d8a5e8069569ce
image: docker.io/library/busybox:latest
imageID: docker.io/library/busybox@sha256:9ae97d36d26566ff84e8893c64a6dc4fe8ca6d1144bf5b87b2b85a32def253c7
lastState: {}
name: test-container
ready: false
restartCount: 0
started: false
state:
terminated:
containerID: containerd://06962418f541510abda8a61803dd03cd27cc1b309402006420d8a5e8069569ce
exitCode: 137
finishedAt: "2024-07-01T08:52:52Z"
reason: Error
startedAt: "2024-07-01T08:52:10Z"
hostIP: 10.99.99.9
phase: Running
podIP: 10.42.1.123
podIPs:
- ip: 10.42.1.123
qosClass: BestEffort
startTime: "2024-07-01T08:51:36Z"

0 comments on commit 68bb6ec

Please sign in to comment.