From 68bb6ec865450b83245b81dfb5fa0fb6e048679d Mon Sep 17 00:00:00 2001 From: Aditya Thebe Date: Mon, 1 Jul 2024 18:48:58 +0545 Subject: [PATCH] feat: pod stuck in "terminating" for > 15m (#61) * feat: pod stuck in "terminating" for > 15m * chore: on pod delete test set the deletion timestamp to 1m ago * chore: address review comments --- pkg/health/health_pod.go | 27 +++--- pkg/health/health_test.go | 35 ++++++- pkg/health/testdata/pod-deletion.yaml | 9 +- pkg/health/testdata/pod-terminating.yaml | 111 +++++++++++++++++++++++ 4 files changed, 159 insertions(+), 23 deletions(-) create mode 100644 pkg/health/testdata/pod-terminating.yaml diff --git a/pkg/health/health_pod.go b/pkg/health/health_pod.go index 7dedbb5..bfbed85 100644 --- a/pkg/health/health_pod.go +++ b/pkg/health/health_pod.go @@ -3,6 +3,7 @@ package health import ( "fmt" "strings" + "time" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" @@ -27,19 +28,21 @@ func getPodHealth(obj *unstructured.Unstructured) (*HealthStatus, error) { func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { isReady := IsPodReady(pod) if pod.ObjectMeta.DeletionTimestamp != nil && !pod.ObjectMeta.DeletionTimestamp.IsZero() { - if isReady { - return &HealthStatus{ - Status: HealthStatusTerminating, - Ready: false, - Health: HealthHealthy, - }, nil - } else { - return &HealthStatus{ - Status: HealthStatusTerminating, - Ready: false, - Health: HealthUnhealthy, - }, nil + status := HealthUnknown + message := "" + + terminatingFor := time.Since(pod.ObjectMeta.DeletionTimestamp.Time) + if terminatingFor >= time.Minute*15 { + status = HealthWarning + message = fmt.Sprintf("stuck in 'Terminating' for %s", terminatingFor) } + + return &HealthStatus{ + Status: HealthStatusTerminating, + Ready: false, + Health: status, + Message: message, + }, nil } if pod.Status.Reason == "Evicted" { diff --git a/pkg/health/health_test.go b/pkg/health/health_test.go index b24cc5e..1970f8d 100644 --- a/pkg/health/health_test.go +++ b/pkg/health/health_test.go @@ -7,29 +7,47 @@ package health_test import ( "os" "testing" + "time" "github.com/flanksource/is-healthy/pkg/health" "github.com/flanksource/is-healthy/pkg/lua" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "sigs.k8s.io/yaml" ) func assertAppHealth(t *testing.T, yamlPath string, expectedStatus health.HealthStatusCode, expectedHealth health.Health, expectedReady bool) { - health := getHealthStatus(yamlPath, t) + health := getHealthStatus(yamlPath, t, nil) assert.NotNil(t, health) assert.Equal(t, expectedHealth, health.Health) assert.Equal(t, expectedReady, health.Ready) assert.Equal(t, expectedStatus, health.Status) } -func getHealthStatus(yamlPath string, t *testing.T) *health.HealthStatus { +func assertAppHealthWithOverwrite(t *testing.T, yamlPath string, overwrites map[string]any, expectedStatus health.HealthStatusCode, expectedHealth health.Health, expectedReady bool) { + health := getHealthStatus(yamlPath, t, overwrites) + assert.NotNil(t, health) + assert.Equal(t, expectedHealth, health.Health) + assert.Equal(t, expectedReady, health.Ready) + assert.Equal(t, expectedStatus, health.Status) +} + +func getHealthStatus(yamlPath string, t *testing.T, overwrites map[string]any) *health.HealthStatus { yamlBytes, err := os.ReadFile(yamlPath) require.NoError(t, err) var obj unstructured.Unstructured err = yaml.Unmarshal(yamlBytes, &obj) require.NoError(t, err) + + for k, v := range overwrites { + switch k { + case "deletionTimestamp": + obj.SetDeletionTimestamp(v.(*v1.Time)) + } + } + health, err := health.GetResourceHealth(&obj, lua.ResourceHealthOverrides{}) require.NoError(t, err) return health @@ -126,6 +144,10 @@ func TestHPA(t *testing.T) { } func TestPod(t *testing.T) { + assertAppHealth(t, "./testdata/pod-terminating.yaml", health.HealthStatusTerminating, health.HealthWarning, false) + status := getHealthStatus("./testdata/pod-terminating.yaml", t, nil) + assert.Contains(t, status.Message, "stuck in 'Terminating' for") + assertAppHealth(t, "./testdata/pod-pending.yaml", health.HealthStatusPending, health.HealthUnknown, false) assertAppHealth(t, "./testdata/pod-running-not-ready.yaml", health.HealthStatusStarting, health.HealthUnknown, false) assertAppHealth(t, "./testdata/pod-crashloop.yaml", health.HealthStatusCrashLoopBackoff, health.HealthUnhealthy, false) @@ -136,8 +158,11 @@ func TestPod(t *testing.T) { assertAppHealth(t, "./testdata/pod-running-restart-onfailure.yaml", health.HealthStatusRunning, health.HealthUnhealthy, false) assertAppHealth(t, "./testdata/pod-failed.yaml", health.HealthStatusError, health.HealthUnhealthy, true) assertAppHealth(t, "./testdata/pod-succeeded.yaml", health.HealthStatusCompleted, health.HealthHealthy, true) - assertAppHealth(t, "./testdata/pod-deletion.yaml", health.HealthStatusTerminating, health.HealthUnhealthy, false) assertAppHealth(t, "./testdata/pod-init-container-fail.yaml", health.HealthStatusCrashLoopBackoff, health.HealthUnhealthy, false) + + assertAppHealthWithOverwrite(t, "./testdata/pod-deletion.yaml", map[string]any{ + "deletionTimestamp": &v1.Time{Time: time.Now().Add(-time.Minute)}, + }, health.HealthStatusTerminating, health.HealthUnknown, false) } // func TestAPIService(t *testing.T) { @@ -206,13 +231,13 @@ func TestFluxResources(t *testing.T) { assertAppHealth(t, "./testdata/flux-kustomization-healthy.yaml", "Succeeded", health.HealthHealthy, true) assertAppHealth(t, "./testdata/flux-kustomization-unhealthy.yaml", "Progressing", health.HealthUnknown, false) assertAppHealth(t, "./testdata/flux-kustomization-failed.yaml", "BuildFailed", health.HealthUnhealthy, false) - status := getHealthStatus("./testdata/flux-kustomization-failed.yaml", t) + status := getHealthStatus("./testdata/flux-kustomization-failed.yaml", t, nil) assert.Contains(t, status.Message, "err='accumulating resources from 'kubernetes_resource_ingress_fail.yaml'") assertAppHealth(t, "./testdata/flux-helmrelease-healthy.yaml", "ReconciliationSucceeded", health.HealthHealthy, true) assertAppHealth(t, "./testdata/flux-helmrelease-unhealthy.yaml", "UpgradeFailed", health.HealthUnhealthy, true) assertAppHealth(t, "./testdata/flux-helmrelease-upgradefailed.yaml", "UpgradeFailed", health.HealthUnhealthy, true) - helmreleaseStatus := getHealthStatus("./testdata/flux-helmrelease-upgradefailed.yaml", t) + helmreleaseStatus := getHealthStatus("./testdata/flux-helmrelease-upgradefailed.yaml", t, nil) assert.Contains(t, helmreleaseStatus.Message, "Helm upgrade failed for release mission-control-agent/prod-kubernetes-bundle with chart mission-control-kubernetes@0.1.29: YAML parse error on mission-control-kubernetes/templates/topology.yaml: error converting YAML to JSON: yaml: line 171: did not find expected '-' indicator") assert.Equal(t, helmreleaseStatus.Status, health.HealthStatusUpgradeFailed) diff --git a/pkg/health/testdata/pod-deletion.yaml b/pkg/health/testdata/pod-deletion.yaml index 9bc86bd..efa4ad9 100644 --- a/pkg/health/testdata/pod-deletion.yaml +++ b/pkg/health/testdata/pod-deletion.yaml @@ -44,18 +44,15 @@ spec: secretName: default-token-f9jvj status: conditions: - - lastProbeTime: null - lastTransitionTime: 2018-12-02T10:16:04Z + - lastTransitionTime: 2018-12-02T10:16:04Z status: "True" type: Initialized - - lastProbeTime: null - lastTransitionTime: 2018-12-02T10:16:04Z + - lastTransitionTime: 2018-12-02T10:16:04Z message: 'containers with unready status: [main]' reason: ContainersNotReady status: "False" type: Ready - - lastProbeTime: null - lastTransitionTime: 2018-12-02T10:16:04Z + - lastTransitionTime: 2018-12-02T10:16:04Z status: "True" type: PodScheduled containerStatuses: diff --git a/pkg/health/testdata/pod-terminating.yaml b/pkg/health/testdata/pod-terminating.yaml new file mode 100644 index 0000000..807138c --- /dev/null +++ b/pkg/health/testdata/pod-terminating.yaml @@ -0,0 +1,111 @@ +apiVersion: v1 +kind: Pod +metadata: + annotations: + kubectl.kubernetes.io/last-applied-configuration: | + {"apiVersion":"v1","kind":"Pod","metadata":{"annotations":{},"finalizers":["example.com/test-finalizer"],"name":"test-pod","namespace":"default"},"spec":{"containers":[{"command":["sh","-c","while true; do echo hello; sleep 10;done"],"image":"busybox","name":"test-container"}]}} + creationTimestamp: "2024-07-01T05:51:36Z" + deletionGracePeriodSeconds: 0 + deletionTimestamp: "2024-07-01T06:52:22Z" + finalizers: + - example.com/test-finalizer + name: test-pod + namespace: default + resourceVersion: "58029548" + uid: 4bb10d70-5481-41e9-bf05-43b740bf6ffa +spec: + containers: + - command: + - sh + - -c + - while true; do echo hello; sleep 10;done + image: busybox + imagePullPolicy: Always + name: test-container + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /var/run/secrets/kubernetes.io/serviceaccount + name: kube-api-access-784np + readOnly: true + dnsPolicy: ClusterFirst + enableServiceLinks: true + nodeName: esr + preemptionPolicy: PreemptLowerPriority + priority: 0 + restartPolicy: Always + schedulerName: default-scheduler + securityContext: {} + serviceAccount: default + serviceAccountName: default + terminationGracePeriodSeconds: 30 + tolerations: + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 300 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 300 + volumes: + - name: kube-api-access-784np + projected: + defaultMode: 420 + sources: + - serviceAccountToken: + expirationSeconds: 3607 + path: token + - configMap: + items: + - key: ca.crt + path: ca.crt + name: kube-root-ca.crt + - downwardAPI: + items: + - fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + path: namespace +status: + conditions: + - lastTransitionTime: "2024-07-01T08:51:36Z" + status: "True" + type: Initialized + - lastTransitionTime: "2024-07-01T08:52:53Z" + message: 'containers with unready status: [test-container]' + reason: ContainersNotReady + status: "False" + type: Ready + - lastTransitionTime: "2024-07-01T08:52:53Z" + message: 'containers with unready status: [test-container]' + reason: ContainersNotReady + status: "False" + type: ContainersReady + - lastTransitionTime: "2024-07-01T08:51:36Z" + status: "True" + type: PodScheduled + containerStatuses: + - containerID: containerd://06962418f541510abda8a61803dd03cd27cc1b309402006420d8a5e8069569ce + image: docker.io/library/busybox:latest + imageID: docker.io/library/busybox@sha256:9ae97d36d26566ff84e8893c64a6dc4fe8ca6d1144bf5b87b2b85a32def253c7 + lastState: {} + name: test-container + ready: false + restartCount: 0 + started: false + state: + terminated: + containerID: containerd://06962418f541510abda8a61803dd03cd27cc1b309402006420d8a5e8069569ce + exitCode: 137 + finishedAt: "2024-07-01T08:52:52Z" + reason: Error + startedAt: "2024-07-01T08:52:10Z" + hostIP: 10.99.99.9 + phase: Running + podIP: 10.42.1.123 + podIPs: + - ip: 10.42.1.123 + qosClass: BestEffort + startTime: "2024-07-01T08:51:36Z"