diff --git a/pkg/health/health_pod.go b/pkg/health/health_pod.go index bfbed85..84eca13 100644 --- a/pkg/health/health_pod.go +++ b/pkg/health/health_pod.go @@ -84,6 +84,21 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { health = msg.Health status = msg.Status messages = append(messages, msg.Message) + } else if containerStatus.RestartCount > 2 && containerStatus.LastTerminationState.Terminated != nil { + lastRestarted := containerStatus.LastTerminationState.Terminated.FinishedAt.Time + if time.Since(lastRestarted) < time.Minute*30 { + return &HealthStatus{ + Health: HealthUnhealthy, + Status: HealthStatusCode(containerStatus.LastTerminationState.Terminated.Reason), + Message: strings.Join(messages, ", "), + }, nil + } else if time.Since(lastRestarted) < time.Hour*8 { + return &HealthStatus{ + Health: HealthWarning, + Status: HealthStatusCode(containerStatus.LastTerminationState.Terminated.Reason), + Message: strings.Join(messages, ", "), + }, nil + } } } diff --git a/pkg/health/health_test.go b/pkg/health/health_test.go index 1970f8d..63fdbe9 100644 --- a/pkg/health/health_test.go +++ b/pkg/health/health_test.go @@ -6,6 +6,7 @@ package health_test import ( "os" + "strings" "testing" "time" @@ -13,7 +14,6 @@ import ( "github.com/flanksource/is-healthy/pkg/lua" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "sigs.k8s.io/yaml" ) @@ -26,7 +26,7 @@ func assertAppHealth(t *testing.T, yamlPath string, expectedStatus health.Health assert.Equal(t, expectedStatus, health.Status) } -func assertAppHealthWithOverwrite(t *testing.T, yamlPath string, overwrites map[string]any, expectedStatus health.HealthStatusCode, expectedHealth health.Health, expectedReady bool) { +func assertAppHealthWithOverwrite(t *testing.T, yamlPath string, overwrites map[string]string, expectedStatus health.HealthStatusCode, expectedHealth health.Health, expectedReady bool) { health := getHealthStatus(yamlPath, t, overwrites) assert.NotNil(t, health) assert.Equal(t, expectedHealth, health.Health) @@ -34,20 +34,19 @@ func assertAppHealthWithOverwrite(t *testing.T, yamlPath string, overwrites map[ assert.Equal(t, expectedStatus, health.Status) } -func getHealthStatus(yamlPath string, t *testing.T, overwrites map[string]any) *health.HealthStatus { +func getHealthStatus(yamlPath string, t *testing.T, overwrites map[string]string) *health.HealthStatus { yamlBytes, err := os.ReadFile(yamlPath) require.NoError(t, err) - var obj unstructured.Unstructured - err = yaml.Unmarshal(yamlBytes, &obj) - require.NoError(t, err) + // Basic, search & replace overwrite for k, v := range overwrites { - switch k { - case "deletionTimestamp": - obj.SetDeletionTimestamp(v.(*v1.Time)) - } + yamlBytes = []byte(strings.ReplaceAll(string(yamlBytes), k, v)) } + var obj unstructured.Unstructured + err = yaml.Unmarshal(yamlBytes, &obj) + require.NoError(t, err) + health, err := health.GetResourceHealth(&obj, lua.ResourceHealthOverrides{}) require.NoError(t, err) return health @@ -144,6 +143,23 @@ func TestHPA(t *testing.T) { } func TestPod(t *testing.T) { + // Less than 30 minutes + assertAppHealthWithOverwrite(t, "./testdata/pod-high-restart-count.yaml", map[string]string{ + "2024-07-17T14:29:51Z": time.Now().UTC().Add(-time.Minute).Format("2006-01-02T15:04:05Z"), + }, "OOMKilled", health.HealthUnhealthy, false) + + // Less than 8 hours + assertAppHealthWithOverwrite(t, "./testdata/pod-high-restart-count.yaml", map[string]string{ + "2024-07-17T14:29:51Z": time.Now().UTC().Add(-time.Hour).Format("2006-01-02T15:04:05Z"), + }, "OOMKilled", health.HealthWarning, false) + + // More than 8 hours + assertAppHealthWithOverwrite(t, "./testdata/pod-high-restart-count.yaml", map[string]string{ + "2024-07-17T14:29:51Z": "2024-06-17T14:29:51Z", + }, health.HealthStatusRunning, health.HealthHealthy, true) + + assertAppHealth(t, "./testdata/pod-old-restarts.yaml", health.HealthStatusRunning, health.HealthHealthy, true) + assertAppHealth(t, "./testdata/pod-terminating.yaml", health.HealthStatusTerminating, health.HealthWarning, false) status := getHealthStatus("./testdata/pod-terminating.yaml", t, nil) assert.Contains(t, status.Message, "stuck in 'Terminating' for") @@ -160,8 +176,8 @@ func TestPod(t *testing.T) { assertAppHealth(t, "./testdata/pod-succeeded.yaml", health.HealthStatusCompleted, health.HealthHealthy, true) assertAppHealth(t, "./testdata/pod-init-container-fail.yaml", health.HealthStatusCrashLoopBackoff, health.HealthUnhealthy, false) - assertAppHealthWithOverwrite(t, "./testdata/pod-deletion.yaml", map[string]any{ - "deletionTimestamp": &v1.Time{Time: time.Now().Add(-time.Minute)}, + assertAppHealthWithOverwrite(t, "./testdata/pod-deletion.yaml", map[string]string{ + "2018-12-03T10:16:04Z": time.Now().Add(-time.Minute).Format("2006-01-02T15:04:05Z"), }, health.HealthStatusTerminating, health.HealthUnknown, false) } diff --git a/pkg/health/testdata/pod-high-restart-count.yaml b/pkg/health/testdata/pod-high-restart-count.yaml new file mode 100644 index 0000000..5900e6f --- /dev/null +++ b/pkg/health/testdata/pod-high-restart-count.yaml @@ -0,0 +1,170 @@ +apiVersion: v1 +kind: Pod +metadata: + uid: ba54f218-3435-464c-9f37-75ac7d76104a + name: config-db-5867b6596f-gs79g + labels: + control-plane: config-db + pod-template-hash: 5867b6596f + app.kubernetes.io/name: config-db + app.kubernetes.io/instance: mission-control + namespace: mission-control + generateName: config-db-5867b6596f- + ownerReferences: + - uid: 00427427-8dee-4003-bd4e-496b0cc275d1 + kind: ReplicaSet + name: config-db-5867b6596f + apiVersion: apps/v1 + controller: true + blockOwnerDeletion: true + creationTimestamp: 2024-07-16T13:31:23Z +spec: + volumes: + - name: aws-iam-token + projected: + sources: + - serviceAccountToken: + path: token + audience: sts.amazonaws.com + expirationSeconds: 86400 + defaultMode: 420 + - name: kube-api-access-7jmn6 + projected: + sources: + - serviceAccountToken: + path: token + expirationSeconds: 3607 + - configMap: + name: kube-root-ca.crt + items: + - key: ca.crt + path: ca.crt + - downwardAPI: + items: + - path: namespace + fieldRef: + fieldPath: metadata.namespace + apiVersion: v1 + defaultMode: 420 + nodeName: ip-10-0-6-40.eu-west-1.compute.internal + priority: 0 + dnsPolicy: ClusterFirst + containers: + - env: + - name: DB_URL + valueFrom: + secretKeyRef: + key: DB_URL + name: incident-commander-postgres + - name: NAMESPACE + value: mission-control + - name: AWS_STS_REGIONAL_ENDPOINTS + value: regional + - name: AWS_DEFAULT_REGION + value: eu-west-1 + - name: AWS_REGION + value: eu-west-1 + - name: AWS_ROLE_ARN + value: arn:aws:iam::765618022540:role/eksctl-config-db-sa + - name: AWS_WEB_IDENTITY_TOKEN_FILE + value: /var/run/secrets/eks.amazonaws.com/serviceaccount/token + args: + - operator + - --disable-postgrest=true + - --change-retention-days=60 + - --analysis-retention-days=60 + - --json-logs + - --otel-collector-url=grafana-alloy.monitoring:4317 + - --otel-service-name=config-db + name: config-db + image: public.ecr.aws/k4y9r6y5/config-db:v0.0.400 + command: + - /app/config-db + resources: + limits: + cpu: 500m + memory: 4Gi + requests: + cpu: 200m + memory: 1Gi + volumeMounts: + - name: kube-api-access-7jmn6 + readOnly: true + mountPath: /var/run/secrets/kubernetes.io/serviceaccount + - name: aws-iam-token + readOnly: true + mountPath: /var/run/secrets/eks.amazonaws.com/serviceaccount + livenessProbe: + httpGet: + path: /live + port: 8080 + scheme: HTTP + periodSeconds: 10 + timeoutSeconds: 1 + failureThreshold: 3 + successThreshold: 1 + readinessProbe: + httpGet: + path: /ready + port: 8080 + scheme: HTTP + periodSeconds: 10 + timeoutSeconds: 1 + failureThreshold: 3 + successThreshold: 1 + imagePullPolicy: IfNotPresent + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + tolerations: + - key: node.kubernetes.io/not-ready + effect: NoExecute + operator: Exists + tolerationSeconds: 300 + - key: node.kubernetes.io/unreachable + effect: NoExecute + operator: Exists + tolerationSeconds: 300 + restartPolicy: Always + schedulerName: default-scheduler + serviceAccount: config-db-sa + securityContext: + fsGroup: 1000 + preemptionPolicy: PreemptLowerPriority + enableServiceLinks: true + serviceAccountName: config-db-sa + terminationGracePeriodSeconds: 30 +status: + phase: Running + podIP: 10.0.6.38 + hostIP: 10.0.6.40 + podIPs: + - ip: 10.0.6.38 + qosClass: Burstable + startTime: 2024-07-16T13:31:23Z + conditions: + - type: Initialized + status: "True" + - type: Ready + status: "True" + - type: ContainersReady + status: "True" + - type: PodScheduled + status: "True" + containerStatuses: + - name: config-db + image: public.ecr.aws/k4y9r6y5/config-db:v0.0.400 + ready: true + state: + running: + startedAt: 2024-07-17T14:29:52Z + imageID: public.ecr.aws/k4y9r6y5/config-db@sha256:b8803113097931662bda448b53c6ca256957957d74d5e8fd1fc442cec197b025 + started: true + lastState: + terminated: + reason: OOMKilled + exitCode: 137 + startedAt: 2024-07-17T14:13:28Z + finishedAt: 2024-07-17T14:29:51Z + containerID: containerd://ee5467962528e4a836dfb48cf9f23d7d547eb7e17cb0e96b9ebf698b05d04420 + containerID: containerd://68dab40e5ad9d6a66477dd2d388e2d7bf37607743b48c0aa454a623d4fa7f7a7 + restartCount: 101 diff --git a/pkg/health/testdata/pod-old-restarts.yaml b/pkg/health/testdata/pod-old-restarts.yaml new file mode 100644 index 0000000..265adb0 --- /dev/null +++ b/pkg/health/testdata/pod-old-restarts.yaml @@ -0,0 +1,143 @@ +apiVersion: v1 +kind: Pod +metadata: + uid: ab9e1119-ec17-4b62-bcc4-5f4dad41ce91 + name: cert-manager-webhook-6fb57c4ff5-v5nm6 + labels: + app: webhook + pod-template-hash: 6fb57c4ff5 + app.kubernetes.io/name: webhook + app.kubernetes.io/version: v1.6.1 + app.kubernetes.io/instance: cert-manager + app.kubernetes.io/component: webhook + namespace: cert-manager + annotations: + cni.projectcalico.org/podIP: 10.244.3.109/32 + cni.projectcalico.org/podIPs: 10.244.3.109/32 + cni.projectcalico.org/containerID: 7d6b2c09ed44e337c4b01c3ba909f92c59e3ed1dda58f8205e59fc300b16cf8c + generateName: cert-manager-webhook-6fb57c4ff5- + ownerReferences: + - uid: 9133f8a8-b4ad-4701-9838-0b2f45174d05 + kind: ReplicaSet + name: cert-manager-webhook-6fb57c4ff5 + apiVersion: apps/v1 + controller: true + blockOwnerDeletion: true + creationTimestamp: 2024-01-10T14:33:26Z +spec: + volumes: + - name: kube-api-access-qwsl5 + projected: + sources: + - serviceAccountToken: + path: token + expirationSeconds: 3607 + - configMap: + name: kube-root-ca.crt + items: + - key: ca.crt + path: ca.crt + - downwardAPI: + items: + - path: namespace + fieldRef: + fieldPath: metadata.namespace + apiVersion: v1 + defaultMode: 420 + nodeName: aks-pool1-37358073-vmss000006 + priority: 0 + dnsPolicy: ClusterFirst + containers: + - env: + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + apiVersion: v1 + args: + - --v=2 + - --secure-port=10250 + - --dynamic-serving-ca-secret-namespace=$(POD_NAMESPACE) + - --dynamic-serving-ca-secret-name=cert-manager-webhook-ca + - --dynamic-serving-dns-names=cert-manager-webhook,cert-manager-webhook.cert-manager,cert-manager-webhook.cert-manager.svc + name: cert-manager + image: quay.io/jetstack/cert-manager-webhook:v1.11.0 + ports: + - name: https + protocol: TCP + containerPort: 10250 + resources: {} + volumeMounts: + - name: kube-api-access-qwsl5 + readOnly: true + mountPath: /var/run/secrets/kubernetes.io/serviceaccount + livenessProbe: + httpGet: + path: /livez + port: 6080 + scheme: HTTP + periodSeconds: 10 + timeoutSeconds: 1 + failureThreshold: 3 + successThreshold: 1 + initialDelaySeconds: 60 + readinessProbe: + httpGet: + path: /healthz + port: 6080 + scheme: HTTP + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 3 + successThreshold: 1 + initialDelaySeconds: 5 + imagePullPolicy: IfNotPresent + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + tolerations: + - key: node.kubernetes.io/not-ready + effect: NoExecute + operator: Exists + tolerationSeconds: 300 + - key: node.kubernetes.io/unreachable + effect: NoExecute + operator: Exists + tolerationSeconds: 300 + restartPolicy: Always + schedulerName: default-scheduler + serviceAccount: cert-manager-webhook + securityContext: + runAsNonRoot: true + preemptionPolicy: PreemptLowerPriority + enableServiceLinks: true + serviceAccountName: cert-manager-webhook + terminationGracePeriodSeconds: 30 +status: + phase: Running + podIP: 10.244.3.109 + hostIP: 10.224.0.4 + podIPs: + - ip: 10.244.3.109 + qosClass: BestEffort + startTime: 2024-01-10T14:33:26Z + conditions: + - type: Initialized + status: "True" + - type: Ready + status: "True" + - type: ContainersReady + status: "True" + - type: PodScheduled + status: "True" + containerStatuses: + - name: cert-manager + image: quay.io/jetstack/cert-manager-webhook:v1.11.0 + ready: true + state: + running: + startedAt: 2024-05-05T16:21:02Z + imageID: quay.io/jetstack/cert-manager-webhook@sha256:6730d96fc382a57cde4f7519d2a32b40013e0d3bace2ea0579c7c051fbbd608d + started: true + lastState: {} + containerID: containerd://a68db464779df377d75d7cf815904e8612544feb7aa000c13ef92d79f99be8cd + restartCount: 257