From 4d7605deab20df36a9c7a5e98669ada95e623ad2 Mon Sep 17 00:00:00 2001 From: Aditya Thebe Date: Wed, 17 Jul 2024 20:51:06 +0545 Subject: [PATCH] feat: detect frequently restarting pods --- pkg/health/health_pod.go | 14 ++ pkg/health/health_test.go | 31 ++-- .../testdata/pod-high-restart-count.yaml | 170 ++++++++++++++++++ 3 files changed, 203 insertions(+), 12 deletions(-) create mode 100644 pkg/health/testdata/pod-high-restart-count.yaml diff --git a/pkg/health/health_pod.go b/pkg/health/health_pod.go index bfbed85..290a129 100644 --- a/pkg/health/health_pod.go +++ b/pkg/health/health_pod.go @@ -84,6 +84,20 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { health = msg.Health status = msg.Status messages = append(messages, msg.Message) + } else if containerStatus.RestartCount > 2 && containerStatus.LastTerminationState.Terminated != nil { + lastRestarted := containerStatus.LastTerminationState.Terminated.FinishedAt.Time + status = HealthStatusCode(containerStatus.LastTerminationState.Terminated.Reason) + if time.Since(lastRestarted) < time.Minute*30 { + health = HealthUnhealthy + } else { + health = HealthWarning + } + + return &HealthStatus{ + Health: health, + Status: status, + Message: strings.Join(messages, ", "), + }, nil } } diff --git a/pkg/health/health_test.go b/pkg/health/health_test.go index 1970f8d..b82a3bf 100644 --- a/pkg/health/health_test.go +++ b/pkg/health/health_test.go @@ -6,6 +6,7 @@ package health_test import ( "os" + "strings" "testing" "time" @@ -13,7 +14,6 @@ import ( "github.com/flanksource/is-healthy/pkg/lua" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "sigs.k8s.io/yaml" ) @@ -26,7 +26,7 @@ func assertAppHealth(t *testing.T, yamlPath string, expectedStatus health.Health assert.Equal(t, expectedStatus, health.Status) } -func assertAppHealthWithOverwrite(t *testing.T, yamlPath string, overwrites map[string]any, expectedStatus health.HealthStatusCode, expectedHealth health.Health, expectedReady bool) { +func assertAppHealthWithOverwrite(t *testing.T, yamlPath string, overwrites map[string]string, expectedStatus health.HealthStatusCode, expectedHealth health.Health, expectedReady bool) { health := getHealthStatus(yamlPath, t, overwrites) assert.NotNil(t, health) assert.Equal(t, expectedHealth, health.Health) @@ -34,20 +34,19 @@ func assertAppHealthWithOverwrite(t *testing.T, yamlPath string, overwrites map[ assert.Equal(t, expectedStatus, health.Status) } -func getHealthStatus(yamlPath string, t *testing.T, overwrites map[string]any) *health.HealthStatus { +func getHealthStatus(yamlPath string, t *testing.T, overwrites map[string]string) *health.HealthStatus { yamlBytes, err := os.ReadFile(yamlPath) require.NoError(t, err) - var obj unstructured.Unstructured - err = yaml.Unmarshal(yamlBytes, &obj) - require.NoError(t, err) + // Basic, search & replace overwrite for k, v := range overwrites { - switch k { - case "deletionTimestamp": - obj.SetDeletionTimestamp(v.(*v1.Time)) - } + yamlBytes = []byte(strings.ReplaceAll(string(yamlBytes), k, v)) } + var obj unstructured.Unstructured + err = yaml.Unmarshal(yamlBytes, &obj) + require.NoError(t, err) + health, err := health.GetResourceHealth(&obj, lua.ResourceHealthOverrides{}) require.NoError(t, err) return health @@ -144,6 +143,14 @@ func TestHPA(t *testing.T) { } func TestPod(t *testing.T) { + assertAppHealthWithOverwrite(t, "./testdata/pod-high-restart-count.yaml", map[string]string{ + "2024-07-17T14:29:51Z": time.Now().Add(-time.Minute).Format("2006-01-02T15:04:05Z"), + }, "OOMKilled", health.HealthUnhealthy, false) + + assertAppHealthWithOverwrite(t, "./testdata/pod-high-restart-count.yaml", map[string]string{ + "2024-07-17T14:29:51Z": "2024-06-17T14:29:51Z", + }, "OOMKilled", health.HealthWarning, false) + assertAppHealth(t, "./testdata/pod-terminating.yaml", health.HealthStatusTerminating, health.HealthWarning, false) status := getHealthStatus("./testdata/pod-terminating.yaml", t, nil) assert.Contains(t, status.Message, "stuck in 'Terminating' for") @@ -160,8 +167,8 @@ func TestPod(t *testing.T) { assertAppHealth(t, "./testdata/pod-succeeded.yaml", health.HealthStatusCompleted, health.HealthHealthy, true) assertAppHealth(t, "./testdata/pod-init-container-fail.yaml", health.HealthStatusCrashLoopBackoff, health.HealthUnhealthy, false) - assertAppHealthWithOverwrite(t, "./testdata/pod-deletion.yaml", map[string]any{ - "deletionTimestamp": &v1.Time{Time: time.Now().Add(-time.Minute)}, + assertAppHealthWithOverwrite(t, "./testdata/pod-deletion.yaml", map[string]string{ + "2018-12-03T10:16:04Z": time.Now().Add(-time.Minute).Format("2006-01-02T15:04:05Z"), }, health.HealthStatusTerminating, health.HealthUnknown, false) } diff --git a/pkg/health/testdata/pod-high-restart-count.yaml b/pkg/health/testdata/pod-high-restart-count.yaml new file mode 100644 index 0000000..5900e6f --- /dev/null +++ b/pkg/health/testdata/pod-high-restart-count.yaml @@ -0,0 +1,170 @@ +apiVersion: v1 +kind: Pod +metadata: + uid: ba54f218-3435-464c-9f37-75ac7d76104a + name: config-db-5867b6596f-gs79g + labels: + control-plane: config-db + pod-template-hash: 5867b6596f + app.kubernetes.io/name: config-db + app.kubernetes.io/instance: mission-control + namespace: mission-control + generateName: config-db-5867b6596f- + ownerReferences: + - uid: 00427427-8dee-4003-bd4e-496b0cc275d1 + kind: ReplicaSet + name: config-db-5867b6596f + apiVersion: apps/v1 + controller: true + blockOwnerDeletion: true + creationTimestamp: 2024-07-16T13:31:23Z +spec: + volumes: + - name: aws-iam-token + projected: + sources: + - serviceAccountToken: + path: token + audience: sts.amazonaws.com + expirationSeconds: 86400 + defaultMode: 420 + - name: kube-api-access-7jmn6 + projected: + sources: + - serviceAccountToken: + path: token + expirationSeconds: 3607 + - configMap: + name: kube-root-ca.crt + items: + - key: ca.crt + path: ca.crt + - downwardAPI: + items: + - path: namespace + fieldRef: + fieldPath: metadata.namespace + apiVersion: v1 + defaultMode: 420 + nodeName: ip-10-0-6-40.eu-west-1.compute.internal + priority: 0 + dnsPolicy: ClusterFirst + containers: + - env: + - name: DB_URL + valueFrom: + secretKeyRef: + key: DB_URL + name: incident-commander-postgres + - name: NAMESPACE + value: mission-control + - name: AWS_STS_REGIONAL_ENDPOINTS + value: regional + - name: AWS_DEFAULT_REGION + value: eu-west-1 + - name: AWS_REGION + value: eu-west-1 + - name: AWS_ROLE_ARN + value: arn:aws:iam::765618022540:role/eksctl-config-db-sa + - name: AWS_WEB_IDENTITY_TOKEN_FILE + value: /var/run/secrets/eks.amazonaws.com/serviceaccount/token + args: + - operator + - --disable-postgrest=true + - --change-retention-days=60 + - --analysis-retention-days=60 + - --json-logs + - --otel-collector-url=grafana-alloy.monitoring:4317 + - --otel-service-name=config-db + name: config-db + image: public.ecr.aws/k4y9r6y5/config-db:v0.0.400 + command: + - /app/config-db + resources: + limits: + cpu: 500m + memory: 4Gi + requests: + cpu: 200m + memory: 1Gi + volumeMounts: + - name: kube-api-access-7jmn6 + readOnly: true + mountPath: /var/run/secrets/kubernetes.io/serviceaccount + - name: aws-iam-token + readOnly: true + mountPath: /var/run/secrets/eks.amazonaws.com/serviceaccount + livenessProbe: + httpGet: + path: /live + port: 8080 + scheme: HTTP + periodSeconds: 10 + timeoutSeconds: 1 + failureThreshold: 3 + successThreshold: 1 + readinessProbe: + httpGet: + path: /ready + port: 8080 + scheme: HTTP + periodSeconds: 10 + timeoutSeconds: 1 + failureThreshold: 3 + successThreshold: 1 + imagePullPolicy: IfNotPresent + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + tolerations: + - key: node.kubernetes.io/not-ready + effect: NoExecute + operator: Exists + tolerationSeconds: 300 + - key: node.kubernetes.io/unreachable + effect: NoExecute + operator: Exists + tolerationSeconds: 300 + restartPolicy: Always + schedulerName: default-scheduler + serviceAccount: config-db-sa + securityContext: + fsGroup: 1000 + preemptionPolicy: PreemptLowerPriority + enableServiceLinks: true + serviceAccountName: config-db-sa + terminationGracePeriodSeconds: 30 +status: + phase: Running + podIP: 10.0.6.38 + hostIP: 10.0.6.40 + podIPs: + - ip: 10.0.6.38 + qosClass: Burstable + startTime: 2024-07-16T13:31:23Z + conditions: + - type: Initialized + status: "True" + - type: Ready + status: "True" + - type: ContainersReady + status: "True" + - type: PodScheduled + status: "True" + containerStatuses: + - name: config-db + image: public.ecr.aws/k4y9r6y5/config-db:v0.0.400 + ready: true + state: + running: + startedAt: 2024-07-17T14:29:52Z + imageID: public.ecr.aws/k4y9r6y5/config-db@sha256:b8803113097931662bda448b53c6ca256957957d74d5e8fd1fc442cec197b025 + started: true + lastState: + terminated: + reason: OOMKilled + exitCode: 137 + startedAt: 2024-07-17T14:13:28Z + finishedAt: 2024-07-17T14:29:51Z + containerID: containerd://ee5467962528e4a836dfb48cf9f23d7d547eb7e17cb0e96b9ebf698b05d04420 + containerID: containerd://68dab40e5ad9d6a66477dd2d388e2d7bf37607743b48c0aa454a623d4fa7f7a7 + restartCount: 101