Skip to content

Commit

Permalink
chore: pod health fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
moshloop committed Nov 20, 2024
1 parent e895391 commit 2a4791b
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 7 deletions.
1 change: 0 additions & 1 deletion pkg/health/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ const (
HealthStatusEvicted HealthStatusCode = "Evicted"
HealthStatusCompleted HealthStatusCode = "Completed"
HealthStatusCrashLoopBackoff HealthStatusCode = "CrashLoopBackOff"
HealthStatusCrashLoop HealthStatusCode = "CrashLoop"
HealthStatusCrashed HealthStatusCode = "Crashed"
HealthStatusCreating HealthStatusCode = "Creating"
HealthStatusDeleted HealthStatusCode = "Deleted"
Expand Down
2 changes: 1 addition & 1 deletion pkg/health/health_deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ func getReplicaHealth(s ReplicaStatus) *HealthStatus {
hs.Status = HealthStatusStarting
} else if s.Ready == 0 && !isStarting {
hs.Health = HealthUnhealthy
hs.Status = HealthStatusCrashLoop
hs.Status = HealthStatusCrashLoopBackoff
} else if s.Desired == 0 && s.Replicas > 0 {
hs.Status = HealthStatusScalingDown
hs.Health = lo.Ternary(isProgressDeadlineExceeded, HealthWarning, HealthHealthy)
Expand Down
11 changes: 8 additions & 3 deletions pkg/health/health_pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,11 +158,16 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
hr.Health = hr.Health.Worst(lo.Ternary(isReady, HealthHealthy, HealthUnhealthy))
}

if isStarting && hr.Health.IsWorseThan(HealthWarning) &&
(terminated != nil && terminated.Status != HealthStatusOOMKilled) {
if isStarting && hr.Health.IsWorseThan(HealthWarning) {
hr.Health = HealthUnknown
hr.Message = fmt.Sprintf("%s %s", string(hr.Status), hr.Message)
hr.Message = strings.TrimSpace(fmt.Sprintf("%s %s", string(hr.Status), hr.Message))
hr.Status = HealthStatusStarting

if terminated != nil && terminated.Status == HealthStatusOOMKilled {
// an OOMKilled on startup is likely not going to resolve after some time
hr.Health = HealthUnhealthy
hr.Status = "Starting OOMKilled"
}
}

return &hr, nil
Expand Down
4 changes: 2 additions & 2 deletions pkg/health/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ func TestStatefulSetHealth(t *testing.T) {
assertAppHealthMsg(
t,
"./testdata/statefulset-starting.yaml",
health.HealthStatusCrashLoop,
health.HealthStatusCrashLoopBackoff,
health.HealthUnhealthy,
true,
"0/1 ready",
Expand All @@ -419,7 +419,7 @@ func TestStatefulSetHealth(t *testing.T) {
assertAppHealthMsg(
t,
"./testdata/statefulset-starting.yaml",
health.HealthStatusCrashLoop,
health.HealthStatusCrashLoopBackoff,
health.HealthUnhealthy,
true,
"0/1 ready",
Expand Down
78 changes: 78 additions & 0 deletions pkg/health/testdata/Kubernetes/Pod/container-creating.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
apiVersion: v1
kind: Pod
metadata:
creationTimestamp: "@now-5m"
name: image-pull-backoff
namespace: argocd
annotations:
expected-status: Starting
expected-health: unknown
resourceVersion: "155333"
selfLink: /api/v1/namespaces/argocd/pods/image-pull-backoff
uid: 46c1e8de-f61b-11e8-a057-fe5f49266390
spec:
containers:
- image: does-not-exist
imagePullPolicy: Always
name: main
resources: {}
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: default-token-f9jvj
readOnly: true
dnsPolicy: ClusterFirst
nodeName: minikube
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
serviceAccount: default
serviceAccountName: default
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoExecute
key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 300
- effect: NoExecute
key: node.kubernetes.io/unreachable
operator: Exists
tolerationSeconds: 300
volumes:
- name: default-token-f9jvj
secret:
defaultMode: 420
secretName: default-token-f9jvj
status:
conditions:
- lastProbeTime: null
lastTransitionTime: 2018-12-02T10:16:04Z
status: "True"
type: Initialized
- lastProbeTime: null
lastTransitionTime: 2018-12-02T10:16:04Z
message: 'containers with unready status: [main]'
reason: ContainersNotReady
status: "False"
type: Ready
- lastProbeTime: null
lastTransitionTime: 2018-12-02T10:16:04Z
status: "True"
type: PodScheduled
containerStatuses:
- image: does-not-exist
imageID: ""
lastState: {}
name: main
ready: false
started: false
restartCount: 0
state:
waiting:
reason: ContainerCreating
hostIP: 192.168.64.41
phase: Pending
podIP: 172.17.0.9
qosClass: BestEffort
startTime: 2018-12-02T10:16:04Z
1 change: 1 addition & 0 deletions pkg/health/testdata/Kubernetes/Pod/pod-pending.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ status:
lastState: {}
name: main
ready: false
started: true
restartCount: 0
state:
waiting:
Expand Down

0 comments on commit 2a4791b

Please sign in to comment.