Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: deployment/sts health fixes #131

Merged
merged 1 commit into from
Nov 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 4 additions & 7 deletions pkg/health/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,15 @@ const (
HealthStatusEvicted HealthStatusCode = "Evicted"
HealthStatusCompleted HealthStatusCode = "Completed"
HealthStatusCrashLoopBackoff HealthStatusCode = "CrashLoopBackOff"
HealthStatusCrashLoop HealthStatusCode = "CrashLoop"
HealthStatusCrashed HealthStatusCode = "Crashed"
HealthStatusCreating HealthStatusCode = "Creating"
HealthStatusDeleted HealthStatusCode = "Deleted"
HealthStatusDeleting HealthStatusCode = "Deleting"
HealthStatusTerminating HealthStatusCode = "Terminating"
HealthStatusError HealthStatusCode = "Error"
HealthStatusRolloutFailed HealthStatusCode = "Rollout Failed"
HealthStatusInaccesible HealthStatusCode = "Inaccesible"
HealthStatusInaccesible HealthStatusCode = "Inaccessible"
HealthStatusInfo HealthStatusCode = "Info"
HealthStatusPending HealthStatusCode = "Pending"
HealthStatusMaintenance HealthStatusCode = "Maintenance"
Expand Down Expand Up @@ -147,7 +148,7 @@ func GetResourceHealth(
terminatingFor := time.Since(obj.GetDeletionTimestamp().Time)
return &HealthStatus{
Status: "TerminatingStalled",
Health: HealthUnhealthy,
Health: HealthWarning,
Message: fmt.Sprintf("terminating for %v", duration.ShortHumanDuration(terminatingFor.Truncate(time.Hour))),
}, nil
}
Expand Down Expand Up @@ -198,10 +199,6 @@ func GetHealthCheckFunc(gvk schema.GroupVersionKind) func(obj *unstructured.Unst
return getNodeHealth
}

if strings.HasSuffix(gvk.Group, ".crossplane.io") || strings.HasSuffix(gvk.Group, ".upbound.io") {
return GetDefaultHealth
}

switch gvk.Group {
case "apps":
switch gvk.Kind {
Expand Down Expand Up @@ -264,5 +261,5 @@ func GetHealthCheckFunc(gvk schema.GroupVersionKind) func(obj *unstructured.Unst
return getHPAHealth
}
}
return nil
return GetDefaultHealth
}
18 changes: 18 additions & 0 deletions pkg/health/health_cnrm_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package health_test

import (
"testing"

"github.com/flanksource/is-healthy/pkg/health"
)

func TestCnrmContainer(t *testing.T) {
assertAppHealthMsg(
t,
"Kubernetes::ContainerCluster/failed.yaml",
"UpdateFailed",
health.HealthUnhealthy,
true,
"Update call failed: error applying desired state: summary: googleapi: Error 403: Google Compute Engine: Required 'compute.networks.get' permission for 'projects/flanksource-prod/global/networks/flanksource-workload'.\nDetails:\n[\n {\n \"@type\": \"type.googleapis.com/google.rpc.RequestInfo\",\n \"requestId\": \"0xf1e9e3ca2797eb18\"\n },\n {\n \"@type\": \"type.googleapis.com/google.rpc.ErrorInfo\",\n \"domain\": \"container.googleapis.com\",\n \"reason\": \"GCE_PERMISSION_DENIED\"\n }\n]\n, forbidden",
)
}
145 changes: 81 additions & 64 deletions pkg/health/health_deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ package health

import (
"fmt"
"strings"
"time"

"github.com/samber/lo"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"

"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
)

Expand All @@ -24,85 +26,100 @@ func getDeploymentHealth(obj *unstructured.Unstructured) (*HealthStatus, error)
}
}

func getAppsv1DeploymentHealth(deployment *appsv1.Deployment, obj *unstructured.Unstructured) (*HealthStatus, error) {
var containersWaitingForReadiness []string
for _, container := range deployment.Spec.Template.Spec.Containers {
if container.ReadinessProbe != nil && container.ReadinessProbe.InitialDelaySeconds > 0 {
deadline := deployment.CreationTimestamp.Add(
time.Second * time.Duration(container.ReadinessProbe.InitialDelaySeconds),
)
if time.Now().Before(deadline) {
containersWaitingForReadiness = append(containersWaitingForReadiness, container.Name)
}
}
type ReplicaStatus struct {
Object *unstructured.Unstructured
Containers []corev1.Container
Desired, Replicas, Ready, Updated, Unavailable int
}

func (rs ReplicaStatus) String() string {
s := fmt.Sprintf("%d/%d ready", rs.Ready, rs.Desired)

if rs.Replicas != rs.Updated {
s += fmt.Sprintf(", %d updating", rs.Replicas-rs.Updated)
}

if len(containersWaitingForReadiness) > 0 {
return &HealthStatus{
Health: HealthUnknown,
Status: HealthStatusStarting,
Message: fmt.Sprintf(
"Container(s) %s is waiting for readiness probe",
strings.Join(containersWaitingForReadiness, ","),
),
}, nil
if rs.Replicas > rs.Desired {
s += fmt.Sprintf(", %d terminating", rs.Replicas-rs.Desired)
}
return s
}

status, err := GetDefaultHealth(obj)
if err != nil {
return status, err
func getReplicaHealth(s ReplicaStatus) *HealthStatus {
hs := &HealthStatus{
Message: s.String(),
}
startDeadline := GetStartDeadline(s.Containers...)
age := time.Since(s.Object.GetCreationTimestamp().Time).Truncate(time.Minute).Abs()

replicas := int32(0)
gs := GetGenericStatus(s.Object)

if deployment.Spec.Replicas != nil {
replicas = *deployment.Spec.Replicas
}
progressing := gs.FindCondition("Progressing")
isStarting := age < startDeadline
isProgressDeadlineExceeded := !isStarting && (progressing.Reason == "ProgressDeadlineExceeded")
hs.Ready = progressing.Status == "True"

if replicas == 0 && deployment.Status.Replicas == 0 {
return &HealthStatus{
Ready: true,
Status: HealthStatusScaledToZero,
Health: HealthUnknown,
}, nil
}
hs.Health = lo.Ternary(s.Ready >= s.Desired, HealthHealthy, lo.Ternary(s.Ready > 0, HealthWarning, HealthUnhealthy))

if deployment.Status.ReadyReplicas == replicas {
status.PrependMessage("%d pods ready", deployment.Status.ReadyReplicas)
} else {
status.PrependMessage("%d of %d pods ready", deployment.Status.ReadyReplicas, replicas)
if s.Desired == 0 && s.Replicas == 0 {
hs.Ready = true
hs.Status = HealthStatusScaledToZero
hs.Health = HealthUnknown
return hs
}

if deployment.Spec.Paused {
status.Ready = false
status.Status = HealthStatusSuspended
return status, err
if s.Replicas == 0 {
if isProgressDeadlineExceeded {
hs.Status = "Failed Create"
hs.Health = HealthUnhealthy
} else {
hs.Status = "Pending"
hs.Health = HealthUnknown
}
} else if s.Ready == 0 && isStarting && !isProgressDeadlineExceeded {
hs.Health = HealthUnknown
hs.Status = HealthStatusStarting
} else if s.Ready == 0 && !isStarting {
hs.Health = HealthUnhealthy
hs.Status = HealthStatusCrashLoop
} else if s.Desired == 0 && s.Replicas > 0 {
hs.Status = HealthStatusScalingDown
hs.Health = lo.Ternary(isProgressDeadlineExceeded, HealthWarning, HealthHealthy)
} else if s.Ready == s.Desired && s.Desired == s.Updated && s.Replicas == s.Desired {
hs.Status = HealthStatusRunning
} else if s.Desired != s.Updated {
hs.Status = HealthStatusUpdating
} else if s.Replicas > s.Desired {
hs.Status = HealthStatusScalingDown
} else if s.Replicas < s.Desired {
hs.Status = HealthStatusScalingUp
}

if deployment.Status.ReadyReplicas > 0 {
status.Status = HealthStatusRunning
if isStarting && hs.Health == HealthUnhealthy {
hs.Health = HealthUnknown
}

if status.Health == HealthUnhealthy {
return status, nil
return hs
}

func getAppsv1DeploymentHealth(deployment *appsv1.Deployment, obj *unstructured.Unstructured) (*HealthStatus, error) {
replicas := int32(0)
if deployment.Spec.Replicas != nil {
replicas = *deployment.Spec.Replicas
}

if deployment.Status.ReadyReplicas < replicas {
status.AppendMessage("%d starting", deployment.Status.Replicas-deployment.Status.ReadyReplicas)
if deployment.Status.Replicas < replicas {
status.AppendMessage("%d creating", replicas-deployment.Status.Replicas)
}
status.Ready = false
status.Status = HealthStatusStarting
} else if deployment.Status.UpdatedReplicas < replicas {
status.AppendMessage("%d updating", replicas-deployment.Status.UpdatedReplicas)
status.Ready = false
status.Status = HealthStatusRollingOut
} else if deployment.Status.Replicas > replicas {
status.AppendMessage("%d pods terminating", deployment.Status.Replicas-replicas)
status.Ready = false
status.Status = HealthStatusScalingDown
replicaHealth := getReplicaHealth(
ReplicaStatus{
Object: obj,
Containers: deployment.Spec.Template.Spec.Containers,
Desired: int(replicas), Replicas: int(deployment.Status.Replicas),
Ready: int(deployment.Status.ReadyReplicas), Updated: int(deployment.Status.UpdatedReplicas),
Unavailable: int(deployment.Status.UnavailableReplicas),
})

if deployment.Spec.Paused {
replicaHealth.Status = HealthStatusSuspended
replicaHealth.Ready = false
}

return status, nil
return replicaHealth, nil
}
76 changes: 26 additions & 50 deletions pkg/health/health_replicaset.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,13 @@ package health

import (
"fmt"
"strings"
"time"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
)

// duration after the creation of a replica set
// within which we never deem the it to be unhealthy.
const replicaSetBufferPeriod = time.Minute * 10

func getReplicaSetHealth(obj *unstructured.Unstructured) (*HealthStatus, error) {
gvk := obj.GroupVersionKind()
switch gvk {
Expand All @@ -29,64 +24,45 @@ func getReplicaSetHealth(obj *unstructured.Unstructured) (*HealthStatus, error)
}
}

func getAppsv1ReplicaSetHealth(replicaSet *appsv1.ReplicaSet) (*HealthStatus, error) {
isWithinBufferPeriod := replicaSet.CreationTimestamp.Add(replicaSetBufferPeriod).After(time.Now())

var containersWaitingForReadiness []string
for _, container := range replicaSet.Spec.Template.Spec.Containers {
if container.ReadinessProbe != nil && container.ReadinessProbe.InitialDelaySeconds > 0 {
deadline := replicaSet.CreationTimestamp.Add(
time.Second * time.Duration(container.ReadinessProbe.InitialDelaySeconds),
)
if time.Now().Before(deadline) {
containersWaitingForReadiness = append(containersWaitingForReadiness, container.Name)
}
}
func getAppsv1ReplicaSetHealth(rs *appsv1.ReplicaSet) (*HealthStatus, error) {
replicas := int32(0)
if rs.Spec.Replicas != nil {
replicas = *rs.Spec.Replicas
}

if len(containersWaitingForReadiness) > 0 {
return &HealthStatus{
Health: HealthUnknown,
Status: HealthStatusStarting,
Message: fmt.Sprintf(
"Container(s) %s is waiting for readiness probe",
strings.Join(containersWaitingForReadiness, ","),
),
}, nil
startDeadline := GetStartDeadline(rs.Spec.Template.Spec.Containers...)
age := time.Since(rs.CreationTimestamp.Time).Truncate(time.Minute).Abs()

health := HealthHealthy
if rs.Status.ReadyReplicas == 0 {
if rs.Status.Replicas > 0 && age < startDeadline {
health = HealthUnknown
} else {
health = HealthUnhealthy
}
} else if rs.Status.ReadyReplicas < replicas {
health = HealthWarning
} else if rs.Status.ReadyReplicas >= replicas {
health = HealthHealthy
}

health := HealthUnknown
if (replicaSet.Spec.Replicas == nil || *replicaSet.Spec.Replicas == 0) && replicaSet.Status.Replicas == 0 {
if replicas == 0 && rs.Status.Replicas == 0 {
return &HealthStatus{
Ready: true,
Status: HealthStatusScaledToZero,
Health: health,
}, nil
}

if replicaSet.Spec.Replicas != nil && replicaSet.Status.ReadyReplicas >= *replicaSet.Spec.Replicas {
health = HealthHealthy
} else if replicaSet.Status.ReadyReplicas > 0 {
health = HealthWarning
} else {
health = HealthUnhealthy
}

if (health == HealthUnhealthy || health == HealthWarning) && isWithinBufferPeriod {
// within the buffer period, we don't mark a ReplicaSet as unhealthy
health = HealthUnknown
}

if replicaSet.Generation == replicaSet.Status.ObservedGeneration &&
replicaSet.Status.ReadyReplicas == *replicaSet.Spec.Replicas {
if rs.Generation == rs.Status.ObservedGeneration &&
rs.Status.ReadyReplicas == *rs.Spec.Replicas {
return &HealthStatus{
Health: health,
Status: HealthStatusRunning,
Ready: true,
}, nil
}

failCondition := getAppsv1ReplicaSetCondition(replicaSet.Status, appsv1.ReplicaSetReplicaFailure)
failCondition := getAppsv1ReplicaSetCondition(rs.Status, appsv1.ReplicaSetReplicaFailure)
if failCondition != nil && failCondition.Status == corev1.ConditionTrue {
return &HealthStatus{
Health: health,
Expand All @@ -95,19 +71,19 @@ func getAppsv1ReplicaSetHealth(replicaSet *appsv1.ReplicaSet) (*HealthStatus, er
}, nil
}

if replicaSet.Status.ReadyReplicas < *replicaSet.Spec.Replicas {
if rs.Status.ReadyReplicas < *rs.Spec.Replicas {
return &HealthStatus{
Health: health,
Status: HealthStatusScalingUp,
Message: fmt.Sprintf("%d of %d pods ready", replicaSet.Status.ReadyReplicas, *replicaSet.Spec.Replicas),
Message: fmt.Sprintf("%d of %d pods ready", rs.Status.ReadyReplicas, *rs.Spec.Replicas),
}, nil
}

if replicaSet.Status.ReadyReplicas > *replicaSet.Spec.Replicas {
if rs.Status.ReadyReplicas > *rs.Spec.Replicas {
return &HealthStatus{
Health: health,
Status: HealthStatusScalingDown,
Message: fmt.Sprintf("%d pods terminating", replicaSet.Status.ReadyReplicas-*replicaSet.Spec.Replicas),
Message: fmt.Sprintf("%d pods terminating", rs.Status.ReadyReplicas-*rs.Spec.Replicas),
}, nil
}

Expand Down
Loading
Loading