From 5fe44befe038d95c7861d963b1bacd2f2aa786de Mon Sep 17 00:00:00 2001 From: Moshe Immermam Date: Fri, 15 Nov 2024 15:31:38 +0200 Subject: [PATCH] fix: deployment/sts health fixes --- pkg/health/health.go | 11 +- pkg/health/health_cnrm_test.go | 18 + pkg/health/health_deployment.go | 145 +++--- pkg/health/health_replicaset.go | 76 ++-- pkg/health/health_statefulset.go | 79 +--- pkg/health/health_test.go | 140 ++++-- pkg/health/status.go | 15 +- pkg/health/statusMap.yaml | 50 ++- .../Kubernetes::Application/degraded.yaml | 425 ++++++++++++++++++ .../Kubernetes::ContainerCluster/failed.yaml | 79 ++++ pkg/health/testdata/deployment-failed.yaml | 86 ++-- .../deployment-rollout-failed-unhealthy.yaml | 54 +++ .../testdata/deployment-rollout-failed.yaml | 56 +++ pkg/health/testdata/deployment-scaled-up.yaml | 118 +++++ .../testdata/deployment-scaling-down.yaml | 6 +- .../testdata/deployment-scaling-up.yaml | 125 ++++++ pkg/health/testdata/deployment-starting.yaml | 62 +++ pkg/health/testdata/pod-terminated.yaml | 130 ++++++ pkg/health/testdata/statefulset-ondelete.yaml | 1 + 19 files changed, 1373 insertions(+), 303 deletions(-) create mode 100644 pkg/health/health_cnrm_test.go create mode 100644 pkg/health/testdata/Kubernetes::Application/degraded.yaml create mode 100644 pkg/health/testdata/Kubernetes::ContainerCluster/failed.yaml create mode 100644 pkg/health/testdata/deployment-rollout-failed-unhealthy.yaml create mode 100644 pkg/health/testdata/deployment-rollout-failed.yaml create mode 100644 pkg/health/testdata/deployment-scaled-up.yaml create mode 100644 pkg/health/testdata/deployment-scaling-up.yaml create mode 100644 pkg/health/testdata/deployment-starting.yaml create mode 100644 pkg/health/testdata/pod-terminated.yaml diff --git a/pkg/health/health.go b/pkg/health/health.go index 40e2654..30e19c0 100644 --- a/pkg/health/health.go +++ b/pkg/health/health.go @@ -44,6 +44,7 @@ const ( HealthStatusEvicted HealthStatusCode = "Evicted" HealthStatusCompleted HealthStatusCode = "Completed" HealthStatusCrashLoopBackoff HealthStatusCode = "CrashLoopBackOff" + HealthStatusCrashLoop HealthStatusCode = "CrashLoop" HealthStatusCrashed HealthStatusCode = "Crashed" HealthStatusCreating HealthStatusCode = "Creating" HealthStatusDeleted HealthStatusCode = "Deleted" @@ -51,7 +52,7 @@ const ( HealthStatusTerminating HealthStatusCode = "Terminating" HealthStatusError HealthStatusCode = "Error" HealthStatusRolloutFailed HealthStatusCode = "Rollout Failed" - HealthStatusInaccesible HealthStatusCode = "Inaccesible" + HealthStatusInaccesible HealthStatusCode = "Inaccessible" HealthStatusInfo HealthStatusCode = "Info" HealthStatusPending HealthStatusCode = "Pending" HealthStatusMaintenance HealthStatusCode = "Maintenance" @@ -147,7 +148,7 @@ func GetResourceHealth( terminatingFor := time.Since(obj.GetDeletionTimestamp().Time) return &HealthStatus{ Status: "TerminatingStalled", - Health: HealthUnhealthy, + Health: HealthWarning, Message: fmt.Sprintf("terminating for %v", duration.ShortHumanDuration(terminatingFor.Truncate(time.Hour))), }, nil } @@ -198,10 +199,6 @@ func GetHealthCheckFunc(gvk schema.GroupVersionKind) func(obj *unstructured.Unst return getNodeHealth } - if strings.HasSuffix(gvk.Group, ".crossplane.io") || strings.HasSuffix(gvk.Group, ".upbound.io") { - return GetDefaultHealth - } - switch gvk.Group { case "apps": switch gvk.Kind { @@ -264,5 +261,5 @@ func GetHealthCheckFunc(gvk schema.GroupVersionKind) func(obj *unstructured.Unst return getHPAHealth } } - return nil + return GetDefaultHealth } diff --git a/pkg/health/health_cnrm_test.go b/pkg/health/health_cnrm_test.go new file mode 100644 index 0000000..032febc --- /dev/null +++ b/pkg/health/health_cnrm_test.go @@ -0,0 +1,18 @@ +package health_test + +import ( + "testing" + + "github.com/flanksource/is-healthy/pkg/health" +) + +func TestCnrmContainer(t *testing.T) { + assertAppHealthMsg( + t, + "Kubernetes::ContainerCluster/failed.yaml", + "UpdateFailed", + health.HealthUnhealthy, + true, + "Update call failed: error applying desired state: summary: googleapi: Error 403: Google Compute Engine: Required 'compute.networks.get' permission for 'projects/flanksource-prod/global/networks/flanksource-workload'.\nDetails:\n[\n {\n \"@type\": \"type.googleapis.com/google.rpc.RequestInfo\",\n \"requestId\": \"0xf1e9e3ca2797eb18\"\n },\n {\n \"@type\": \"type.googleapis.com/google.rpc.ErrorInfo\",\n \"domain\": \"container.googleapis.com\",\n \"reason\": \"GCE_PERMISSION_DENIED\"\n }\n]\n, forbidden", + ) +} diff --git a/pkg/health/health_deployment.go b/pkg/health/health_deployment.go index 31a99a4..0903804 100644 --- a/pkg/health/health_deployment.go +++ b/pkg/health/health_deployment.go @@ -2,10 +2,12 @@ package health import ( "fmt" - "strings" "time" + "github.com/samber/lo" appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" ) @@ -24,85 +26,100 @@ func getDeploymentHealth(obj *unstructured.Unstructured) (*HealthStatus, error) } } -func getAppsv1DeploymentHealth(deployment *appsv1.Deployment, obj *unstructured.Unstructured) (*HealthStatus, error) { - var containersWaitingForReadiness []string - for _, container := range deployment.Spec.Template.Spec.Containers { - if container.ReadinessProbe != nil && container.ReadinessProbe.InitialDelaySeconds > 0 { - deadline := deployment.CreationTimestamp.Add( - time.Second * time.Duration(container.ReadinessProbe.InitialDelaySeconds), - ) - if time.Now().Before(deadline) { - containersWaitingForReadiness = append(containersWaitingForReadiness, container.Name) - } - } +type ReplicaStatus struct { + Object *unstructured.Unstructured + Containers []corev1.Container + Desired, Replicas, Ready, Updated, Unavailable int +} + +func (rs ReplicaStatus) String() string { + s := fmt.Sprintf("%d/%d ready", rs.Ready, rs.Desired) + + if rs.Replicas != rs.Updated { + s += fmt.Sprintf(", %d updating", rs.Replicas-rs.Updated) } - if len(containersWaitingForReadiness) > 0 { - return &HealthStatus{ - Health: HealthUnknown, - Status: HealthStatusStarting, - Message: fmt.Sprintf( - "Container(s) %s is waiting for readiness probe", - strings.Join(containersWaitingForReadiness, ","), - ), - }, nil + if rs.Replicas > rs.Desired { + s += fmt.Sprintf(", %d terminating", rs.Replicas-rs.Desired) } + return s +} - status, err := GetDefaultHealth(obj) - if err != nil { - return status, err +func getReplicaHealth(s ReplicaStatus) *HealthStatus { + hs := &HealthStatus{ + Message: s.String(), } + startDeadline := GetStartDeadline(s.Containers...) + age := time.Since(s.Object.GetCreationTimestamp().Time).Truncate(time.Minute).Abs() - replicas := int32(0) + gs := GetGenericStatus(s.Object) - if deployment.Spec.Replicas != nil { - replicas = *deployment.Spec.Replicas - } + progressing := gs.FindCondition("Progressing") + isStarting := age < startDeadline + isProgressDeadlineExceeded := !isStarting && (progressing.Reason == "ProgressDeadlineExceeded") + hs.Ready = progressing.Status == "True" - if replicas == 0 && deployment.Status.Replicas == 0 { - return &HealthStatus{ - Ready: true, - Status: HealthStatusScaledToZero, - Health: HealthUnknown, - }, nil - } + hs.Health = lo.Ternary(s.Ready >= s.Desired, HealthHealthy, lo.Ternary(s.Ready > 0, HealthWarning, HealthUnhealthy)) - if deployment.Status.ReadyReplicas == replicas { - status.PrependMessage("%d pods ready", deployment.Status.ReadyReplicas) - } else { - status.PrependMessage("%d of %d pods ready", deployment.Status.ReadyReplicas, replicas) + if s.Desired == 0 && s.Replicas == 0 { + hs.Ready = true + hs.Status = HealthStatusScaledToZero + hs.Health = HealthUnknown + return hs } - - if deployment.Spec.Paused { - status.Ready = false - status.Status = HealthStatusSuspended - return status, err + if s.Replicas == 0 { + if isProgressDeadlineExceeded { + hs.Status = "Failed Create" + hs.Health = HealthUnhealthy + } else { + hs.Status = "Pending" + hs.Health = HealthUnknown + } + } else if s.Ready == 0 && isStarting && !isProgressDeadlineExceeded { + hs.Health = HealthUnknown + hs.Status = HealthStatusStarting + } else if s.Ready == 0 && !isStarting { + hs.Health = HealthUnhealthy + hs.Status = HealthStatusCrashLoop + } else if s.Desired == 0 && s.Replicas > 0 { + hs.Status = HealthStatusScalingDown + hs.Health = lo.Ternary(isProgressDeadlineExceeded, HealthWarning, HealthHealthy) + } else if s.Ready == s.Desired && s.Desired == s.Updated && s.Replicas == s.Desired { + hs.Status = HealthStatusRunning + } else if s.Desired != s.Updated { + hs.Status = HealthStatusUpdating + } else if s.Replicas > s.Desired { + hs.Status = HealthStatusScalingDown + } else if s.Replicas < s.Desired { + hs.Status = HealthStatusScalingUp } - if deployment.Status.ReadyReplicas > 0 { - status.Status = HealthStatusRunning + if isStarting && hs.Health == HealthUnhealthy { + hs.Health = HealthUnknown } - if status.Health == HealthUnhealthy { - return status, nil + return hs +} + +func getAppsv1DeploymentHealth(deployment *appsv1.Deployment, obj *unstructured.Unstructured) (*HealthStatus, error) { + replicas := int32(0) + if deployment.Spec.Replicas != nil { + replicas = *deployment.Spec.Replicas } - if deployment.Status.ReadyReplicas < replicas { - status.AppendMessage("%d starting", deployment.Status.Replicas-deployment.Status.ReadyReplicas) - if deployment.Status.Replicas < replicas { - status.AppendMessage("%d creating", replicas-deployment.Status.Replicas) - } - status.Ready = false - status.Status = HealthStatusStarting - } else if deployment.Status.UpdatedReplicas < replicas { - status.AppendMessage("%d updating", replicas-deployment.Status.UpdatedReplicas) - status.Ready = false - status.Status = HealthStatusRollingOut - } else if deployment.Status.Replicas > replicas { - status.AppendMessage("%d pods terminating", deployment.Status.Replicas-replicas) - status.Ready = false - status.Status = HealthStatusScalingDown + replicaHealth := getReplicaHealth( + ReplicaStatus{ + Object: obj, + Containers: deployment.Spec.Template.Spec.Containers, + Desired: int(replicas), Replicas: int(deployment.Status.Replicas), + Ready: int(deployment.Status.ReadyReplicas), Updated: int(deployment.Status.UpdatedReplicas), + Unavailable: int(deployment.Status.UnavailableReplicas), + }) + + if deployment.Spec.Paused { + replicaHealth.Status = HealthStatusSuspended + replicaHealth.Ready = false } - return status, nil + return replicaHealth, nil } diff --git a/pkg/health/health_replicaset.go b/pkg/health/health_replicaset.go index e44f9ae..8a3df02 100644 --- a/pkg/health/health_replicaset.go +++ b/pkg/health/health_replicaset.go @@ -2,7 +2,6 @@ package health import ( "fmt" - "strings" "time" appsv1 "k8s.io/api/apps/v1" @@ -10,10 +9,6 @@ import ( "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" ) -// duration after the creation of a replica set -// within which we never deem the it to be unhealthy. -const replicaSetBufferPeriod = time.Minute * 10 - func getReplicaSetHealth(obj *unstructured.Unstructured) (*HealthStatus, error) { gvk := obj.GroupVersionKind() switch gvk { @@ -29,34 +24,28 @@ func getReplicaSetHealth(obj *unstructured.Unstructured) (*HealthStatus, error) } } -func getAppsv1ReplicaSetHealth(replicaSet *appsv1.ReplicaSet) (*HealthStatus, error) { - isWithinBufferPeriod := replicaSet.CreationTimestamp.Add(replicaSetBufferPeriod).After(time.Now()) - - var containersWaitingForReadiness []string - for _, container := range replicaSet.Spec.Template.Spec.Containers { - if container.ReadinessProbe != nil && container.ReadinessProbe.InitialDelaySeconds > 0 { - deadline := replicaSet.CreationTimestamp.Add( - time.Second * time.Duration(container.ReadinessProbe.InitialDelaySeconds), - ) - if time.Now().Before(deadline) { - containersWaitingForReadiness = append(containersWaitingForReadiness, container.Name) - } - } +func getAppsv1ReplicaSetHealth(rs *appsv1.ReplicaSet) (*HealthStatus, error) { + replicas := int32(0) + if rs.Spec.Replicas != nil { + replicas = *rs.Spec.Replicas } - - if len(containersWaitingForReadiness) > 0 { - return &HealthStatus{ - Health: HealthUnknown, - Status: HealthStatusStarting, - Message: fmt.Sprintf( - "Container(s) %s is waiting for readiness probe", - strings.Join(containersWaitingForReadiness, ","), - ), - }, nil + startDeadline := GetStartDeadline(rs.Spec.Template.Spec.Containers...) + age := time.Since(rs.CreationTimestamp.Time).Truncate(time.Minute).Abs() + + health := HealthHealthy + if rs.Status.ReadyReplicas == 0 { + if rs.Status.Replicas > 0 && age < startDeadline { + health = HealthUnknown + } else { + health = HealthUnhealthy + } + } else if rs.Status.ReadyReplicas < replicas { + health = HealthWarning + } else if rs.Status.ReadyReplicas >= replicas { + health = HealthHealthy } - health := HealthUnknown - if (replicaSet.Spec.Replicas == nil || *replicaSet.Spec.Replicas == 0) && replicaSet.Status.Replicas == 0 { + if replicas == 0 && rs.Status.Replicas == 0 { return &HealthStatus{ Ready: true, Status: HealthStatusScaledToZero, @@ -64,21 +53,8 @@ func getAppsv1ReplicaSetHealth(replicaSet *appsv1.ReplicaSet) (*HealthStatus, er }, nil } - if replicaSet.Spec.Replicas != nil && replicaSet.Status.ReadyReplicas >= *replicaSet.Spec.Replicas { - health = HealthHealthy - } else if replicaSet.Status.ReadyReplicas > 0 { - health = HealthWarning - } else { - health = HealthUnhealthy - } - - if (health == HealthUnhealthy || health == HealthWarning) && isWithinBufferPeriod { - // within the buffer period, we don't mark a ReplicaSet as unhealthy - health = HealthUnknown - } - - if replicaSet.Generation == replicaSet.Status.ObservedGeneration && - replicaSet.Status.ReadyReplicas == *replicaSet.Spec.Replicas { + if rs.Generation == rs.Status.ObservedGeneration && + rs.Status.ReadyReplicas == *rs.Spec.Replicas { return &HealthStatus{ Health: health, Status: HealthStatusRunning, @@ -86,7 +62,7 @@ func getAppsv1ReplicaSetHealth(replicaSet *appsv1.ReplicaSet) (*HealthStatus, er }, nil } - failCondition := getAppsv1ReplicaSetCondition(replicaSet.Status, appsv1.ReplicaSetReplicaFailure) + failCondition := getAppsv1ReplicaSetCondition(rs.Status, appsv1.ReplicaSetReplicaFailure) if failCondition != nil && failCondition.Status == corev1.ConditionTrue { return &HealthStatus{ Health: health, @@ -95,19 +71,19 @@ func getAppsv1ReplicaSetHealth(replicaSet *appsv1.ReplicaSet) (*HealthStatus, er }, nil } - if replicaSet.Status.ReadyReplicas < *replicaSet.Spec.Replicas { + if rs.Status.ReadyReplicas < *rs.Spec.Replicas { return &HealthStatus{ Health: health, Status: HealthStatusScalingUp, - Message: fmt.Sprintf("%d of %d pods ready", replicaSet.Status.ReadyReplicas, *replicaSet.Spec.Replicas), + Message: fmt.Sprintf("%d of %d pods ready", rs.Status.ReadyReplicas, *rs.Spec.Replicas), }, nil } - if replicaSet.Status.ReadyReplicas > *replicaSet.Spec.Replicas { + if rs.Status.ReadyReplicas > *rs.Spec.Replicas { return &HealthStatus{ Health: health, Status: HealthStatusScalingDown, - Message: fmt.Sprintf("%d pods terminating", replicaSet.Status.ReadyReplicas-*replicaSet.Spec.Replicas), + Message: fmt.Sprintf("%d pods terminating", rs.Status.ReadyReplicas-*rs.Spec.Replicas), }, nil } diff --git a/pkg/health/health_statefulset.go b/pkg/health/health_statefulset.go index a9967b6..6d308f1 100644 --- a/pkg/health/health_statefulset.go +++ b/pkg/health/health_statefulset.go @@ -2,7 +2,6 @@ package health import ( "fmt" - "time" appsv1 "k8s.io/api/apps/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" @@ -16,83 +15,27 @@ func getStatefulSetHealth(obj *unstructured.Unstructured) (*HealthStatus, error) if err := convertFromUnstructured(obj, &sts); err != nil { return nil, err } - return getAppsv1StatefulSetHealth(&sts) + return getAppsv1StatefulSetHealth(&sts, obj) default: return nil, fmt.Errorf("unsupported StatefulSet GVK: %s", gvk) } } -func getAppsv1StatefulSetHealth(sts *appsv1.StatefulSet) (*HealthStatus, error) { +func getAppsv1StatefulSetHealth(sts *appsv1.StatefulSet, obj *unstructured.Unstructured) (*HealthStatus, error) { replicas := int32(0) if sts.Spec.Replicas != nil { replicas = *sts.Spec.Replicas } - if replicas == 0 && sts.Status.Replicas == 0 { - return &HealthStatus{ - Status: HealthStatusScaledToZero, - Health: HealthUnknown, - Ready: true, - }, nil - } - - startDeadline := GetStartDeadline(sts.Spec.Template.Spec.Containers...) - age := time.Since(sts.CreationTimestamp.Time).Truncate(time.Minute).Abs() - - health := HealthHealthy - if sts.Status.ReadyReplicas == 0 { - if sts.Status.CurrentReplicas > 0 && age < startDeadline { - health = HealthUnknown - } else { - health = HealthUnhealthy - } - } else if sts.Status.UpdatedReplicas == 0 { - health = HealthWarning - } else if sts.Status.ReadyReplicas >= replicas { - health = HealthHealthy - } - - if sts.Spec.Replicas != nil && sts.Status.ReadyReplicas < *sts.Spec.Replicas { - return &HealthStatus{ - Health: health, - Status: HealthStatusStarting, - Message: fmt.Sprintf("%d of %d pods ready", sts.Status.ReadyReplicas, *sts.Spec.Replicas), - }, nil - } - - if sts.Spec.Replicas != nil && sts.Status.UpdatedReplicas < replicas { - return &HealthStatus{ - Health: health, - Status: HealthStatusRollingOut, - Message: fmt.Sprintf( - "%d of %d pods updated, %d of %d ready", - sts.Status.UpdatedReplicas, - replicas, - sts.Status.ReadyReplicas, - replicas, - ), - }, nil - } + replicaHealth := getReplicaHealth( + ReplicaStatus{ + Object: obj, + Containers: sts.Spec.Template.Spec.Containers, + Desired: int(replicas), Replicas: int(sts.Status.Replicas), + Ready: int(sts.Status.ReadyReplicas), Updated: int(sts.Status.UpdatedReplicas), + }) - if sts.Status.ObservedGeneration == 0 || sts.Generation > sts.Status.ObservedGeneration { - return &HealthStatus{ - Health: health, - Status: HealthStatusRollingOut, - Message: fmt.Sprintf("generation not up to date %d", sts.Generation), - }, nil - } - - if sts.Status.UpdateRevision != "" && sts.Status.CurrentRevision != sts.Status.UpdateRevision { - return &HealthStatus{ - Health: health, - Status: HealthStatusRollingOut, - Message: fmt.Sprintf("revision not up to date %s", sts.Status.UpdateRevision), - }, nil - } + replicaHealth.Ready = sts.Status.Replicas == sts.Status.UpdatedReplicas - return &HealthStatus{ - Ready: true, - Health: health, - Status: HealthStatusRunning, - }, nil + return replicaHealth, nil } diff --git a/pkg/health/health_test.go b/pkg/health/health_test.go index 32d328b..f0b9d14 100644 --- a/pkg/health/health_test.go +++ b/pkg/health/health_test.go @@ -68,12 +68,14 @@ func assertAppHealthMsg( m[overrides[i]] = overrides[i+1] } } - health := getHealthStatus(yamlPath, t, m) - assert.NotNil(t, health) - assert.Equal(t, expectedHealth, health.Health) - assert.Equal(t, expectedReady, health.Ready) - assert.Equal(t, expectedStatus, health.Status) - assert.Equal(t, expectedMsg, health.Message) + t.Run(yamlPath, func(t *testing.T) { + health := getHealthStatus(yamlPath, t, m) + assert.NotNil(t, health) + assert.Equal(t, expectedHealth, health.Health) + assert.Equal(t, expectedReady, health.Ready) + assert.Equal(t, expectedStatus, health.Status) + assert.Equal(t, expectedMsg, health.Message) + }) } func assertAppHealth( @@ -88,6 +90,9 @@ func assertAppHealth( for k, v := range defaultOverrides { m[k] = v } + if len(overrides)%2 == 1 { + assert.FailNow(t, "even number of overrides") + } for i := 0; i < len(overrides); i += 2 { m[overrides[i]] = overrides[i+1] } @@ -251,53 +256,97 @@ func TestCertificate(t *testing.T) { func TestExternalSecrets(t *testing.T) { b := "../resource_customizations/external-secrets.io/ExternalSecret/testdata/" - assertAppHealth(t, b+"degraded.yaml", "", health.HealthUnhealthy, true) - assertAppHealth(t, b+"progressing.yaml", "Progressing", health.HealthUnknown, false) - assertAppHealth(t, b+"healthy.yaml", "", health.HealthHealthy, true) + assertAppHealth(t, b+"degraded.yaml", "SecretSyncedError", health.HealthUnhealthy, false) + assertAppHealth(t, b+"progressing.yaml", "", health.HealthUnknown, false) + assertAppHealth(t, b+"healthy.yaml", "SecretSynced", health.HealthHealthy, true) } func TestDeploymentHealth(t *testing.T) { - assertAppHealth(t, "./testdata/nginx.yaml", health.HealthStatusRunning, health.HealthHealthy, true) - assertAppHealth( + assertAppHealthMsg(t, "./testdata/nginx.yaml", health.HealthStatusRunning, health.HealthHealthy, true, "1/1 ready") + assertAppHealthMsg( t, - "./testdata/deployment-progressing.yaml", - health.HealthStatusStarting, + "./deployment-scaled-up.yaml", + health.HealthStatusRunning, health.HealthHealthy, - false, + true, + "3/3 ready", ) - assertAppHealth( + + assertAppHealthMsg( + t, + "./deployment-rollout-failed.yaml", + health.HealthStatusUpdating, + health.HealthWarning, + true, + "1/2 ready, 1 updating", + ) + assertAppHealthMsg( + t, + "./testdata/deployment-progressing.yaml", + health.HealthStatusUpdating, + health.HealthWarning, + true, + "1/2 ready, 1 updating", + ) + assertAppHealthMsg( t, "./testdata/deployment-suspended.yaml", health.HealthStatusSuspended, health.HealthHealthy, false, + "1/1 ready, 1 updating, 1 terminating", ) - assertAppHealth(t, "./testdata/deployment-degraded.yaml", health.HealthStatusStarting, health.HealthHealthy, false) - assertAppHealth( + assertAppHealthMsg( + t, + "./testdata/deployment-degraded.yaml", + health.HealthStatusUpdating, + health.HealthWarning, + true, + "1/2 ready, 1 updating", + ) + + assertAppHealthMsg( + t, + "./testdata/deployment-starting.yaml", + health.HealthStatusStarting, + health.HealthUnknown, + true, + "0/2 ready, 1 updating", + ) + assertAppHealthMsg( t, "./testdata/deployment-scaling-down.yaml", health.HealthStatusScalingDown, health.HealthHealthy, - false, + true, + "1/1 ready, 1 updating, 1 terminating", ) - assertAppHealth( + assertAppHealthMsg( t, "./testdata/deployment-failed.yaml", - health.HealthStatusRolloutFailed, + "Failed Create", health.HealthUnhealthy, false, + "0/1 ready", ) } func TestStatefulSetHealth(t *testing.T) { - assertAppHealthMsg(t, "./testdata/statefulset.yaml", health.HealthStatusRunning, health.HealthHealthy, true, "") + assertAppHealthMsg( + t, + "./testdata/statefulset.yaml", + health.HealthStatusRunning, + health.HealthHealthy, + true, + "1/1 ready", + ) assertAppHealthMsg( t, "./testdata/statefulset-starting.yaml", health.HealthStatusStarting, health.HealthUnknown, - false, - "0 of 1 pods ready", + true, + "0/1 ready", "@now", "@now-1m", ) @@ -306,40 +355,44 @@ func TestStatefulSetHealth(t *testing.T) { "./testdata/statefulset-starting.yaml", health.HealthStatusStarting, health.HealthUnknown, - false, - "0 of 1 pods ready", + true, + "0/1 ready", "@now", "@now-5m", ) assertAppHealthMsg( t, "./testdata/statefulset-starting.yaml", - health.HealthStatusStarting, + health.HealthStatusCrashLoop, health.HealthUnhealthy, - false, - "0 of 1 pods ready", + true, + "0/1 ready", "@now", "@now-15m", ) assertAppHealthMsg( t, "./testdata/statefulset-starting.yaml", - health.HealthStatusStarting, + health.HealthStatusCrashLoop, health.HealthUnhealthy, - false, - "0 of 1 pods ready", + true, + "0/1 ready", "@now", "@now-1d", ) } func TestStatefulSetOnDeleteHealth(t *testing.T) { - assertAppHealth( + assertAppHealthMsg( t, "./testdata/statefulset-ondelete.yaml", - health.HealthStatusRollingOut, + "TerminatingStalled", health.HealthWarning, false, + "terminating for 1d", + + "@now", + "@now-1d", ) } @@ -378,7 +431,22 @@ func TestIngressHealth(t *testing.T) { } func TestCRD(t *testing.T) { - assertAppHealth(t, "./testdata/knative-service.yaml", health.HealthStatusProgressing, health.HealthUnknown, false) + b := "../resource_customizations/serving.knative.dev/Service/testdata/" + + assertAppHealth(t, "./testdata/knative-service.yaml", "", health.HealthUnknown, false) + assertAppHealth(t, b+"degraded.yaml", "RevisionFailed", health.HealthUnhealthy, false) + assertAppHealth(t, b+"healthy.yaml", "", health.HealthHealthy, true) + assertAppHealth(t, b+"progressing.yaml", "", health.HealthUnknown, false) +} + +func TestCnrmPubSub(t *testing.T) { + b := "../resource_customizations/pubsub.cnrm.cloud.google.com/PubSubTopic/testdata/" + + assertAppHealth(t, b+"dependency_not_found.yaml", "DependencyNotFound", health.HealthUnhealthy, true) + assertAppHealth(t, b+"dependency_not_ready.yaml", "DependencyNotReady", health.HealthUnknown, false) + assertAppHealth(t, b+"up_to_date.yaml", "UpToDate", health.HealthHealthy, true) + assertAppHealth(t, b+"update_failed.yaml", "UpdateFailed", health.HealthUnhealthy, true) + assertAppHealth(t, b+"update_in_progress.yaml", "", health.HealthUnknown, false) } func TestJob(t *testing.T) { @@ -439,8 +507,8 @@ func TestReplicaSet(t *testing.T) { } func TestPod(t *testing.T) { - assertAppHealth(t, "./testdata/terminating-stuck.yaml", "TerminatingStalled", health.HealthUnhealthy, false) - assertAppHealth(t, "./testdata/terminating-namespace.yaml", "TerminatingStalled", health.HealthUnhealthy, false) + assertAppHealth(t, "./testdata/terminating-stuck.yaml", "TerminatingStalled", health.HealthWarning, false) + assertAppHealth(t, "./testdata/terminating-namespace.yaml", "TerminatingStalled", health.HealthWarning, false) assertAppHealthWithOverwrite(t, "./testdata/pod-terminating.yaml", map[string]string{ "2024-07-01T06:52:22Z": time.Now().Add(-time.Minute * 20).UTC().Format("2006-01-02T15:04:05Z"), diff --git a/pkg/health/status.go b/pkg/health/status.go index d853a56..2be0572 100644 --- a/pkg/health/status.go +++ b/pkg/health/status.go @@ -54,17 +54,17 @@ func (s GenericStatus) Int(name string) (int32, bool) { return 0, false } -func (s GenericStatus) FindCondition(name string) *metav1.Condition { +func (s GenericStatus) FindCondition(name string) metav1.Condition { if name == "" || name == NoCondition { - return nil + return metav1.Condition{} } // FindStatusCondition finds the conditionType in conditions. for i := range s.Conditions { if s.Conditions[i].Type == name { - return &s.Conditions[i] + return s.Conditions[i] } } - return nil + return metav1.Condition{} } func GetGenericStatus(obj *unstructured.Unstructured) GenericStatus { @@ -195,13 +195,16 @@ func GetDefaultHealth(obj *unstructured.Unstructured) (*HealthStatus, error) { kind = "crossplane.io" } + if strings.Contains(group, "cnrm.cloud.google.com") { + kind = "cnrm.cloud.google.com" + } if statusMap, ok := statusByKind[obj.GetAPIVersion()+"/"+obj.GetKind()]; ok { return GetHealthFromStatus(GetGenericStatus(obj), statusMap) } else if statusMap, ok := statusByKind[kind]; ok { return GetHealthFromStatus(GetGenericStatus(obj), statusMap) + } else { + return GetHealthFromStatus(GetGenericStatus(obj), statusByKind["default"]) } - - return &HealthStatus{}, nil } func GetHealth(obj *unstructured.Unstructured, statusMap StatusMap) (*HealthStatus, error) { diff --git a/pkg/health/statusMap.yaml b/pkg/health/statusMap.yaml index 1a785d2..3162340 100644 --- a/pkg/health/statusMap.yaml +++ b/pkg/health/statusMap.yaml @@ -1,3 +1,8 @@ +default: + conditions: + Ready: + ready: true + health: healthy Issuer: conditions: Ready: @@ -55,29 +60,6 @@ CertificateRequest: status: 'True' type: Ready -Deployment: - conditions: - ReplicaFailure: - health: unhealthy - notReady: true - message: true - Progressing: - ready: true - status: Running - onFalse: - status: Rolling Out - reasons: - ProgressDeadlineExceeded: - health: unhealthy - notReady: true - status: Rollout Failed - order: 1 - message: true - Available: - health: healthy - onFalse: - health: unhealthy - message: true Kustomization: conditions: @@ -185,7 +167,27 @@ image.toolkit.fluxcd.io/v1beta2/ImagePolicy: *flux image.toolkit.fluxcd.io/v1beta2/ImageRepository: *flux image.toolkit.fluxcd.io/v1beta2/ImageUpdateAutomation: *flux -# Not an actual kind. +cnrm.cloud.google.com: + conditions: &cnrmconditions + DependencyNotFound: + message: true + health: unhealthy + ready: true + DependencyNotReady: + message: true + health: unknown + UpdateFailed: + message: true + health: unhealthy + ready: true + UpToDate: + message: true + ready: true + health: healthy + Ready: + reasons: *cnrmconditions + + crossplane.io: conditions: Healthy: diff --git a/pkg/health/testdata/Kubernetes::Application/degraded.yaml b/pkg/health/testdata/Kubernetes::Application/degraded.yaml new file mode 100644 index 0000000..6328646 --- /dev/null +++ b/pkg/health/testdata/Kubernetes::Application/degraded.yaml @@ -0,0 +1,425 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + uid: 9c7274c7-c5b9-47d6-ba94-e474ea7e6f59 + name: snipe-it-helm + namespace: argo-apps + annotations: + argocd.argoproj.io/refresh: normal + argocd.argoproj.io/sync-wave: "-5" + argocd.argoproj.io/tracking-id: argo-apps_snipe-it-wrapper:argoproj.io/Application:argo-apps/snipe-it-helm + creationTimestamp: 2024-10-03T15:24:34Z +spec: + source: + helm: + valuesObject: + image: + tag: v7.0.13 + mysql: + enabled: false + config: + snipeit: + env: production + url: https://assets.np-apps.acme.systems + debug: false + timezone: Europe/London + externalSecrets: snipe-it + ingress: + tls: + - hosts: + - assets.np-apps.acme.systems + secretName: snipe-it-tls + path: / + hosts: + - assets.np-apps.acme.systems + enabled: true + pathType: ImplementationSpecific + className: nginx-internal + annotations: + kubernetes.io/tls-acme: "true" + chart: snipeit + repoURL: https://storage.googleapis.com/t3n-helm-charts + targetRevision: 3.4.1 + project: apps-project + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=false + destination: + server: https://kubernetes.default.svc + namespace: snipe-it +status: + sync: + status: Synced + revision: 3.4.1 + comparedTo: + source: + helm: + valuesObject: + mysql: + enabled: false + config: + snipeit: + env: non-prod + url: assets.np-apps.acme.systems + timezone: Europe/London + ingress: + tls: + - hosts: + - assets.np-apps.acme.systems + secretName: snipe-it-tls + path: / + hosts: + - assets.np-apps.acme.systems + enabled: true + pathType: ImplementationSpecific + className: nginx-internal + annotations: + kubernetes.io/tls-acme: "true" + externalSecrets: snipe-it + chart: snipeit + repoURL: https://storage.googleapis.com/t3n-helm-charts + targetRevision: 3.4.1 + destination: + server: https://kubernetes.default.svc + namespace: snipe-it + health: + status: Degraded + history: + - id: 0 + source: + helm: + valuesObject: + mysql: + enabled: false + config: + snipeit: + env: non-prod + url: assets.np-apps.acme.systems + timezone: Europe/London + ingress: + tls: + - hosts: + - assets.np-apps.acme.systems + secretName: snipe-it-tls + path: / + hosts: + - assets.np-apps.acme.systems + enabled: true + pathType: ImplementationSpecific + className: nginx-internal + annotations: + kubernetes.io/tls-acme: "true" + externalSecrets: snipe-it + chart: snipeit + repoURL: https://storage.googleapis.com/t3n-helm-charts + targetRevision: 3.4.1 + revision: 3.4.1 + deployedAt: 2024-10-07T09:39:46Z + deployStartedAt: 2024-10-07T09:39:43Z + - id: 1 + source: + helm: + valuesObject: + mysql: + enabled: false + config: + snipeit: + env: non-prod + url: assets.np-apps.acme.systems + timezone: Europe/London + externalSecrets: snipe-it + ingress: + tls: + - hosts: + - assets.np-apps.acme.systems + secretName: snipe-it-tls + path: / + hosts: + - assets.np-apps.acme.systems + enabled: true + pathType: ImplementationSpecific + className: nginx-internal + annotations: + kubernetes.io/tls-acme: "true" + chart: snipeit + repoURL: https://storage.googleapis.com/t3n-helm-charts + targetRevision: 3.4.1 + revision: 3.4.1 + deployedAt: 2024-10-07T10:00:44Z + deployStartedAt: 2024-10-07T10:00:41Z + - id: 2 + source: + helm: + valuesObject: + mysql: + enabled: false + config: + snipeit: + env: non-prod + url: assets.np-apps.acme.systems + debug: true + timezone: Europe/London + externalSecrets: snipe-it + ingress: + tls: + - hosts: + - assets.np-apps.acme.systems + secretName: snipe-it-tls + path: / + hosts: + - assets.np-apps.acme.systems + enabled: true + pathType: ImplementationSpecific + className: nginx-internal + annotations: + kubernetes.io/tls-acme: "true" + chart: snipeit + repoURL: https://storage.googleapis.com/t3n-helm-charts + targetRevision: 3.4.1 + revision: 3.4.1 + deployedAt: 2024-10-07T10:21:51Z + deployStartedAt: 2024-10-07T10:21:45Z + - id: 3 + source: + helm: + valuesObject: + image: + tag: v7.0.13 + mysql: + enabled: false + config: + snipeit: + env: non-prod + url: assets.np-apps.acme.systems + debug: true + timezone: Europe/London + externalSecrets: snipe-it + ingress: + tls: + - hosts: + - assets.np-apps.acme.systems + secretName: snipe-it-tls + path: / + hosts: + - assets.np-apps.acme.systems + enabled: true + pathType: ImplementationSpecific + className: nginx-internal + annotations: + kubernetes.io/tls-acme: "true" + chart: snipeit + repoURL: https://storage.googleapis.com/t3n-helm-charts + targetRevision: 3.4.1 + revision: 3.4.1 + deployedAt: 2024-10-07T11:03:50Z + deployStartedAt: 2024-10-07T11:03:44Z + - id: 4 + source: + helm: + valuesObject: + image: + tag: v7.0.13 + mysql: + enabled: false + config: + snipeit: + env: production + url: https://assets.np-apps.acme.systems + debug: true + timezone: Europe/London + externalSecrets: snipe-it + ingress: + tls: + - hosts: + - assets.np-apps.acme.systems + secretName: snipe-it-tls + path: / + hosts: + - assets.np-apps.acme.systems + enabled: true + pathType: ImplementationSpecific + className: nginx-internal + annotations: + kubernetes.io/tls-acme: "true" + chart: snipeit + repoURL: https://storage.googleapis.com/t3n-helm-charts + targetRevision: 3.4.1 + revision: 3.4.1 + deployedAt: 2024-10-08T10:03:44Z + deployStartedAt: 2024-10-08T10:03:40Z + - id: 5 + source: + helm: + valuesObject: + image: + tag: v7.0.13 + mysql: + enabled: false + config: + snipeit: + env: production + url: https://assets.np-apps.acme.systems + debug: false + timezone: Europe/London + externalSecrets: snipe-it + ingress: + tls: + - hosts: + - assets.np-apps.acme.systems + secretName: snipe-it-tls + path: / + hosts: + - assets.np-apps.acme.systems + enabled: true + pathType: ImplementationSpecific + className: nginx-internal + annotations: + kubernetes.io/tls-acme: "true" + chart: snipeit + repoURL: https://storage.googleapis.com/t3n-helm-charts + targetRevision: 3.4.1 + revision: 3.4.1 + deployedAt: 2024-10-08T13:50:17Z + deployStartedAt: 2024-10-08T13:50:14Z + summary: + images: + - busybox + - snipe/snipe-it:v6.0.14 + externalURLs: + - https://assets.np-apps.acme.systems/ + resources: + - kind: PersistentVolumeClaim + name: snipe-it-helm-snipeit + health: + status: Healthy + status: Synced + version: v1 + namespace: snipe-it + - kind: Secret + name: snipe-it-helm-snipeit + status: Synced + version: v1 + namespace: snipe-it + - kind: Service + name: snipe-it-helm-snipeit + health: + status: Healthy + status: Synced + version: v1 + namespace: snipe-it + - kind: Deployment + name: snipe-it-helm-snipeit + group: apps + health: + status: Degraded + message: Deployment "snipe-it-helm-snipeit" exceeded its progress deadline + status: Synced + version: v1 + namespace: snipe-it + - kind: Ingress + name: snipe-it-helm-snipeit + group: networking.k8s.io + health: + status: Healthy + status: Synced + version: v1 + namespace: snipe-it + sourceType: Helm + reconciledAt: 2024-10-07T10:00:30Z + operationState: + phase: Succeeded + message: successfully synced (all tasks run) + operation: + sync: + prune: true + revision: 3.4.1 + syncOptions: + - CreateNamespace=false + retry: + limit: 5 + initiatedBy: + automated: true + startedAt: 2024-10-08T13:50:14Z + finishedAt: 2024-10-08T13:50:17Z + syncResult: + source: + helm: + valuesObject: + image: + tag: v7.0.13 + mysql: + enabled: false + config: + snipeit: + env: production + url: https://assets.np-apps.acme.systems + debug: false + timezone: Europe/London + externalSecrets: snipe-it + ingress: + tls: + - hosts: + - assets.np-apps.acme.systems + secretName: snipe-it-tls + path: / + hosts: + - assets.np-apps.acme.systems + enabled: true + pathType: ImplementationSpecific + className: nginx-internal + annotations: + kubernetes.io/tls-acme: "true" + chart: snipeit + repoURL: https://storage.googleapis.com/t3n-helm-charts + targetRevision: 3.4.1 + revision: 3.4.1 + resources: + - kind: Secret + name: snipe-it-helm-snipeit + group: "" + status: Synced + message: secret/snipe-it-helm-snipeit configured + version: v1 + hookPhase: Running + namespace: snipe-it + syncPhase: Sync + - kind: PersistentVolumeClaim + name: snipe-it-helm-snipeit + group: "" + status: Synced + message: persistentvolumeclaim/snipe-it-helm-snipeit unchanged + version: v1 + hookPhase: Running + namespace: snipe-it + syncPhase: Sync + - kind: Service + name: snipe-it-helm-snipeit + group: "" + status: Synced + message: service/snipe-it-helm-snipeit unchanged + version: v1 + hookPhase: Running + namespace: snipe-it + syncPhase: Sync + - kind: Deployment + name: snipe-it-helm-snipeit + group: apps + status: Synced + message: deployment.apps/snipe-it-helm-snipeit configured + version: v1 + hookPhase: Running + namespace: snipe-it + syncPhase: Sync + - kind: Ingress + name: snipe-it-helm-snipeit + group: networking.k8s.io + status: Synced + message: ingress.networking.k8s.io/snipe-it-helm-snipeit unchanged + version: v1 + hookPhase: Running + namespace: snipe-it + syncPhase: Sync + controllerNamespace: argo-cd diff --git a/pkg/health/testdata/Kubernetes::ContainerCluster/failed.yaml b/pkg/health/testdata/Kubernetes::ContainerCluster/failed.yaml new file mode 100644 index 0000000..b5a5f15 --- /dev/null +++ b/pkg/health/testdata/Kubernetes::ContainerCluster/failed.yaml @@ -0,0 +1,79 @@ +apiVersion: container.cnrm.cloud.google.com/v1beta1 +kind: ContainerCluster +metadata: + uid: f56c9c02-6a1d-440d-9c58-fff66643e546 + name: workload-prod-eu-02 + labels: + kustomize.toolkit.fluxcd.io/name: config-connector + kustomize.toolkit.fluxcd.io/namespace: flux-system + namespace: workload-prod-eu-02 + finalizers: + - cnrm.cloud.google.com/finalizer + - cnrm.cloud.google.com/deletion-defender + annotations: + cnrm.cloud.google.com/project-id: workload-prod-eu-02 + cnrm.cloud.google.com/state-into-spec: merge + cnrm.cloud.google.com/management-conflict-prevention-policy: none + creationTimestamp: 2024-11-13T09:26:00Z +spec: + location: europe-west1 + networkRef: + external: projects/flanksource-prod/global/networks/flanksource-workload + description: Flanksource Prod Workload Cluster + networkingMode: VPC_NATIVE + releaseChannel: + channel: STABLE + initialNodeCount: 1 + maintenancePolicy: + dailyMaintenanceWindow: + startTime: 00:00 + clusterAutoscaling: + enabled: false + ipAllocationPolicy: + clusterIpv4CidrBlock: /20 + servicesIpv4CidrBlock: /22 + notificationConfig: + pubsub: + enabled: true + topicRef: + name: workload-prod-eu-02-cluster-notifications + privateClusterConfig: + enablePrivateNodes: true + masterIpv4CidrBlock: 10.1.239.208/28 + enablePrivateEndpoint: true + masterGlobalAccessConfig: + enabled: true + workloadIdentityConfig: + workloadPool: workload-prod-eu-02.svc.id.goog + masterAuthorizedNetworksConfig: + cidrBlocks: + - cidrBlock: 10.1.140.0/22 + displayName: Tailscale exit range + - cidrBlock: 10.1.112.0/20 + displayName: Hub Pods +status: + conditions: + - type: Ready + reason: UpdateFailed + status: "False" + message: >- + Update call failed: error applying desired state: summary: googleapi: + Error 403: Google Compute Engine: Required 'compute.networks.get' + permission for + 'projects/flanksource-prod/global/networks/flanksource-workload'. + + Details: + + [ + { + "@type": "type.googleapis.com/google.rpc.RequestInfo", + "requestId": "0xf1e9e3ca2797eb18" + }, + { + "@type": "type.googleapis.com/google.rpc.ErrorInfo", + "domain": "container.googleapis.com", + "reason": "GCE_PERMISSION_DENIED" + } + ] + + , forbidden diff --git a/pkg/health/testdata/deployment-failed.yaml b/pkg/health/testdata/deployment-failed.yaml index bfb511a..6136a69 100644 --- a/pkg/health/testdata/deployment-failed.yaml +++ b/pkg/health/testdata/deployment-failed.yaml @@ -1,70 +1,70 @@ apiVersion: apps/v1 kind: Deployment metadata: - annotations: - deployment.kubernetes.io/revision: "4" - kubectl.kubernetes.io/last-applied-configuration: | - {"apiVersion":"apps/v1","kind":"Deployment","metadata":{"annotations":{},"labels":{"app.kubernetes.io/instance":"guestbook-default"},"name":"guestbook-ui","namespace":"default"},"spec":{"replicas":1,"selector":{"matchLabels":{"app":"guestbook-ui"}},"template":{"metadata":{"labels":{"app":"guestbook-ui","app.kubernetes.io/instance":"guestbook-default"}},"spec":{"containers":[{"image":"gcr.io/heptio-images/ks-guestbook-demo:0.3","name":"guestbook-ui","ports":[{"containerPort":80}]}]}}}} - creationTimestamp: 2018-07-18T04:40:44Z - generation: 4 + uid: 03e7bf66-2bbb-4ea9-8bdf-0cf7bc6ba8c1 + name: karina labels: - app.kubernetes.io/instance: guestbook-default - name: guestbook-ui - namespace: default - resourceVersion: "13660" - selfLink: /apis/apps/v1/namespaces/default/deployments/guestbook-ui - uid: bb9af0c7-8a44-11e8-9e23-42010aa80010 + control-plane: karina-operator + namespace: platform-system + annotations: + deployment.kubernetes.io/revision: "1" + creationTimestamp: 2023-05-10T08:11:03Z spec: - progressDeadlineSeconds: 600 replicas: 1 - revisionHistoryLimit: 10 selector: matchLabels: - app: guestbook-ui + control-plane: karina-operator strategy: + type: RollingUpdate rollingUpdate: maxSurge: 25% maxUnavailable: 25% - type: RollingUpdate template: - metadata: - creationTimestamp: null - labels: - app: guestbook-ui - app.kubernetes.io/instance: guestbook-default spec: + dnsPolicy: ClusterFirst containers: - - image: gcr.io/heptio-images/ks-guestbook-demo:0.3 + - args: + - operator + - --enable-leader-election + - --log-level=debug + name: karina-operator + image: docker.io/flanksource/karina:v0.51.1 + command: + - /bin/karina + resources: + limits: + cpu: 500m + memory: 256Mi + requests: + cpu: 100m + memory: 128Mi imagePullPolicy: IfNotPresent - name: guestbook-ui - ports: - - containerPort: 80 - protocol: TCP - resources: {} terminationMessagePath: /dev/termination-log terminationMessagePolicy: File - dnsPolicy: ClusterFirst restartPolicy: Always schedulerName: default-scheduler + serviceAccount: karina securityContext: {} + serviceAccountName: karina terminationGracePeriodSeconds: 30 + metadata: + labels: + control-plane: karina-operator + revisionHistoryLimit: 10 + progressDeadlineSeconds: 600 status: - availableReplicas: 1 conditions: - - lastTransitionTime: 2018-07-18T04:48:48Z - lastUpdateTime: 2018-07-18T04:48:48Z - message: Deployment has minimum availability. - reason: MinimumReplicasAvailable - status: "True" - type: Available - - lastTransitionTime: 2018-07-18T06:29:23Z - lastUpdateTime: 2018-07-18T06:29:23Z - message: ReplicaSet "guestbook-ui-75dd4d49d5" has timed out progressing. + - type: Available + reason: MinimumReplicasUnavailable + status: "False" + message: Deployment does not have minimum availability. + - type: Progressing reason: ProgressDeadlineExceeded status: "False" - type: Progressing - observedGeneration: 4 - readyReplicas: 0 - replicas: 1 + message: ReplicaSet "karina-c7585bd87" has timed out progressing. + - type: ReplicaFailure + reason: FailedCreate + status: "True" + message: 'pods "karina-c7585bd87-" is forbidden: error looking up service + account platform-system/karina: serviceaccount "karina" not found' unavailableReplicas: 1 - updatedReplicas: 1 diff --git a/pkg/health/testdata/deployment-rollout-failed-unhealthy.yaml b/pkg/health/testdata/deployment-rollout-failed-unhealthy.yaml new file mode 100644 index 0000000..512ee39 --- /dev/null +++ b/pkg/health/testdata/deployment-rollout-failed-unhealthy.yaml @@ -0,0 +1,54 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: guestbook-ui +spec: + progressDeadlineSeconds: 600 + replicas: 2 + revisionHistoryLimit: 10 + selector: + matchLabels: + app: guestbook-ui + strategy: + rollingUpdate: + maxSurge: 25% + maxUnavailable: 25% + type: RollingUpdate + template: + metadata: + creationTimestamp: null + labels: + app: guestbook-ui + app.kubernetes.io/instance: guestbook-default + spec: + containers: + - image: gcr.io/heptio-images/ks-guestbook-demo:0.3 + imagePullPolicy: IfNotPresent + name: guestbook-ui + ports: + - containerPort: 80 + protocol: TCP + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + dnsPolicy: ClusterFirst + restartPolicy: Always + schedulerName: default-scheduler + securityContext: {} + terminationGracePeriodSeconds: 30 +status: +status: + replicas: 2 + conditions: + - type: Available + reason: MinimumReplicasUnavailable + status: "False" + message: Deployment does not have minimum availability. + - type: Progressing + reason: ProgressDeadlineExceeded + status: "False" + message: ReplicaSet "mission-control-99b64d74c" has timed out progressing. + readyReplicas: 1 + updatedReplicas: 1 + availableReplicas: 1 + unavailableReplicas: 1 diff --git a/pkg/health/testdata/deployment-rollout-failed.yaml b/pkg/health/testdata/deployment-rollout-failed.yaml new file mode 100644 index 0000000..86f87e1 --- /dev/null +++ b/pkg/health/testdata/deployment-rollout-failed.yaml @@ -0,0 +1,56 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: guestbook-ui +spec: + progressDeadlineSeconds: 600 + replicas: 2 + revisionHistoryLimit: 10 + selector: + matchLabels: + app: guestbook-ui + strategy: + rollingUpdate: + maxSurge: 25% + maxUnavailable: 25% + type: RollingUpdate + template: + metadata: + creationTimestamp: null + labels: + app: guestbook-ui + app.kubernetes.io/instance: guestbook-default + spec: + containers: + - image: gcr.io/heptio-images/ks-guestbook-demo:0.3 + imagePullPolicy: IfNotPresent + name: guestbook-ui + ports: + - containerPort: 80 + protocol: TCP + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + dnsPolicy: ClusterFirst + restartPolicy: Always + schedulerName: default-scheduler + securityContext: {} + terminationGracePeriodSeconds: 30 +status: + availableReplicas: 1 + conditions: + - lastTransitionTime: 2018-07-18T04:48:48Z + lastUpdateTime: 2018-07-18T04:48:48Z + message: Deployment has minimum availability. + reason: MinimumReplicasAvailable + status: "True" + type: Available + - lastTransitionTime: 2018-07-18T06:29:23Z + lastUpdateTime: 2018-07-18T06:29:23Z + status: "True" + type: Progressing + observedGeneration: 4 + readyReplicas: 1 + replicas: 2 + unavailableReplicas: 1 + updatedReplicas: 1 diff --git a/pkg/health/testdata/deployment-scaled-up.yaml b/pkg/health/testdata/deployment-scaled-up.yaml new file mode 100644 index 0000000..5e154b4 --- /dev/null +++ b/pkg/health/testdata/deployment-scaled-up.yaml @@ -0,0 +1,118 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + uid: 9e75ed6f-2d20-4c90-b6c7-b88228047670 + name: podinfo + labels: + helm.sh/chart: podinfo-6.5.4 + app.kubernetes.io/name: podinfo + app.kubernetes.io/version: 6.5.4 + app.kubernetes.io/managed-by: Helm + namespace: podinfo + annotations: + meta.helm.sh/release-name: podinfo + meta.helm.sh/release-namespace: podinfo + deployment.kubernetes.io/revision: "1" + creationTimestamp: 2023-12-19T15:50:39Z +spec: + replicas: 3 + selector: + matchLabels: + app.kubernetes.io/name: podinfo + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 25% + maxUnavailable: 1 + template: + spec: + volumes: + - name: data + emptyDir: {} + dnsPolicy: ClusterFirst + containers: + - env: + - name: PODINFO_UI_COLOR + value: "#34577c" + name: podinfo + image: ghcr.io/stefanprodan/podinfo:6.5.4 + ports: + - name: http + protocol: TCP + containerPort: 9898 + - name: http-metrics + protocol: TCP + containerPort: 9797 + - name: grpc + protocol: TCP + containerPort: 9999 + command: + - ./podinfo + - --port=9898 + - --cert-path=/data/cert + - --port-metrics=9797 + - --grpc-port=9999 + - --grpc-service-name=podinfo + - --level=info + - --random-delay=false + - --random-error=false + resources: + requests: + cpu: 1m + memory: 16Mi + volumeMounts: + - name: data + mountPath: /data + livenessProbe: + exec: + command: + - podcli + - check + - http + - localhost:9898/healthz + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + successThreshold: 1 + initialDelaySeconds: 1 + readinessProbe: + exec: + command: + - podcli + - check + - http + - localhost:9898/readyz + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + successThreshold: 1 + initialDelaySeconds: 1 + imagePullPolicy: IfNotPresent + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + restartPolicy: Always + schedulerName: default-scheduler + securityContext: {} + terminationGracePeriodSeconds: 30 + metadata: + labels: + app.kubernetes.io/name: podinfo + annotations: + prometheus.io/port: "9898" + prometheus.io/scrape: "true" + revisionHistoryLimit: 10 + progressDeadlineSeconds: 600 +status: + replicas: 3 + conditions: + - type: Available + reason: MinimumReplicasAvailable + status: "True" + message: Deployment has minimum availability. + - type: Progressing + reason: NewReplicaSetAvailable + status: "True" + message: ReplicaSet "podinfo-97c6d4b94" has successfully progressed. + readyReplicas: 3 + updatedReplicas: 3 + availableReplicas: 3 diff --git a/pkg/health/testdata/deployment-scaling-down.yaml b/pkg/health/testdata/deployment-scaling-down.yaml index 8e906a3..ce7359f 100644 --- a/pkg/health/testdata/deployment-scaling-down.yaml +++ b/pkg/health/testdata/deployment-scaling-down.yaml @@ -53,15 +53,11 @@ status: conditions: - lastTransitionTime: 2018-07-18T04:48:48Z lastUpdateTime: 2018-07-18T04:48:48Z - message: Deployment has minimum availability. - reason: MinimumReplicasAvailable - status: "True" type: Available - lastTransitionTime: 2018-07-18T06:29:23Z lastUpdateTime: 2018-07-18T06:29:23Z - message: ReplicaSet "guestbook-ui-75dd4d49d5" has timed out progressing. reason: OK - status: "False" + status: "True" type: Progressing observedGeneration: 4 readyReplicas: 1 diff --git a/pkg/health/testdata/deployment-scaling-up.yaml b/pkg/health/testdata/deployment-scaling-up.yaml new file mode 100644 index 0000000..9d18eca --- /dev/null +++ b/pkg/health/testdata/deployment-scaling-up.yaml @@ -0,0 +1,125 @@ +Config +Changes1 +Insights0 +Relationships4 +Playbooks1 +Checks0 +apiVersion: apps/v1 +kind: Deployment +metadata: + uid: 9e75ed6f-2d20-4c90-b6c7-b88228047670 + name: podinfo + labels: + helm.sh/chart: podinfo-6.5.4 + app.kubernetes.io/name: podinfo + app.kubernetes.io/version: 6.5.4 + app.kubernetes.io/managed-by: Helm + namespace: podinfo + annotations: + meta.helm.sh/release-name: podinfo + meta.helm.sh/release-namespace: podinfo + deployment.kubernetes.io/revision: "1" + creationTimestamp: 2023-12-19T15:50:39Z +spec: + replicas: 3 + selector: + matchLabels: + app.kubernetes.io/name: podinfo + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 25% + maxUnavailable: 1 + template: + spec: + volumes: + - name: data + emptyDir: {} + dnsPolicy: ClusterFirst + containers: + - env: + - name: PODINFO_UI_COLOR + value: "#34577c" + name: podinfo + image: ghcr.io/stefanprodan/podinfo:6.5.4 + ports: + - name: http + protocol: TCP + containerPort: 9898 + - name: http-metrics + protocol: TCP + containerPort: 9797 + - name: grpc + protocol: TCP + containerPort: 9999 + command: + - ./podinfo + - --port=9898 + - --cert-path=/data/cert + - --port-metrics=9797 + - --grpc-port=9999 + - --grpc-service-name=podinfo + - --level=info + - --random-delay=false + - --random-error=false + resources: + requests: + cpu: 1m + memory: 16Mi + volumeMounts: + - name: data + mountPath: /data + livenessProbe: + exec: + command: + - podcli + - check + - http + - localhost:9898/healthz + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + successThreshold: 1 + initialDelaySeconds: 1 + readinessProbe: + exec: + command: + - podcli + - check + - http + - localhost:9898/readyz + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + successThreshold: 1 + initialDelaySeconds: 1 + imagePullPolicy: IfNotPresent + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + restartPolicy: Always + schedulerName: default-scheduler + securityContext: {} + terminationGracePeriodSeconds: 30 + metadata: + labels: + app.kubernetes.io/name: podinfo + annotations: + prometheus.io/port: "9898" + prometheus.io/scrape: "true" + revisionHistoryLimit: 10 + progressDeadlineSeconds: 600 +status: + replicas: 3 + conditions: + - type: Available + reason: MinimumReplicasUnavailable + status: "False" + message: Deployment does not have minimum availability. + - type: Progressing + reason: NewReplicaSetAvailable + status: "True" + message: ReplicaSet "podinfo-97c6d4b94" has successfully progressed. + readyReplicas: 1 + updatedReplicas: 3 + availableReplicas: 1 + unavailableReplicas: 2 diff --git a/pkg/health/testdata/deployment-starting.yaml b/pkg/health/testdata/deployment-starting.yaml new file mode 100644 index 0000000..eec0a6d --- /dev/null +++ b/pkg/health/testdata/deployment-starting.yaml @@ -0,0 +1,62 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + creationTimestamp: "@now-5m" + labels: + app.kubernetes.io/instance: guestbook-default + name: guestbook-ui + namespace: default +spec: + progressDeadlineSeconds: 600 + replicas: 2 + revisionHistoryLimit: 10 + selector: + matchLabels: + app: guestbook-ui + strategy: + rollingUpdate: + maxSurge: 25% + maxUnavailable: 25% + type: RollingUpdate + template: + metadata: + creationTimestamp: null + labels: + app: guestbook-ui + app.kubernetes.io/instance: guestbook-default + spec: + containers: + - image: gcr.io/heptio-images/ks-guestbook-demo:0.3 + imagePullPolicy: IfNotPresent + name: guestbook-ui + ports: + - containerPort: 80 + protocol: TCP + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + dnsPolicy: ClusterFirst + restartPolicy: Always + schedulerName: default-scheduler + securityContext: {} + terminationGracePeriodSeconds: 30 +status: + availableReplicas: 0 + conditions: + - lastTransitionTime: 2018-07-18T04:48:48Z + lastUpdateTime: 2018-07-18T04:48:48Z + reason: MinimumReplicasUnavailable + status: "False" + message: Deployment does not have minimum availability. + type: Available + - lastTransitionTime: 2018-07-18T04:40:44Z + lastUpdateTime: 2018-07-18T06:19:22Z + message: ReplicaSet "guestbook-ui-75dd4d49d5" is progressing. + reason: ReplicaSetUpdated + status: "True" + type: Progressing + observedGeneration: 4 + readyReplicas: 0 + replicas: 2 + unavailableReplicas: 1 + updatedReplicas: 1 diff --git a/pkg/health/testdata/pod-terminated.yaml b/pkg/health/testdata/pod-terminated.yaml new file mode 100644 index 0000000..bc0eecb --- /dev/null +++ b/pkg/health/testdata/pod-terminated.yaml @@ -0,0 +1,130 @@ +apiVersion: v1 +kind: Pod +metadata: + uid: b18e39d3-1301-4ac9-afb9-da3295261aa0 + name: config-test-q9kfv + labels: {} + namespace: flux-system + finalizers: + - batch.kubernetes.io/job-tracking + generateName: config-test- + ownerReferences: + - uid: c9f2c95e-3564-4631-959c-921ac410c030 + kind: Job + name: config-test + apiVersion: batch/v1 + controller: true + blockOwnerDeletion: true + creationTimestamp: 2024-11-14T12:40:17Z +spec: + volumes: + - name: kube-api-access-7hdzn + projected: + sources: + - serviceAccountToken: + path: token + expirationSeconds: 3607 + - configMap: + name: kube-root-ca.crt + items: + - key: ca.crt + path: ca.crt + - downwardAPI: + items: + - path: namespace + fieldRef: + fieldPath: metadata.namespace + apiVersion: v1 + defaultMode: 420 + nodeName: gke-hub-cluster-private-pool-containe-bf9b9895-9gpx + priority: 0 + dnsPolicy: ClusterFirst + containers: + - name: kubeconfig-updater + image: flanksource/base-image:latest + command: + - /bin/bash + - -c + - > + while read -r NAME NAMESPACE; do + CLUSTER=$(kubectl get containercluster $NAME -n $NAMESPACE -o yaml) + LOCATION=$(echo "$CLUSTER" | yq '.spec.location') + PROJECT=$(echo "$CLUSTER" | yq '.metadata.annotations."cnrm.cloud.google.com/project-id"') + export KUBECONFIG="$NAME-$LOCATION-$PROJECT" + export TOKEN=$(gcloud auth print-access-token) + gcloud container clusters get-credentials $NAME --location $LOCATION --project $PROJECT + yq -i '.users[].user.token = strenv(TOKEN) | del(.users[].user.exec)' $KUBECONFIG + kubectl create secret generic $NAME-kubeconfig -n $NAMESPACE --from-file=kubeconfig=$KUBECONFIG --dry-run=client -o yaml | kubectl apply -f - + done < <(kubectl get containercluster -A -o custom-columns=NAME:.metadata.name,NAMESPACE:.metadata.namespace | grep -v NAME) + resources: {} + volumeMounts: + - name: kube-api-access-7hdzn + readOnly: true + mountPath: /var/run/secrets/kubernetes.io/serviceaccount + imagePullPolicy: Always + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + tolerations: + - key: node.kubernetes.io/not-ready + effect: NoExecute + operator: Exists + tolerationSeconds: 300 + - key: node.kubernetes.io/unreachable + effect: NoExecute + operator: Exists + tolerationSeconds: 300 + restartPolicy: OnFailure + schedulerName: default-scheduler + serviceAccount: kustomize-controller + securityContext: {} + preemptionPolicy: PreemptLowerPriority + enableServiceLinks: true + serviceAccountName: kustomize-controller + terminationGracePeriodSeconds: 30 +status: + phase: Running + podIP: 10.1.115.11 + hostIP: 10.1.238.8 + podIPs: + - ip: 10.1.115.11 + hostIPs: + - ip: 10.1.238.8 + qosClass: BestEffort + startTime: 2024-11-14T12:40:17Z + conditions: + - type: ContainersReady + reason: ContainersNotReady + status: "False" + message: "containers with unready status: [kubeconfig-updater]" + - type: Initialized + status: "True" + - type: PodReadyToStartContainers + status: "True" + - type: PodScheduled + status: "True" + - type: Ready + reason: ContainersNotReady + status: "False" + message: "containers with unready status: [kubeconfig-updater]" + containerStatuses: + - name: kubeconfig-updater + image: docker.io/flanksource/base-image:latest + ready: false + state: + terminated: + reason: Error + exitCode: 1 + startedAt: 2024-11-14T12:40:22Z + finishedAt: 2024-11-14T12:40:24Z + containerID: containerd://3743fce5828cad78b261d52d2b5c27bfb4436a2ce55f454962fc5669d1c0dff1 + imageID: docker.io/flanksource/base-image@sha256:8d3fe5816e10e0eb0e74ef30dbbc66d54402dcbdab80b72c7461811a05825dbc + started: false + lastState: + terminated: + reason: Error + exitCode: 1 + startedAt: 2024-11-14T12:40:18Z + finishedAt: 2024-11-14T12:40:21Z + containerID: containerd://b4c3b97a5495e10d80202c1879b5aee7d6720c13dde573163f832a3231d35886 + containerID: containerd://3743fce5828cad78b261d52d2b5c27bfb4436a2ce55f454962fc5669d1c0dff1 + restartCount: 1 diff --git a/pkg/health/testdata/statefulset-ondelete.yaml b/pkg/health/testdata/statefulset-ondelete.yaml index deb0204..9f2adfa 100644 --- a/pkg/health/testdata/statefulset-ondelete.yaml +++ b/pkg/health/testdata/statefulset-ondelete.yaml @@ -16,6 +16,7 @@ metadata: namespace: default resourceVersion: "514251" selfLink: /apis/apps/v1/namespaces/default/statefulsets/redis-master + deletionTimestamp: "@now" uid: 1f80ab97-8bf6-11e8-aff0-42010a8a0fc6 spec: podManagementPolicy: OrderedReady