From 3a24f87ae3fa77b3ad7d08a4881f75aefc293507 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nils=20Gustav=20Str=C3=A5b=C3=B8?= Date: Thu, 17 Oct 2024 16:15:11 +0200 Subject: [PATCH] fix batch status incorrectly transitioning to Waiting when k8s job has temporary empty ready count --- charts/radix-operator/Chart.yaml | 4 ++-- pkg/apis/batch/status.go | 29 +++++++++++++++++------------ 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/charts/radix-operator/Chart.yaml b/charts/radix-operator/Chart.yaml index 7076022b4..5f130263f 100644 --- a/charts/radix-operator/Chart.yaml +++ b/charts/radix-operator/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 name: radix-operator -version: 1.43.1 -appVersion: 1.63.1 +version: 1.43.2 +appVersion: 1.63.2 kubeVersion: ">=1.24.0" description: Radix Operator keywords: diff --git a/pkg/apis/batch/status.go b/pkg/apis/batch/status.go index c9c44b3ed..d964d79cc 100644 --- a/pkg/apis/batch/status.go +++ b/pkg/apis/batch/status.go @@ -121,30 +121,31 @@ func (s *syncer) buildJobStatuses(ctx context.Context) ([]radixv1.RadixBatchJobS } func (s *syncer) buildBatchJobStatus(ctx context.Context, batchJob *radixv1.RadixBatchJob, allJobs []*batchv1.Job) radixv1.RadixBatchJobStatus { - currentStatus := slice.FindAll(s.radixBatch.Status.JobStatuses, func(jobStatus radixv1.RadixBatchJobStatus) bool { + currentStatus, hasCurrentStatus := slice.FindFirst(s.radixBatch.Status.JobStatuses, func(jobStatus radixv1.RadixBatchJobStatus) bool { return jobStatus.Name == batchJob.Name }) - if len(currentStatus) > 0 && isJobStatusDone(currentStatus[0]) { - return currentStatus[0] + if hasCurrentStatus && isJobStatusDone(currentStatus) { + return currentStatus } status := radixv1.RadixBatchJobStatus{ Name: batchJob.Name, Phase: radixv1.BatchJobPhaseWaiting, } - if len(currentStatus) > 0 { - status.Restart = currentStatus[0].Restart + if hasCurrentStatus { + status.Restart = currentStatus.Restart + status.Phase = currentStatus.Phase } if isBatchJobStopRequested(batchJob) { status.Phase = radixv1.BatchJobPhaseStopped now := metav1.Now() status.EndTime = &now - if len(currentStatus) > 0 { - status.CreationTime = currentStatus[0].CreationTime - status.StartTime = currentStatus[0].StartTime - status.Message = currentStatus[0].Message - status.Reason = currentStatus[0].Reason + if hasCurrentStatus { + status.CreationTime = currentStatus.CreationTime + status.StartTime = currentStatus.StartTime + status.Message = currentStatus.Message + status.Reason = currentStatus.Reason } s.updateJobAndPodStatuses(ctx, batchJob.Name, &status) return status @@ -158,12 +159,16 @@ func (s *syncer) buildBatchJobStatus(ctx context.Context, batchJob *radixv1.Radi status.CreationTime = &job.CreationTimestamp status.Failed = job.Status.Failed + var uncountedSucceeded, uncountedFailed int + if uncounted := job.Status.UncountedTerminatedPods; uncounted != nil { + uncountedSucceeded, uncountedFailed = len(uncounted.Succeeded), len(uncounted.Failed) + } jobConditionsSortedDesc := getJobConditionsSortedDesc(job) - if job.Status.Succeeded > 0 && + if (job.Status.Succeeded+int32(uncountedSucceeded)) > 0 && s.setJobStatus(ctx, batchJob, &status, job, jobConditionsSortedDesc, radixv1.BatchJobPhaseSucceeded, batchv1.JobComplete) { return status } - if job.Status.Failed == jobBackoffLimit+1 && + if (job.Status.Failed+int32(uncountedFailed)) == jobBackoffLimit+1 && s.setJobStatus(ctx, batchJob, &status, job, jobConditionsSortedDesc, radixv1.BatchJobPhaseFailed, batchv1.JobFailed) { return status }