From cb83d14bab7b0fd91d514af1946dd8f27089f83b Mon Sep 17 00:00:00 2001 From: Yuki Iwai Date: Sat, 31 Aug 2024 03:52:33 +0900 Subject: [PATCH] Release-1.8: Cherry-pick of #2243 (#2244) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * No cleaning up a job if the job is suspended. Signed-off-by: Michal Szadkowski Signed-off-by: Yuki Iwai Co-authored-by: MichaƂ Szadkowski --- pkg/controller.v1/common/job.go | 2 +- pkg/controller.v1/tensorflow/job_test.go | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/pkg/controller.v1/common/job.go b/pkg/controller.v1/common/job.go index 825afdf8b1..d8a494dd0c 100644 --- a/pkg/controller.v1/common/job.go +++ b/pkg/controller.v1/common/job.go @@ -421,7 +421,7 @@ func (jc *JobController) CleanupJob(runPolicy *apiv1.RunPolicy, jobStatus apiv1. currentTime := time.Now() metaObject, _ := job.(metav1.Object) ttl := runPolicy.TTLSecondsAfterFinished - if ttl == nil { + if ttl == nil || trainutil.IsJobSuspended(runPolicy) { return nil } duration := time.Second * time.Duration(*ttl) diff --git a/pkg/controller.v1/tensorflow/job_test.go b/pkg/controller.v1/tensorflow/job_test.go index c7e5a43c45..df146ef15a 100644 --- a/pkg/controller.v1/tensorflow/job_test.go +++ b/pkg/controller.v1/tensorflow/job_test.go @@ -663,6 +663,30 @@ var _ = Describe("Test for controller.v1/common", func() { wantTFJobIsRemoved: false, wantErr: false, }), + Entry("No error with completionTime is nil if suspended", &cleanUpCases{ + tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, nil), + runPolicy: &kubeflowv1.RunPolicy{ + TTLSecondsAfterFinished: nil, + Suspend: ptr.To(true), + }, + jobStatus: kubeflowv1.JobStatus{ + CompletionTime: nil, + }, + wantTFJobIsRemoved: false, + wantErr: false, + }), + Entry("No error with TTL is set and completionTime is nil, if suspended", &cleanUpCases{ + tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, ptr.To[int32](10)), + runPolicy: &kubeflowv1.RunPolicy{ + TTLSecondsAfterFinished: ptr.To[int32](10), + Suspend: ptr.To(true), + }, + jobStatus: kubeflowv1.JobStatus{ + CompletionTime: nil, + }, + wantTFJobIsRemoved: false, + wantErr: false, + }), Entry("Error is occurred since completionTime is nil", &cleanUpCases{ tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, ptr.To[int32](10)), runPolicy: &kubeflowv1.RunPolicy{