diff --git a/Makefile b/Makefile index a3a4c9d7df..abbc5865d7 100644 --- a/Makefile +++ b/Makefile @@ -38,11 +38,11 @@ help: ## Display this help. manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects. $(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=training-operator webhook paths="./pkg/apis/kubeflow.org/v1/..." \ - output:crd:artifacts:config=manifests/base/v1/crds \ - output:rbac:artifacts:config=manifests/base/v1/rbac \ - output:webhook:artifacts:config=manifests/v1/base/webhook + output:crd:artifacts:config=manifests/base/crds \ + output:rbac:artifacts:config=manifests/base/rbac \ + output:webhook:artifacts:config=manifests/base/webhook $(CONTROLLER_GEN) "crd:generateEmbeddedObjectMeta=true" paths="./pkg/apis/kubeflow.org/v2alpha1/..." \ - output:crd:artifacts:config=manifests/base/v2alpha1/crds + output:crd:artifacts:config=manifests/v2/base/crds generate: controller-gen ## Generate apidoc, sdk and code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations. $(CONTROLLER_GEN) object:headerFile="hack/boilerplate/boilerplate.go.txt" paths="./pkg/apis/..." @@ -97,10 +97,10 @@ docker-push: ## Push docker image with the manager. ##@ Deployment install: manifests kustomize ## Install CRDs into the K8s cluster specified in ~/.kube/config. - $(KUSTOMIZE) build manifests/base/v1/crds | kubectl apply -f - + $(KUSTOMIZE) build manifests/base/crds | kubectl apply -f - uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified in ~/.kube/config. - $(KUSTOMIZE) build manifests/base/v1/crds | kubectl delete -f - + $(KUSTOMIZE) build manifests/base/crds | kubectl delete -f - deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config. cd manifests/overlays/standalone && $(KUSTOMIZE) edit set image kubeflow/training-operator=${IMG} diff --git a/manifests/base/v1/crds/kubeflow.org_jaxjobs.yaml b/manifests/base/crds/kubeflow.org_jaxjobs.yaml similarity index 100% rename from manifests/base/v1/crds/kubeflow.org_jaxjobs.yaml rename to manifests/base/crds/kubeflow.org_jaxjobs.yaml diff --git a/manifests/base/v1/crds/kubeflow.org_mpijobs.yaml b/manifests/base/crds/kubeflow.org_mpijobs.yaml similarity index 100% rename from manifests/base/v1/crds/kubeflow.org_mpijobs.yaml rename to manifests/base/crds/kubeflow.org_mpijobs.yaml diff --git a/manifests/base/v1/crds/kubeflow.org_paddlejobs.yaml b/manifests/base/crds/kubeflow.org_paddlejobs.yaml similarity index 100% rename from manifests/base/v1/crds/kubeflow.org_paddlejobs.yaml rename to manifests/base/crds/kubeflow.org_paddlejobs.yaml diff --git a/manifests/base/v1/crds/kubeflow.org_pytorchjobs.yaml b/manifests/base/crds/kubeflow.org_pytorchjobs.yaml similarity index 100% rename from manifests/base/v1/crds/kubeflow.org_pytorchjobs.yaml rename to manifests/base/crds/kubeflow.org_pytorchjobs.yaml diff --git a/manifests/base/v1/crds/kubeflow.org_tfjobs.yaml b/manifests/base/crds/kubeflow.org_tfjobs.yaml similarity index 100% rename from manifests/base/v1/crds/kubeflow.org_tfjobs.yaml rename to manifests/base/crds/kubeflow.org_tfjobs.yaml diff --git a/manifests/base/v1/crds/kubeflow.org_xgboostjobs.yaml b/manifests/base/crds/kubeflow.org_xgboostjobs.yaml similarity index 100% rename from manifests/base/v1/crds/kubeflow.org_xgboostjobs.yaml rename to manifests/base/crds/kubeflow.org_xgboostjobs.yaml diff --git a/manifests/base/v1/crds/kustomization.yaml b/manifests/base/crds/kustomization.yaml similarity index 100% rename from manifests/base/v1/crds/kustomization.yaml rename to manifests/base/crds/kustomization.yaml diff --git a/manifests/base/v1/deployment.yaml b/manifests/base/deployment.yaml similarity index 100% rename from manifests/base/v1/deployment.yaml rename to manifests/base/deployment.yaml diff --git a/manifests/base/kustomization.yaml b/manifests/base/kustomization.yaml new file mode 100644 index 0000000000..b140be1441 --- /dev/null +++ b/manifests/base/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ./crds + - ./rbac/cluster-role-binding.yaml + - ./rbac/role.yaml + - ./rbac/service-account.yaml + - ./webhook + - service.yaml + - deployment.yaml diff --git a/manifests/base/v1/rbac/cluster-role-binding.yaml b/manifests/base/rbac/cluster-role-binding.yaml similarity index 100% rename from manifests/base/v1/rbac/cluster-role-binding.yaml rename to manifests/base/rbac/cluster-role-binding.yaml diff --git a/manifests/base/v1/rbac/role.yaml b/manifests/base/rbac/role.yaml similarity index 100% rename from manifests/base/v1/rbac/role.yaml rename to manifests/base/rbac/role.yaml diff --git a/manifests/base/v1/rbac/service-account.yaml b/manifests/base/rbac/service-account.yaml similarity index 100% rename from manifests/base/v1/rbac/service-account.yaml rename to manifests/base/rbac/service-account.yaml diff --git a/manifests/base/v1/service.yaml b/manifests/base/service.yaml similarity index 100% rename from manifests/base/v1/service.yaml rename to manifests/base/service.yaml diff --git a/manifests/base/v1/kustomization.yaml b/manifests/base/v1/kustomization.yaml deleted file mode 100644 index 4d175a9693..0000000000 --- a/manifests/base/v1/kustomization.yaml +++ /dev/null @@ -1,10 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -resources: - - crds - - rbac/cluster-role-binding.yaml - - rbac/role.yaml - - rbac/service-account.yaml - - webhook - - service.yaml - - deployment.yaml diff --git a/manifests/base/v2alpha1/crds/kustomization.yaml b/manifests/base/v2alpha1/crds/kustomization.yaml deleted file mode 100644 index e1cfc732b4..0000000000 --- a/manifests/base/v2alpha1/crds/kustomization.yaml +++ /dev/null @@ -1,6 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -resources: -- kubeflow.org_clustertrainingruntimes.yaml -- kubeflow.org_trainingruntimes.yaml -- kubeflow.org_trainjobs.yaml diff --git a/manifests/base/v1/webhook/kustomization.yaml b/manifests/base/webhook/kustomization.yaml similarity index 100% rename from manifests/base/v1/webhook/kustomization.yaml rename to manifests/base/webhook/kustomization.yaml diff --git a/manifests/base/v1/webhook/kustomizeconfig.yaml b/manifests/base/webhook/kustomizeconfig.yaml similarity index 100% rename from manifests/base/v1/webhook/kustomizeconfig.yaml rename to manifests/base/webhook/kustomizeconfig.yaml diff --git a/manifests/base/v1/webhook/manifests.yaml b/manifests/base/webhook/manifests.yaml similarity index 100% rename from manifests/base/v1/webhook/manifests.yaml rename to manifests/base/webhook/manifests.yaml diff --git a/manifests/base/v1/webhook/patch.yaml b/manifests/base/webhook/patch.yaml similarity index 100% rename from manifests/base/v1/webhook/patch.yaml rename to manifests/base/webhook/patch.yaml diff --git a/manifests/overlays/kubeflow/kustomization.yaml b/manifests/overlays/kubeflow/kustomization.yaml index 3b489691f8..206d900a88 100644 --- a/manifests/overlays/kubeflow/kustomization.yaml +++ b/manifests/overlays/kubeflow/kustomization.yaml @@ -2,7 +2,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: kubeflow resources: - - ../../base/v1 + - ../../base - kubeflow-training-roles.yaml images: - name: kubeflow/training-operator diff --git a/manifests/overlays/standalone/kustomization.yaml b/manifests/overlays/standalone/kustomization.yaml index 0034e30fb9..df72e1dc03 100644 --- a/manifests/overlays/standalone/kustomization.yaml +++ b/manifests/overlays/standalone/kustomization.yaml @@ -2,7 +2,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: kubeflow resources: - - ../../base/v1 + - ../../base - namespace.yaml images: - name: kubeflow/training-operator diff --git a/manifests/base/v2alpha1/crds/kubeflow.org_clustertrainingruntimes.yaml b/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml similarity index 100% rename from manifests/base/v2alpha1/crds/kubeflow.org_clustertrainingruntimes.yaml rename to manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml diff --git a/manifests/base/v2alpha1/crds/kubeflow.org_trainingruntimes.yaml b/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml similarity index 100% rename from manifests/base/v2alpha1/crds/kubeflow.org_trainingruntimes.yaml rename to manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml diff --git a/manifests/base/v2alpha1/crds/kubeflow.org_trainjobs.yaml b/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml similarity index 100% rename from manifests/base/v2alpha1/crds/kubeflow.org_trainjobs.yaml rename to manifests/v2/base/crds/kubeflow.org_trainjobs.yaml diff --git a/pkg/controller.v1/common/job.go b/pkg/controller.v1/common/job.go index 825afdf8b1..d8a494dd0c 100644 --- a/pkg/controller.v1/common/job.go +++ b/pkg/controller.v1/common/job.go @@ -421,7 +421,7 @@ func (jc *JobController) CleanupJob(runPolicy *apiv1.RunPolicy, jobStatus apiv1. currentTime := time.Now() metaObject, _ := job.(metav1.Object) ttl := runPolicy.TTLSecondsAfterFinished - if ttl == nil { + if ttl == nil || trainutil.IsJobSuspended(runPolicy) { return nil } duration := time.Second * time.Duration(*ttl) diff --git a/pkg/controller.v1/mpi/suite_test.go b/pkg/controller.v1/mpi/suite_test.go index 2c2ee658af..1335c73c48 100644 --- a/pkg/controller.v1/mpi/suite_test.go +++ b/pkg/controller.v1/mpi/suite_test.go @@ -60,7 +60,7 @@ var _ = BeforeSuite(func() { By("bootstrapping test environment") testEnv = &envtest.Environment{ - CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "manifests", "base", "v1", "crds")}, + CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "manifests", "base", "crds")}, ErrorIfCRDPathMissing: true, } diff --git a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_suite_test.go b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_suite_test.go index df03e178c9..5d3505cb71 100644 --- a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_suite_test.go +++ b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_suite_test.go @@ -64,10 +64,10 @@ var _ = BeforeSuite(func() { By("bootstrapping test environment") testEnv = &envtest.Environment{ - CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "manifests", "base", "v1", "crds")}, + CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "manifests", "base", "crds")}, ErrorIfCRDPathMissing: true, WebhookInstallOptions: envtest.WebhookInstallOptions{ - Paths: []string{filepath.Join("..", "..", "..", "manifests", "base", "v1", "webhook", "manifests.yaml")}, + Paths: []string{filepath.Join("..", "..", "..", "manifests", "base", "webhook", "manifests.yaml")}, }, } diff --git a/pkg/controller.v1/pytorch/pytorchjob_controller_suite_test.go b/pkg/controller.v1/pytorch/pytorchjob_controller_suite_test.go index 99382bff9e..35810c9d1c 100644 --- a/pkg/controller.v1/pytorch/pytorchjob_controller_suite_test.go +++ b/pkg/controller.v1/pytorch/pytorchjob_controller_suite_test.go @@ -65,10 +65,10 @@ var _ = BeforeSuite(func() { By("bootstrapping test environment") testEnv = &envtest.Environment{ - CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "manifests", "base", "v1", "crds")}, + CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "manifests", "base", "crds")}, ErrorIfCRDPathMissing: true, WebhookInstallOptions: envtest.WebhookInstallOptions{ - Paths: []string{filepath.Join("..", "..", "..", "manifests", "base", "v1", "webhook", "manifests.yaml")}, + Paths: []string{filepath.Join("..", "..", "..", "manifests", "base", "webhook", "manifests.yaml")}, }, } diff --git a/pkg/controller.v1/tensorflow/job_test.go b/pkg/controller.v1/tensorflow/job_test.go index c7e5a43c45..df146ef15a 100644 --- a/pkg/controller.v1/tensorflow/job_test.go +++ b/pkg/controller.v1/tensorflow/job_test.go @@ -663,6 +663,30 @@ var _ = Describe("Test for controller.v1/common", func() { wantTFJobIsRemoved: false, wantErr: false, }), + Entry("No error with completionTime is nil if suspended", &cleanUpCases{ + tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, nil), + runPolicy: &kubeflowv1.RunPolicy{ + TTLSecondsAfterFinished: nil, + Suspend: ptr.To(true), + }, + jobStatus: kubeflowv1.JobStatus{ + CompletionTime: nil, + }, + wantTFJobIsRemoved: false, + wantErr: false, + }), + Entry("No error with TTL is set and completionTime is nil, if suspended", &cleanUpCases{ + tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, ptr.To[int32](10)), + runPolicy: &kubeflowv1.RunPolicy{ + TTLSecondsAfterFinished: ptr.To[int32](10), + Suspend: ptr.To(true), + }, + jobStatus: kubeflowv1.JobStatus{ + CompletionTime: nil, + }, + wantTFJobIsRemoved: false, + wantErr: false, + }), Entry("Error is occurred since completionTime is nil", &cleanUpCases{ tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, ptr.To[int32](10)), runPolicy: &kubeflowv1.RunPolicy{ diff --git a/pkg/controller.v1/tensorflow/suite_test.go b/pkg/controller.v1/tensorflow/suite_test.go index b06d1b5c7d..c1824fea5c 100644 --- a/pkg/controller.v1/tensorflow/suite_test.go +++ b/pkg/controller.v1/tensorflow/suite_test.go @@ -67,10 +67,10 @@ var _ = BeforeSuite(func() { By("bootstrapping test environment") testEnv = &envtest.Environment{ - CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "manifests", "base", "v1", "crds")}, + CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "manifests", "base", "crds")}, ErrorIfCRDPathMissing: true, WebhookInstallOptions: envtest.WebhookInstallOptions{ - Paths: []string{filepath.Join("..", "..", "..", "manifests", "base", "v1", "webhook", "manifests.yaml")}, + Paths: []string{filepath.Join("..", "..", "..", "manifests", "base", "webhook", "manifests.yaml")}, }, } diff --git a/pkg/controller.v1/xgboost/suite_test.go b/pkg/controller.v1/xgboost/suite_test.go index 3a18e03ffa..6a61611c5b 100644 --- a/pkg/controller.v1/xgboost/suite_test.go +++ b/pkg/controller.v1/xgboost/suite_test.go @@ -64,10 +64,10 @@ var _ = BeforeSuite(func() { By("bootstrapping test environment") testEnv = &envtest.Environment{ - CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "manifests", "base", "v1", "crds")}, + CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "manifests", "base", "crds")}, ErrorIfCRDPathMissing: true, WebhookInstallOptions: envtest.WebhookInstallOptions{ - Paths: []string{filepath.Join("..", "..", "..", "manifests", "base", "v1", "webhook", "manifests.yaml")}, + Paths: []string{filepath.Join("..", "..", "..", "manifests", "base", "webhook", "manifests.yaml")}, }, }