From c26fbe23dce8259231034b3ca96e3d42734b395c Mon Sep 17 00:00:00 2001 From: Robert Ulbrich Date: Mon, 4 Sep 2023 07:08:42 +0200 Subject: [PATCH 1/8] Removing assignment of service-account for launcher --- pkg/controller.v1/mpi/mpijob_controller.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/controller.v1/mpi/mpijob_controller.go b/pkg/controller.v1/mpi/mpijob_controller.go index f6ae024dba..af88b1f26a 100644 --- a/pkg/controller.v1/mpi/mpijob_controller.go +++ b/pkg/controller.v1/mpi/mpijob_controller.go @@ -1035,7 +1035,6 @@ func (jc *MPIJobReconciler) newLauncher(mpiJob *kubeflowv1.MPIJob, kubectlDelive jc.PodGroupControl.DecoratePodTemplateSpec(podSpec, mpiJob, rt) } - podSpec.Spec.ServiceAccountName = launcherName podSpec.Spec.InitContainers = append(podSpec.Spec.InitContainers, corev1.Container{ Name: kubectlDeliveryName, Image: kubectlDeliveryImage, From 8c42234fd46c161fd2667e12b96ec05f79e5b3c0 Mon Sep 17 00:00:00 2001 From: Robert Ulbrich Date: Mon, 4 Sep 2023 07:08:42 +0200 Subject: [PATCH 2/8] Removing assignment of service-account for launcher --- pkg/controller.v1/mpi/mpijob_controller.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/controller.v1/mpi/mpijob_controller.go b/pkg/controller.v1/mpi/mpijob_controller.go index f6ae024dba..af88b1f26a 100644 --- a/pkg/controller.v1/mpi/mpijob_controller.go +++ b/pkg/controller.v1/mpi/mpijob_controller.go @@ -1035,7 +1035,6 @@ func (jc *MPIJobReconciler) newLauncher(mpiJob *kubeflowv1.MPIJob, kubectlDelive jc.PodGroupControl.DecoratePodTemplateSpec(podSpec, mpiJob, rt) } - podSpec.Spec.ServiceAccountName = launcherName podSpec.Spec.InitContainers = append(podSpec.Spec.InitContainers, corev1.Container{ Name: kubectlDeliveryName, Image: kubectlDeliveryImage, From 337f80950f3d5775520d2eecab65df42fcc1eb9e Mon Sep 17 00:00:00 2001 From: Robert Ulbrich Date: Mon, 11 Sep 2023 15:35:53 +0200 Subject: [PATCH 3/8] Increasing timeout in order to avoid e2e test failures --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 2 +- sdk/python/test/e2e/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index 22a9b11b83..9a997d89d6 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -97,7 +97,7 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace): job_namespace, constants.PYTORCHJOB_KIND, CONTAINER_NAME, - timeout=900, + timeout=3600, ) TRAINING_CLIENT.delete_pytorchjob(JOB_NAME, job_namespace) diff --git a/sdk/python/test/e2e/utils.py b/sdk/python/test/e2e/utils.py index 32994be79c..06b6ff28e3 100644 --- a/sdk/python/test/e2e/utils.py +++ b/sdk/python/test/e2e/utils.py @@ -34,7 +34,7 @@ def verify_unschedulable_job_e2e( def verify_job_e2e( - client: TrainingClient, name: str, namespace: str, job_kind: str, container: str, timeout: int = 600 + client: TrainingClient, name: str, namespace: str, job_kind: str, container: str, timeout: int = 1800 ): """Verify Training Job e2e test.""" From a071eea5da2cfa296255412c15345f70d0bd0de5 Mon Sep 17 00:00:00 2001 From: Robert Ulbrich Date: Thu, 14 Sep 2023 14:24:02 +0200 Subject: [PATCH 4/8] Extending maximum runtime of integration tests --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 4 ++-- sdk/python/test/e2e/utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index 9a997d89d6..698dadab4f 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -97,7 +97,7 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace): job_namespace, constants.PYTORCHJOB_KIND, CONTAINER_NAME, - timeout=3600, + timeout=10000, ) TRAINING_CLIENT.delete_pytorchjob(JOB_NAME, job_namespace) @@ -135,7 +135,7 @@ def test_sdk_e2e(job_namespace): job_namespace, constants.PYTORCHJOB_KIND, CONTAINER_NAME, - timeout=900, + timeout=10000, ) TRAINING_CLIENT.delete_pytorchjob(JOB_NAME, job_namespace) diff --git a/sdk/python/test/e2e/utils.py b/sdk/python/test/e2e/utils.py index 06b6ff28e3..a08e8b8cde 100644 --- a/sdk/python/test/e2e/utils.py +++ b/sdk/python/test/e2e/utils.py @@ -34,7 +34,7 @@ def verify_unschedulable_job_e2e( def verify_job_e2e( - client: TrainingClient, name: str, namespace: str, job_kind: str, container: str, timeout: int = 1800 + client: TrainingClient, name: str, namespace: str, job_kind: str, container: str, timeout: int = 7200 ): """Verify Training Job e2e test.""" From 4de96e7e6dbce9f1c7e947f3e74d3c8eabf94ae2 Mon Sep 17 00:00:00 2001 From: Robert Ulbrich Date: Tue, 19 Sep 2023 07:07:36 +0200 Subject: [PATCH 5/8] Resetting timeout values for E2E tests --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 4 ++-- sdk/python/test/e2e/utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index 698dadab4f..22a9b11b83 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -97,7 +97,7 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace): job_namespace, constants.PYTORCHJOB_KIND, CONTAINER_NAME, - timeout=10000, + timeout=900, ) TRAINING_CLIENT.delete_pytorchjob(JOB_NAME, job_namespace) @@ -135,7 +135,7 @@ def test_sdk_e2e(job_namespace): job_namespace, constants.PYTORCHJOB_KIND, CONTAINER_NAME, - timeout=10000, + timeout=900, ) TRAINING_CLIENT.delete_pytorchjob(JOB_NAME, job_namespace) diff --git a/sdk/python/test/e2e/utils.py b/sdk/python/test/e2e/utils.py index a08e8b8cde..32994be79c 100644 --- a/sdk/python/test/e2e/utils.py +++ b/sdk/python/test/e2e/utils.py @@ -34,7 +34,7 @@ def verify_unschedulable_job_e2e( def verify_job_e2e( - client: TrainingClient, name: str, namespace: str, job_kind: str, container: str, timeout: int = 7200 + client: TrainingClient, name: str, namespace: str, job_kind: str, container: str, timeout: int = 600 ): """Verify Training Job e2e test.""" From e0976eeaf16b59d5abb6faca0502636ad6acfa9b Mon Sep 17 00:00:00 2001 From: Robert Ulbrich Date: Wed, 20 Sep 2023 07:02:40 +0200 Subject: [PATCH 6/8] Using service account name if provided, otherwise create default --- pkg/controller.v1/mpi/mpijob_controller.go | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/pkg/controller.v1/mpi/mpijob_controller.go b/pkg/controller.v1/mpi/mpijob_controller.go index af88b1f26a..be5645bee2 100644 --- a/pkg/controller.v1/mpi/mpijob_controller.go +++ b/pkg/controller.v1/mpi/mpijob_controller.go @@ -724,8 +724,14 @@ func (jc *MPIJobReconciler) getOrCreateConfigMap(mpiJob *kubeflowv1.MPIJob, work // by this MPIJob, or creates one if it doesn't exist. func (jc *MPIJobReconciler) getOrCreateLauncherServiceAccount(mpiJob *kubeflowv1.MPIJob) (*corev1.ServiceAccount, error) { + saName := mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher].Template.Spec.ServiceAccountName + + if len(saName) == 0 { + saName = mpiJob.Name + launcherSuffix + } + sa := &corev1.ServiceAccount{} - NamespacedName := types.NamespacedName{Namespace: mpiJob.Namespace, Name: mpiJob.Name + launcherSuffix} + NamespacedName := types.NamespacedName{Namespace: mpiJob.Namespace, Name: saName} err := jc.Get(context.Background(), NamespacedName, sa) if err == nil { @@ -1035,6 +1041,10 @@ func (jc *MPIJobReconciler) newLauncher(mpiJob *kubeflowv1.MPIJob, kubectlDelive jc.PodGroupControl.DecoratePodTemplateSpec(podSpec, mpiJob, rt) } + if len(mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher].Template.Spec.ServiceAccountName) == 0 { + podSpec.Spec.ServiceAccountName = launcherName + } + podSpec.Spec.InitContainers = append(podSpec.Spec.InitContainers, corev1.Container{ Name: kubectlDeliveryName, Image: kubectlDeliveryImage, @@ -1350,6 +1360,12 @@ func newLauncherRole(mpiJob *kubeflowv1.MPIJob, workerReplicas int32) *rbacv1.Ro // handleObject can discover the MPIJob resource that 'owns' it. func newLauncherRoleBinding(mpiJob *kubeflowv1.MPIJob) *rbacv1.RoleBinding { launcherName := mpiJob.Name + launcherSuffix + saName := launcherName + + if len(mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher].Template.Spec.ServiceAccountName) > 0 { + saName = mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher].Template.Spec.ServiceAccountName + } + return &rbacv1.RoleBinding{ ObjectMeta: metav1.ObjectMeta{ Name: launcherName, @@ -1364,7 +1380,7 @@ func newLauncherRoleBinding(mpiJob *kubeflowv1.MPIJob) *rbacv1.RoleBinding { Subjects: []rbacv1.Subject{ { Kind: rbacv1.ServiceAccountKind, - Name: launcherName, + Name: saName, Namespace: mpiJob.Namespace, }, }, From 933e428625c2fa527da3187657dbd8ad0546381b Mon Sep 17 00:00:00 2001 From: Robert Ulbrich Date: Tue, 19 Sep 2023 07:07:36 +0200 Subject: [PATCH 7/8] Resetting timeout values for E2E tests Correcting author --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 4 ++-- sdk/python/test/e2e/utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index 698dadab4f..22a9b11b83 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -97,7 +97,7 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace): job_namespace, constants.PYTORCHJOB_KIND, CONTAINER_NAME, - timeout=10000, + timeout=900, ) TRAINING_CLIENT.delete_pytorchjob(JOB_NAME, job_namespace) @@ -135,7 +135,7 @@ def test_sdk_e2e(job_namespace): job_namespace, constants.PYTORCHJOB_KIND, CONTAINER_NAME, - timeout=10000, + timeout=900, ) TRAINING_CLIENT.delete_pytorchjob(JOB_NAME, job_namespace) diff --git a/sdk/python/test/e2e/utils.py b/sdk/python/test/e2e/utils.py index a08e8b8cde..32994be79c 100644 --- a/sdk/python/test/e2e/utils.py +++ b/sdk/python/test/e2e/utils.py @@ -34,7 +34,7 @@ def verify_unschedulable_job_e2e( def verify_job_e2e( - client: TrainingClient, name: str, namespace: str, job_kind: str, container: str, timeout: int = 7200 + client: TrainingClient, name: str, namespace: str, job_kind: str, container: str, timeout: int = 600 ): """Verify Training Job e2e test.""" From e3ca98c423f22ce9ee1b5956d999322c41732d71 Mon Sep 17 00:00:00 2001 From: Robert Ulbrich Date: Wed, 20 Sep 2023 07:02:40 +0200 Subject: [PATCH 8/8] Using service account name if provided, otherwise create default --- pkg/controller.v1/mpi/mpijob_controller.go | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/pkg/controller.v1/mpi/mpijob_controller.go b/pkg/controller.v1/mpi/mpijob_controller.go index af88b1f26a..be5645bee2 100644 --- a/pkg/controller.v1/mpi/mpijob_controller.go +++ b/pkg/controller.v1/mpi/mpijob_controller.go @@ -724,8 +724,14 @@ func (jc *MPIJobReconciler) getOrCreateConfigMap(mpiJob *kubeflowv1.MPIJob, work // by this MPIJob, or creates one if it doesn't exist. func (jc *MPIJobReconciler) getOrCreateLauncherServiceAccount(mpiJob *kubeflowv1.MPIJob) (*corev1.ServiceAccount, error) { + saName := mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher].Template.Spec.ServiceAccountName + + if len(saName) == 0 { + saName = mpiJob.Name + launcherSuffix + } + sa := &corev1.ServiceAccount{} - NamespacedName := types.NamespacedName{Namespace: mpiJob.Namespace, Name: mpiJob.Name + launcherSuffix} + NamespacedName := types.NamespacedName{Namespace: mpiJob.Namespace, Name: saName} err := jc.Get(context.Background(), NamespacedName, sa) if err == nil { @@ -1035,6 +1041,10 @@ func (jc *MPIJobReconciler) newLauncher(mpiJob *kubeflowv1.MPIJob, kubectlDelive jc.PodGroupControl.DecoratePodTemplateSpec(podSpec, mpiJob, rt) } + if len(mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher].Template.Spec.ServiceAccountName) == 0 { + podSpec.Spec.ServiceAccountName = launcherName + } + podSpec.Spec.InitContainers = append(podSpec.Spec.InitContainers, corev1.Container{ Name: kubectlDeliveryName, Image: kubectlDeliveryImage, @@ -1350,6 +1360,12 @@ func newLauncherRole(mpiJob *kubeflowv1.MPIJob, workerReplicas int32) *rbacv1.Ro // handleObject can discover the MPIJob resource that 'owns' it. func newLauncherRoleBinding(mpiJob *kubeflowv1.MPIJob) *rbacv1.RoleBinding { launcherName := mpiJob.Name + launcherSuffix + saName := launcherName + + if len(mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher].Template.Spec.ServiceAccountName) > 0 { + saName = mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher].Template.Spec.ServiceAccountName + } + return &rbacv1.RoleBinding{ ObjectMeta: metav1.ObjectMeta{ Name: launcherName, @@ -1364,7 +1380,7 @@ func newLauncherRoleBinding(mpiJob *kubeflowv1.MPIJob) *rbacv1.RoleBinding { Subjects: []rbacv1.Subject{ { Kind: rbacv1.ServiceAccountKind, - Name: launcherName, + Name: saName, Namespace: mpiJob.Namespace, }, },