Skip to content

Commit

Permalink
E2E tests for managedBy
Browse files Browse the repository at this point in the history
  • Loading branch information
mszadkow committed Sep 11, 2024
1 parent 902c4fc commit 95f0b23
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 13 deletions.
44 changes: 31 additions & 13 deletions sdk/python/test/e2e/test_e2e_pytorchjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,14 +171,10 @@ def test_sdk_e2e(job_namespace):
GANG_SCHEDULER_NAME in GANG_SCHEDULERS,
reason="For plain scheduling",
)
def test_sdk_e2e_managed_by_multikueue(job_namespace):
def test_sdk_e2e_managed_by(job_namespace):
JOB_NAME = "pytorchjob-e2e"
container = generate_container()

#1. Job created with default value - normal procedure
#2. Job created with external valid value - created but not updated at all
#3. Job created with external invalid value - not created

master = KubeflowOrgV1ReplicaSpec(
replicas=1,
restart_policy="OnFailure",
Expand All @@ -201,17 +197,39 @@ def test_sdk_e2e_managed_by_multikueue(job_namespace):
),
)

managed_by = 'kueue.x-k8s.io/multikueue'
pytorchjob = generate_pytorchjob(job_namespace, JOB_NAME, master, worker, managed_by=managed_by)

TRAINING_CLIENT.create_job(job=pytorchjob, namespace=job_namespace)
#1. Job created with default value: 'kubeflow.org/training-operator' - job created and status updated
#2. Job created with kueue value: 'kueue.x-k8s.io/multikueue' - job created but status not updated
#3. Job created with invalid value (not acceptable by the webhook) - job not created
controllers = {
JOB_NAME+"-default-controller": 'kubeflow.org/training-operator',
JOB_NAME+"-multikueue-controller": 'kueue.x-k8s.io/multikueue',
JOB_NAME+"-invalid-controller": 'kueue.x-k8s.io/other-controller',
}
for job_name, managed_by in controllers.items():
pytorchjob = generate_pytorchjob(job_namespace, job_name, master, worker, managed_by=managed_by)
try:
TRAINING_CLIENT.create_job(job=pytorchjob, namespace=job_namespace)
except Exception as e:
if "invalid" in str(job_name):
error_message = f"Failed to create PyTorchJob: {job_namespace}/{job_name}"
assert error_message in str(e), f"Unexpected error: {e}"
else:
raise Exception(f"PyTorchJob E2E fails. Exception: {e}")

logging.info(f"List of created {TRAINING_CLIENT.job_kind}s")
logging.info(TRAINING_CLIENT.list_jobs(job_namespace))
jobs = TRAINING_CLIENT.list_jobs(job_namespace)
logging.info(jobs)

try:
conditions = TRAINING_CLIENT.get_job_conditions(JOB_NAME, job_namespace, TRAINING_CLIENT.job_kind, pytorchjob)
if len(conditions) != 0:
raise Exception(f"PyTorchJob conditions {conditions} should not be updated, externally managed by {managed_by}")
#Only jobs with valid controllers should be created, 2 out of 3 satisfy this condition: 'kubeflow.org/training-operator' and 'kueue.x-k8s.io/multikueue'
if len(jobs) != 2:
raise Exception(f"Too many PyTorchJobs created {jobs}")

for job in jobs:
if job._metadata.name == 'kubeflow.org/training-operator':
utils.verify_job_e2e(TRAINING_CLIENT, job._metadata.name, job_namespace, wait_timeout=900)
if job._metadata.name == 'kueue.x-k8s.io/multikueue':
utils.verify_externally_managed_job_e2e(TRAINING_CLIENT, job._metadata.name, job_namespace)

except Exception as e:
utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
Expand Down
9 changes: 9 additions & 0 deletions sdk/python/test/e2e/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,15 @@
logging.getLogger().setLevel(logging.INFO)


def verify_externally_managed_job_e2e(client: TrainingClient, name: str, namespace: str):
"""Verify externally managed Training Job e2e test."""
logging.info(f"\n\n\n{client.job_kind} is created")
job = client.get_job(name, namespace)
conditions = client.get_job_conditions(name, namespace, client.job_kind, job)
if len(conditions) != 0:
raise Exception(f"{client.job_kind} conditions {conditions} should not be updated, externally managed by {managed_by}")


def verify_unschedulable_job_e2e(client: TrainingClient, name: str, namespace: str):
"""Verify unschedulable Training Job e2e test."""
logging.info(f"\n\n\n{client.job_kind} is creating")
Expand Down

0 comments on commit 95f0b23

Please sign in to comment.