From 15b6cb0bcc0ba9eece05c88046be6c5ca47e6690 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 9 Aug 2024 11:08:04 +0800 Subject: [PATCH 01/86] add e2e test for train API Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 60 +++++++++++ sdk/python/test_e2e/test_e2e_train_api.py | 126 ++++++++++++++++++++++ sdk/python/test_e2e/test_e2e_train_api.sh | 37 +++++++ 3 files changed, 223 insertions(+) create mode 100644 .github/workflows/e2e-test-train-api.yaml create mode 100644 sdk/python/test_e2e/test_e2e_train_api.py create mode 100755 sdk/python/test_e2e/test_e2e_train_api.sh diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml new file mode 100644 index 0000000000..21537e9616 --- /dev/null +++ b/.github/workflows/e2e-test-train-api.yaml @@ -0,0 +1,60 @@ +name: E2E Test with train API + +on: + - pull_request + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + e2e: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"] + python-version: [3.8, 3.9, 3.10, 3.11] + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Free-Up Disk Space + uses: ./.github/workflows/free-up-disk-space + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Create k8s Kind Cluster + uses: helm/kind-action@v1.10.0 + with: + node_image: kindest/node:${{ matrix.kubernetes-version }} + cluster_name: training-operator-cluster + kubectl_version: ${{ matrix.kubernetes-version }} + + - name: Build training-operator + run: | + ./scripts/gha/build-image.sh + env: + TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test + + - name: Deploy training operator + run: | + ./scripts/gha/setup-training-operator.sh + env: + KIND_CLUSTER: training-operator-cluster + TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test + GANG_SCHEDULER_NAME: "none" + KUBERNETES_VERSION: ${{ matrix.kubernetes-version }} + + - name: Run tests + run: | + python3 -m pip install -e sdk/python + ./sdk/python/test_e2e/test_e2e_train_api.sh diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py new file mode 100644 index 0000000000..146b6fe056 --- /dev/null +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -0,0 +1,126 @@ +# Copyright 2024 kubeflow.org. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from kubernetes import client, config + +from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceModelParams, + HuggingFaceTrainerParams, + HuggingFaceDatasetParams, +) +from kubeflow.training import TrainingClient +from kubeflow.training import constants + +import logging + +from peft import LoraConfig +import transformers + +import test.e2e.utils as utils + +logging.basicConfig(format="%(message)s") +logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG) + +TRAINING_CLIENT = TrainingClient(job_kind=constants.PYTORCHJOB_KIND) +JOB_NAME = "test-train-api" + + +def test_train_api(job_namespace): + num_workers = 1 + + # Use test case from fine-tuning API tutorial + # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/ + TRAINING_CLIENT.train( + name=JOB_NAME, + namespace=job_namespace, + # BERT model URI and type of Transformer to train it. + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + ), + # In order to save test time, use 8 samples from Yelp dataset. + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + # Specify HuggingFace Trainer parameters. In this example, we will skip evaluation and model checkpoints. + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_trainer", + save_strategy="no", + evaluation_strategy="no", + do_eval=False, + disable_tqdm=True, + log_level="info", + num_train_epochs=1, + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r=8, + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + num_workers=num_workers, # nodes parameter for torchrun command. + num_procs_per_worker=1, # nproc-per-node parameter for torchrun command. + resources_per_worker={ + "gpu": 0, + "cpu": 2, + "memory": "10G", + }, + ) + + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) + + try: + utils.verify_job_e2e( + TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=60 * 30 + ) + logging.info(f"Training job {JOB_NAME} is succeded.") + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"Training job {JOB_NAME} is failed. Exception: {e}") + + # Verify that training job has correct pods. + pod_names = TRAINING_CLIENT.get_job_pod_names( + name=JOB_NAME, namespace=job_namespace + ) + + # if len(pod_names) != num_workers or f"{JOB_NAME}-worker-0" not in pod_names: + if len(pod_names) != num_workers: + raise Exception(f"Training job has incorrect pods: {pod_names}") + + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + + # Get and print the logs of the master pod + master_pod_name = next((name for name in pod_names if "master" in name), None) + if master_pod_name: + config.load_kube_config() # Load kube config to interact with the cluster + v1 = client.CoreV1Api() + try: + pod_logs = v1.read_namespaced_pod_log( + name=master_pod_name, namespace=job_namespace + ) + logging.info(f"Logs of master pod {master_pod_name}:\n{pod_logs}") + except client.exceptions.ApiException as e: + logging.error(f"Failed to get logs for pod {master_pod_name}: {e}") + + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + + +if __name__ == "__main__": + test_train_api(job_namespace="default") diff --git a/sdk/python/test_e2e/test_e2e_train_api.sh b/sdk/python/test_e2e/test_e2e_train_api.sh new file mode 100755 index 0000000000..0ac3f9f4e1 --- /dev/null +++ b/sdk/python/test_e2e/test_e2e_train_api.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +# Copyright 2024 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This shell script is used to run Katib Experiment. +# Input parameter - path to Experiment yaml. + +set -o errexit +set -o nounset +set -o pipefail + +cd "$(dirname "$0")" + +echo "Training Operator deployments" +kubectl -n kubeflow get deploy +echo "Training Operator services" +kubectl -n kubeflow get svc +echo "Training Operator pods" +kubectl -n kubeflow get pod +echo "Training Operator persistent volume claims" +kubectl get pvc -n kubeflow +echo "Available CRDs" +kubectl get crd + +python test_e2e_train_api.py || (kubectl get pods -n kubeflow && exit 1) \ No newline at end of file From daa00543443fe7cb975afaacca7d024fa8ca3b9c Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 9 Aug 2024 11:29:15 +0800 Subject: [PATCH 02/86] fix peft import error Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index 21537e9616..bec2981c59 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -14,7 +14,7 @@ jobs: fail-fast: false matrix: kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"] - python-version: [3.8, 3.9, 3.10, 3.11] + python-version: ["3.8", "3.9", "3.10", "3.11"] steps: - name: Checkout uses: actions/checkout@v4 @@ -56,5 +56,5 @@ jobs: - name: Run tests run: | - python3 -m pip install -e sdk/python + python3 -m pip install -e sdk/python[huggingface] ./sdk/python/test_e2e/test_e2e_train_api.sh From 8d4af9051f14aaeb8cef6893ca827064e6247fcb Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 9 Aug 2024 12:05:34 +0800 Subject: [PATCH 03/86] update settings of the job Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index 146b6fe056..0898c6ab5b 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -37,7 +37,7 @@ def test_train_api(job_namespace): - num_workers = 1 + num_workers = 4 # Use test case from fine-tuning API tutorial # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/ @@ -74,10 +74,10 @@ def test_train_api(job_namespace): ), ), num_workers=num_workers, # nodes parameter for torchrun command. - num_procs_per_worker=1, # nproc-per-node parameter for torchrun command. + num_procs_per_worker=2, # nproc-per-node parameter for torchrun command. resources_per_worker={ - "gpu": 0, - "cpu": 2, + "gpu": 2, + "cpu": 5, "memory": "10G", }, ) @@ -87,7 +87,7 @@ def test_train_api(job_namespace): try: utils.verify_job_e2e( - TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=60 * 30 + TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=60 * 60 ) logging.info(f"Training job {JOB_NAME} is succeded.") except Exception as e: From 86c31c82e2ffdbdeda1438730e834031859bfce8 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 9 Aug 2024 12:09:23 +0800 Subject: [PATCH 04/86] fix format Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 22 +++++++++------------- sdk/python/test_e2e/test_e2e_train_api.sh | 2 +- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index 0898c6ab5b..6254827cdf 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -12,23 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -from kubernetes import client, config - -from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceModelParams, - HuggingFaceTrainerParams, - HuggingFaceDatasetParams, -) -from kubeflow.training import TrainingClient -from kubeflow.training import constants - import logging - +import test.e2e.utils as utils + +from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams +from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams +from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams +from kubeflow.training import constants +from kubeflow.training import TrainingClient +from kubernetes import client +from kubernetes import config from peft import LoraConfig import transformers -import test.e2e.utils as utils - logging.basicConfig(format="%(message)s") logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG) diff --git a/sdk/python/test_e2e/test_e2e_train_api.sh b/sdk/python/test_e2e/test_e2e_train_api.sh index 0ac3f9f4e1..af41771faf 100755 --- a/sdk/python/test_e2e/test_e2e_train_api.sh +++ b/sdk/python/test_e2e/test_e2e_train_api.sh @@ -34,4 +34,4 @@ kubectl get pvc -n kubeflow echo "Available CRDs" kubectl get crd -python test_e2e_train_api.py || (kubectl get pods -n kubeflow && exit 1) \ No newline at end of file +python test_e2e_train_api.py || (kubectl get pods -n kubeflow && exit 1) From 01870e239c62700df82b762fb96d8d92f242e3f6 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 9 Aug 2024 12:16:05 +0800 Subject: [PATCH 05/86] fix format Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 2 +- sdk/python/test_e2e/test_e2e_train_api.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index bec2981c59..a663da9f87 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -1,6 +1,6 @@ name: E2E Test with train API -on: +on: - pull_request concurrency: diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index 6254827cdf..1074bfc530 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -14,7 +14,7 @@ import logging import test.e2e.utils as utils - + from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams From 17f3c33032796e83686169da56b84c855e894964 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 9 Aug 2024 20:46:56 +0800 Subject: [PATCH 06/86] fix error detection Signed-off-by: helenxie-bit --- .../kubeflow/training/constants/constants.py | 4 +- sdk/python/test_e2e/test_e2e_train_api.py | 122 +++++++++++++----- 2 files changed, 89 insertions(+), 37 deletions(-) diff --git a/sdk/python/kubeflow/training/constants/constants.py b/sdk/python/kubeflow/training/constants/constants.py index 0513c3e31e..0102fe7ef7 100644 --- a/sdk/python/kubeflow/training/constants/constants.py +++ b/sdk/python/kubeflow/training/constants/constants.py @@ -78,7 +78,7 @@ # TODO (andreyvelich): We should add image tag for Storage Initializer and Trainer. -STORAGE_INITIALIZER_IMAGE = "docker.io/kubeflow/storage-initializer" +STORAGE_INITIALIZER_IMAGE = "docker.io/helenxiehz428/test" STORAGE_INITIALIZER_VOLUME_MOUNT = models.V1VolumeMount( name=STORAGE_INITIALIZER, @@ -90,7 +90,7 @@ claim_name=STORAGE_INITIALIZER ), ) -TRAINER_TRANSFORMER_IMAGE = "docker.io/kubeflow/trainer-huggingface" +TRAINER_TRANSFORMER_IMAGE = "docker.io/helenxiehz428/test_llm4" # TFJob constants. TFJOB_KIND = "TFJob" diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index 1074bfc530..5ff54ec614 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -14,14 +14,17 @@ import logging import test.e2e.utils as utils +import time from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams from kubeflow.training import constants +from kubeflow.training.utils import utils from kubeflow.training import TrainingClient from kubernetes import client from kubernetes import config +from kubernetes.client.exceptions import ApiException from peft import LoraConfig import transformers @@ -32,10 +35,33 @@ JOB_NAME = "test-train-api" +def get_logs_of_master_pod(job_namespace, num_workers): + # Verify that training job has correct pods. + pod_names = TRAINING_CLIENT.get_job_pod_names( + name=JOB_NAME, namespace=job_namespace + ) + + if len(pod_names) != num_workers: + raise Exception(f"Training job has incorrect pods: {pod_names}") + + # Get and print the logs of the master pod. + master_pod_name = next((name for name in pod_names if "master" in name), None) + if master_pod_name: + config.load_kube_config() # Load kube config to interact with the cluster. + v1 = client.CoreV1Api() + try: + pod_logs = v1.read_namespaced_pod_log( + name=master_pod_name, namespace=job_namespace + ) + logging.info(f"Logs of master pod {master_pod_name}:\n{pod_logs}") + except ApiException as e: + logging.error(f"Failed to get logs for pod {master_pod_name}: {e}") + + def test_train_api(job_namespace): - num_workers = 4 + num_workers = 1 - # Use test case from fine-tuning API tutorial + # Use test case from fine-tuning API tutorial. # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/ TRAINING_CLIENT.train( name=JOB_NAME, @@ -70,52 +96,78 @@ def test_train_api(job_namespace): ), ), num_workers=num_workers, # nodes parameter for torchrun command. - num_procs_per_worker=2, # nproc-per-node parameter for torchrun command. + num_procs_per_worker=1, # nproc-per-node parameter for torchrun command. resources_per_worker={ - "gpu": 2, - "cpu": 5, + "gpu": 0, + "cpu": 2, "memory": "10G", }, ) - logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + logging.info("---------------------------------------------------------------") + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s:") logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - try: - utils.verify_job_e2e( - TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=60 * 60 - ) - logging.info(f"Training job {JOB_NAME} is succeded.") - except Exception as e: - utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) - TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) - raise Exception(f"Training job {JOB_NAME} is failed. Exception: {e}") - - # Verify that training job has correct pods. - pod_names = TRAINING_CLIENT.get_job_pod_names( - name=JOB_NAME, namespace=job_namespace - ) + logging.info("---------------------------------------------------------------") + logging.info(f"Training job {JOB_NAME} is running...") - # if len(pod_names) != num_workers or f"{JOB_NAME}-worker-0" not in pod_names: - if len(pod_names) != num_workers: - raise Exception(f"Training job has incorrect pods: {pod_names}") + logging.info("---------------------------------------------------------------") + wait_timeout = 60 * 60 + polling_interval = 15 + for _ in range(round(wait_timeout / polling_interval)): - utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + # Get the list of pods associated with the job. + pod_names = TRAINING_CLIENT.get_job_pod_names( + name=JOB_NAME, namespace=job_namespace + ) - # Get and print the logs of the master pod - master_pod_name = next((name for name in pod_names if "master" in name), None) - if master_pod_name: - config.load_kube_config() # Load kube config to interact with the cluster + config.load_kube_config() # Load kube config to interact with the cluster. v1 = client.CoreV1Api() - try: - pod_logs = v1.read_namespaced_pod_log( - name=master_pod_name, namespace=job_namespace + + # Iterate over each pod to check its status. + for pod_name in pod_names: + pod_status = v1.read_namespaced_pod_status( + name=pod_name, namespace=job_namespace ) - logging.info(f"Logs of master pod {master_pod_name}:\n{pod_logs}") - except client.exceptions.ApiException as e: - logging.error(f"Failed to get logs for pod {master_pod_name}: {e}") - TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + # Check if any container in the pod has been restarted, indicating a previous failure. + for container_status in pod_status.status.container_statuses: + if container_status.restart_count > 0: + logging.warning( + f"Pod {pod_name} in job {JOB_NAME} has been restarted {container_status.restart_count} times. Retrieving logs..." + ) + + get_logs_of_master_pod(job_namespace, num_workers) + + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + + # Raise an exception to indicate that a pod has failed at least once. + raise Exception(f"Training job {JOB_NAME} is failed.") + + # Get Job only once per cycle and check the statuses. + job = TRAINING_CLIENT.get_job( + name=JOB_NAME, + namespace=job_namespace, + job_kind=constants.PYTORCHJOB_KIND, + timeout=constants.DEFAULT_TIMEOUT, + ) + + # Get Job conditions. + conditions = TRAINING_CLIENT.get_job_conditions( + job=job, timeout=constants.DEFAULT_TIMEOUT + ) + + # Check if the job has succeeded. + if utils.has_condition(conditions, constants.JOB_CONDITION_SUCCEEDED): + get_logs_of_master_pod(job_namespace, num_workers) + logging.info("---------------------------------------------------------------") + logging.info(f"Training job {JOB_NAME} is succeeded.") + + logging.info("---------------------------------------------------------------") + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + break + + time.sleep(polling_interval) if __name__ == "__main__": From 0685dc7f9236ba522c798bd6ec9805026239936f Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 9 Aug 2024 20:49:52 +0800 Subject: [PATCH 07/86] resolve conflict Signed-off-by: helenxie-bit --- sdk/python/kubeflow/training/constants/constants.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/sdk/python/kubeflow/training/constants/constants.py b/sdk/python/kubeflow/training/constants/constants.py index 0102fe7ef7..d4f638e6b9 100644 --- a/sdk/python/kubeflow/training/constants/constants.py +++ b/sdk/python/kubeflow/training/constants/constants.py @@ -84,12 +84,7 @@ name=STORAGE_INITIALIZER, mount_path=INIT_CONTAINER_MOUNT_PATH, ) -STORAGE_INITIALIZER_VOLUME = models.V1Volume( - name=STORAGE_INITIALIZER, - persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource( - claim_name=STORAGE_INITIALIZER - ), -) + TRAINER_TRANSFORMER_IMAGE = "docker.io/helenxiehz428/test_llm4" # TFJob constants. From 83de64b1d4c46300dab1b7b04629b2ba9814a9bc Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 9 Aug 2024 20:52:35 +0800 Subject: [PATCH 08/86] resolve conflict Signed-off-by: helenxie-bit --- sdk/python/kubeflow/training/constants/constants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/python/kubeflow/training/constants/constants.py b/sdk/python/kubeflow/training/constants/constants.py index d4f638e6b9..07c98bc787 100644 --- a/sdk/python/kubeflow/training/constants/constants.py +++ b/sdk/python/kubeflow/training/constants/constants.py @@ -78,14 +78,14 @@ # TODO (andreyvelich): We should add image tag for Storage Initializer and Trainer. -STORAGE_INITIALIZER_IMAGE = "docker.io/helenxiehz428/test" +STORAGE_INITIALIZER_IMAGE = "docker.io/kubeflow/storage-initializer" STORAGE_INITIALIZER_VOLUME_MOUNT = models.V1VolumeMount( name=STORAGE_INITIALIZER, mount_path=INIT_CONTAINER_MOUNT_PATH, ) -TRAINER_TRANSFORMER_IMAGE = "docker.io/helenxiehz428/test_llm4" +TRAINER_TRANSFORMER_IMAGE = "docker.io/kubeflow/trainer-huggingface" # TFJob constants. TFJOB_KIND = "TFJob" From f954f2d4a4bf87cf7329f07cb82b4b390cddac71 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 9 Aug 2024 20:56:56 +0800 Subject: [PATCH 09/86] resolve conflict Signed-off-by: helenxie-bit --- sdk/python/kubeflow/training/constants/constants.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sdk/python/kubeflow/training/constants/constants.py b/sdk/python/kubeflow/training/constants/constants.py index 07c98bc787..0513c3e31e 100644 --- a/sdk/python/kubeflow/training/constants/constants.py +++ b/sdk/python/kubeflow/training/constants/constants.py @@ -84,7 +84,12 @@ name=STORAGE_INITIALIZER, mount_path=INIT_CONTAINER_MOUNT_PATH, ) - +STORAGE_INITIALIZER_VOLUME = models.V1Volume( + name=STORAGE_INITIALIZER, + persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource( + claim_name=STORAGE_INITIALIZER + ), +) TRAINER_TRANSFORMER_IMAGE = "docker.io/kubeflow/trainer-huggingface" # TFJob constants. From ff48154314e0dde7fc321855d9fe1a80d612bc4e Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 9 Aug 2024 20:59:57 +0800 Subject: [PATCH 10/86] fix format Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index 5ff54ec614..d630b61e91 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -20,8 +20,8 @@ from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams from kubeflow.training import constants -from kubeflow.training.utils import utils from kubeflow.training import TrainingClient +from kubeflow.training.utils import utils from kubernetes import client from kubernetes import config from kubernetes.client.exceptions import ApiException @@ -160,10 +160,14 @@ def test_train_api(job_namespace): # Check if the job has succeeded. if utils.has_condition(conditions, constants.JOB_CONDITION_SUCCEEDED): get_logs_of_master_pod(job_namespace, num_workers) - logging.info("---------------------------------------------------------------") + logging.info( + "---------------------------------------------------------------" + ) logging.info(f"Training job {JOB_NAME} is succeeded.") - logging.info("---------------------------------------------------------------") + logging.info( + "---------------------------------------------------------------" + ) TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) break From 304db5d8b9fb173c01d6eb83d53d583b92a893c7 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 9 Aug 2024 21:28:24 +0800 Subject: [PATCH 11/86] fix NoneType error Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index d630b61e91..4d84404b5a 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -113,7 +113,7 @@ def test_train_api(job_namespace): logging.info("---------------------------------------------------------------") wait_timeout = 60 * 60 - polling_interval = 15 + polling_interval = 30 for _ in range(round(wait_timeout / polling_interval)): # Get the list of pods associated with the job. @@ -130,6 +130,11 @@ def test_train_api(job_namespace): name=pod_name, namespace=job_namespace ) + # Ensure that container_statuses is not None before iterating. + if pod_status.status.container_statuses is None: + logging.warning(f"Pod {pod_name} has no container statuses available yet.") + continue + # Check if any container in the pod has been restarted, indicating a previous failure. for container_status in pod_status.status.container_statuses: if container_status.restart_count > 0: From 486154d6e226dfd822f34db96f16fb26e6116eb6 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 9 Aug 2024 21:29:32 +0800 Subject: [PATCH 12/86] fix format Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index 4d84404b5a..fa11f40221 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -132,7 +132,9 @@ def test_train_api(job_namespace): # Ensure that container_statuses is not None before iterating. if pod_status.status.container_statuses is None: - logging.warning(f"Pod {pod_name} has no container statuses available yet.") + logging.warning( + f"Pod {pod_name} has no container statuses available yet." + ) continue # Check if any container in the pod has been restarted, indicating a previous failure. From 016c41db06ac5c9e3d0a55680aa8000c5676e186 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 10 Aug 2024 07:32:39 +0800 Subject: [PATCH 13/86] test bug Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index fa11f40221..7f07dca409 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -130,6 +130,8 @@ def test_train_api(job_namespace): name=pod_name, namespace=job_namespace ) + get_logs_of_master_pod(job_namespace, num_workers) + # Ensure that container_statuses is not None before iterating. if pod_status.status.container_statuses is None: logging.warning( From 1e7bd2339b303e08b874fff695ffdefa8cc1612e Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 11 Aug 2024 16:25:37 +0800 Subject: [PATCH 14/86] find bug Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 1 + sdk/python/test_e2e/test_e2e_train_api.py | 30 +++++++++++++++-------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index a663da9f87..7cf10aef86 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -58,3 +58,4 @@ jobs: run: | python3 -m pip install -e sdk/python[huggingface] ./sdk/python/test_e2e/test_e2e_train_api.sh + (kubectl get pods -n default && kubectl describe pod -n default $(kubectl get pods -n default -o jsonpath='{.items[0].metadata.name}'); exit 1) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index 7f07dca409..a4dfd7be90 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -13,7 +13,6 @@ # limitations under the License. import logging -import test.e2e.utils as utils import time from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams @@ -124,13 +123,32 @@ def test_train_api(job_namespace): config.load_kube_config() # Load kube config to interact with the cluster. v1 = client.CoreV1Api() + # Get Job only once per cycle and check the statuses. + job = TRAINING_CLIENT.get_job( + name=JOB_NAME, + namespace=job_namespace, + job_kind=constants.PYTORCHJOB_KIND, + timeout=constants.DEFAULT_TIMEOUT, + ) + + for replica_name, replica_status in job.status.replica_statuses.items(): + logging.info( + f"Replica {replica_name} status: {replica_status.succeeded} succeeded, {replica_status.failed} failed." + ) + # Iterate over each pod to check its status. for pod_name in pod_names: pod_status = v1.read_namespaced_pod_status( name=pod_name, namespace=job_namespace ) - get_logs_of_master_pod(job_namespace, num_workers) + print("pod_status:") + print(pod_status) + print("pod_status.status:") + print(pod_status.status) + print("pod_status.status.container_statuses:") + print(pod_status.status.container_statuses) + print("continue...") # Ensure that container_statuses is not None before iterating. if pod_status.status.container_statuses is None: @@ -153,14 +171,6 @@ def test_train_api(job_namespace): # Raise an exception to indicate that a pod has failed at least once. raise Exception(f"Training job {JOB_NAME} is failed.") - # Get Job only once per cycle and check the statuses. - job = TRAINING_CLIENT.get_job( - name=JOB_NAME, - namespace=job_namespace, - job_kind=constants.PYTORCHJOB_KIND, - timeout=constants.DEFAULT_TIMEOUT, - ) - # Get Job conditions. conditions = TRAINING_CLIENT.get_job_conditions( job=job, timeout=constants.DEFAULT_TIMEOUT From 1aced614cdbe75900226bde3a2f2ee9fd8bc95ef Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 11 Aug 2024 16:37:05 +0800 Subject: [PATCH 15/86] find bug Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index a4dfd7be90..a89cfff448 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -111,7 +111,7 @@ def test_train_api(job_namespace): logging.info(f"Training job {JOB_NAME} is running...") logging.info("---------------------------------------------------------------") - wait_timeout = 60 * 60 + wait_timeout = 60 * 10 polling_interval = 30 for _ in range(round(wait_timeout / polling_interval)): From 3100aae50155e0bbaa5e4681ac2e13849c47226b Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sun, 11 Aug 2024 17:06:12 +0800 Subject: [PATCH 16/86] find bug Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 8 +++++++- sdk/python/test_e2e/test_e2e_train_api.py | 4 +--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index 7cf10aef86..375b4df442 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -58,4 +58,10 @@ jobs: run: | python3 -m pip install -e sdk/python[huggingface] ./sdk/python/test_e2e/test_e2e_train_api.sh - (kubectl get pods -n default && kubectl describe pod -n default $(kubectl get pods -n default -o jsonpath='{.items[0].metadata.name}'); exit 1) + kubectl get pods -n default + POD_NAME=$(kubectl get pods -n default -o jsonpath='{.items[0].metadata.name}') + kubectl describe pod -n default $POD_NAME + kubectl get pvc -n default + PVC_NAME=$(kubectl get pvc -n default -o jsonpath='{.items[0].metadata.name}') + kubectl describe pvc -n default $PVC_NAME + exit 1 diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index a89cfff448..ea731ef146 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -111,7 +111,7 @@ def test_train_api(job_namespace): logging.info(f"Training job {JOB_NAME} is running...") logging.info("---------------------------------------------------------------") - wait_timeout = 60 * 10 + wait_timeout = 60 * 120 polling_interval = 30 for _ in range(round(wait_timeout / polling_interval)): @@ -142,8 +142,6 @@ def test_train_api(job_namespace): name=pod_name, namespace=job_namespace ) - print("pod_status:") - print(pod_status) print("pod_status.status:") print(pod_status.status) print("pod_status.status.container_statuses:") From e5b9061cee3ef9168bd853ca28a5be95800f3c44 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 12 Aug 2024 07:45:16 +0800 Subject: [PATCH 17/86] add storage_config Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 2 +- sdk/python/test_e2e/test_e2e_train_api.py | 30 +++++++++++------------ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index 375b4df442..4378a01bb6 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -58,7 +58,7 @@ jobs: run: | python3 -m pip install -e sdk/python[huggingface] ./sdk/python/test_e2e/test_e2e_train_api.sh - kubectl get pods -n default + kubectl get pods -n default POD_NAME=$(kubectl get pods -n default -o jsonpath='{.items[0].metadata.name}') kubectl describe pod -n default $POD_NAME kubectl get pvc -n default diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index ea731ef146..e71b5ece11 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -101,6 +101,11 @@ def test_train_api(job_namespace): "cpu": 2, "memory": "10G", }, + storage_config={ + "size": "2Gi", + "storage_class": "ReadWriteOnce", + "access_modes": ["ReadWriteOnce", "ReadOnlyMany"], + } ) logging.info("---------------------------------------------------------------") @@ -111,8 +116,8 @@ def test_train_api(job_namespace): logging.info(f"Training job {JOB_NAME} is running...") logging.info("---------------------------------------------------------------") - wait_timeout = 60 * 120 - polling_interval = 30 + wait_timeout = 60 * 30 # 30 minutes. + polling_interval = 30 # 30 seconds. for _ in range(round(wait_timeout / polling_interval)): # Get the list of pods associated with the job. @@ -123,19 +128,6 @@ def test_train_api(job_namespace): config.load_kube_config() # Load kube config to interact with the cluster. v1 = client.CoreV1Api() - # Get Job only once per cycle and check the statuses. - job = TRAINING_CLIENT.get_job( - name=JOB_NAME, - namespace=job_namespace, - job_kind=constants.PYTORCHJOB_KIND, - timeout=constants.DEFAULT_TIMEOUT, - ) - - for replica_name, replica_status in job.status.replica_statuses.items(): - logging.info( - f"Replica {replica_name} status: {replica_status.succeeded} succeeded, {replica_status.failed} failed." - ) - # Iterate over each pod to check its status. for pod_name in pod_names: pod_status = v1.read_namespaced_pod_status( @@ -169,6 +161,14 @@ def test_train_api(job_namespace): # Raise an exception to indicate that a pod has failed at least once. raise Exception(f"Training job {JOB_NAME} is failed.") + # Get Job only once per cycle and check the statuses. + job = TRAINING_CLIENT.get_job( + name=JOB_NAME, + namespace=job_namespace, + job_kind=constants.PYTORCHJOB_KIND, + timeout=constants.DEFAULT_TIMEOUT, + ) + # Get Job conditions. conditions = TRAINING_CLIENT.get_job_conditions( job=job, timeout=constants.DEFAULT_TIMEOUT From ffb068523d492abb87587ee937fff78007c47f59 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 12 Aug 2024 07:47:13 +0800 Subject: [PATCH 18/86] fix format Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index e71b5ece11..b419be5141 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -105,7 +105,7 @@ def test_train_api(job_namespace): "size": "2Gi", "storage_class": "ReadWriteOnce", "access_modes": ["ReadWriteOnce", "ReadOnlyMany"], - } + }, ) logging.info("---------------------------------------------------------------") From dc1b48a5be59a8afdcfe522af46bcf9ae6d77ebb Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 12 Aug 2024 14:39:57 +0800 Subject: [PATCH 19/86] reduce pvc size Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index b419be5141..d1fee95f0f 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -27,7 +27,10 @@ from peft import LoraConfig import transformers -logging.basicConfig(format="%(message)s") +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(message)s", + level=logging.INFO, +) logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG) TRAINING_CLIENT = TrainingClient(job_kind=constants.PYTORCHJOB_KIND) @@ -99,12 +102,10 @@ def test_train_api(job_namespace): resources_per_worker={ "gpu": 0, "cpu": 2, - "memory": "10G", + "memory": "2G", }, storage_config={ "size": "2Gi", - "storage_class": "ReadWriteOnce", - "access_modes": ["ReadWriteOnce", "ReadOnlyMany"], }, ) @@ -119,7 +120,6 @@ def test_train_api(job_namespace): wait_timeout = 60 * 30 # 30 minutes. polling_interval = 30 # 30 seconds. for _ in range(round(wait_timeout / polling_interval)): - # Get the list of pods associated with the job. pod_names = TRAINING_CLIENT.get_job_pod_names( name=JOB_NAME, namespace=job_namespace @@ -134,12 +134,6 @@ def test_train_api(job_namespace): name=pod_name, namespace=job_namespace ) - print("pod_status.status:") - print(pod_status.status) - print("pod_status.status.container_statuses:") - print(pod_status.status.container_statuses) - print("continue...") - # Ensure that container_statuses is not None before iterating. if pod_status.status.container_statuses is None: logging.warning( From 889451755ff575e8fd73cb14a2e2afb21c4af370 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 12 Aug 2024 16:11:41 +0800 Subject: [PATCH 20/86] set storage_config Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index d1fee95f0f..b759383c1e 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -106,6 +106,7 @@ def test_train_api(job_namespace): }, storage_config={ "size": "2Gi", + "access_modes": "ReadWriteOnce", }, ) @@ -150,7 +151,7 @@ def test_train_api(job_namespace): get_logs_of_master_pod(job_namespace, num_workers) - TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + #TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) # Raise an exception to indicate that a pod has failed at least once. raise Exception(f"Training job {JOB_NAME} is failed.") From 36872d725ffbdef69cca50c62a11eee9da701464 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 12 Aug 2024 16:12:14 +0800 Subject: [PATCH 21/86] set storage_config Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index b759383c1e..e85d862f8c 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -118,7 +118,7 @@ def test_train_api(job_namespace): logging.info(f"Training job {JOB_NAME} is running...") logging.info("---------------------------------------------------------------") - wait_timeout = 60 * 30 # 30 minutes. + wait_timeout = 60 * 15 # 30 minutes. polling_interval = 30 # 30 seconds. for _ in range(round(wait_timeout / polling_interval)): # Get the list of pods associated with the job. From 7dd8d400a86cdcc6ccde6095e085ba23dab22df4 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 12 Aug 2024 16:14:25 +0800 Subject: [PATCH 22/86] set storage_config Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index e85d862f8c..18db6720f7 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -151,7 +151,7 @@ def test_train_api(job_namespace): get_logs_of_master_pod(job_namespace, num_workers) - #TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) # Raise an exception to indicate that a pod has failed at least once. raise Exception(f"Training job {JOB_NAME} is failed.") From 60c322d98e874caf4b08fd46705a9104a576955b Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 12 Aug 2024 16:21:53 +0800 Subject: [PATCH 23/86] set storage_config Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index 18db6720f7..f0259b1eec 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -106,7 +106,7 @@ def test_train_api(job_namespace): }, storage_config={ "size": "2Gi", - "access_modes": "ReadWriteOnce", + "access_modes": ["ReadWriteOnce"], }, ) From dd970ab825a862f3aff078aa442ead0760102cbe Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 12 Aug 2024 17:33:57 +0800 Subject: [PATCH 24/86] use gpu Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index f0259b1eec..c2ce5655a5 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -100,8 +100,8 @@ def test_train_api(job_namespace): num_workers=num_workers, # nodes parameter for torchrun command. num_procs_per_worker=1, # nproc-per-node parameter for torchrun command. resources_per_worker={ - "gpu": 0, - "cpu": 2, + "gpu": 1, + "cpu": 0, "memory": "2G", }, storage_config={ @@ -118,7 +118,7 @@ def test_train_api(job_namespace): logging.info(f"Training job {JOB_NAME} is running...") logging.info("---------------------------------------------------------------") - wait_timeout = 60 * 15 # 30 minutes. + wait_timeout = 60 * 60 # 1 hour. polling_interval = 30 # 30 seconds. for _ in range(round(wait_timeout / polling_interval)): # Get the list of pods associated with the job. @@ -151,7 +151,7 @@ def test_train_api(job_namespace): get_logs_of_master_pod(job_namespace, num_workers) - TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + #TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) # Raise an exception to indicate that a pod has failed at least once. raise Exception(f"Training job {JOB_NAME} is failed.") From 10bbfa0e6aba2be56776300d90d822f99394be04 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 12 Aug 2024 17:46:40 +0800 Subject: [PATCH 25/86] use gpu Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index c2ce5655a5..167a27afac 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -118,7 +118,7 @@ def test_train_api(job_namespace): logging.info(f"Training job {JOB_NAME} is running...") logging.info("---------------------------------------------------------------") - wait_timeout = 60 * 60 # 1 hour. + wait_timeout = 60 * 10 # 1 hour. polling_interval = 30 # 30 seconds. for _ in range(round(wait_timeout / polling_interval)): # Get the list of pods associated with the job. From d47d6a6c0e01f400aa4a7d66f6d201da2cf1eaf5 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 12 Aug 2024 17:48:43 +0800 Subject: [PATCH 26/86] use gpu Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index 167a27afac..cb273bd4a5 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -151,7 +151,7 @@ def test_train_api(job_namespace): get_logs_of_master_pod(job_namespace, num_workers) - #TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) # Raise an exception to indicate that a pod has failed at least once. raise Exception(f"Training job {JOB_NAME} is failed.") From 4ccd4a76dc91bc0d4bd54b967341bbbf736b9443 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Mon, 12 Aug 2024 19:14:59 +0800 Subject: [PATCH 27/86] fix 'set_device' error Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index cb273bd4a5..1898fd570b 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -88,6 +88,8 @@ def test_train_api(job_namespace): disable_tqdm=True, log_level="info", num_train_epochs=1, + no_cuda=True, + use_cpu=True, ), # Set LoRA config to reduce number of trainable model parameters. lora_config=LoraConfig( @@ -100,8 +102,8 @@ def test_train_api(job_namespace): num_workers=num_workers, # nodes parameter for torchrun command. num_procs_per_worker=1, # nproc-per-node parameter for torchrun command. resources_per_worker={ - "gpu": 1, - "cpu": 0, + "gpu": 0, + "cpu": 2, "memory": "2G", }, storage_config={ @@ -118,7 +120,7 @@ def test_train_api(job_namespace): logging.info(f"Training job {JOB_NAME} is running...") logging.info("---------------------------------------------------------------") - wait_timeout = 60 * 10 # 1 hour. + wait_timeout = 60 * 60 # 1 hour. polling_interval = 30 # 30 seconds. for _ in range(round(wait_timeout / polling_interval)): # Get the list of pods associated with the job. From 0750322689f36353bd291b9fd2c07d27a8cfc6bf Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 15 Aug 2024 17:03:22 +0800 Subject: [PATCH 28/86] add timeout error Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index 1898fd570b..cd19670565 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -88,8 +88,6 @@ def test_train_api(job_namespace): disable_tqdm=True, log_level="info", num_train_epochs=1, - no_cuda=True, - use_cpu=True, ), # Set LoRA config to reduce number of trainable model parameters. lora_config=LoraConfig( @@ -104,10 +102,10 @@ def test_train_api(job_namespace): resources_per_worker={ "gpu": 0, "cpu": 2, - "memory": "2G", + "memory": "10G", }, storage_config={ - "size": "2Gi", + "size": "10Gi", "access_modes": ["ReadWriteOnce"], }, ) @@ -122,7 +120,16 @@ def test_train_api(job_namespace): logging.info("---------------------------------------------------------------") wait_timeout = 60 * 60 # 1 hour. polling_interval = 30 # 30 seconds. - for _ in range(round(wait_timeout / polling_interval)): + start_time = time.time() # Record the start time + + while True: + elapsed_time = time.time() - start_time # Calculate the elapsed time + if elapsed_time > wait_timeout: + # Raise a TimeoutError if the job takes too long + logging.error(f"Training job {JOB_NAME} exceeded the timeout of {wait_timeout} seconds.") + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise TimeoutError(f"Training job {JOB_NAME} did not complete within the allowed time of {wait_timeout} seconds.") + # Get the list of pods associated with the job. pod_names = TRAINING_CLIENT.get_job_pod_names( name=JOB_NAME, namespace=job_namespace @@ -156,7 +163,7 @@ def test_train_api(job_namespace): TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) # Raise an exception to indicate that a pod has failed at least once. - raise Exception(f"Training job {JOB_NAME} is failed.") + raise Exception(f"Training job {JOB_NAME} has failed.") # Get Job only once per cycle and check the statuses. job = TRAINING_CLIENT.get_job( @@ -177,7 +184,7 @@ def test_train_api(job_namespace): logging.info( "---------------------------------------------------------------" ) - logging.info(f"Training job {JOB_NAME} is succeeded.") + logging.info(f"Training job {JOB_NAME} has succeeded.") logging.info( "---------------------------------------------------------------" @@ -190,3 +197,4 @@ def test_train_api(job_namespace): if __name__ == "__main__": test_train_api(job_namespace="default") + \ No newline at end of file From 5ca0923e98a3d7dde7b5a9e1065d5a258e0f8646 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 15 Aug 2024 17:06:56 +0800 Subject: [PATCH 29/86] fix format Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index cd19670565..317b6d9b88 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -126,9 +126,13 @@ def test_train_api(job_namespace): elapsed_time = time.time() - start_time # Calculate the elapsed time if elapsed_time > wait_timeout: # Raise a TimeoutError if the job takes too long - logging.error(f"Training job {JOB_NAME} exceeded the timeout of {wait_timeout} seconds.") + logging.error( + f"Training job {JOB_NAME} exceeded the timeout of {wait_timeout} seconds." + ) TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) - raise TimeoutError(f"Training job {JOB_NAME} did not complete within the allowed time of {wait_timeout} seconds.") + raise TimeoutError( + f"Training job {JOB_NAME} did not complete within the allowed time of {wait_timeout} seconds." + ) # Get the list of pods associated with the job. pod_names = TRAINING_CLIENT.get_job_pod_names( From 387eb8479a53d3cd3c8f01fbe37c6bb40b94d20e Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 15 Aug 2024 17:09:30 +0800 Subject: [PATCH 30/86] fix format Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index 317b6d9b88..8f7d81ff70 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -201,4 +201,3 @@ def test_train_api(job_namespace): if __name__ == "__main__": test_train_api(job_namespace="default") - \ No newline at end of file From 9cc5429c22c589c21120f1e77b14ecc8079470a8 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 15 Aug 2024 17:18:41 +0800 Subject: [PATCH 31/86] fix format Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py index 8f7d81ff70..dde3161084 100644 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ b/sdk/python/test_e2e/test_e2e_train_api.py @@ -78,7 +78,7 @@ def test_train_api(job_namespace): repo_id="yelp_review_full", split="train[:8]", ), - # Specify HuggingFace Trainer parameters. In this example, we will skip evaluation and model checkpoints. + # Specify HuggingFace Trainer parameters. trainer_parameters=HuggingFaceTrainerParams( training_parameters=transformers.TrainingArguments( output_dir="test_trainer", @@ -131,7 +131,8 @@ def test_train_api(job_namespace): ) TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) raise TimeoutError( - f"Training job {JOB_NAME} did not complete within the allowed time of {wait_timeout} seconds." + f"Training job {JOB_NAME} did not complete within the allowed time of " + f"{wait_timeout} seconds." ) # Get the list of pods associated with the job. @@ -159,7 +160,8 @@ def test_train_api(job_namespace): for container_status in pod_status.status.container_statuses: if container_status.restart_count > 0: logging.warning( - f"Pod {pod_name} in job {JOB_NAME} has been restarted {container_status.restart_count} times. Retrieving logs..." + f"Pod {pod_name} in job {JOB_NAME} has been restarted " + f"{container_status.restart_count} times. Retrieving logs..." ) get_logs_of_master_pod(job_namespace, num_workers) From 8a537adf74e22694b330ea6bd60bd0015bc2a68f Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 27 Aug 2024 07:12:53 +0800 Subject: [PATCH 32/86] fix typo Signed-off-by: helenxie-bit --- sdk/python/test_e2e/test_e2e_train_api.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sdk/python/test_e2e/test_e2e_train_api.sh b/sdk/python/test_e2e/test_e2e_train_api.sh index af41771faf..9495a0c3af 100755 --- a/sdk/python/test_e2e/test_e2e_train_api.sh +++ b/sdk/python/test_e2e/test_e2e_train_api.sh @@ -14,8 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# This shell script is used to run Katib Experiment. -# Input parameter - path to Experiment yaml. +# This shell script is used to run e2e test. set -o errexit set -o nounset From e508ef445c14bd8bbb4f909e066834200dbedaa8 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 29 Aug 2024 20:18:27 +0800 Subject: [PATCH 33/86] update e2e test for train api Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 67 ------- sdk/python/test/e2e/test_e2e_pytorchjob.py | 80 ++++++++ sdk/python/test_e2e/test_e2e_train_api.py | 205 --------------------- sdk/python/test_e2e/test_e2e_train_api.sh | 36 ---- 4 files changed, 80 insertions(+), 308 deletions(-) delete mode 100644 .github/workflows/e2e-test-train-api.yaml delete mode 100644 sdk/python/test_e2e/test_e2e_train_api.py delete mode 100755 sdk/python/test_e2e/test_e2e_train_api.sh diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml deleted file mode 100644 index 4378a01bb6..0000000000 --- a/.github/workflows/e2e-test-train-api.yaml +++ /dev/null @@ -1,67 +0,0 @@ -name: E2E Test with train API - -on: - - pull_request - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - e2e: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"] - python-version: ["3.8", "3.9", "3.10", "3.11"] - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Free-Up Disk Space - uses: ./.github/workflows/free-up-disk-space - - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - - name: Setup Go - uses: actions/setup-go@v5 - with: - go-version-file: go.mod - - - name: Create k8s Kind Cluster - uses: helm/kind-action@v1.10.0 - with: - node_image: kindest/node:${{ matrix.kubernetes-version }} - cluster_name: training-operator-cluster - kubectl_version: ${{ matrix.kubernetes-version }} - - - name: Build training-operator - run: | - ./scripts/gha/build-image.sh - env: - TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test - - - name: Deploy training operator - run: | - ./scripts/gha/setup-training-operator.sh - env: - KIND_CLUSTER: training-operator-cluster - TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test - GANG_SCHEDULER_NAME: "none" - KUBERNETES_VERSION: ${{ matrix.kubernetes-version }} - - - name: Run tests - run: | - python3 -m pip install -e sdk/python[huggingface] - ./sdk/python/test_e2e/test_e2e_train_api.sh - kubectl get pods -n default - POD_NAME=$(kubectl get pods -n default -o jsonpath='{.items[0].metadata.name}') - kubectl describe pod -n default $POD_NAME - kubectl get pvc -n default - PVC_NAME=$(kubectl get pvc -n default -o jsonpath='{.items[0].metadata.name}') - kubectl describe pvc -n default $PVC_NAME - exit 1 diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index c5b28faaf8..508f4d03a3 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -23,6 +23,10 @@ from kubernetes.client import V1Container from kubernetes.client import V1ResourceRequirements +from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams +from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams +from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams + from kubeflow.training import TrainingClient from kubeflow.training import KubeflowOrgV1ReplicaSpec from kubeflow.training import KubeflowOrgV1PyTorchJob @@ -30,6 +34,10 @@ from kubeflow.training import KubeflowOrgV1RunPolicy from kubeflow.training import KubeflowOrgV1SchedulingPolicy from kubeflow.training import constants +from kubeflow.training.utils import utils + +from peft import LoraConfig +import transformers import test.e2e.utils as utils from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY @@ -240,6 +248,78 @@ def test_sdk_e2e_create_from_image(job_namespace): TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) +@pytest.mark.skipif( + GANG_SCHEDULER_NAME in GANG_SCHEDULERS, + reason="For plain scheduling", +) +def test_sdk_e2e_create_from_train_api(job_namespace): + JOB_NAME = "pytorchjob-from-train-api" + + num_workers = 1 + + # Use test case from fine-tuning API tutorial. + # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/ + TRAINING_CLIENT.train( + name=JOB_NAME, + namespace=job_namespace, + # BERT model URI and type of Transformer to train it. + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + ), + # In order to save test time, use 8 samples from Yelp dataset. + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + # Specify HuggingFace Trainer parameters. + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_trainer", + save_strategy="no", + evaluation_strategy="no", + do_eval=False, + disable_tqdm=True, + log_level="info", + num_train_epochs=1, + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r=8, + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + num_workers=num_workers, # nodes parameter for torchrun command. + num_procs_per_worker=1, # nproc-per-node parameter for torchrun command. + resources_per_worker={ + "gpu": 0, + "cpu": 2, + "memory": "10G", + }, + storage_config={ + "size": "10Gi", + "access_modes": ["ReadWriteOnce"], + }, + ) + + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) + + try: + utils.verify_job_e2e( + TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=1800 + ) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}") + + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + + def generate_pytorchjob( job_namespace: str, job_name: str, diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py deleted file mode 100644 index dde3161084..0000000000 --- a/sdk/python/test_e2e/test_e2e_train_api.py +++ /dev/null @@ -1,205 +0,0 @@ -# Copyright 2024 kubeflow.org. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import time - -from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams -from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams -from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams -from kubeflow.training import constants -from kubeflow.training import TrainingClient -from kubeflow.training.utils import utils -from kubernetes import client -from kubernetes import config -from kubernetes.client.exceptions import ApiException -from peft import LoraConfig -import transformers - -logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(message)s", - level=logging.INFO, -) -logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG) - -TRAINING_CLIENT = TrainingClient(job_kind=constants.PYTORCHJOB_KIND) -JOB_NAME = "test-train-api" - - -def get_logs_of_master_pod(job_namespace, num_workers): - # Verify that training job has correct pods. - pod_names = TRAINING_CLIENT.get_job_pod_names( - name=JOB_NAME, namespace=job_namespace - ) - - if len(pod_names) != num_workers: - raise Exception(f"Training job has incorrect pods: {pod_names}") - - # Get and print the logs of the master pod. - master_pod_name = next((name for name in pod_names if "master" in name), None) - if master_pod_name: - config.load_kube_config() # Load kube config to interact with the cluster. - v1 = client.CoreV1Api() - try: - pod_logs = v1.read_namespaced_pod_log( - name=master_pod_name, namespace=job_namespace - ) - logging.info(f"Logs of master pod {master_pod_name}:\n{pod_logs}") - except ApiException as e: - logging.error(f"Failed to get logs for pod {master_pod_name}: {e}") - - -def test_train_api(job_namespace): - num_workers = 1 - - # Use test case from fine-tuning API tutorial. - # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/ - TRAINING_CLIENT.train( - name=JOB_NAME, - namespace=job_namespace, - # BERT model URI and type of Transformer to train it. - model_provider_parameters=HuggingFaceModelParams( - model_uri="hf://google-bert/bert-base-cased", - transformer_type=transformers.AutoModelForSequenceClassification, - ), - # In order to save test time, use 8 samples from Yelp dataset. - dataset_provider_parameters=HuggingFaceDatasetParams( - repo_id="yelp_review_full", - split="train[:8]", - ), - # Specify HuggingFace Trainer parameters. - trainer_parameters=HuggingFaceTrainerParams( - training_parameters=transformers.TrainingArguments( - output_dir="test_trainer", - save_strategy="no", - evaluation_strategy="no", - do_eval=False, - disable_tqdm=True, - log_level="info", - num_train_epochs=1, - ), - # Set LoRA config to reduce number of trainable model parameters. - lora_config=LoraConfig( - r=8, - lora_alpha=8, - lora_dropout=0.1, - bias="none", - ), - ), - num_workers=num_workers, # nodes parameter for torchrun command. - num_procs_per_worker=1, # nproc-per-node parameter for torchrun command. - resources_per_worker={ - "gpu": 0, - "cpu": 2, - "memory": "10G", - }, - storage_config={ - "size": "10Gi", - "access_modes": ["ReadWriteOnce"], - }, - ) - - logging.info("---------------------------------------------------------------") - logging.info(f"List of created {TRAINING_CLIENT.job_kind}s:") - logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - - logging.info("---------------------------------------------------------------") - logging.info(f"Training job {JOB_NAME} is running...") - - logging.info("---------------------------------------------------------------") - wait_timeout = 60 * 60 # 1 hour. - polling_interval = 30 # 30 seconds. - start_time = time.time() # Record the start time - - while True: - elapsed_time = time.time() - start_time # Calculate the elapsed time - if elapsed_time > wait_timeout: - # Raise a TimeoutError if the job takes too long - logging.error( - f"Training job {JOB_NAME} exceeded the timeout of {wait_timeout} seconds." - ) - TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) - raise TimeoutError( - f"Training job {JOB_NAME} did not complete within the allowed time of " - f"{wait_timeout} seconds." - ) - - # Get the list of pods associated with the job. - pod_names = TRAINING_CLIENT.get_job_pod_names( - name=JOB_NAME, namespace=job_namespace - ) - - config.load_kube_config() # Load kube config to interact with the cluster. - v1 = client.CoreV1Api() - - # Iterate over each pod to check its status. - for pod_name in pod_names: - pod_status = v1.read_namespaced_pod_status( - name=pod_name, namespace=job_namespace - ) - - # Ensure that container_statuses is not None before iterating. - if pod_status.status.container_statuses is None: - logging.warning( - f"Pod {pod_name} has no container statuses available yet." - ) - continue - - # Check if any container in the pod has been restarted, indicating a previous failure. - for container_status in pod_status.status.container_statuses: - if container_status.restart_count > 0: - logging.warning( - f"Pod {pod_name} in job {JOB_NAME} has been restarted " - f"{container_status.restart_count} times. Retrieving logs..." - ) - - get_logs_of_master_pod(job_namespace, num_workers) - - TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) - - # Raise an exception to indicate that a pod has failed at least once. - raise Exception(f"Training job {JOB_NAME} has failed.") - - # Get Job only once per cycle and check the statuses. - job = TRAINING_CLIENT.get_job( - name=JOB_NAME, - namespace=job_namespace, - job_kind=constants.PYTORCHJOB_KIND, - timeout=constants.DEFAULT_TIMEOUT, - ) - - # Get Job conditions. - conditions = TRAINING_CLIENT.get_job_conditions( - job=job, timeout=constants.DEFAULT_TIMEOUT - ) - - # Check if the job has succeeded. - if utils.has_condition(conditions, constants.JOB_CONDITION_SUCCEEDED): - get_logs_of_master_pod(job_namespace, num_workers) - logging.info( - "---------------------------------------------------------------" - ) - logging.info(f"Training job {JOB_NAME} has succeeded.") - - logging.info( - "---------------------------------------------------------------" - ) - TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) - break - - time.sleep(polling_interval) - - -if __name__ == "__main__": - test_train_api(job_namespace="default") diff --git a/sdk/python/test_e2e/test_e2e_train_api.sh b/sdk/python/test_e2e/test_e2e_train_api.sh deleted file mode 100755 index 9495a0c3af..0000000000 --- a/sdk/python/test_e2e/test_e2e_train_api.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2024 The Kubeflow Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This shell script is used to run e2e test. - -set -o errexit -set -o nounset -set -o pipefail - -cd "$(dirname "$0")" - -echo "Training Operator deployments" -kubectl -n kubeflow get deploy -echo "Training Operator services" -kubectl -n kubeflow get svc -echo "Training Operator pods" -kubectl -n kubeflow get pod -echo "Training Operator persistent volume claims" -kubectl get pvc -n kubeflow -echo "Available CRDs" -kubectl get crd - -python test_e2e_train_api.py || (kubectl get pods -n kubeflow && exit 1) From 788359bec23bf1b696292a3021b4e191980f3ffd Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 29 Aug 2024 20:21:06 +0800 Subject: [PATCH 34/86] add num_labels Signed-off-by: helenxie-bit --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index 508f4d03a3..827b86032d 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -266,6 +266,7 @@ def test_sdk_e2e_create_from_train_api(job_namespace): model_provider_parameters=HuggingFaceModelParams( model_uri="hf://google-bert/bert-base-cased", transformer_type=transformers.AutoModelForSequenceClassification, + num_labels=5, ), # In order to save test time, use 8 samples from Yelp dataset. dataset_provider_parameters=HuggingFaceDatasetParams( From 9b4222e7b08eb09aa002ec5595effc71ede6ee89 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 29 Aug 2024 20:31:37 +0800 Subject: [PATCH 35/86] update pip install Signed-off-by: helenxie-bit --- .github/workflows/integration-tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml index ca2b543fc7..b2886587bc 100644 --- a/.github/workflows/integration-tests.yaml +++ b/.github/workflows/integration-tests.yaml @@ -96,7 +96,7 @@ jobs: - name: Run tests run: | pip install pytest - python3 -m pip install -e sdk/python; pytest -s sdk/python/test --log-cli-level=debug --namespace=default + python3 -m pip install -e sdk/python[huggingface]; pytest -s sdk/python/test --log-cli-level=debug --namespace=default env: GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }} From d75938d057761d3afca395326ce1635f39cf7382 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 29 Aug 2024 21:32:23 +0800 Subject: [PATCH 36/86] check disk space Signed-off-by: helenxie-bit --- .github/workflows/integration-tests.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml index b2886587bc..b34981c378 100644 --- a/.github/workflows/integration-tests.yaml +++ b/.github/workflows/integration-tests.yaml @@ -92,6 +92,9 @@ jobs: TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }} KUBERNETES_VERSION: ${{ matrix.kubernetes-version }} + + - name: Check Disk Space + run: df -h - name: Run tests run: | From 1148bc8010b26b26c1e8e83de0a03d8e69ece9f4 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 29 Aug 2024 21:54:04 +0800 Subject: [PATCH 37/86] change sequence of e2e tests Signed-off-by: helenxie-bit --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 146 ++++++++++----------- 1 file changed, 73 insertions(+), 73 deletions(-) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index 827b86032d..a8415de4cb 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -128,6 +128,79 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace): TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) +@pytest.mark.skipif( + GANG_SCHEDULER_NAME in GANG_SCHEDULERS, + reason="For plain scheduling", +) +def test_sdk_e2e_create_from_train_api(job_namespace): + JOB_NAME = "pytorchjob-from-train-api" + + num_workers = 1 + + # Use test case from fine-tuning API tutorial. + # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/ + TRAINING_CLIENT.train( + name=JOB_NAME, + namespace=job_namespace, + # BERT model URI and type of Transformer to train it. + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + num_labels=5, + ), + # In order to save test time, use 8 samples from Yelp dataset. + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + # Specify HuggingFace Trainer parameters. + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_trainer", + save_strategy="no", + evaluation_strategy="no", + do_eval=False, + disable_tqdm=True, + log_level="info", + num_train_epochs=1, + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r=8, + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + num_workers=num_workers, # nodes parameter for torchrun command. + num_procs_per_worker=1, # nproc-per-node parameter for torchrun command. + resources_per_worker={ + "gpu": 0, + "cpu": 2, + "memory": "10G", + }, + storage_config={ + "size": "10Gi", + "access_modes": ["ReadWriteOnce"], + }, + ) + + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) + + try: + utils.verify_job_e2e( + TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900 + ) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}") + + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + + @pytest.mark.skipif( GANG_SCHEDULER_NAME in GANG_SCHEDULERS, reason="For plain scheduling", @@ -248,79 +321,6 @@ def test_sdk_e2e_create_from_image(job_namespace): TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) -@pytest.mark.skipif( - GANG_SCHEDULER_NAME in GANG_SCHEDULERS, - reason="For plain scheduling", -) -def test_sdk_e2e_create_from_train_api(job_namespace): - JOB_NAME = "pytorchjob-from-train-api" - - num_workers = 1 - - # Use test case from fine-tuning API tutorial. - # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/ - TRAINING_CLIENT.train( - name=JOB_NAME, - namespace=job_namespace, - # BERT model URI and type of Transformer to train it. - model_provider_parameters=HuggingFaceModelParams( - model_uri="hf://google-bert/bert-base-cased", - transformer_type=transformers.AutoModelForSequenceClassification, - num_labels=5, - ), - # In order to save test time, use 8 samples from Yelp dataset. - dataset_provider_parameters=HuggingFaceDatasetParams( - repo_id="yelp_review_full", - split="train[:8]", - ), - # Specify HuggingFace Trainer parameters. - trainer_parameters=HuggingFaceTrainerParams( - training_parameters=transformers.TrainingArguments( - output_dir="test_trainer", - save_strategy="no", - evaluation_strategy="no", - do_eval=False, - disable_tqdm=True, - log_level="info", - num_train_epochs=1, - ), - # Set LoRA config to reduce number of trainable model parameters. - lora_config=LoraConfig( - r=8, - lora_alpha=8, - lora_dropout=0.1, - bias="none", - ), - ), - num_workers=num_workers, # nodes parameter for torchrun command. - num_procs_per_worker=1, # nproc-per-node parameter for torchrun command. - resources_per_worker={ - "gpu": 0, - "cpu": 2, - "memory": "10G", - }, - storage_config={ - "size": "10Gi", - "access_modes": ["ReadWriteOnce"], - }, - ) - - logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") - logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - - try: - utils.verify_job_e2e( - TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=1800 - ) - except Exception as e: - utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) - TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) - raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}") - - utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) - TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) - - def generate_pytorchjob( job_namespace: str, job_name: str, From d29a85da74aebcb601e39fd8a1eb24ae7cfc030c Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Thu, 29 Aug 2024 22:32:14 +0800 Subject: [PATCH 38/86] add clean-up after each e2e test of pytorchjob Signed-off-by: helenxie-bit --- .github/workflows/integration-tests.yaml | 3 - sdk/python/test/e2e/test_e2e_pytorchjob.py | 161 +++++++++++---------- 2 files changed, 88 insertions(+), 76 deletions(-) diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml index b34981c378..b2886587bc 100644 --- a/.github/workflows/integration-tests.yaml +++ b/.github/workflows/integration-tests.yaml @@ -92,9 +92,6 @@ jobs: TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }} KUBERNETES_VERSION: ${{ matrix.kubernetes-version }} - - - name: Check Disk Space - run: df -h - name: Run tests run: | diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index a8415de4cb..f443bd8a30 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -15,6 +15,7 @@ import os import logging import pytest +import subprocess from typing import Optional from kubernetes.client import V1PodTemplateSpec @@ -128,79 +129,6 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace): TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) -@pytest.mark.skipif( - GANG_SCHEDULER_NAME in GANG_SCHEDULERS, - reason="For plain scheduling", -) -def test_sdk_e2e_create_from_train_api(job_namespace): - JOB_NAME = "pytorchjob-from-train-api" - - num_workers = 1 - - # Use test case from fine-tuning API tutorial. - # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/ - TRAINING_CLIENT.train( - name=JOB_NAME, - namespace=job_namespace, - # BERT model URI and type of Transformer to train it. - model_provider_parameters=HuggingFaceModelParams( - model_uri="hf://google-bert/bert-base-cased", - transformer_type=transformers.AutoModelForSequenceClassification, - num_labels=5, - ), - # In order to save test time, use 8 samples from Yelp dataset. - dataset_provider_parameters=HuggingFaceDatasetParams( - repo_id="yelp_review_full", - split="train[:8]", - ), - # Specify HuggingFace Trainer parameters. - trainer_parameters=HuggingFaceTrainerParams( - training_parameters=transformers.TrainingArguments( - output_dir="test_trainer", - save_strategy="no", - evaluation_strategy="no", - do_eval=False, - disable_tqdm=True, - log_level="info", - num_train_epochs=1, - ), - # Set LoRA config to reduce number of trainable model parameters. - lora_config=LoraConfig( - r=8, - lora_alpha=8, - lora_dropout=0.1, - bias="none", - ), - ), - num_workers=num_workers, # nodes parameter for torchrun command. - num_procs_per_worker=1, # nproc-per-node parameter for torchrun command. - resources_per_worker={ - "gpu": 0, - "cpu": 2, - "memory": "10G", - }, - storage_config={ - "size": "10Gi", - "access_modes": ["ReadWriteOnce"], - }, - ) - - logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") - logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - - try: - utils.verify_job_e2e( - TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900 - ) - except Exception as e: - utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) - TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) - raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}") - - utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) - TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) - - @pytest.mark.skipif( GANG_SCHEDULER_NAME in GANG_SCHEDULERS, reason="For plain scheduling", @@ -321,6 +249,79 @@ def test_sdk_e2e_create_from_image(job_namespace): TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) +@pytest.mark.skipif( + GANG_SCHEDULER_NAME in GANG_SCHEDULERS, + reason="For plain scheduling", +) +def test_sdk_e2e_create_from_train_api(job_namespace): + JOB_NAME = "pytorchjob-from-train-api" + + num_workers = 1 + + # Use test case from fine-tuning API tutorial. + # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/ + TRAINING_CLIENT.train( + name=JOB_NAME, + namespace=job_namespace, + # BERT model URI and type of Transformer to train it. + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + num_labels=5, + ), + # In order to save test time, use 8 samples from Yelp dataset. + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + # Specify HuggingFace Trainer parameters. + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_trainer", + save_strategy="no", + evaluation_strategy="no", + do_eval=False, + disable_tqdm=True, + log_level="info", + num_train_epochs=1, + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r=8, + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + num_workers=num_workers, # nodes parameter for torchrun command. + num_procs_per_worker=1, # nproc-per-node parameter for torchrun command. + resources_per_worker={ + "gpu": 0, + "cpu": 2, + "memory": "10G", + }, + storage_config={ + "size": "10Gi", + "access_modes": ["ReadWriteOnce"], + }, + ) + + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) + + try: + utils.verify_job_e2e( + TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900 + ) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}") + + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + + def generate_pytorchjob( job_namespace: str, job_name: str, @@ -349,3 +350,17 @@ def generate_container() -> V1Container: args=["--backend", "gloo", "--epochs", "1"], resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}), ) + + +@pytest.fixture(scope="function", autouse=True) +def clean_up_resources(): + # This code runs after each test function + yield + + # Prune all unused Docker images + try: + subprocess.run(["docker", "image", "prune", "-a", "-f"], check=True) + subprocess.run(["docker", "system", "df"], check=True) + subprocess.run(["df", "-hT"], check=True) + except subprocess.CalledProcessError as e: + print(f"Error during cleanup: {e}") From 82ea9bee2d2d1185c53f3b18d271e0ee73df3aad Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 30 Aug 2024 08:17:44 +0800 Subject: [PATCH 39/86] update cleanup function Signed-off-by: helenxie-bit --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index f443bd8a30..2f31cb5d77 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -359,8 +359,15 @@ def clean_up_resources(): # Prune all unused Docker images try: + # Remove all stopped containers + subprocess.run(["docker", "container", "prune", "-f"], check=True) + # Remove all unused images subprocess.run(["docker", "image", "prune", "-a", "-f"], check=True) + # Remove all unused volumes + subprocess.run(["docker", "volume", "prune", "-f"], check=True) + # Remove all unused networks + subprocess.run(["docker", "network", "prune", "-f"], check=True) + # Show Docker disk usage subprocess.run(["docker", "system", "df"], check=True) - subprocess.run(["df", "-hT"], check=True) except subprocess.CalledProcessError as e: - print(f"Error during cleanup: {e}") + print(f"Error during Docker cleanup: {e}") From b45f9f75459180e00431fa6cb131a21c929c871f Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 30 Aug 2024 08:35:42 +0800 Subject: [PATCH 40/86] update cleanup function Signed-off-by: helenxie-bit --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 30 +++++++++++++++------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index 2f31cb5d77..3017b40f86 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -357,17 +357,29 @@ def clean_up_resources(): # This code runs after each test function yield - # Prune all unused Docker images try: - # Remove all stopped containers - subprocess.run(["docker", "container", "prune", "-f"], check=True) - # Remove all unused images + # 1. Remove unnecessary files + print("Freeing up disk space by removing unnecessary files...") + subprocess.run([ + "sudo", "rm", "-rf", + "/usr/share/dotnet", + "/opt/ghc", + "/usr/local/share/boost", + "$AGENT_TOOLSDIRECTORY", + "/usr/local/lib/android", + "/usr/local/share/powershell", + "/usr/share/swift" + ], check=True) + + print("Disk usage after removing unnecessary files:") + subprocess.run(["df", "-hT"], check=True) + + # 2. Prune Docker images + print("Pruning Docker images to free up space...") subprocess.run(["docker", "image", "prune", "-a", "-f"], check=True) - # Remove all unused volumes - subprocess.run(["docker", "volume", "prune", "-f"], check=True) - # Remove all unused networks - subprocess.run(["docker", "network", "prune", "-f"], check=True) - # Show Docker disk usage + + print("Docker disk usage after pruning images:") subprocess.run(["docker", "system", "df"], check=True) + except subprocess.CalledProcessError as e: print(f"Error during Docker cleanup: {e}") From a204746dc7f048987d9210da86b0f46459dbaf2f Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 30 Aug 2024 09:23:19 +0800 Subject: [PATCH 41/86] update cleanup function-add check disk Signed-off-by: helenxie-bit --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index 3017b40f86..8815d46f8d 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -358,6 +358,10 @@ def clean_up_resources(): yield try: + # Check contents of /mnt before cleanup + print("Listing contents of /mnt directory before cleanup:") + subprocess.run(["ls", "-lh", "/mnt"], check=True) + # 1. Remove unnecessary files print("Freeing up disk space by removing unnecessary files...") subprocess.run([ @@ -368,7 +372,7 @@ def clean_up_resources(): "$AGENT_TOOLSDIRECTORY", "/usr/local/lib/android", "/usr/local/share/powershell", - "/usr/share/swift" + "/usr/share/swift", ], check=True) print("Disk usage after removing unnecessary files:") @@ -381,5 +385,9 @@ def clean_up_resources(): print("Docker disk usage after pruning images:") subprocess.run(["docker", "system", "df"], check=True) + # Check contents of /mnt after cleanup + print("Listing contents of /mnt directory after cleanup:") + subprocess.run(["ls", "-lh", "/mnt"], check=True) + except subprocess.CalledProcessError as e: print(f"Error during Docker cleanup: {e}") From 2d8f8b1de369ec4a8599c9bcfeb0af866d570f71 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 30 Aug 2024 09:49:53 +0800 Subject: [PATCH 42/86] check docker volumes Signed-off-by: helenxie-bit --- .github/workflows/integration-tests.yaml | 6 ++++++ sdk/python/test/e2e/test_e2e_pytorchjob.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml index b2886587bc..2f6c8a8bd5 100644 --- a/.github/workflows/integration-tests.yaml +++ b/.github/workflows/integration-tests.yaml @@ -100,6 +100,12 @@ jobs: env: GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }} + # List all Docker volumes to understand disk usage + - name: List Docker volumes + run: | + echo "Listing all Docker volumes:" + docker volume ls + - name: Collect volcano logs if: ${{ failure() && matrix.gang-scheduler-name == 'volcano' }} run: | diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index 8815d46f8d..a20763e49e 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -311,7 +311,7 @@ def test_sdk_e2e_create_from_train_api(job_namespace): try: utils.verify_job_e2e( - TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900 + TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=300 ) except Exception as e: utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) From c748d0e3a7213f27b2068e3c5336a5a140957216 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 30 Aug 2024 11:19:11 +0800 Subject: [PATCH 43/86] update cleanup function Signed-off-by: helenxie-bit --- .github/workflows/integration-tests.yaml | 6 ---- sdk/python/test/e2e/test_e2e_pytorchjob.py | 37 ++++++++-------------- 2 files changed, 14 insertions(+), 29 deletions(-) diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml index 2f6c8a8bd5..b2886587bc 100644 --- a/.github/workflows/integration-tests.yaml +++ b/.github/workflows/integration-tests.yaml @@ -100,12 +100,6 @@ jobs: env: GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }} - # List all Docker volumes to understand disk usage - - name: List Docker volumes - run: | - echo "Listing all Docker volumes:" - docker volume ls - - name: Collect volcano logs if: ${{ failure() && matrix.gang-scheduler-name == 'volcano' }} run: | diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index a20763e49e..b7b3c6d786 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -358,36 +358,27 @@ def clean_up_resources(): yield try: - # Check contents of /mnt before cleanup - print("Listing contents of /mnt directory before cleanup:") - subprocess.run(["ls", "-lh", "/mnt"], check=True) - - # 1. Remove unnecessary files + # Display disk usage before cleanup + print("Disk usage before removing unnecessary files:") + subprocess.run(["df", "-hT"], check=True) + + # Remove unnecessary docker files print("Freeing up disk space by removing unnecessary files...") subprocess.run([ "sudo", "rm", "-rf", - "/usr/share/dotnet", - "/opt/ghc", - "/usr/local/share/boost", - "$AGENT_TOOLSDIRECTORY", - "/usr/local/lib/android", - "/usr/local/share/powershell", - "/usr/share/swift", + "mnt/docker" ], check=True) - - print("Disk usage after removing unnecessary files:") - subprocess.run(["df", "-hT"], check=True) - # 2. Prune Docker images - print("Pruning Docker images to free up space...") + # Prune Docker images and build cache + print("Pruning Docker images...") subprocess.run(["docker", "image", "prune", "-a", "-f"], check=True) - - print("Docker disk usage after pruning images:") - subprocess.run(["docker", "system", "df"], check=True) - # Check contents of /mnt after cleanup - print("Listing contents of /mnt directory after cleanup:") - subprocess.run(["ls", "-lh", "/mnt"], check=True) + print("Clearing Docker build cache...") + subprocess.run(["docker", "builder", "prune", "-f"], check=True) + + # Display disk usage after cleanup + print("Disk usage after removing unnecessary files:") + subprocess.run(["df", "-hT"], check=True) except subprocess.CalledProcessError as e: print(f"Error during Docker cleanup: {e}") From a68e182d1b486c59e516fd30e661d36534c9cc42 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 30 Aug 2024 12:00:16 +0800 Subject: [PATCH 44/86] update cleanup function Signed-off-by: helenxie-bit --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 24 +++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index b7b3c6d786..c5660f0f48 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -35,7 +35,6 @@ from kubeflow.training import KubeflowOrgV1RunPolicy from kubeflow.training import KubeflowOrgV1SchedulingPolicy from kubeflow.training import constants -from kubeflow.training.utils import utils from peft import LoraConfig import transformers @@ -362,23 +361,38 @@ def clean_up_resources(): print("Disk usage before removing unnecessary files:") subprocess.run(["df", "-hT"], check=True) - # Remove unnecessary docker files + # Check detailed disk usage in /mnt + print("Detailed disk usage in /mnt before cleanup:") + subprocess.run(["du", "-sh", "/mnt/*"], check=True) + + # Remove unnecessary docker files from the correct directory print("Freeing up disk space by removing unnecessary files...") subprocess.run([ "sudo", "rm", "-rf", - "mnt/docker" + "/mnt/docker" ], check=True) - # Prune Docker images and build cache + # List open files in /mnt/docker to understand usage + print("Listing open files in /mnt/docker:") + subprocess.run(["lsof", "+D", "/mnt/docker"], check=True) + + # Prune Docker images and volumes print("Pruning Docker images...") subprocess.run(["docker", "image", "prune", "-a", "-f"], check=True) + print("Pruning Docker volumes...") + subprocess.run(["docker", "volume", "prune", "-f"], check=True) + print("Clearing Docker build cache...") subprocess.run(["docker", "builder", "prune", "-f"], check=True) - + # Display disk usage after cleanup print("Disk usage after removing unnecessary files:") subprocess.run(["df", "-hT"], check=True) + # Check detailed disk usage in /mnt after cleanup + print("Detailed disk usage in /mnt after cleanup:") + subprocess.run(["du", "-sh", "/mnt/*"], check=True) + except subprocess.CalledProcessError as e: print(f"Error during Docker cleanup: {e}") From 227129e38a6f65b8b041aabdfeed074ed2dc1cd8 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 30 Aug 2024 13:33:07 +0800 Subject: [PATCH 45/86] check docker directory Signed-off-by: helenxie-bit --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 45 +++++++--------------- 1 file changed, 13 insertions(+), 32 deletions(-) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index c5660f0f48..efb6f9f248 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -361,38 +361,19 @@ def clean_up_resources(): print("Disk usage before removing unnecessary files:") subprocess.run(["df", "-hT"], check=True) - # Check detailed disk usage in /mnt - print("Detailed disk usage in /mnt before cleanup:") - subprocess.run(["du", "-sh", "/mnt/*"], check=True) - - # Remove unnecessary docker files from the correct directory - print("Freeing up disk space by removing unnecessary files...") - subprocess.run([ - "sudo", "rm", "-rf", - "/mnt/docker" - ], check=True) - - # List open files in /mnt/docker to understand usage - print("Listing open files in /mnt/docker:") - subprocess.run(["lsof", "+D", "/mnt/docker"], check=True) - - # Prune Docker images and volumes - print("Pruning Docker images...") - subprocess.run(["docker", "image", "prune", "-a", "-f"], check=True) - - print("Pruning Docker volumes...") - subprocess.run(["docker", "volume", "prune", "-f"], check=True) - - print("Clearing Docker build cache...") - subprocess.run(["docker", "builder", "prune", "-f"], check=True) - - # Display disk usage after cleanup - print("Disk usage after removing unnecessary files:") - subprocess.run(["df", "-hT"], check=True) - - # Check detailed disk usage in /mnt after cleanup - print("Detailed disk usage in /mnt after cleanup:") - subprocess.run(["du", "-sh", "/mnt/*"], check=True) + # Check contents of /var/lib/docker before cleanup + print("Listing contents of /var/lib/docker directory before cleanup:") + try: + subprocess.run(["ls", "-lh", "/var/lib/docker"], check=True) + except subprocess.CalledProcessError as e: + print(f"Error listing /var/lib/docker: {e}") + + # Check contents of /mnt/docker before cleanup + print("Listing contents of /mnt/docker directory before cleanup:") + try: + subprocess.run(["ls", "-lh", "/mnt/docker"], check=True) + except subprocess.CalledProcessError as e: + print(f"Error listing /mnt/docker: {e}") except subprocess.CalledProcessError as e: print(f"Error during Docker cleanup: {e}") From 79e9e32fd092abe6a625c4bc6a193840c45bf543 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 30 Aug 2024 15:47:37 +0800 Subject: [PATCH 46/86] update pip install and 'num_workers' Signed-off-by: helenxie-bit --- .github/workflows/integration-tests.yaml | 2 +- sdk/python/test/e2e/test_e2e_pytorchjob.py | 36 ++++++++++------------ 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml index b2886587bc..92f8ced2f1 100644 --- a/.github/workflows/integration-tests.yaml +++ b/.github/workflows/integration-tests.yaml @@ -96,7 +96,7 @@ jobs: - name: Run tests run: | pip install pytest - python3 -m pip install -e sdk/python[huggingface]; pytest -s sdk/python/test --log-cli-level=debug --namespace=default + python3 -m pip install -e sdk/python/kubeflow/trainer -e sdk/pythoh; pytest -s sdk/python/test --log-cli-level=debug --namespace=default env: GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }} diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index efb6f9f248..01dad75c82 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -255,8 +255,6 @@ def test_sdk_e2e_create_from_image(job_namespace): def test_sdk_e2e_create_from_train_api(job_namespace): JOB_NAME = "pytorchjob-from-train-api" - num_workers = 1 - # Use test case from fine-tuning API tutorial. # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/ TRAINING_CLIENT.train( @@ -292,7 +290,7 @@ def test_sdk_e2e_create_from_train_api(job_namespace): bias="none", ), ), - num_workers=num_workers, # nodes parameter for torchrun command. + num_workers=1, # nodes parameter for torchrun command. num_procs_per_worker=1, # nproc-per-node parameter for torchrun command. resources_per_worker={ "gpu": 0, @@ -357,23 +355,21 @@ def clean_up_resources(): yield try: - # Display disk usage before cleanup - print("Disk usage before removing unnecessary files:") - subprocess.run(["df", "-hT"], check=True) - - # Check contents of /var/lib/docker before cleanup - print("Listing contents of /var/lib/docker directory before cleanup:") - try: - subprocess.run(["ls", "-lh", "/var/lib/docker"], check=True) - except subprocess.CalledProcessError as e: - print(f"Error listing /var/lib/docker: {e}") - - # Check contents of /mnt/docker before cleanup - print("Listing contents of /mnt/docker directory before cleanup:") - try: - subprocess.run(["ls", "-lh", "/mnt/docker"], check=True) - except subprocess.CalledProcessError as e: - print(f"Error listing /mnt/docker: {e}") + # List all volumes and inspect them + print("Listing all Docker volumes:") + subprocess.run(["docker", "volume", "ls"], check=True) + + # Check for stopped containers + print("Checking for stopped containers:") + subprocess.run(["docker", "ps", "-a"], check=True) + + # Remove all stopped containers + print("Removing stopped containers...") + subprocess.run(["docker", "rm", "$(docker ps -a -q)"], shell=True, check=True) + + # Prune unused volumes + print("Pruning unused Docker volumes...") + subprocess.run(["docker", "volume", "prune", "-f"], check=True) except subprocess.CalledProcessError as e: print(f"Error during Docker cleanup: {e}") From b7dbf5c4f39bbbb698760e4e1532e70432ec75d9 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 30 Aug 2024 15:48:35 +0800 Subject: [PATCH 47/86] update pip install and 'num_workers' Signed-off-by: helenxie-bit --- .github/workflows/integration-tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml index 92f8ced2f1..2fed976393 100644 --- a/.github/workflows/integration-tests.yaml +++ b/.github/workflows/integration-tests.yaml @@ -96,7 +96,7 @@ jobs: - name: Run tests run: | pip install pytest - python3 -m pip install -e sdk/python/kubeflow/trainer -e sdk/pythoh; pytest -s sdk/python/test --log-cli-level=debug --namespace=default + python3 -m pip install -e sdk/python/kubeflow/trainer -e sdk/python; pytest -s sdk/python/test --log-cli-level=debug --namespace=default env: GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }} From 1f639a71bddb4566e05336a7fe6b0b7cb8bdba62 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 30 Aug 2024 15:56:10 +0800 Subject: [PATCH 48/86] update pip install Signed-off-by: helenxie-bit --- .github/workflows/integration-tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml index 2fed976393..234a2988be 100644 --- a/.github/workflows/integration-tests.yaml +++ b/.github/workflows/integration-tests.yaml @@ -96,7 +96,7 @@ jobs: - name: Run tests run: | pip install pytest - python3 -m pip install -e sdk/python/kubeflow/trainer -e sdk/python; pytest -s sdk/python/test --log-cli-level=debug --namespace=default + python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test --log-cli-level=debug --namespace=default env: GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }} From 832273073feaad2d846a3f505cae44e0a4b023be Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 30 Aug 2024 16:57:22 +0800 Subject: [PATCH 49/86] change the value of 'clean_pod_policy' Signed-off-by: helenxie-bit --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index 01dad75c82..41c079ab98 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -332,7 +332,7 @@ def generate_pytorchjob( metadata=V1ObjectMeta(name=job_name, namespace=job_namespace), spec=KubeflowOrgV1PyTorchJobSpec( run_policy=KubeflowOrgV1RunPolicy( - clean_pod_policy="None", + clean_pod_policy="Running", scheduling_policy=scheduling_policy, ), pytorch_replica_specs={"Master": master, "Worker": worker}, @@ -355,10 +355,18 @@ def clean_up_resources(): yield try: + # Display disk usage before cleanup + print("Disk usage before removing unnecessary files:") + subprocess.run(["df", "-hT"], check=True) + # List all volumes and inspect them print("Listing all Docker volumes:") subprocess.run(["docker", "volume", "ls"], check=True) + # Prune unused volumes + print("Pruning unused Docker volumes...") + subprocess.run(["docker", "volume", "prune", "-f"], check=True) + # Check for stopped containers print("Checking for stopped containers:") subprocess.run(["docker", "ps", "-a"], check=True) @@ -367,9 +375,9 @@ def clean_up_resources(): print("Removing stopped containers...") subprocess.run(["docker", "rm", "$(docker ps -a -q)"], shell=True, check=True) - # Prune unused volumes - print("Pruning unused Docker volumes...") - subprocess.run(["docker", "volume", "prune", "-f"], check=True) + # Display disk usage before cleanup + print("Disk usage before removing unnecessary files:") + subprocess.run(["df", "-hT"], check=True) except subprocess.CalledProcessError as e: print(f"Error during Docker cleanup: {e}") From ed105746a23ebd4f130f468703645a4fc69375d3 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 30 Aug 2024 18:19:04 +0800 Subject: [PATCH 50/86] change the value of 'update cleanup function Signed-off-by: helenxie-bit --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index 41c079ab98..ac6e1d23f5 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -332,7 +332,7 @@ def generate_pytorchjob( metadata=V1ObjectMeta(name=job_name, namespace=job_namespace), spec=KubeflowOrgV1PyTorchJobSpec( run_policy=KubeflowOrgV1RunPolicy( - clean_pod_policy="Running", + clean_pod_policy="None", scheduling_policy=scheduling_policy, ), pytorch_replica_specs={"Master": master, "Worker": worker}, @@ -379,5 +379,12 @@ def clean_up_resources(): print("Disk usage before removing unnecessary files:") subprocess.run(["df", "-hT"], check=True) + # Remove unnecessary docker files from the correct directory + print("Freeing up disk space by removing unnecessary files...") + subprocess.run([ + "sudo", "rm", "-rf", + "/var/lib/docker" + ], check=True) + except subprocess.CalledProcessError as e: print(f"Error during Docker cleanup: {e}") From 50ed9e8449ea8ccd24767ab5fd0706aa6a6a4c76 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 30 Aug 2024 20:42:09 +0800 Subject: [PATCH 51/86] update cleanup function Signed-off-by: helenxie-bit --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 25 +++------------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index ac6e1d23f5..20fa45bacb 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -359,32 +359,13 @@ def clean_up_resources(): print("Disk usage before removing unnecessary files:") subprocess.run(["df", "-hT"], check=True) - # List all volumes and inspect them - print("Listing all Docker volumes:") - subprocess.run(["docker", "volume", "ls"], check=True) - # Prune unused volumes print("Pruning unused Docker volumes...") - subprocess.run(["docker", "volume", "prune", "-f"], check=True) - - # Check for stopped containers - print("Checking for stopped containers:") - subprocess.run(["docker", "ps", "-a"], check=True) - - # Remove all stopped containers - print("Removing stopped containers...") - subprocess.run(["docker", "rm", "$(docker ps -a -q)"], shell=True, check=True) + subprocess.run(["docker", "system", "prune", "-a", "--volumes", "-f"], check=True) - # Display disk usage before cleanup - print("Disk usage before removing unnecessary files:") + # Display disk usage after cleanup + print("Disk usage after removing unnecessary files:") subprocess.run(["df", "-hT"], check=True) - # Remove unnecessary docker files from the correct directory - print("Freeing up disk space by removing unnecessary files...") - subprocess.run([ - "sudo", "rm", "-rf", - "/var/lib/docker" - ], check=True) - except subprocess.CalledProcessError as e: print(f"Error during Docker cleanup: {e}") From b2cd27ab579804dac668abceedaea973f11cb880 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 31 Aug 2024 08:44:10 +0800 Subject: [PATCH 52/86] update cleanup function Signed-off-by: helenxie-bit --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 46 +++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index 20fa45bacb..0ee316eb16 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -354,18 +354,62 @@ def clean_up_resources(): # This code runs after each test function yield + docker_accessible = False + + # Check Docker daemon access + try: + result = subprocess.run(["docker", "version"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + print("Docker daemon is accessible.") + print(result.stdout.decode()) + docker_accessible = True + except subprocess.CalledProcessError as e: + print("Error: Docker daemon is not accessible.") + print(e.stderr.decode()) + + if not docker_accessible: + print("Skipping Docker cleanup since Docker is not accessible.") + return + try: # Display disk usage before cleanup print("Disk usage before removing unnecessary files:") subprocess.run(["df", "-hT"], check=True) + # Display Docker disk usage before cleanup + print("Docker disk usage before removing unnecessary files:") + subprocess.run(["docker", "system", "df", "-v"], check=True) + + # Display Docker images before cleanup + print("Docker images before removing unnecessary files:") + subprocess.run(["docker", "images"], check=True) + + # Display Docker containers before cleanup + print("Docker containers before removing unnecessary files:") + subprocess.run(["docker", "ps", "-s", "--all"], check=True) + + # Display Docker volumes before cleanup + print("Docker volumess before removing unnecessary files:") + subprocess.run(["docker", "volume", "ls"], check=True) + + # Check Docker root directory disk usage + print("Check Docker root directory:") + subprocess.run(["sudo", "du", "-sh", "/var/lib/docker"], check=True) + + # Check Docker runtime directory disk usage + print("Check Docker runtime directory:") + subprocess.run(["sudo", "du", "-sh", "/var/lib/containerd"], check=True) + # Prune unused volumes print("Pruning unused Docker volumes...") - subprocess.run(["docker", "system", "prune", "-a", "--volumes", "-f"], check=True) + subprocess.run(["sudo", "docker", "system", "prune", "-a", "--volumes", "-f"], check=True) # Display disk usage after cleanup print("Disk usage after removing unnecessary files:") subprocess.run(["df", "-hT"], check=True) + # Display Docker disk usage after cleanup + print("Docker disk usage after removing unnecessary files:") + subprocess.run(["docker", "system", "df", "-v"], check=True) + except subprocess.CalledProcessError as e: print(f"Error during Docker cleanup: {e}") From 3af5d87c40ee37462e1393feb87f757dd51e11c0 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 31 Aug 2024 09:07:13 +0800 Subject: [PATCH 53/86] check docker volumes Signed-off-by: helenxie-bit --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index 0ee316eb16..418f6f03ba 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -403,6 +403,21 @@ def clean_up_resources(): print("Pruning unused Docker volumes...") subprocess.run(["sudo", "docker", "system", "prune", "-a", "--volumes", "-f"], check=True) + # Additional check: List volumes and remove large unused ones + print("Listing Docker volumes to check for large unused ones:") + result = subprocess.run(["docker", "volume", "ls", "-q"], check=True, stdout=subprocess.PIPE) + volumes = result.stdout.decode().splitlines() + for volume in volumes: + inspect_result = subprocess.run(["docker", "volume", "inspect", volume], check=True, stdout=subprocess.PIPE) + volume_details = inspect_result.stdout.decode() + if '"Mountpoint":' in volume_details and '/mnt/' in volume_details: + volume_size = subprocess.run(["sudo", "du", "-sh", f"/mnt/{volume}"], check=True, stdout=subprocess.PIPE).stdout.decode().split()[0] + print(f"Volume {volume} size: {volume_size}") + # Example: Remove if larger than 10GB + if float(volume_size[:-1]) > 10: # Adjust this condition as needed + print(f"Removing large unused volume: {volume}") + subprocess.run(["docker", "volume", "rm", volume], check=True) + # Display disk usage after cleanup print("Disk usage after removing unnecessary files:") subprocess.run(["df", "-hT"], check=True) From 1a0eff33a78b6b65daafb3412cefe2c071669945 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 31 Aug 2024 09:24:58 +0800 Subject: [PATCH 54/86] check docker volumes Signed-off-by: helenxie-bit --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index 418f6f03ba..cd317eb244 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -13,6 +13,7 @@ # limitations under the License. import os +import json import logging import pytest import subprocess @@ -407,16 +408,25 @@ def clean_up_resources(): print("Listing Docker volumes to check for large unused ones:") result = subprocess.run(["docker", "volume", "ls", "-q"], check=True, stdout=subprocess.PIPE) volumes = result.stdout.decode().splitlines() + for volume in volumes: inspect_result = subprocess.run(["docker", "volume", "inspect", volume], check=True, stdout=subprocess.PIPE) - volume_details = inspect_result.stdout.decode() - if '"Mountpoint":' in volume_details and '/mnt/' in volume_details: - volume_size = subprocess.run(["sudo", "du", "-sh", f"/mnt/{volume}"], check=True, stdout=subprocess.PIPE).stdout.decode().split()[0] + volume_details = json.loads(inspect_result.stdout.decode()) + mountpoint = volume_details[0]["Mountpoint"] + + # Check if the mountpoint exists before accessing it + try: + volume_size = subprocess.run(["sudo", "du", "-sh", mountpoint], check=True, stdout=subprocess.PIPE).stdout.decode().split()[0] print(f"Volume {volume} size: {volume_size}") # Example: Remove if larger than 10GB - if float(volume_size[:-1]) > 10: # Adjust this condition as needed + size_value = float(volume_size[:-1]) + size_unit = volume_size[-1].upper() + + if size_unit == 'G' and size_value > 10: # Adjust this condition as needed print(f"Removing large unused volume: {volume}") subprocess.run(["docker", "volume", "rm", volume], check=True) + except subprocess.CalledProcessError: + print(f"Volume {volume} not found at expected mountpoint {mountpoint} or cannot access.") # Display disk usage after cleanup print("Disk usage after removing unnecessary files:") From 604265a8006da171f898a86966e9cd1d2aa72393 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 31 Aug 2024 09:49:56 +0800 Subject: [PATCH 55/86] stop the controller and restart it again to clean up Signed-off-by: helenxie-bit --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 24 ++++++++++------------ 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index cd317eb244..8efda407a5 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -392,19 +392,12 @@ def clean_up_resources(): print("Docker volumess before removing unnecessary files:") subprocess.run(["docker", "volume", "ls"], check=True) - # Check Docker root directory disk usage - print("Check Docker root directory:") - subprocess.run(["sudo", "du", "-sh", "/var/lib/docker"], check=True) + # Stop the training-operator-control-plane container if running + container_name = "training-operator-cluster-control-plane" + print(f"Stopping container {container_name}...") + subprocess.run(["docker", "stop", container_name], check=True) - # Check Docker runtime directory disk usage - print("Check Docker runtime directory:") - subprocess.run(["sudo", "du", "-sh", "/var/lib/containerd"], check=True) - - # Prune unused volumes - print("Pruning unused Docker volumes...") - subprocess.run(["sudo", "docker", "system", "prune", "-a", "--volumes", "-f"], check=True) - - # Additional check: List volumes and remove large unused ones + # List volumes and remove large unused ones print("Listing Docker volumes to check for large unused ones:") result = subprocess.run(["docker", "volume", "ls", "-q"], check=True, stdout=subprocess.PIPE) volumes = result.stdout.decode().splitlines() @@ -418,16 +411,21 @@ def clean_up_resources(): try: volume_size = subprocess.run(["sudo", "du", "-sh", mountpoint], check=True, stdout=subprocess.PIPE).stdout.decode().split()[0] print(f"Volume {volume} size: {volume_size}") + # Example: Remove if larger than 10GB size_value = float(volume_size[:-1]) size_unit = volume_size[-1].upper() if size_unit == 'G' and size_value > 10: # Adjust this condition as needed - print(f"Removing large unused volume: {volume}") + print(f"Removing volume: {volume}") subprocess.run(["docker", "volume", "rm", volume], check=True) except subprocess.CalledProcessError: print(f"Volume {volume} not found at expected mountpoint {mountpoint} or cannot access.") + # Restart the training-operator-control-plane container if necessary + print(f"Starting container {container_name}...") + subprocess.run(["docker", "start", container_name], check=True) + # Display disk usage after cleanup print("Disk usage after removing unnecessary files:") subprocess.run(["df", "-hT"], check=True) From a4f848f398ad0faf8d34e2ac48e53337458fb1de Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 31 Aug 2024 10:36:21 +0800 Subject: [PATCH 56/86] update cleanup function Signed-off-by: helenxie-bit --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 57 ++-------------------- 1 file changed, 5 insertions(+), 52 deletions(-) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index 8efda407a5..75c4525e18 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -355,22 +355,6 @@ def clean_up_resources(): # This code runs after each test function yield - docker_accessible = False - - # Check Docker daemon access - try: - result = subprocess.run(["docker", "version"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - print("Docker daemon is accessible.") - print(result.stdout.decode()) - docker_accessible = True - except subprocess.CalledProcessError as e: - print("Error: Docker daemon is not accessible.") - print(e.stderr.decode()) - - if not docker_accessible: - print("Skipping Docker cleanup since Docker is not accessible.") - return - try: # Display disk usage before cleanup print("Disk usage before removing unnecessary files:") @@ -389,44 +373,13 @@ def clean_up_resources(): subprocess.run(["docker", "ps", "-s", "--all"], check=True) # Display Docker volumes before cleanup - print("Docker volumess before removing unnecessary files:") + print("Docker volumes before removing unnecessary files:") subprocess.run(["docker", "volume", "ls"], check=True) - # Stop the training-operator-control-plane container if running - container_name = "training-operator-cluster-control-plane" - print(f"Stopping container {container_name}...") - subprocess.run(["docker", "stop", container_name], check=True) - - # List volumes and remove large unused ones - print("Listing Docker volumes to check for large unused ones:") - result = subprocess.run(["docker", "volume", "ls", "-q"], check=True, stdout=subprocess.PIPE) - volumes = result.stdout.decode().splitlines() - - for volume in volumes: - inspect_result = subprocess.run(["docker", "volume", "inspect", volume], check=True, stdout=subprocess.PIPE) - volume_details = json.loads(inspect_result.stdout.decode()) - mountpoint = volume_details[0]["Mountpoint"] - - # Check if the mountpoint exists before accessing it - try: - volume_size = subprocess.run(["sudo", "du", "-sh", mountpoint], check=True, stdout=subprocess.PIPE).stdout.decode().split()[0] - print(f"Volume {volume} size: {volume_size}") - - # Example: Remove if larger than 10GB - size_value = float(volume_size[:-1]) - size_unit = volume_size[-1].upper() - - if size_unit == 'G' and size_value > 10: # Adjust this condition as needed - print(f"Removing volume: {volume}") - subprocess.run(["docker", "volume", "rm", volume], check=True) - except subprocess.CalledProcessError: - print(f"Volume {volume} not found at expected mountpoint {mountpoint} or cannot access.") - - # Restart the training-operator-control-plane container if necessary - print(f"Starting container {container_name}...") - subprocess.run(["docker", "start", container_name], check=True) - - # Display disk usage after cleanup + # Remove unused Docker volumes + print("Remove unused Docker volumes:") + subprocess.run(["docker", "volume", "prune", "--filter", "all=1"], check=True) + print("Disk usage after removing unnecessary files:") subprocess.run(["df", "-hT"], check=True) From 3e86e90e9eedf9b47dd95894a8550469ba6962a4 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 31 Aug 2024 10:52:37 +0800 Subject: [PATCH 57/86] update cleanup function Signed-off-by: helenxie-bit --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index 75c4525e18..3d1a31451b 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -378,7 +378,7 @@ def clean_up_resources(): # Remove unused Docker volumes print("Remove unused Docker volumes:") - subprocess.run(["docker", "volume", "prune", "--filter", "all=1"], check=True) + subprocess.run(["docker", "volume", "prune", "--filter", "all=1", "-f"], check=True) print("Disk usage after removing unnecessary files:") subprocess.run(["df", "-hT"], check=True) From 558330b17069b006bac1a580a1d5628d3e3a5c66 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 31 Aug 2024 13:28:45 +0800 Subject: [PATCH 58/86] update cleanup function Signed-off-by: helenxie-bit --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 27 +++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index 3d1a31451b..60d27c1efd 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -378,7 +378,32 @@ def clean_up_resources(): # Remove unused Docker volumes print("Remove unused Docker volumes:") - subprocess.run(["docker", "volume", "prune", "--filter", "all=1", "-f"], check=True) + subprocess.run(["docker", "volume", "prune", "-f"], check=True) + + # Additionally list volumes and remove large unused ones + print("Listing Docker volumes to check for large unused ones:") + result = subprocess.run(["docker", "volume", "ls", "-q"], check=True, stdout=subprocess.PIPE) + volumes = result.stdout.decode().splitlines() + + for volume in volumes: + inspect_result = subprocess.run(["docker", "volume", "inspect", volume], check=True, stdout=subprocess.PIPE) + volume_details = json.loads(inspect_result.stdout.decode()) + mountpoint = volume_details[0]["Mountpoint"] + + # Check if the mountpoint exists before accessing it + try: + volume_size = subprocess.run(["sudo", "du", "-sh", mountpoint], check=True, stdout=subprocess.PIPE).stdout.decode().split()[0] + print(f"Volume {volume} size: {volume_size}") + + # Example: Remove if larger than 10GB + size_value = float(volume_size[:-1]) + size_unit = volume_size[-1].upper() + + if size_unit == 'G' and size_value > 10: # Adjust this condition as needed + print(f"Removing volume: {volume}") + subprocess.run(["docker", "volume", "rm", volume], check=True) + except subprocess.CalledProcessError: + print(f"Volume {volume} not found at expected mountpoint {mountpoint} or cannot access.") print("Disk usage after removing unnecessary files:") subprocess.run(["df", "-hT"], check=True) From d4ed2d81b4435655db95e3e97fcbacee18020705 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 3 Sep 2024 10:26:06 +0800 Subject: [PATCH 59/86] separate e2e test for train api Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 60 +++++++ .github/workflows/integration-tests.yaml | 2 +- sdk/python/test/e2e/test_e2e_pytorchjob.py | 148 +----------------- .../test_train_api/test_e2e_train_api.py | 96 ++++++++++++ 4 files changed, 158 insertions(+), 148 deletions(-) create mode 100644 .github/workflows/e2e-test-train-api.yaml create mode 100644 sdk/python/test_train_api/test_e2e_train_api.py diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml new file mode 100644 index 0000000000..182998df51 --- /dev/null +++ b/.github/workflows/e2e-test-train-api.yaml @@ -0,0 +1,60 @@ +name: E2E Test with train API +on: + - pull_request + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + e2e-test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"] + python-version: ["3.8", "3.9", "3.10", "3.11"] + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Free-Up Disk Space + uses: ./.github/workflows/free-up-disk-space + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Create k8s Kind Cluster + uses: helm/kind-action@v1.10.0 + with: + node_image: kindest/node:${{ matrix.kubernetes-version }} + cluster_name: training-operator-cluster + kubectl_version: ${{ matrix.kubernetes-version }} + + - name: Build training-operator + run: | + ./scripts/gha/build-image.sh + env: + TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test + + - name: Deploy training operator + run: | + ./scripts/gha/setup-training-operator.sh + env: + KIND_CLUSTER: training-operator-cluster + TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test + GANG_SCHEDULER_NAME: "none" + KUBERNETES_VERSION: ${{ matrix.kubernetes-version }} + + - name: Run tests + run: | + pip install pytest + python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test_train_api --log-cli-level=debug --namespace=default diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml index 234a2988be..ca2b543fc7 100644 --- a/.github/workflows/integration-tests.yaml +++ b/.github/workflows/integration-tests.yaml @@ -96,7 +96,7 @@ jobs: - name: Run tests run: | pip install pytest - python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test --log-cli-level=debug --namespace=default + python3 -m pip install -e sdk/python; pytest -s sdk/python/test --log-cli-level=debug --namespace=default env: GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }} diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index 60d27c1efd..8e0739c9b4 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -13,10 +13,8 @@ # limitations under the License. import os -import json import logging import pytest -import subprocess from typing import Optional from kubernetes.client import V1PodTemplateSpec @@ -25,10 +23,6 @@ from kubernetes.client import V1Container from kubernetes.client import V1ResourceRequirements -from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams -from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams -from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams - from kubeflow.training import TrainingClient from kubeflow.training import KubeflowOrgV1ReplicaSpec from kubeflow.training import KubeflowOrgV1PyTorchJob @@ -37,9 +31,6 @@ from kubeflow.training import KubeflowOrgV1SchedulingPolicy from kubeflow.training import constants -from peft import LoraConfig -import transformers - import test.e2e.utils as utils from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS @@ -249,77 +240,6 @@ def test_sdk_e2e_create_from_image(job_namespace): TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) -@pytest.mark.skipif( - GANG_SCHEDULER_NAME in GANG_SCHEDULERS, - reason="For plain scheduling", -) -def test_sdk_e2e_create_from_train_api(job_namespace): - JOB_NAME = "pytorchjob-from-train-api" - - # Use test case from fine-tuning API tutorial. - # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/ - TRAINING_CLIENT.train( - name=JOB_NAME, - namespace=job_namespace, - # BERT model URI and type of Transformer to train it. - model_provider_parameters=HuggingFaceModelParams( - model_uri="hf://google-bert/bert-base-cased", - transformer_type=transformers.AutoModelForSequenceClassification, - num_labels=5, - ), - # In order to save test time, use 8 samples from Yelp dataset. - dataset_provider_parameters=HuggingFaceDatasetParams( - repo_id="yelp_review_full", - split="train[:8]", - ), - # Specify HuggingFace Trainer parameters. - trainer_parameters=HuggingFaceTrainerParams( - training_parameters=transformers.TrainingArguments( - output_dir="test_trainer", - save_strategy="no", - evaluation_strategy="no", - do_eval=False, - disable_tqdm=True, - log_level="info", - num_train_epochs=1, - ), - # Set LoRA config to reduce number of trainable model parameters. - lora_config=LoraConfig( - r=8, - lora_alpha=8, - lora_dropout=0.1, - bias="none", - ), - ), - num_workers=1, # nodes parameter for torchrun command. - num_procs_per_worker=1, # nproc-per-node parameter for torchrun command. - resources_per_worker={ - "gpu": 0, - "cpu": 2, - "memory": "10G", - }, - storage_config={ - "size": "10Gi", - "access_modes": ["ReadWriteOnce"], - }, - ) - - logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") - logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - - try: - utils.verify_job_e2e( - TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=300 - ) - except Exception as e: - utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) - TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) - raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}") - - utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) - TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) - - def generate_pytorchjob( job_namespace: str, job_name: str, @@ -347,70 +267,4 @@ def generate_container() -> V1Container: image="kubeflow/pytorch-dist-mnist:latest", args=["--backend", "gloo", "--epochs", "1"], resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}), - ) - - -@pytest.fixture(scope="function", autouse=True) -def clean_up_resources(): - # This code runs after each test function - yield - - try: - # Display disk usage before cleanup - print("Disk usage before removing unnecessary files:") - subprocess.run(["df", "-hT"], check=True) - - # Display Docker disk usage before cleanup - print("Docker disk usage before removing unnecessary files:") - subprocess.run(["docker", "system", "df", "-v"], check=True) - - # Display Docker images before cleanup - print("Docker images before removing unnecessary files:") - subprocess.run(["docker", "images"], check=True) - - # Display Docker containers before cleanup - print("Docker containers before removing unnecessary files:") - subprocess.run(["docker", "ps", "-s", "--all"], check=True) - - # Display Docker volumes before cleanup - print("Docker volumes before removing unnecessary files:") - subprocess.run(["docker", "volume", "ls"], check=True) - - # Remove unused Docker volumes - print("Remove unused Docker volumes:") - subprocess.run(["docker", "volume", "prune", "-f"], check=True) - - # Additionally list volumes and remove large unused ones - print("Listing Docker volumes to check for large unused ones:") - result = subprocess.run(["docker", "volume", "ls", "-q"], check=True, stdout=subprocess.PIPE) - volumes = result.stdout.decode().splitlines() - - for volume in volumes: - inspect_result = subprocess.run(["docker", "volume", "inspect", volume], check=True, stdout=subprocess.PIPE) - volume_details = json.loads(inspect_result.stdout.decode()) - mountpoint = volume_details[0]["Mountpoint"] - - # Check if the mountpoint exists before accessing it - try: - volume_size = subprocess.run(["sudo", "du", "-sh", mountpoint], check=True, stdout=subprocess.PIPE).stdout.decode().split()[0] - print(f"Volume {volume} size: {volume_size}") - - # Example: Remove if larger than 10GB - size_value = float(volume_size[:-1]) - size_unit = volume_size[-1].upper() - - if size_unit == 'G' and size_value > 10: # Adjust this condition as needed - print(f"Removing volume: {volume}") - subprocess.run(["docker", "volume", "rm", volume], check=True) - except subprocess.CalledProcessError: - print(f"Volume {volume} not found at expected mountpoint {mountpoint} or cannot access.") - - print("Disk usage after removing unnecessary files:") - subprocess.run(["df", "-hT"], check=True) - - # Display Docker disk usage after cleanup - print("Docker disk usage after removing unnecessary files:") - subprocess.run(["docker", "system", "df", "-v"], check=True) - - except subprocess.CalledProcessError as e: - print(f"Error during Docker cleanup: {e}") + ) \ No newline at end of file diff --git a/sdk/python/test_train_api/test_e2e_train_api.py b/sdk/python/test_train_api/test_e2e_train_api.py new file mode 100644 index 0000000000..9fe4e6b731 --- /dev/null +++ b/sdk/python/test_train_api/test_e2e_train_api.py @@ -0,0 +1,96 @@ +# Copyright 2024 kubeflow.org. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import test.e2e.utils as utils + +import transformers +from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceDatasetParams, + HuggingFaceModelParams, + HuggingFaceTrainerParams, +) +from kubeflow.training import TrainingClient, constants +from peft import LoraConfig + +logging.basicConfig(format="%(message)s") +logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG) + +TRAINING_CLIENT = TrainingClient(job_kind=constants.PYTORCHJOB_KIND) +CONTAINER_NAME = "pytorch" + + +def test_sdk_e2e_create_from_train_api(job_namespace): + JOB_NAME = "pytorchjob-from-train-api" + + # Use test case from fine-tuning API tutorial. + # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/ + TRAINING_CLIENT.train( + name=JOB_NAME, + namespace=job_namespace, + # BERT model URI and type of Transformer to train it. + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + num_labels=5, + ), + # In order to save test time, use 8 samples from Yelp dataset. + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + # Specify HuggingFace Trainer parameters. + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_trainer", + save_strategy="no", + evaluation_strategy="no", + do_eval=False, + disable_tqdm=True, + log_level="info", + num_train_epochs=1, + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r=8, + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + num_workers=1, # nodes parameter for torchrun command. + num_procs_per_worker=1, # nproc-per-node parameter for torchrun command. + resources_per_worker={ + "gpu": 0, + "cpu": 2, + "memory": "10G", + }, + storage_config={ + "size": "10Gi", + "access_modes": ["ReadWriteOnce"], + }, + ) + + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) + + try: + utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=300) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}") + + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) From 7a2ae05ce7555194f18194e907f1de20d8811bd9 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 3 Sep 2024 10:28:37 +0800 Subject: [PATCH 60/86] fix format Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index 182998df51..b60f3e071c 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -14,7 +14,6 @@ jobs: matrix: kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"] python-version: ["3.8", "3.9", "3.10", "3.11"] - steps: - name: Checkout uses: actions/checkout@v4 From 9efcce5b08a3b63a60f5dc794664d2ba9339e75b Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 3 Sep 2024 14:16:00 +0800 Subject: [PATCH 61/86] fix parameter of namespace Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 2 +- sdk/python/test_train_api/test_e2e_train_api.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index b60f3e071c..8b3944277f 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -56,4 +56,4 @@ jobs: - name: Run tests run: | pip install pytest - python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test_train_api --log-cli-level=debug --namespace=default + python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test_train_api/test_e2e_train_api.py --log-cli-level=debug diff --git a/sdk/python/test_train_api/test_e2e_train_api.py b/sdk/python/test_train_api/test_e2e_train_api.py index 9fe4e6b731..0918941672 100644 --- a/sdk/python/test_train_api/test_e2e_train_api.py +++ b/sdk/python/test_train_api/test_e2e_train_api.py @@ -31,7 +31,7 @@ CONTAINER_NAME = "pytorch" -def test_sdk_e2e_create_from_train_api(job_namespace): +def test_sdk_e2e_create_from_train_api(job_namespace="default"): JOB_NAME = "pytorchjob-from-train-api" # Use test case from fine-tuning API tutorial. @@ -86,7 +86,7 @@ def test_sdk_e2e_create_from_train_api(job_namespace): logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) try: - utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=300) + utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900) except Exception as e: utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) From a443ea2c4e42382da45b4b0e9c87e2e2cb79c5f9 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 3 Sep 2024 15:26:07 +0800 Subject: [PATCH 62/86] fix format Signed-off-by: helenxie-bit --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index 8e0739c9b4..c5b28faaf8 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -267,4 +267,4 @@ def generate_container() -> V1Container: image="kubeflow/pytorch-dist-mnist:latest", args=["--backend", "gloo", "--epochs", "1"], resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}), - ) \ No newline at end of file + ) From 85fd8e62525adde9a2137bfd0cc3e08d8f54db3b Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 3 Sep 2024 15:48:52 +0800 Subject: [PATCH 63/86] reduce resources Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 59 ------------ .github/workflows/integration-tests.yaml | 2 +- sdk/python/test/e2e/test_e2e_pytorchjob.py | 80 ++++++++++++++++ .../test_train_api/test_e2e_train_api.py | 96 ------------------- 4 files changed, 81 insertions(+), 156 deletions(-) delete mode 100644 .github/workflows/e2e-test-train-api.yaml delete mode 100644 sdk/python/test_train_api/test_e2e_train_api.py diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml deleted file mode 100644 index 8b3944277f..0000000000 --- a/.github/workflows/e2e-test-train-api.yaml +++ /dev/null @@ -1,59 +0,0 @@ -name: E2E Test with train API -on: - - pull_request - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - e2e-test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"] - python-version: ["3.8", "3.9", "3.10", "3.11"] - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Free-Up Disk Space - uses: ./.github/workflows/free-up-disk-space - - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - - name: Setup Go - uses: actions/setup-go@v5 - with: - go-version-file: go.mod - - - name: Create k8s Kind Cluster - uses: helm/kind-action@v1.10.0 - with: - node_image: kindest/node:${{ matrix.kubernetes-version }} - cluster_name: training-operator-cluster - kubectl_version: ${{ matrix.kubernetes-version }} - - - name: Build training-operator - run: | - ./scripts/gha/build-image.sh - env: - TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test - - - name: Deploy training operator - run: | - ./scripts/gha/setup-training-operator.sh - env: - KIND_CLUSTER: training-operator-cluster - TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test - GANG_SCHEDULER_NAME: "none" - KUBERNETES_VERSION: ${{ matrix.kubernetes-version }} - - - name: Run tests - run: | - pip install pytest - python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test_train_api/test_e2e_train_api.py --log-cli-level=debug diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml index ca2b543fc7..234a2988be 100644 --- a/.github/workflows/integration-tests.yaml +++ b/.github/workflows/integration-tests.yaml @@ -96,7 +96,7 @@ jobs: - name: Run tests run: | pip install pytest - python3 -m pip install -e sdk/python; pytest -s sdk/python/test --log-cli-level=debug --namespace=default + python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test --log-cli-level=debug --namespace=default env: GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }} diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index c5b28faaf8..cbe1b54039 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -13,8 +13,10 @@ # limitations under the License. import os +import json import logging import pytest +import subprocess from typing import Optional from kubernetes.client import V1PodTemplateSpec @@ -23,6 +25,10 @@ from kubernetes.client import V1Container from kubernetes.client import V1ResourceRequirements +from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams +from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams +from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams + from kubeflow.training import TrainingClient from kubeflow.training import KubeflowOrgV1ReplicaSpec from kubeflow.training import KubeflowOrgV1PyTorchJob @@ -31,6 +37,9 @@ from kubeflow.training import KubeflowOrgV1SchedulingPolicy from kubeflow.training import constants +from peft import LoraConfig +import transformers + import test.e2e.utils as utils from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS @@ -240,6 +249,77 @@ def test_sdk_e2e_create_from_image(job_namespace): TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) +@pytest.mark.skipif( + GANG_SCHEDULER_NAME in GANG_SCHEDULERS, + reason="For plain scheduling", +) +def test_sdk_e2e_create_from_train_api(job_namespace): + JOB_NAME = "pytorchjob-from-train-api" + + # Use test case from fine-tuning API tutorial. + # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/ + TRAINING_CLIENT.train( + name=JOB_NAME, + namespace=job_namespace, + # BERT model URI and type of Transformer to train it. + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + num_labels=5, + ), + # In order to save test time, use 8 samples from Yelp dataset. + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + # Specify HuggingFace Trainer parameters. + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_trainer", + save_strategy="no", + evaluation_strategy="no", + do_eval=False, + disable_tqdm=True, + log_level="info", + num_train_epochs=1, + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r=8, + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + num_workers=1, # nodes parameter for torchrun command. + num_procs_per_worker=1, # nproc-per-node parameter for torchrun command. + resources_per_worker={ + "gpu": 0, + "cpu": 1, + "memory": "5G", + }, + storage_config={ + "size": "5Gi", + "access_modes": ["ReadWriteOnce"], + }, + ) + + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) + + try: + utils.verify_job_e2e( + TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=300 + ) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}") + + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + + def generate_pytorchjob( job_namespace: str, job_name: str, diff --git a/sdk/python/test_train_api/test_e2e_train_api.py b/sdk/python/test_train_api/test_e2e_train_api.py deleted file mode 100644 index 0918941672..0000000000 --- a/sdk/python/test_train_api/test_e2e_train_api.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright 2024 kubeflow.org. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import test.e2e.utils as utils - -import transformers -from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceDatasetParams, - HuggingFaceModelParams, - HuggingFaceTrainerParams, -) -from kubeflow.training import TrainingClient, constants -from peft import LoraConfig - -logging.basicConfig(format="%(message)s") -logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG) - -TRAINING_CLIENT = TrainingClient(job_kind=constants.PYTORCHJOB_KIND) -CONTAINER_NAME = "pytorch" - - -def test_sdk_e2e_create_from_train_api(job_namespace="default"): - JOB_NAME = "pytorchjob-from-train-api" - - # Use test case from fine-tuning API tutorial. - # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/ - TRAINING_CLIENT.train( - name=JOB_NAME, - namespace=job_namespace, - # BERT model URI and type of Transformer to train it. - model_provider_parameters=HuggingFaceModelParams( - model_uri="hf://google-bert/bert-base-cased", - transformer_type=transformers.AutoModelForSequenceClassification, - num_labels=5, - ), - # In order to save test time, use 8 samples from Yelp dataset. - dataset_provider_parameters=HuggingFaceDatasetParams( - repo_id="yelp_review_full", - split="train[:8]", - ), - # Specify HuggingFace Trainer parameters. - trainer_parameters=HuggingFaceTrainerParams( - training_parameters=transformers.TrainingArguments( - output_dir="test_trainer", - save_strategy="no", - evaluation_strategy="no", - do_eval=False, - disable_tqdm=True, - log_level="info", - num_train_epochs=1, - ), - # Set LoRA config to reduce number of trainable model parameters. - lora_config=LoraConfig( - r=8, - lora_alpha=8, - lora_dropout=0.1, - bias="none", - ), - ), - num_workers=1, # nodes parameter for torchrun command. - num_procs_per_worker=1, # nproc-per-node parameter for torchrun command. - resources_per_worker={ - "gpu": 0, - "cpu": 2, - "memory": "10G", - }, - storage_config={ - "size": "10Gi", - "access_modes": ["ReadWriteOnce"], - }, - ) - - logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") - logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - - try: - utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900) - except Exception as e: - utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) - TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) - raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}") - - utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) - TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) From 1a0c455d2f1a8e8df44547258fe0a4ee916c2ab1 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 3 Sep 2024 16:55:30 +0800 Subject: [PATCH 64/86] separate e2e test for train API Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 59 ++++++++++++ .github/workflows/integration-tests.yaml | 2 +- sdk/python/test/e2e/test_e2e_pytorchjob.py | 80 ---------------- .../test_train_api/test_e2e_train_api.py | 96 +++++++++++++++++++ 4 files changed, 156 insertions(+), 81 deletions(-) create mode 100644 .github/workflows/e2e-test-train-api.yaml create mode 100644 sdk/python/test_train_api/test_e2e_train_api.py diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml new file mode 100644 index 0000000000..c3f885cc9b --- /dev/null +++ b/.github/workflows/e2e-test-train-api.yaml @@ -0,0 +1,59 @@ +name: E2E Test with train API +on: + - pull_request + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + e2e-test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"] + python-version: ["3.9", "3.10", "3.11"] + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Free-Up Disk Space + uses: ./.github/workflows/free-up-disk-space + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Create k8s Kind Cluster + uses: helm/kind-action@v1.10.0 + with: + node_image: kindest/node:${{ matrix.kubernetes-version }} + cluster_name: training-operator-cluster + kubectl_version: ${{ matrix.kubernetes-version }} + + - name: Build training-operator + run: | + ./scripts/gha/build-image.sh + env: + TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test + + - name: Deploy training operator + run: | + ./scripts/gha/setup-training-operator.sh + env: + KIND_CLUSTER: training-operator-cluster + TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test + GANG_SCHEDULER_NAME: "none" + KUBERNETES_VERSION: ${{ matrix.kubernetes-version }} + + - name: Run tests + run: | + pip install pytest + python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test_train_api/test_e2e_train_api.py --log-cli-level=debug diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml index 234a2988be..ca2b543fc7 100644 --- a/.github/workflows/integration-tests.yaml +++ b/.github/workflows/integration-tests.yaml @@ -96,7 +96,7 @@ jobs: - name: Run tests run: | pip install pytest - python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test --log-cli-level=debug --namespace=default + python3 -m pip install -e sdk/python; pytest -s sdk/python/test --log-cli-level=debug --namespace=default env: GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }} diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index cbe1b54039..c5b28faaf8 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -13,10 +13,8 @@ # limitations under the License. import os -import json import logging import pytest -import subprocess from typing import Optional from kubernetes.client import V1PodTemplateSpec @@ -25,10 +23,6 @@ from kubernetes.client import V1Container from kubernetes.client import V1ResourceRequirements -from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams -from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams -from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams - from kubeflow.training import TrainingClient from kubeflow.training import KubeflowOrgV1ReplicaSpec from kubeflow.training import KubeflowOrgV1PyTorchJob @@ -37,9 +31,6 @@ from kubeflow.training import KubeflowOrgV1SchedulingPolicy from kubeflow.training import constants -from peft import LoraConfig -import transformers - import test.e2e.utils as utils from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS @@ -249,77 +240,6 @@ def test_sdk_e2e_create_from_image(job_namespace): TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) -@pytest.mark.skipif( - GANG_SCHEDULER_NAME in GANG_SCHEDULERS, - reason="For plain scheduling", -) -def test_sdk_e2e_create_from_train_api(job_namespace): - JOB_NAME = "pytorchjob-from-train-api" - - # Use test case from fine-tuning API tutorial. - # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/ - TRAINING_CLIENT.train( - name=JOB_NAME, - namespace=job_namespace, - # BERT model URI and type of Transformer to train it. - model_provider_parameters=HuggingFaceModelParams( - model_uri="hf://google-bert/bert-base-cased", - transformer_type=transformers.AutoModelForSequenceClassification, - num_labels=5, - ), - # In order to save test time, use 8 samples from Yelp dataset. - dataset_provider_parameters=HuggingFaceDatasetParams( - repo_id="yelp_review_full", - split="train[:8]", - ), - # Specify HuggingFace Trainer parameters. - trainer_parameters=HuggingFaceTrainerParams( - training_parameters=transformers.TrainingArguments( - output_dir="test_trainer", - save_strategy="no", - evaluation_strategy="no", - do_eval=False, - disable_tqdm=True, - log_level="info", - num_train_epochs=1, - ), - # Set LoRA config to reduce number of trainable model parameters. - lora_config=LoraConfig( - r=8, - lora_alpha=8, - lora_dropout=0.1, - bias="none", - ), - ), - num_workers=1, # nodes parameter for torchrun command. - num_procs_per_worker=1, # nproc-per-node parameter for torchrun command. - resources_per_worker={ - "gpu": 0, - "cpu": 1, - "memory": "5G", - }, - storage_config={ - "size": "5Gi", - "access_modes": ["ReadWriteOnce"], - }, - ) - - logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") - logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - - try: - utils.verify_job_e2e( - TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=300 - ) - except Exception as e: - utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) - TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) - raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}") - - utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) - TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) - - def generate_pytorchjob( job_namespace: str, job_name: str, diff --git a/sdk/python/test_train_api/test_e2e_train_api.py b/sdk/python/test_train_api/test_e2e_train_api.py new file mode 100644 index 0000000000..0918941672 --- /dev/null +++ b/sdk/python/test_train_api/test_e2e_train_api.py @@ -0,0 +1,96 @@ +# Copyright 2024 kubeflow.org. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import test.e2e.utils as utils + +import transformers +from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceDatasetParams, + HuggingFaceModelParams, + HuggingFaceTrainerParams, +) +from kubeflow.training import TrainingClient, constants +from peft import LoraConfig + +logging.basicConfig(format="%(message)s") +logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG) + +TRAINING_CLIENT = TrainingClient(job_kind=constants.PYTORCHJOB_KIND) +CONTAINER_NAME = "pytorch" + + +def test_sdk_e2e_create_from_train_api(job_namespace="default"): + JOB_NAME = "pytorchjob-from-train-api" + + # Use test case from fine-tuning API tutorial. + # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/ + TRAINING_CLIENT.train( + name=JOB_NAME, + namespace=job_namespace, + # BERT model URI and type of Transformer to train it. + model_provider_parameters=HuggingFaceModelParams( + model_uri="hf://google-bert/bert-base-cased", + transformer_type=transformers.AutoModelForSequenceClassification, + num_labels=5, + ), + # In order to save test time, use 8 samples from Yelp dataset. + dataset_provider_parameters=HuggingFaceDatasetParams( + repo_id="yelp_review_full", + split="train[:8]", + ), + # Specify HuggingFace Trainer parameters. + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + output_dir="test_trainer", + save_strategy="no", + evaluation_strategy="no", + do_eval=False, + disable_tqdm=True, + log_level="info", + num_train_epochs=1, + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config=LoraConfig( + r=8, + lora_alpha=8, + lora_dropout=0.1, + bias="none", + ), + ), + num_workers=1, # nodes parameter for torchrun command. + num_procs_per_worker=1, # nproc-per-node parameter for torchrun command. + resources_per_worker={ + "gpu": 0, + "cpu": 2, + "memory": "10G", + }, + storage_config={ + "size": "10Gi", + "access_modes": ["ReadWriteOnce"], + }, + ) + + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) + + try: + utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}") + + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) From afe4240c62bac0fdfd58f0e35f400f60ecb61065 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 3 Sep 2024 19:24:59 +0800 Subject: [PATCH 65/86] remove go setup Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index c3f885cc9b..471b351272 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -26,11 +26,6 @@ jobs: with: python-version: ${{ matrix.python-version }} - - name: Setup Go - uses: actions/setup-go@v5 - with: - go-version-file: go.mod - - name: Create k8s Kind Cluster uses: helm/kind-action@v1.10.0 with: From 250b830bc5f881d224003d4b4c6bde3b6663cae5 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 3 Sep 2024 22:58:41 +0800 Subject: [PATCH 66/86] adjust the version of k8s Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index 471b351272..94271e180f 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -12,7 +12,7 @@ jobs: strategy: fail-fast: false matrix: - kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"] + kubernetes-version: ["v1.28.7"] python-version: ["3.9", "3.10", "3.11"] steps: - name: Checkout From c5b39a4821f183c85b95be2c6e9ad52ae746bf8e Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 4 Sep 2024 07:49:32 +0800 Subject: [PATCH 67/86] move test file to new place Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 2 +- .github/workflows/integration-tests.yaml | 2 +- .../e2e-train-api}/test_e2e_train_api.py | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename sdk/python/{test_train_api => test/e2e-train-api}/test_e2e_train_api.py (100%) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index 94271e180f..dfaf615aff 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -51,4 +51,4 @@ jobs: - name: Run tests run: | pip install pytest - python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test_train_api/test_e2e_train_api.py --log-cli-level=debug + python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test/e2e-train-api/test_e2e_train_api.py --log-cli-level=debug diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml index ca2b543fc7..d88f26e77f 100644 --- a/.github/workflows/integration-tests.yaml +++ b/.github/workflows/integration-tests.yaml @@ -96,7 +96,7 @@ jobs: - name: Run tests run: | pip install pytest - python3 -m pip install -e sdk/python; pytest -s sdk/python/test --log-cli-level=debug --namespace=default + python3 -m pip install -e sdk/python; pytest -s sdk/python/test/e2e --log-cli-level=debug --namespace=default env: GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }} diff --git a/sdk/python/test_train_api/test_e2e_train_api.py b/sdk/python/test/e2e-train-api/test_e2e_train_api.py similarity index 100% rename from sdk/python/test_train_api/test_e2e_train_api.py rename to sdk/python/test/e2e-train-api/test_e2e_train_api.py From fa99a92dd6e798207424d2377a78a17375c54323 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 4 Sep 2024 08:47:10 +0800 Subject: [PATCH 68/86] fix typos Signed-off-by: helenxie-bit --- sdk/python/test/e2e-train-api/test_e2e_train_api.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sdk/python/test/e2e-train-api/test_e2e_train_api.py b/sdk/python/test/e2e-train-api/test_e2e_train_api.py index 0918941672..59b4bafa58 100644 --- a/sdk/python/test/e2e-train-api/test_e2e_train_api.py +++ b/sdk/python/test/e2e-train-api/test_e2e_train_api.py @@ -28,7 +28,6 @@ logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG) TRAINING_CLIENT = TrainingClient(job_kind=constants.PYTORCHJOB_KIND) -CONTAINER_NAME = "pytorch" def test_sdk_e2e_create_from_train_api(job_namespace="default"): @@ -90,7 +89,7 @@ def test_sdk_e2e_create_from_train_api(job_namespace="default"): except Exception as e: utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) - raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}") + raise Exception(f"PyTorchJob create from API E2E fails. Exception: {e}") utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) From f0d8cc4dc03618ff6100166453b0f6b8331a85fe Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Wed, 4 Sep 2024 08:56:27 +0800 Subject: [PATCH 69/86] rerun tests Signed-off-by: helenxie-bit --- sdk/python/test/e2e-train-api/test_e2e_train_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/python/test/e2e-train-api/test_e2e_train_api.py b/sdk/python/test/e2e-train-api/test_e2e_train_api.py index 59b4bafa58..764db97042 100644 --- a/sdk/python/test/e2e-train-api/test_e2e_train_api.py +++ b/sdk/python/test/e2e-train-api/test_e2e_train_api.py @@ -68,8 +68,8 @@ def test_sdk_e2e_create_from_train_api(job_namespace="default"): bias="none", ), ), - num_workers=1, # nodes parameter for torchrun command. - num_procs_per_worker=1, # nproc-per-node parameter for torchrun command. + num_workers=1, + num_procs_per_worker=1, resources_per_worker={ "gpu": 0, "cpu": 2, From d2c3cacfe4308715fdcfbfd34e343942410e3777 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 20 Sep 2024 18:54:39 -0700 Subject: [PATCH 70/86] update install packages Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index dfaf615aff..776aa65a92 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -51,4 +51,4 @@ jobs: - name: Run tests run: | pip install pytest - python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test/e2e-train-api/test_e2e_train_api.py --log-cli-level=debug + python3 -m pip install -e sdk/python[huggingface]; pytest -s sdk/python/test/e2e-train-api/test_e2e_train_api.py --log-cli-level=debug From 9f4244909b44f0a8be3c7234d90c808728e94b71 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 20 Sep 2024 20:56:38 -0700 Subject: [PATCH 71/86] build and verify images of storage-intializer and trainer Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 13 +++++++++++-- scripts/gha/build-image.sh | 2 ++ sdk/python/kubeflow/training/api/training_client.py | 9 +++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index 776aa65a92..20b19ec769 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -33,11 +33,13 @@ jobs: cluster_name: training-operator-cluster kubectl_version: ${{ matrix.kubernetes-version }} - - name: Build training-operator + - name: Build training-operator, storage-initializer, and trainer images run: | ./scripts/gha/build-image.sh env: TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test + STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test + TRAINER_CI_IMAGE: kubeflowtraining/trainer:test - name: Deploy training operator run: | @@ -50,5 +52,12 @@ jobs: - name: Run tests run: | + kind load docker-image ${{ env.STORAGE_INITIALIZER_IMAGE }} --name ${{ env.KIND_CLUSTER }} + kind load docker-image ${{ env.TRAINER_TRANSFORMER_IMAGE_DEFAULT }} --name ${{ env.KIND_CLUSTER }} pip install pytest - python3 -m pip install -e sdk/python[huggingface]; pytest -s sdk/python/test/e2e-train-api/test_e2e_train_api.py --log-cli-level=debug + python3 -m pip install -e sdk/python[huggingface] + pytest -s sdk/python/test/e2e-train-api/test_e2e_train_api.py --log-cli-level=debug + env: + KIND_CLUSTER: training-operator-cluster + STORAGE_INITIALIZER_IMAGE: kubeflowtraining/storage-initializer:test + TRAINER_TRANSFORMER_IMAGE_DEFAULT: kubeflowtraining/trainer:test diff --git a/scripts/gha/build-image.sh b/scripts/gha/build-image.sh index cb4f0fc832..7c2947bdce 100755 --- a/scripts/gha/build-image.sh +++ b/scripts/gha/build-image.sh @@ -22,3 +22,5 @@ set -o nounset set -o pipefail docker build . -t ${TRAINING_CI_IMAGE} -f build/images/training-operator/Dockerfile +docker build . -t ${STORAGE_INITIALIZER_CI_IMAGE} -f sdk/python/kubeflow/storage_initializer/Dockerfile +docker build . -t ${TRAINER_CI_IMAGE} -f sdk/python/kubeflow/trainer/Dockerfile diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py index 1626f18820..459e16a046 100644 --- a/sdk/python/kubeflow/training/api/training_client.py +++ b/sdk/python/kubeflow/training/api/training_client.py @@ -258,6 +258,10 @@ def train( ], volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT], ) + base_image1=os.getenv( + "STORAGE_INITIALIZER_IMAGE", constants.STORAGE_INITIALIZER_IMAGE_DEFAULT + ) + print("base_image1: " + base_image1) # create app container spec container_spec = utils.get_container_spec( @@ -287,6 +291,11 @@ def train( volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT], resources=resources_per_worker, ) + base_image2=os.getenv( + "TRAINER_TRANSFORMER_IMAGE_DEFAULT", + constants.TRAINER_TRANSFORMER_IMAGE_DEFAULT, + ) + print("base_image2: " + base_image2) storage_initializer_volume = models.V1Volume( name=constants.STORAGE_INITIALIZER, From bb406cee573141304c37b35c672a027b40f51d4c Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 20 Sep 2024 21:07:18 -0700 Subject: [PATCH 72/86] fix image build error Signed-off-by: helenxie-bit --- scripts/gha/build-image.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/gha/build-image.sh b/scripts/gha/build-image.sh index 7c2947bdce..0246d18dc1 100755 --- a/scripts/gha/build-image.sh +++ b/scripts/gha/build-image.sh @@ -22,5 +22,5 @@ set -o nounset set -o pipefail docker build . -t ${TRAINING_CI_IMAGE} -f build/images/training-operator/Dockerfile -docker build . -t ${STORAGE_INITIALIZER_CI_IMAGE} -f sdk/python/kubeflow/storage_initializer/Dockerfile -docker build . -t ${TRAINER_CI_IMAGE} -f sdk/python/kubeflow/trainer/Dockerfile +docker build sdk/python/kubeflow/storage_initializer -t ${STORAGE_INITIALIZER_CI_IMAGE} -f Dockerfile +docker build sdk/python/kubeflow/trainer -t ${TRAINER_CI_IMAGE} -f Dockerfile From f0b6b38515bb10b914cefb32f5ad545dd2f49086 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 20 Sep 2024 21:13:27 -0700 Subject: [PATCH 73/86] fix image build error Signed-off-by: helenxie-bit --- scripts/gha/build-image.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/gha/build-image.sh b/scripts/gha/build-image.sh index 0246d18dc1..9ffbb314ed 100755 --- a/scripts/gha/build-image.sh +++ b/scripts/gha/build-image.sh @@ -22,5 +22,5 @@ set -o nounset set -o pipefail docker build . -t ${TRAINING_CI_IMAGE} -f build/images/training-operator/Dockerfile -docker build sdk/python/kubeflow/storage_initializer -t ${STORAGE_INITIALIZER_CI_IMAGE} -f Dockerfile -docker build sdk/python/kubeflow/trainer -t ${TRAINER_CI_IMAGE} -f Dockerfile +docker build sdk/python/kubeflow/storage_initializer -t ${STORAGE_INITIALIZER_CI_IMAGE} -f sdk/python/kubeflow/storage_initializer/Dockerfile +docker build sdk/python/kubeflow/trainer -t ${TRAINER_CI_IMAGE} -f sdk/python/kubeflow/trainer/Dockerfile From 45eb7e082fb129067972c38030f193aa6c811dd5 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 20 Sep 2024 22:06:11 -0700 Subject: [PATCH 74/86] check disk space Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 16 +++++++++--- scripts/gha/build-image.sh | 2 -- .../setup-storage-initializer-and-trainer.sh | 25 +++++++++++++++++++ 3 files changed, 37 insertions(+), 6 deletions(-) create mode 100644 scripts/gha/setup-storage-initializer-and-trainer.sh diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index 20b19ec769..d6af0a1d55 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -33,13 +33,11 @@ jobs: cluster_name: training-operator-cluster kubectl_version: ${{ matrix.kubernetes-version }} - - name: Build training-operator, storage-initializer, and trainer images + - name: Build training-operator run: | ./scripts/gha/build-image.sh env: TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test - STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test - TRAINER_CI_IMAGE: kubeflowtraining/trainer:test - name: Deploy training operator run: | @@ -49,11 +47,21 @@ jobs: TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test GANG_SCHEDULER_NAME: "none" KUBERNETES_VERSION: ${{ matrix.kubernetes-version }} + + - name: Build and load storage initializer and trainer + run: | + ./scripts/gha/setup-storage-initializer-and-trainer.sh + env: + STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test + TRAINER_CI_IMAGE: kubeflowtraining/trainer:test + + - name: Check disk space + run: df -h - name: Run tests run: | kind load docker-image ${{ env.STORAGE_INITIALIZER_IMAGE }} --name ${{ env.KIND_CLUSTER }} - kind load docker-image ${{ env.TRAINER_TRANSFORMER_IMAGE_DEFAULT }} --name ${{ env.KIND_CLUSTER }} + kind load docker-image ${{ env.TRAINER_TRANSFORMER_IMAGE_DEFAULT }} --name ${{ env.KIND_CLUSTER }} pip install pytest python3 -m pip install -e sdk/python[huggingface] pytest -s sdk/python/test/e2e-train-api/test_e2e_train_api.py --log-cli-level=debug diff --git a/scripts/gha/build-image.sh b/scripts/gha/build-image.sh index 9ffbb314ed..cb4f0fc832 100755 --- a/scripts/gha/build-image.sh +++ b/scripts/gha/build-image.sh @@ -22,5 +22,3 @@ set -o nounset set -o pipefail docker build . -t ${TRAINING_CI_IMAGE} -f build/images/training-operator/Dockerfile -docker build sdk/python/kubeflow/storage_initializer -t ${STORAGE_INITIALIZER_CI_IMAGE} -f sdk/python/kubeflow/storage_initializer/Dockerfile -docker build sdk/python/kubeflow/trainer -t ${TRAINER_CI_IMAGE} -f sdk/python/kubeflow/trainer/Dockerfile diff --git a/scripts/gha/setup-storage-initializer-and-trainer.sh b/scripts/gha/setup-storage-initializer-and-trainer.sh new file mode 100644 index 0000000000..3f06fa6a5b --- /dev/null +++ b/scripts/gha/setup-storage-initializer-and-trainer.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# Copyright 2024 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The script is used to build Kubeflow Training image. + + +set -o errexit +set -o nounset +set -o pipefail + +docker build sdk/python/kubeflow/storage_initializer -t ${STORAGE_INITIALIZER_CI_IMAGE} -f sdk/python/kubeflow/storage_initializer/Dockerfile +docker build sdk/python/kubeflow/trainer -t ${TRAINER_CI_IMAGE} -f sdk/python/kubeflow/trainer/Dockerfile From f21779494b72a1391197cca75d4b4221e58fe10e Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 20 Sep 2024 22:15:25 -0700 Subject: [PATCH 75/86] make 'setup-storage-initializer-and-trainer' executable Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 2 +- .../gha/setup-storage-initializer-and-trainer.sh | 0 .../kubeflow/training/api/training_client.py | 14 +++++++------- 3 files changed, 8 insertions(+), 8 deletions(-) mode change 100644 => 100755 scripts/gha/setup-storage-initializer-and-trainer.sh diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index d6af0a1d55..1df822a735 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -48,7 +48,7 @@ jobs: GANG_SCHEDULER_NAME: "none" KUBERNETES_VERSION: ${{ matrix.kubernetes-version }} - - name: Build and load storage initializer and trainer + - name: Build storage initializer and trainer run: | ./scripts/gha/setup-storage-initializer-and-trainer.sh env: diff --git a/scripts/gha/setup-storage-initializer-and-trainer.sh b/scripts/gha/setup-storage-initializer-and-trainer.sh old mode 100644 new mode 100755 diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py index 459e16a046..4ca57e7c76 100644 --- a/sdk/python/kubeflow/training/api/training_client.py +++ b/sdk/python/kubeflow/training/api/training_client.py @@ -258,9 +258,9 @@ def train( ], volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT], ) - base_image1=os.getenv( - "STORAGE_INITIALIZER_IMAGE", constants.STORAGE_INITIALIZER_IMAGE_DEFAULT - ) + base_image1 = os.getenv( + "STORAGE_INITIALIZER_IMAGE", constants.STORAGE_INITIALIZER_IMAGE_DEFAULT + ) print("base_image1: " + base_image1) # create app container spec @@ -291,10 +291,10 @@ def train( volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT], resources=resources_per_worker, ) - base_image2=os.getenv( - "TRAINER_TRANSFORMER_IMAGE_DEFAULT", - constants.TRAINER_TRANSFORMER_IMAGE_DEFAULT, - ) + base_image2 = os.getenv( + "TRAINER_TRANSFORMER_IMAGE_DEFAULT", + constants.TRAINER_TRANSFORMER_IMAGE_DEFAULT, + ) print("base_image2: " + base_image2) storage_initializer_volume = models.V1Volume( From 083e15572dc7982de18519505399bc89f27fcec3 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 20 Sep 2024 22:40:03 -0700 Subject: [PATCH 76/86] separate step of loading images Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index 1df822a735..198fdd7f1c 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -58,14 +58,30 @@ jobs: - name: Check disk space run: df -h + - name: Load storage initializer + run: | + kind load docker-image ${{ env.STORAGE_INITIALIZER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }} + docker rmi ${{ env.STORAGE_INITIALIZER_CI_IMAGE }} + env: + KIND_CLUSTER: training-operator-cluster + STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test + + - name: Check disk space + run: df -h + + - name: Load trainer + run: | + kind load docker-image ${{ env.TRAINER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }} + docker rmi ${{ env.TRAINER_CI_IMAGE }} + env: + KIND_CLUSTER: training-operator-cluster + TRAINER_CI_IMAGE: kubeflowtraining/trainer:test + - name: Run tests run: | - kind load docker-image ${{ env.STORAGE_INITIALIZER_IMAGE }} --name ${{ env.KIND_CLUSTER }} - kind load docker-image ${{ env.TRAINER_TRANSFORMER_IMAGE_DEFAULT }} --name ${{ env.KIND_CLUSTER }} pip install pytest python3 -m pip install -e sdk/python[huggingface] pytest -s sdk/python/test/e2e-train-api/test_e2e_train_api.py --log-cli-level=debug env: - KIND_CLUSTER: training-operator-cluster STORAGE_INITIALIZER_IMAGE: kubeflowtraining/storage-initializer:test TRAINER_TRANSFORMER_IMAGE_DEFAULT: kubeflowtraining/trainer:test From dc74844601b9b9a5cd089d56de9ceeb94f283ad3 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 20 Sep 2024 22:40:59 -0700 Subject: [PATCH 77/86] check disk space after loading image Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index 198fdd7f1c..129e28f3be 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -77,6 +77,9 @@ jobs: KIND_CLUSTER: training-operator-cluster TRAINER_CI_IMAGE: kubeflowtraining/trainer:test + - name: Check disk space + run: df -h + - name: Run tests run: | pip install pytest From de18ef0abfaa1e3344ec6dfcbe2e2303fc432f99 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 20 Sep 2024 23:43:28 -0700 Subject: [PATCH 78/86] clean up and check disk space Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 40 +++++++++++++++++------ 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index 129e28f3be..d1cecccea0 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -48,38 +48,58 @@ jobs: GANG_SCHEDULER_NAME: "none" KUBERNETES_VERSION: ${{ matrix.kubernetes-version }} + - name: Prune docker images + shell: bash + run: | + docker image prune -a -f + docker system df + df -h + - name: Build storage initializer and trainer run: | ./scripts/gha/setup-storage-initializer-and-trainer.sh + docker system df + df -h env: STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test TRAINER_CI_IMAGE: kubeflowtraining/trainer:test - - name: Check disk space - run: df -h - - name: Load storage initializer run: | kind load docker-image ${{ env.STORAGE_INITIALIZER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }} - docker rmi ${{ env.STORAGE_INITIALIZER_CI_IMAGE }} + docker system df + df -h env: KIND_CLUSTER: training-operator-cluster STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test - - name: Check disk space - run: df -h - + - name: Remove image + run: | + docker rmi ${{ env.STORAGE_INITIALIZER_CI_IMAGE}} + docker system df + df -h + env: + STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test + + - name: Monitor resources usage of node + run: | + echo "Monitor resources usage of node" + kubectl describe nodes training-operator-cluster-control-plane + echo "Monitor resources usage of pods" + kubectl get pods --all-namespaces + echo "Monitor resources usage of storage" + docker exec -it training-operator-cluster-control-plane df -h + - name: Load trainer run: | kind load docker-image ${{ env.TRAINER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }} docker rmi ${{ env.TRAINER_CI_IMAGE }} + docker system df + df -h env: KIND_CLUSTER: training-operator-cluster TRAINER_CI_IMAGE: kubeflowtraining/trainer:test - - name: Check disk space - run: df -h - - name: Run tests run: | pip install pytest From ef8742ce70d071575cb15d1d10f0f6a4bc125ab4 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 21 Sep 2024 00:01:54 -0700 Subject: [PATCH 79/86] prune docker build cache Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index d1cecccea0..a2f52c6cbc 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -42,6 +42,8 @@ jobs: - name: Deploy training operator run: | ./scripts/gha/setup-training-operator.sh + docker system df + df -h env: KIND_CLUSTER: training-operator-cluster TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test @@ -76,6 +78,7 @@ jobs: - name: Remove image run: | docker rmi ${{ env.STORAGE_INITIALIZER_CI_IMAGE}} + docker builder prune docker system df df -h env: @@ -87,8 +90,6 @@ jobs: kubectl describe nodes training-operator-cluster-control-plane echo "Monitor resources usage of pods" kubectl get pods --all-namespaces - echo "Monitor resources usage of storage" - docker exec -it training-operator-cluster-control-plane df -h - name: Load trainer run: | From 1eb3ef1b3c74c4598a110dd437b020c43595133a Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 21 Sep 2024 00:13:49 -0700 Subject: [PATCH 80/86] prune docker build cache Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index a2f52c6cbc..66feaadedf 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -78,7 +78,7 @@ jobs: - name: Remove image run: | docker rmi ${{ env.STORAGE_INITIALIZER_CI_IMAGE}} - docker builder prune + docker builder prune --all --force docker system df df -h env: From 1e407a51ecdbb556e225a85aa3f9f0f59ae4bc74 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 21 Sep 2024 16:17:27 -0700 Subject: [PATCH 81/86] adjust sequence of building and loading images Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 54 ++++++++++++------- ...rainer.sh => build-storage-initializer.sh} | 1 - scripts/gha/build-trainer.sh | 24 +++++++++ 3 files changed, 59 insertions(+), 20 deletions(-) rename scripts/gha/{setup-storage-initializer-and-trainer.sh => build-storage-initializer.sh} (88%) create mode 100755 scripts/gha/build-trainer.sh diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index 66feaadedf..b72fa60ef7 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -57,50 +57,66 @@ jobs: docker system df df -h - - name: Build storage initializer and trainer + - name: Build trainer run: | - ./scripts/gha/setup-storage-initializer-and-trainer.sh + ./scripts/gha/build-trainer.sh docker system df df -h env: - STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test TRAINER_CI_IMAGE: kubeflowtraining/trainer:test + + - name: Clean up build cache + run: | + docker builder prune --all --force + docker volume ls + docker system df + df -h - - name: Load storage initializer + - name: Load trainer run: | - kind load docker-image ${{ env.STORAGE_INITIALIZER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }} + kind load docker-image ${{ env.TRAINER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }} + docker image prune -a -f + docker volume prune -f docker system df df -h env: KIND_CLUSTER: training-operator-cluster + TRAINER_CI_IMAGE: kubeflowtraining/trainer:test + + - name: Build storage initializer + run: | + ./scripts/gha/build-storage-initializer.sh + docker system df + df -h + env: STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test + TRAINER_CI_IMAGE: kubeflowtraining/trainer:test - - name: Remove image + - name: Clean up build cache run: | - docker rmi ${{ env.STORAGE_INITIALIZER_CI_IMAGE}} docker builder prune --all --force + docker volume ls + docker system df + df -h + + - name: Load storage initializer + run: | + kind load docker-image ${{ env.STORAGE_INITIALIZER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }} + docker image prune -a -f + docker volume prune -f docker system df df -h env: + KIND_CLUSTER: training-operator-cluster STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test - + - name: Monitor resources usage of node run: | echo "Monitor resources usage of node" kubectl describe nodes training-operator-cluster-control-plane echo "Monitor resources usage of pods" kubectl get pods --all-namespaces - - - name: Load trainer - run: | - kind load docker-image ${{ env.TRAINER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }} - docker rmi ${{ env.TRAINER_CI_IMAGE }} - docker system df - df -h - env: - KIND_CLUSTER: training-operator-cluster - TRAINER_CI_IMAGE: kubeflowtraining/trainer:test - + - name: Run tests run: | pip install pytest diff --git a/scripts/gha/setup-storage-initializer-and-trainer.sh b/scripts/gha/build-storage-initializer.sh similarity index 88% rename from scripts/gha/setup-storage-initializer-and-trainer.sh rename to scripts/gha/build-storage-initializer.sh index 3f06fa6a5b..261e140a60 100755 --- a/scripts/gha/setup-storage-initializer-and-trainer.sh +++ b/scripts/gha/build-storage-initializer.sh @@ -22,4 +22,3 @@ set -o nounset set -o pipefail docker build sdk/python/kubeflow/storage_initializer -t ${STORAGE_INITIALIZER_CI_IMAGE} -f sdk/python/kubeflow/storage_initializer/Dockerfile -docker build sdk/python/kubeflow/trainer -t ${TRAINER_CI_IMAGE} -f sdk/python/kubeflow/trainer/Dockerfile diff --git a/scripts/gha/build-trainer.sh b/scripts/gha/build-trainer.sh new file mode 100755 index 0000000000..87bf229246 --- /dev/null +++ b/scripts/gha/build-trainer.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Copyright 2024 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The script is used to build Kubeflow Training image. + + +set -o errexit +set -o nounset +set -o pipefail + +docker build sdk/python/kubeflow/trainer -t ${TRAINER_CI_IMAGE} -f sdk/python/kubeflow/trainer/Dockerfile From 751955907a71f8b1b3852e096e8e2c89089e4957 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 21 Sep 2024 16:48:28 -0700 Subject: [PATCH 82/86] move working directory Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index b72fa60ef7..f158bcf5d0 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -83,6 +83,24 @@ jobs: KIND_CLUSTER: training-operator-cluster TRAINER_CI_IMAGE: kubeflowtraining/trainer:test + # Step to move Docker data directory back to / for Storage Initializer build + - name: Move docker data directory back to / for Storage Initializer build + shell: bash + run: | + echo "Stopping docker service ..." + sudo systemctl stop docker + DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker + DOCKER_ROOT_DIR=/mnt/docker + echo "Removing symlink and moving Docker data back to ${DOCKER_DEFAULT_ROOT_DIR}..." + sudo rm -rf ${DOCKER_DEFAULT_ROOT_DIR} + sudo mv ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR} + echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})" + echo "Starting docker service ..." + sudo systemctl daemon-reload + sudo systemctl start docker + echo "Docker service status:" + sudo systemctl --no-pager -l -o short status docker + - name: Build storage initializer run: | ./scripts/gha/build-storage-initializer.sh From f5d63c40b65176279ae55bc6cbcbecd37f6aa731 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 21 Sep 2024 17:29:59 -0700 Subject: [PATCH 83/86] delete moving working directory Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 52 ++--------------------- 1 file changed, 3 insertions(+), 49 deletions(-) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index f158bcf5d0..93064415c2 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -42,35 +42,20 @@ jobs: - name: Deploy training operator run: | ./scripts/gha/setup-training-operator.sh - docker system df - df -h env: KIND_CLUSTER: training-operator-cluster TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test GANG_SCHEDULER_NAME: "none" KUBERNETES_VERSION: ${{ matrix.kubernetes-version }} - - - name: Prune docker images - shell: bash - run: | - docker image prune -a -f - docker system df - df -h - name: Build trainer run: | ./scripts/gha/build-trainer.sh + docker builder prune --all --force docker system df df -h env: TRAINER_CI_IMAGE: kubeflowtraining/trainer:test - - - name: Clean up build cache - run: | - docker builder prune --all --force - docker volume ls - docker system df - df -h - name: Load trainer run: | @@ -82,40 +67,16 @@ jobs: env: KIND_CLUSTER: training-operator-cluster TRAINER_CI_IMAGE: kubeflowtraining/trainer:test - - # Step to move Docker data directory back to / for Storage Initializer build - - name: Move docker data directory back to / for Storage Initializer build - shell: bash - run: | - echo "Stopping docker service ..." - sudo systemctl stop docker - DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker - DOCKER_ROOT_DIR=/mnt/docker - echo "Removing symlink and moving Docker data back to ${DOCKER_DEFAULT_ROOT_DIR}..." - sudo rm -rf ${DOCKER_DEFAULT_ROOT_DIR} - sudo mv ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR} - echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})" - echo "Starting docker service ..." - sudo systemctl daemon-reload - sudo systemctl start docker - echo "Docker service status:" - sudo systemctl --no-pager -l -o short status docker - + - name: Build storage initializer run: | ./scripts/gha/build-storage-initializer.sh + docker builder prune --all --force docker system df df -h env: STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test TRAINER_CI_IMAGE: kubeflowtraining/trainer:test - - - name: Clean up build cache - run: | - docker builder prune --all --force - docker volume ls - docker system df - df -h - name: Load storage initializer run: | @@ -127,13 +88,6 @@ jobs: env: KIND_CLUSTER: training-operator-cluster STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test - - - name: Monitor resources usage of node - run: | - echo "Monitor resources usage of node" - kubectl describe nodes training-operator-cluster-control-plane - echo "Monitor resources usage of pods" - kubectl get pods --all-namespaces - name: Run tests run: | From 08c8562b0e41b6fe49add34df05b8ff516265c2a Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Sat, 21 Sep 2024 17:34:15 -0700 Subject: [PATCH 84/86] fix format Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 2 +- sdk/python/kubeflow/training/api/training_client.py | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index 93064415c2..c286d9a6e9 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -88,7 +88,7 @@ jobs: env: KIND_CLUSTER: training-operator-cluster STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test - + - name: Run tests run: | pip install pytest diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py index 4ca57e7c76..1626f18820 100644 --- a/sdk/python/kubeflow/training/api/training_client.py +++ b/sdk/python/kubeflow/training/api/training_client.py @@ -258,10 +258,6 @@ def train( ], volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT], ) - base_image1 = os.getenv( - "STORAGE_INITIALIZER_IMAGE", constants.STORAGE_INITIALIZER_IMAGE_DEFAULT - ) - print("base_image1: " + base_image1) # create app container spec container_spec = utils.get_container_spec( @@ -291,11 +287,6 @@ def train( volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT], resources=resources_per_worker, ) - base_image2 = os.getenv( - "TRAINER_TRANSFORMER_IMAGE_DEFAULT", - constants.TRAINER_TRANSFORMER_IMAGE_DEFAULT, - ) - print("base_image2: " + base_image2) storage_initializer_volume = models.V1Volume( name=constants.STORAGE_INITIALIZER, From d2ae5423539e2b17ab65279c8de9b3bc3cda7c24 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 24 Sep 2024 13:13:41 -0700 Subject: [PATCH 85/86] use 'docker system prune' Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index c286d9a6e9..84fba097dd 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -42,6 +42,9 @@ jobs: - name: Deploy training operator run: | ./scripts/gha/setup-training-operator.sh + docker system prune -a -f + docker system df + df -h env: KIND_CLUSTER: training-operator-cluster TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test @@ -60,8 +63,7 @@ jobs: - name: Load trainer run: | kind load docker-image ${{ env.TRAINER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }} - docker image prune -a -f - docker volume prune -f + docker system prune -a -f docker system df df -h env: @@ -81,8 +83,7 @@ jobs: - name: Load storage initializer run: | kind load docker-image ${{ env.STORAGE_INITIALIZER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }} - docker image prune -a -f - docker volume prune -f + docker system prune -a -f docker system df df -h env: From 09fc8a906afaeffeaa7815d0961a9360ccc8b98c Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Tue, 24 Sep 2024 13:15:47 -0700 Subject: [PATCH 86/86] make the format of the commands to be consistent Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index 84fba097dd..fa65402682 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -54,7 +54,7 @@ jobs: - name: Build trainer run: | ./scripts/gha/build-trainer.sh - docker builder prune --all --force + docker builder prune -a -f docker system df df -h env: @@ -73,7 +73,7 @@ jobs: - name: Build storage initializer run: | ./scripts/gha/build-storage-initializer.sh - docker builder prune --all --force + docker builder prune -a -f docker system df df -h env: