From 15b6cb0bcc0ba9eece05c88046be6c5ca47e6690 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 9 Aug 2024 11:08:04 +0800
Subject: [PATCH 01/86] add e2e test for train API

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml |  60 +++++++++++
 sdk/python/test_e2e/test_e2e_train_api.py | 126 ++++++++++++++++++++++
 sdk/python/test_e2e/test_e2e_train_api.sh |  37 +++++++
 3 files changed, 223 insertions(+)
 create mode 100644 .github/workflows/e2e-test-train-api.yaml
 create mode 100644 sdk/python/test_e2e/test_e2e_train_api.py
 create mode 100755 sdk/python/test_e2e/test_e2e_train_api.sh

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
new file mode 100644
index 0000000000..21537e9616
--- /dev/null
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -0,0 +1,60 @@
+name: E2E Test with train API
+
+on: 
+  - pull_request
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+    e2e:
+        runs-on: ubuntu-latest
+        strategy:
+            fail-fast: false
+            matrix:
+                kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
+                python-version: [3.8, 3.9, 3.10, 3.11]
+        steps:
+            - name: Checkout
+              uses: actions/checkout@v4
+
+            - name: Free-Up Disk Space
+              uses: ./.github/workflows/free-up-disk-space
+
+            - name: Setup Python
+              uses: actions/setup-python@v5
+              with:
+                python-version: ${{ matrix.python-version }}
+
+            - name: Setup Go
+              uses: actions/setup-go@v5
+              with:
+                go-version-file: go.mod
+
+            - name: Create k8s Kind Cluster
+              uses: helm/kind-action@v1.10.0
+              with:
+                node_image: kindest/node:${{ matrix.kubernetes-version }}
+                cluster_name: training-operator-cluster
+                kubectl_version: ${{ matrix.kubernetes-version }}
+
+            - name: Build training-operator
+              run: |
+                ./scripts/gha/build-image.sh
+              env:
+                TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
+
+            - name: Deploy training operator
+              run: |
+                ./scripts/gha/setup-training-operator.sh
+              env:
+                KIND_CLUSTER: training-operator-cluster
+                TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
+                GANG_SCHEDULER_NAME: "none"
+                KUBERNETES_VERSION: ${{ matrix.kubernetes-version }}
+
+            - name: Run tests
+              run: |
+                python3 -m pip install -e sdk/python
+                ./sdk/python/test_e2e/test_e2e_train_api.sh
diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
new file mode 100644
index 0000000000..146b6fe056
--- /dev/null
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -0,0 +1,126 @@
+# Copyright 2024 kubeflow.org.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from kubernetes import client, config
+
+from kubeflow.storage_initializer.hugging_face import (
+    HuggingFaceModelParams,
+    HuggingFaceTrainerParams,
+    HuggingFaceDatasetParams,
+)
+from kubeflow.training import TrainingClient
+from kubeflow.training import constants
+
+import logging
+
+from peft import LoraConfig
+import transformers
+
+import test.e2e.utils as utils
+
+logging.basicConfig(format="%(message)s")
+logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG)
+
+TRAINING_CLIENT = TrainingClient(job_kind=constants.PYTORCHJOB_KIND)
+JOB_NAME = "test-train-api"
+
+
+def test_train_api(job_namespace):
+    num_workers = 1
+
+    # Use test case from fine-tuning API tutorial
+    # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
+    TRAINING_CLIENT.train(
+        name=JOB_NAME,
+        namespace=job_namespace,
+        # BERT model URI and type of Transformer to train it.
+        model_provider_parameters=HuggingFaceModelParams(
+            model_uri="hf://google-bert/bert-base-cased",
+            transformer_type=transformers.AutoModelForSequenceClassification,
+        ),
+        # In order to save test time, use 8 samples from Yelp dataset.
+        dataset_provider_parameters=HuggingFaceDatasetParams(
+            repo_id="yelp_review_full",
+            split="train[:8]",
+        ),
+        # Specify HuggingFace Trainer parameters. In this example, we will skip evaluation and model checkpoints.
+        trainer_parameters=HuggingFaceTrainerParams(
+            training_parameters=transformers.TrainingArguments(
+                output_dir="test_trainer",
+                save_strategy="no",
+                evaluation_strategy="no",
+                do_eval=False,
+                disable_tqdm=True,
+                log_level="info",
+                num_train_epochs=1,
+            ),
+            # Set LoRA config to reduce number of trainable model parameters.
+            lora_config=LoraConfig(
+                r=8,
+                lora_alpha=8,
+                lora_dropout=0.1,
+                bias="none",
+            ),
+        ),
+        num_workers=num_workers,  # nodes parameter for torchrun command.
+        num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
+        resources_per_worker={
+            "gpu": 0,
+            "cpu": 2,
+            "memory": "10G",
+        },
+    )
+
+    logging.info(f"List of created {TRAINING_CLIENT.job_kind}s")
+    logging.info(TRAINING_CLIENT.list_jobs(job_namespace))
+
+    try:
+        utils.verify_job_e2e(
+            TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=60 * 30
+        )
+        logging.info(f"Training job {JOB_NAME} is succeded.")
+    except Exception as e:
+        utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
+        TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+        raise Exception(f"Training job {JOB_NAME} is failed. Exception: {e}")
+
+    # Verify that training job has correct pods.
+    pod_names = TRAINING_CLIENT.get_job_pod_names(
+        name=JOB_NAME, namespace=job_namespace
+    )
+
+    # if len(pod_names) != num_workers or f"{JOB_NAME}-worker-0" not in pod_names:
+    if len(pod_names) != num_workers:
+        raise Exception(f"Training job has incorrect pods: {pod_names}")
+
+    utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
+
+    # Get and print the logs of the master pod
+    master_pod_name = next((name for name in pod_names if "master" in name), None)
+    if master_pod_name:
+        config.load_kube_config()  # Load kube config to interact with the cluster
+        v1 = client.CoreV1Api()
+        try:
+            pod_logs = v1.read_namespaced_pod_log(
+                name=master_pod_name, namespace=job_namespace
+            )
+            logging.info(f"Logs of master pod {master_pod_name}:\n{pod_logs}")
+        except client.exceptions.ApiException as e:
+            logging.error(f"Failed to get logs for pod {master_pod_name}: {e}")
+
+    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+
+
+if __name__ == "__main__":
+    test_train_api(job_namespace="default")
diff --git a/sdk/python/test_e2e/test_e2e_train_api.sh b/sdk/python/test_e2e/test_e2e_train_api.sh
new file mode 100755
index 0000000000..0ac3f9f4e1
--- /dev/null
+++ b/sdk/python/test_e2e/test_e2e_train_api.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+
+# Copyright 2024 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This shell script is used to run Katib Experiment.
+# Input parameter - path to Experiment yaml.
+
+set -o errexit
+set -o nounset
+set -o pipefail
+
+cd "$(dirname "$0")"
+
+echo "Training Operator deployments"
+kubectl -n kubeflow get deploy
+echo "Training Operator services"
+kubectl -n kubeflow get svc
+echo "Training Operator pods"
+kubectl -n kubeflow get pod
+echo "Training Operator persistent volume claims"
+kubectl get pvc -n kubeflow
+echo "Available CRDs"
+kubectl get crd
+
+python test_e2e_train_api.py || (kubectl get pods -n kubeflow && exit 1)
\ No newline at end of file

From daa00543443fe7cb975afaacca7d024fa8ca3b9c Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 9 Aug 2024 11:29:15 +0800
Subject: [PATCH 02/86] fix peft import error

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index 21537e9616..bec2981c59 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -14,7 +14,7 @@ jobs:
             fail-fast: false
             matrix:
                 kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
-                python-version: [3.8, 3.9, 3.10, 3.11]
+                python-version: ["3.8", "3.9", "3.10", "3.11"]
         steps:
             - name: Checkout
               uses: actions/checkout@v4
@@ -56,5 +56,5 @@ jobs:
 
             - name: Run tests
               run: |
-                python3 -m pip install -e sdk/python
+                python3 -m pip install -e sdk/python[huggingface]
                 ./sdk/python/test_e2e/test_e2e_train_api.sh

From 8d4af9051f14aaeb8cef6893ca827064e6247fcb Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 9 Aug 2024 12:05:34 +0800
Subject: [PATCH 03/86] update settings of the job

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index 146b6fe056..0898c6ab5b 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -37,7 +37,7 @@
 
 
 def test_train_api(job_namespace):
-    num_workers = 1
+    num_workers = 4
 
     # Use test case from fine-tuning API tutorial
     # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
@@ -74,10 +74,10 @@ def test_train_api(job_namespace):
             ),
         ),
         num_workers=num_workers,  # nodes parameter for torchrun command.
-        num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
+        num_procs_per_worker=2,  # nproc-per-node parameter for torchrun command.
         resources_per_worker={
-            "gpu": 0,
-            "cpu": 2,
+            "gpu": 2,
+            "cpu": 5,
             "memory": "10G",
         },
     )
@@ -87,7 +87,7 @@ def test_train_api(job_namespace):
 
     try:
         utils.verify_job_e2e(
-            TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=60 * 30
+            TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=60 * 60
         )
         logging.info(f"Training job {JOB_NAME} is succeded.")
     except Exception as e:

From 86c31c82e2ffdbdeda1438730e834031859bfce8 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 9 Aug 2024 12:09:23 +0800
Subject: [PATCH 04/86] fix format

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 22 +++++++++-------------
 sdk/python/test_e2e/test_e2e_train_api.sh |  2 +-
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index 0898c6ab5b..6254827cdf 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -12,23 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from kubernetes import client, config
-
-from kubeflow.storage_initializer.hugging_face import (
-    HuggingFaceModelParams,
-    HuggingFaceTrainerParams,
-    HuggingFaceDatasetParams,
-)
-from kubeflow.training import TrainingClient
-from kubeflow.training import constants
-
 import logging
-
+import test.e2e.utils as utils
+ 
+from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams
+from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams
+from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams
+from kubeflow.training import constants
+from kubeflow.training import TrainingClient
+from kubernetes import client
+from kubernetes import config
 from peft import LoraConfig
 import transformers
 
-import test.e2e.utils as utils
-
 logging.basicConfig(format="%(message)s")
 logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG)
 
diff --git a/sdk/python/test_e2e/test_e2e_train_api.sh b/sdk/python/test_e2e/test_e2e_train_api.sh
index 0ac3f9f4e1..af41771faf 100755
--- a/sdk/python/test_e2e/test_e2e_train_api.sh
+++ b/sdk/python/test_e2e/test_e2e_train_api.sh
@@ -34,4 +34,4 @@ kubectl get pvc -n kubeflow
 echo "Available CRDs"
 kubectl get crd
 
-python test_e2e_train_api.py || (kubectl get pods -n kubeflow && exit 1)
\ No newline at end of file
+python test_e2e_train_api.py || (kubectl get pods -n kubeflow && exit 1)

From 01870e239c62700df82b762fb96d8d92f242e3f6 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 9 Aug 2024 12:16:05 +0800
Subject: [PATCH 05/86] fix format

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml | 2 +-
 sdk/python/test_e2e/test_e2e_train_api.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index bec2981c59..a663da9f87 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -1,6 +1,6 @@
 name: E2E Test with train API
 
-on: 
+on:
   - pull_request
 
 concurrency:
diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index 6254827cdf..1074bfc530 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -14,7 +14,7 @@
 
 import logging
 import test.e2e.utils as utils
- 
+
 from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams
 from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams
 from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams

From 17f3c33032796e83686169da56b84c855e894964 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 9 Aug 2024 20:46:56 +0800
Subject: [PATCH 06/86] fix error detection

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .../kubeflow/training/constants/constants.py  |   4 +-
 sdk/python/test_e2e/test_e2e_train_api.py     | 122 +++++++++++++-----
 2 files changed, 89 insertions(+), 37 deletions(-)

diff --git a/sdk/python/kubeflow/training/constants/constants.py b/sdk/python/kubeflow/training/constants/constants.py
index 0513c3e31e..0102fe7ef7 100644
--- a/sdk/python/kubeflow/training/constants/constants.py
+++ b/sdk/python/kubeflow/training/constants/constants.py
@@ -78,7 +78,7 @@
 
 
 # TODO (andreyvelich): We should add image tag for Storage Initializer and Trainer.
-STORAGE_INITIALIZER_IMAGE = "docker.io/kubeflow/storage-initializer"
+STORAGE_INITIALIZER_IMAGE = "docker.io/helenxiehz428/test"
 
 STORAGE_INITIALIZER_VOLUME_MOUNT = models.V1VolumeMount(
     name=STORAGE_INITIALIZER,
@@ -90,7 +90,7 @@
         claim_name=STORAGE_INITIALIZER
     ),
 )
-TRAINER_TRANSFORMER_IMAGE = "docker.io/kubeflow/trainer-huggingface"
+TRAINER_TRANSFORMER_IMAGE = "docker.io/helenxiehz428/test_llm4"
 
 # TFJob constants.
 TFJOB_KIND = "TFJob"
diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index 1074bfc530..5ff54ec614 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -14,14 +14,17 @@
 
 import logging
 import test.e2e.utils as utils
+import time
 
 from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams
 from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams
 from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams
 from kubeflow.training import constants
+from kubeflow.training.utils import utils
 from kubeflow.training import TrainingClient
 from kubernetes import client
 from kubernetes import config
+from kubernetes.client.exceptions import ApiException
 from peft import LoraConfig
 import transformers
 
@@ -32,10 +35,33 @@
 JOB_NAME = "test-train-api"
 
 
+def get_logs_of_master_pod(job_namespace, num_workers):
+    # Verify that training job has correct pods.
+    pod_names = TRAINING_CLIENT.get_job_pod_names(
+        name=JOB_NAME, namespace=job_namespace
+    )
+
+    if len(pod_names) != num_workers:
+        raise Exception(f"Training job has incorrect pods: {pod_names}")
+
+    # Get and print the logs of the master pod.
+    master_pod_name = next((name for name in pod_names if "master" in name), None)
+    if master_pod_name:
+        config.load_kube_config()  # Load kube config to interact with the cluster.
+        v1 = client.CoreV1Api()
+        try:
+            pod_logs = v1.read_namespaced_pod_log(
+                name=master_pod_name, namespace=job_namespace
+            )
+            logging.info(f"Logs of master pod {master_pod_name}:\n{pod_logs}")
+        except ApiException as e:
+            logging.error(f"Failed to get logs for pod {master_pod_name}: {e}")
+
+
 def test_train_api(job_namespace):
-    num_workers = 4
+    num_workers = 1
 
-    # Use test case from fine-tuning API tutorial
+    # Use test case from fine-tuning API tutorial.
     # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
     TRAINING_CLIENT.train(
         name=JOB_NAME,
@@ -70,52 +96,78 @@ def test_train_api(job_namespace):
             ),
         ),
         num_workers=num_workers,  # nodes parameter for torchrun command.
-        num_procs_per_worker=2,  # nproc-per-node parameter for torchrun command.
+        num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
         resources_per_worker={
-            "gpu": 2,
-            "cpu": 5,
+            "gpu": 0,
+            "cpu": 2,
             "memory": "10G",
         },
     )
 
-    logging.info(f"List of created {TRAINING_CLIENT.job_kind}s")
+    logging.info("---------------------------------------------------------------")
+    logging.info(f"List of created {TRAINING_CLIENT.job_kind}s:")
     logging.info(TRAINING_CLIENT.list_jobs(job_namespace))
 
-    try:
-        utils.verify_job_e2e(
-            TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=60 * 60
-        )
-        logging.info(f"Training job {JOB_NAME} is succeded.")
-    except Exception as e:
-        utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
-        TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
-        raise Exception(f"Training job {JOB_NAME} is failed. Exception: {e}")
-
-    # Verify that training job has correct pods.
-    pod_names = TRAINING_CLIENT.get_job_pod_names(
-        name=JOB_NAME, namespace=job_namespace
-    )
+    logging.info("---------------------------------------------------------------")
+    logging.info(f"Training job {JOB_NAME} is running...")
 
-    # if len(pod_names) != num_workers or f"{JOB_NAME}-worker-0" not in pod_names:
-    if len(pod_names) != num_workers:
-        raise Exception(f"Training job has incorrect pods: {pod_names}")
+    logging.info("---------------------------------------------------------------")
+    wait_timeout = 60 * 60
+    polling_interval = 15
+    for _ in range(round(wait_timeout / polling_interval)):
 
-    utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
+        # Get the list of pods associated with the job.
+        pod_names = TRAINING_CLIENT.get_job_pod_names(
+            name=JOB_NAME, namespace=job_namespace
+        )
 
-    # Get and print the logs of the master pod
-    master_pod_name = next((name for name in pod_names if "master" in name), None)
-    if master_pod_name:
-        config.load_kube_config()  # Load kube config to interact with the cluster
+        config.load_kube_config()  # Load kube config to interact with the cluster.
         v1 = client.CoreV1Api()
-        try:
-            pod_logs = v1.read_namespaced_pod_log(
-                name=master_pod_name, namespace=job_namespace
+
+        # Iterate over each pod to check its status.
+        for pod_name in pod_names:
+            pod_status = v1.read_namespaced_pod_status(
+                name=pod_name, namespace=job_namespace
             )
-            logging.info(f"Logs of master pod {master_pod_name}:\n{pod_logs}")
-        except client.exceptions.ApiException as e:
-            logging.error(f"Failed to get logs for pod {master_pod_name}: {e}")
 
-    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+            # Check if any container in the pod has been restarted, indicating a previous failure.
+            for container_status in pod_status.status.container_statuses:
+                if container_status.restart_count > 0:
+                    logging.warning(
+                        f"Pod {pod_name} in job {JOB_NAME} has been restarted {container_status.restart_count} times. Retrieving logs..."
+                    )
+
+                    get_logs_of_master_pod(job_namespace, num_workers)
+
+                    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+
+                    # Raise an exception to indicate that a pod has failed at least once.
+                    raise Exception(f"Training job {JOB_NAME} is failed.")
+
+        # Get Job only once per cycle and check the statuses.
+        job = TRAINING_CLIENT.get_job(
+            name=JOB_NAME,
+            namespace=job_namespace,
+            job_kind=constants.PYTORCHJOB_KIND,
+            timeout=constants.DEFAULT_TIMEOUT,
+        )
+
+        # Get Job conditions.
+        conditions = TRAINING_CLIENT.get_job_conditions(
+            job=job, timeout=constants.DEFAULT_TIMEOUT
+        )
+
+        # Check if the job has succeeded.
+        if utils.has_condition(conditions, constants.JOB_CONDITION_SUCCEEDED):
+            get_logs_of_master_pod(job_namespace, num_workers)
+            logging.info("---------------------------------------------------------------")
+            logging.info(f"Training job {JOB_NAME} is succeeded.")
+
+            logging.info("---------------------------------------------------------------")
+            TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+            break
+
+        time.sleep(polling_interval)
 
 
 if __name__ == "__main__":

From 0685dc7f9236ba522c798bd6ec9805026239936f Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 9 Aug 2024 20:49:52 +0800
Subject: [PATCH 07/86] resolve conflict

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/kubeflow/training/constants/constants.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/sdk/python/kubeflow/training/constants/constants.py b/sdk/python/kubeflow/training/constants/constants.py
index 0102fe7ef7..d4f638e6b9 100644
--- a/sdk/python/kubeflow/training/constants/constants.py
+++ b/sdk/python/kubeflow/training/constants/constants.py
@@ -84,12 +84,7 @@
     name=STORAGE_INITIALIZER,
     mount_path=INIT_CONTAINER_MOUNT_PATH,
 )
-STORAGE_INITIALIZER_VOLUME = models.V1Volume(
-    name=STORAGE_INITIALIZER,
-    persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource(
-        claim_name=STORAGE_INITIALIZER
-    ),
-)
+
 TRAINER_TRANSFORMER_IMAGE = "docker.io/helenxiehz428/test_llm4"
 
 # TFJob constants.

From 83de64b1d4c46300dab1b7b04629b2ba9814a9bc Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 9 Aug 2024 20:52:35 +0800
Subject: [PATCH 08/86] resolve conflict

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/kubeflow/training/constants/constants.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sdk/python/kubeflow/training/constants/constants.py b/sdk/python/kubeflow/training/constants/constants.py
index d4f638e6b9..07c98bc787 100644
--- a/sdk/python/kubeflow/training/constants/constants.py
+++ b/sdk/python/kubeflow/training/constants/constants.py
@@ -78,14 +78,14 @@
 
 
 # TODO (andreyvelich): We should add image tag for Storage Initializer and Trainer.
-STORAGE_INITIALIZER_IMAGE = "docker.io/helenxiehz428/test"
+STORAGE_INITIALIZER_IMAGE = "docker.io/kubeflow/storage-initializer"
 
 STORAGE_INITIALIZER_VOLUME_MOUNT = models.V1VolumeMount(
     name=STORAGE_INITIALIZER,
     mount_path=INIT_CONTAINER_MOUNT_PATH,
 )
 
-TRAINER_TRANSFORMER_IMAGE = "docker.io/helenxiehz428/test_llm4"
+TRAINER_TRANSFORMER_IMAGE = "docker.io/kubeflow/trainer-huggingface"
 
 # TFJob constants.
 TFJOB_KIND = "TFJob"

From f954f2d4a4bf87cf7329f07cb82b4b390cddac71 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 9 Aug 2024 20:56:56 +0800
Subject: [PATCH 09/86] resolve conflict

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/kubeflow/training/constants/constants.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/sdk/python/kubeflow/training/constants/constants.py b/sdk/python/kubeflow/training/constants/constants.py
index 07c98bc787..0513c3e31e 100644
--- a/sdk/python/kubeflow/training/constants/constants.py
+++ b/sdk/python/kubeflow/training/constants/constants.py
@@ -84,7 +84,12 @@
     name=STORAGE_INITIALIZER,
     mount_path=INIT_CONTAINER_MOUNT_PATH,
 )
-
+STORAGE_INITIALIZER_VOLUME = models.V1Volume(
+    name=STORAGE_INITIALIZER,
+    persistent_volume_claim=models.V1PersistentVolumeClaimVolumeSource(
+        claim_name=STORAGE_INITIALIZER
+    ),
+)
 TRAINER_TRANSFORMER_IMAGE = "docker.io/kubeflow/trainer-huggingface"
 
 # TFJob constants.

From ff48154314e0dde7fc321855d9fe1a80d612bc4e Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 9 Aug 2024 20:59:57 +0800
Subject: [PATCH 10/86] fix format

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index 5ff54ec614..d630b61e91 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -20,8 +20,8 @@
 from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams
 from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams
 from kubeflow.training import constants
-from kubeflow.training.utils import utils
 from kubeflow.training import TrainingClient
+from kubeflow.training.utils import utils
 from kubernetes import client
 from kubernetes import config
 from kubernetes.client.exceptions import ApiException
@@ -160,10 +160,14 @@ def test_train_api(job_namespace):
         # Check if the job has succeeded.
         if utils.has_condition(conditions, constants.JOB_CONDITION_SUCCEEDED):
             get_logs_of_master_pod(job_namespace, num_workers)
-            logging.info("---------------------------------------------------------------")
+            logging.info(
+                "---------------------------------------------------------------"
+            )
             logging.info(f"Training job {JOB_NAME} is succeeded.")
 
-            logging.info("---------------------------------------------------------------")
+            logging.info(
+                "---------------------------------------------------------------"
+            )
             TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
             break
 

From 304db5d8b9fb173c01d6eb83d53d583b92a893c7 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 9 Aug 2024 21:28:24 +0800
Subject: [PATCH 11/86] fix NoneType error

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index d630b61e91..4d84404b5a 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -113,7 +113,7 @@ def test_train_api(job_namespace):
 
     logging.info("---------------------------------------------------------------")
     wait_timeout = 60 * 60
-    polling_interval = 15
+    polling_interval = 30
     for _ in range(round(wait_timeout / polling_interval)):
 
         # Get the list of pods associated with the job.
@@ -130,6 +130,11 @@ def test_train_api(job_namespace):
                 name=pod_name, namespace=job_namespace
             )
 
+            # Ensure that container_statuses is not None before iterating.
+            if pod_status.status.container_statuses is None:
+                logging.warning(f"Pod {pod_name} has no container statuses available yet.")
+                continue
+
             # Check if any container in the pod has been restarted, indicating a previous failure.
             for container_status in pod_status.status.container_statuses:
                 if container_status.restart_count > 0:

From 486154d6e226dfd822f34db96f16fb26e6116eb6 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 9 Aug 2024 21:29:32 +0800
Subject: [PATCH 12/86] fix format

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index 4d84404b5a..fa11f40221 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -132,7 +132,9 @@ def test_train_api(job_namespace):
 
             # Ensure that container_statuses is not None before iterating.
             if pod_status.status.container_statuses is None:
-                logging.warning(f"Pod {pod_name} has no container statuses available yet.")
+                logging.warning(
+                    f"Pod {pod_name} has no container statuses available yet."
+                )
                 continue
 
             # Check if any container in the pod has been restarted, indicating a previous failure.

From 016c41db06ac5c9e3d0a55680aa8000c5676e186 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sat, 10 Aug 2024 07:32:39 +0800
Subject: [PATCH 13/86] test bug

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index fa11f40221..7f07dca409 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -130,6 +130,8 @@ def test_train_api(job_namespace):
                 name=pod_name, namespace=job_namespace
             )
 
+            get_logs_of_master_pod(job_namespace, num_workers)
+
             # Ensure that container_statuses is not None before iterating.
             if pod_status.status.container_statuses is None:
                 logging.warning(

From 1e7bd2339b303e08b874fff695ffdefa8cc1612e Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sun, 11 Aug 2024 16:25:37 +0800
Subject: [PATCH 14/86] find bug

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml |  1 +
 sdk/python/test_e2e/test_e2e_train_api.py | 30 +++++++++++++++--------
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index a663da9f87..7cf10aef86 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -58,3 +58,4 @@ jobs:
               run: |
                 python3 -m pip install -e sdk/python[huggingface]
                 ./sdk/python/test_e2e/test_e2e_train_api.sh
+                (kubectl get pods -n default && kubectl describe pod -n default $(kubectl get pods -n default -o jsonpath='{.items[0].metadata.name}'); exit 1)
diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index 7f07dca409..a4dfd7be90 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import logging
-import test.e2e.utils as utils
 import time
 
 from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams
@@ -124,13 +123,32 @@ def test_train_api(job_namespace):
         config.load_kube_config()  # Load kube config to interact with the cluster.
         v1 = client.CoreV1Api()
 
+        # Get Job only once per cycle and check the statuses.
+        job = TRAINING_CLIENT.get_job(
+            name=JOB_NAME,
+            namespace=job_namespace,
+            job_kind=constants.PYTORCHJOB_KIND,
+            timeout=constants.DEFAULT_TIMEOUT,
+        )
+
+        for replica_name, replica_status in job.status.replica_statuses.items():
+            logging.info(
+                f"Replica {replica_name} status: {replica_status.succeeded} succeeded, {replica_status.failed} failed."
+            )
+
         # Iterate over each pod to check its status.
         for pod_name in pod_names:
             pod_status = v1.read_namespaced_pod_status(
                 name=pod_name, namespace=job_namespace
             )
 
-            get_logs_of_master_pod(job_namespace, num_workers)
+            print("pod_status:")
+            print(pod_status)
+            print("pod_status.status:")
+            print(pod_status.status)
+            print("pod_status.status.container_statuses:")
+            print(pod_status.status.container_statuses)
+            print("continue...")
 
             # Ensure that container_statuses is not None before iterating.
             if pod_status.status.container_statuses is None:
@@ -153,14 +171,6 @@ def test_train_api(job_namespace):
                     # Raise an exception to indicate that a pod has failed at least once.
                     raise Exception(f"Training job {JOB_NAME} is failed.")
 
-        # Get Job only once per cycle and check the statuses.
-        job = TRAINING_CLIENT.get_job(
-            name=JOB_NAME,
-            namespace=job_namespace,
-            job_kind=constants.PYTORCHJOB_KIND,
-            timeout=constants.DEFAULT_TIMEOUT,
-        )
-
         # Get Job conditions.
         conditions = TRAINING_CLIENT.get_job_conditions(
             job=job, timeout=constants.DEFAULT_TIMEOUT

From 1aced614cdbe75900226bde3a2f2ee9fd8bc95ef Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sun, 11 Aug 2024 16:37:05 +0800
Subject: [PATCH 15/86] find bug

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index a4dfd7be90..a89cfff448 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -111,7 +111,7 @@ def test_train_api(job_namespace):
     logging.info(f"Training job {JOB_NAME} is running...")
 
     logging.info("---------------------------------------------------------------")
-    wait_timeout = 60 * 60
+    wait_timeout = 60 * 10
     polling_interval = 30
     for _ in range(round(wait_timeout / polling_interval)):
 

From 3100aae50155e0bbaa5e4681ac2e13849c47226b Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sun, 11 Aug 2024 17:06:12 +0800
Subject: [PATCH 16/86] find bug

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml | 8 +++++++-
 sdk/python/test_e2e/test_e2e_train_api.py | 4 +---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index 7cf10aef86..375b4df442 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -58,4 +58,10 @@ jobs:
               run: |
                 python3 -m pip install -e sdk/python[huggingface]
                 ./sdk/python/test_e2e/test_e2e_train_api.sh
-                (kubectl get pods -n default && kubectl describe pod -n default $(kubectl get pods -n default -o jsonpath='{.items[0].metadata.name}'); exit 1)
+                kubectl get pods -n default 
+                POD_NAME=$(kubectl get pods -n default -o jsonpath='{.items[0].metadata.name}')
+                kubectl describe pod -n default $POD_NAME
+                kubectl get pvc -n default
+                PVC_NAME=$(kubectl get pvc -n default -o jsonpath='{.items[0].metadata.name}')
+                kubectl describe pvc -n default $PVC_NAME
+                exit 1
diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index a89cfff448..ea731ef146 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -111,7 +111,7 @@ def test_train_api(job_namespace):
     logging.info(f"Training job {JOB_NAME} is running...")
 
     logging.info("---------------------------------------------------------------")
-    wait_timeout = 60 * 10
+    wait_timeout = 60 * 120
     polling_interval = 30
     for _ in range(round(wait_timeout / polling_interval)):
 
@@ -142,8 +142,6 @@ def test_train_api(job_namespace):
                 name=pod_name, namespace=job_namespace
             )
 
-            print("pod_status:")
-            print(pod_status)
             print("pod_status.status:")
             print(pod_status.status)
             print("pod_status.status.container_statuses:")

From e5b9061cee3ef9168bd853ca28a5be95800f3c44 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Mon, 12 Aug 2024 07:45:16 +0800
Subject: [PATCH 17/86] add storage_config

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml |  2 +-
 sdk/python/test_e2e/test_e2e_train_api.py | 30 +++++++++++------------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index 375b4df442..4378a01bb6 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -58,7 +58,7 @@ jobs:
               run: |
                 python3 -m pip install -e sdk/python[huggingface]
                 ./sdk/python/test_e2e/test_e2e_train_api.sh
-                kubectl get pods -n default 
+                kubectl get pods -n default
                 POD_NAME=$(kubectl get pods -n default -o jsonpath='{.items[0].metadata.name}')
                 kubectl describe pod -n default $POD_NAME
                 kubectl get pvc -n default
diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index ea731ef146..e71b5ece11 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -101,6 +101,11 @@ def test_train_api(job_namespace):
             "cpu": 2,
             "memory": "10G",
         },
+        storage_config={
+            "size": "2Gi",
+            "storage_class": "ReadWriteOnce",
+            "access_modes": ["ReadWriteOnce", "ReadOnlyMany"],
+        }
     )
 
     logging.info("---------------------------------------------------------------")
@@ -111,8 +116,8 @@ def test_train_api(job_namespace):
     logging.info(f"Training job {JOB_NAME} is running...")
 
     logging.info("---------------------------------------------------------------")
-    wait_timeout = 60 * 120
-    polling_interval = 30
+    wait_timeout = 60 * 30  # 30 minutes.
+    polling_interval = 30  # 30 seconds.
     for _ in range(round(wait_timeout / polling_interval)):
 
         # Get the list of pods associated with the job.
@@ -123,19 +128,6 @@ def test_train_api(job_namespace):
         config.load_kube_config()  # Load kube config to interact with the cluster.
         v1 = client.CoreV1Api()
 
-        # Get Job only once per cycle and check the statuses.
-        job = TRAINING_CLIENT.get_job(
-            name=JOB_NAME,
-            namespace=job_namespace,
-            job_kind=constants.PYTORCHJOB_KIND,
-            timeout=constants.DEFAULT_TIMEOUT,
-        )
-
-        for replica_name, replica_status in job.status.replica_statuses.items():
-            logging.info(
-                f"Replica {replica_name} status: {replica_status.succeeded} succeeded, {replica_status.failed} failed."
-            )
-
         # Iterate over each pod to check its status.
         for pod_name in pod_names:
             pod_status = v1.read_namespaced_pod_status(
@@ -169,6 +161,14 @@ def test_train_api(job_namespace):
                     # Raise an exception to indicate that a pod has failed at least once.
                     raise Exception(f"Training job {JOB_NAME} is failed.")
 
+        # Get Job only once per cycle and check the statuses.
+        job = TRAINING_CLIENT.get_job(
+            name=JOB_NAME,
+            namespace=job_namespace,
+            job_kind=constants.PYTORCHJOB_KIND,
+            timeout=constants.DEFAULT_TIMEOUT,
+        )
+
         # Get Job conditions.
         conditions = TRAINING_CLIENT.get_job_conditions(
             job=job, timeout=constants.DEFAULT_TIMEOUT

From ffb068523d492abb87587ee937fff78007c47f59 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Mon, 12 Aug 2024 07:47:13 +0800
Subject: [PATCH 18/86] fix format

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index e71b5ece11..b419be5141 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -105,7 +105,7 @@ def test_train_api(job_namespace):
             "size": "2Gi",
             "storage_class": "ReadWriteOnce",
             "access_modes": ["ReadWriteOnce", "ReadOnlyMany"],
-        }
+        },
     )
 
     logging.info("---------------------------------------------------------------")

From dc1b48a5be59a8afdcfe522af46bcf9ae6d77ebb Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Mon, 12 Aug 2024 14:39:57 +0800
Subject: [PATCH 19/86] reduce pvc size

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index b419be5141..d1fee95f0f 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -27,7 +27,10 @@
 from peft import LoraConfig
 import transformers
 
-logging.basicConfig(format="%(message)s")
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    level=logging.INFO,
+)
 logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG)
 
 TRAINING_CLIENT = TrainingClient(job_kind=constants.PYTORCHJOB_KIND)
@@ -99,12 +102,10 @@ def test_train_api(job_namespace):
         resources_per_worker={
             "gpu": 0,
             "cpu": 2,
-            "memory": "10G",
+            "memory": "2G",
         },
         storage_config={
             "size": "2Gi",
-            "storage_class": "ReadWriteOnce",
-            "access_modes": ["ReadWriteOnce", "ReadOnlyMany"],
         },
     )
 
@@ -119,7 +120,6 @@ def test_train_api(job_namespace):
     wait_timeout = 60 * 30  # 30 minutes.
     polling_interval = 30  # 30 seconds.
     for _ in range(round(wait_timeout / polling_interval)):
-
         # Get the list of pods associated with the job.
         pod_names = TRAINING_CLIENT.get_job_pod_names(
             name=JOB_NAME, namespace=job_namespace
@@ -134,12 +134,6 @@ def test_train_api(job_namespace):
                 name=pod_name, namespace=job_namespace
             )
 
-            print("pod_status.status:")
-            print(pod_status.status)
-            print("pod_status.status.container_statuses:")
-            print(pod_status.status.container_statuses)
-            print("continue...")
-
             # Ensure that container_statuses is not None before iterating.
             if pod_status.status.container_statuses is None:
                 logging.warning(

From 889451755ff575e8fd73cb14a2e2afb21c4af370 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Mon, 12 Aug 2024 16:11:41 +0800
Subject: [PATCH 20/86] set storage_config

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index d1fee95f0f..b759383c1e 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -106,6 +106,7 @@ def test_train_api(job_namespace):
         },
         storage_config={
             "size": "2Gi",
+            "access_modes": "ReadWriteOnce",
         },
     )
 
@@ -150,7 +151,7 @@ def test_train_api(job_namespace):
 
                     get_logs_of_master_pod(job_namespace, num_workers)
 
-                    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+                    #TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
 
                     # Raise an exception to indicate that a pod has failed at least once.
                     raise Exception(f"Training job {JOB_NAME} is failed.")

From 36872d725ffbdef69cca50c62a11eee9da701464 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Mon, 12 Aug 2024 16:12:14 +0800
Subject: [PATCH 21/86] set storage_config

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index b759383c1e..e85d862f8c 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -118,7 +118,7 @@ def test_train_api(job_namespace):
     logging.info(f"Training job {JOB_NAME} is running...")
 
     logging.info("---------------------------------------------------------------")
-    wait_timeout = 60 * 30  # 30 minutes.
+    wait_timeout = 60 * 15  # 30 minutes.
     polling_interval = 30  # 30 seconds.
     for _ in range(round(wait_timeout / polling_interval)):
         # Get the list of pods associated with the job.

From 7dd8d400a86cdcc6ccde6095e085ba23dab22df4 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Mon, 12 Aug 2024 16:14:25 +0800
Subject: [PATCH 22/86] set storage_config

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index e85d862f8c..18db6720f7 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -151,7 +151,7 @@ def test_train_api(job_namespace):
 
                     get_logs_of_master_pod(job_namespace, num_workers)
 
-                    #TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+                    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
 
                     # Raise an exception to indicate that a pod has failed at least once.
                     raise Exception(f"Training job {JOB_NAME} is failed.")

From 60c322d98e874caf4b08fd46705a9104a576955b Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Mon, 12 Aug 2024 16:21:53 +0800
Subject: [PATCH 23/86] set storage_config

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index 18db6720f7..f0259b1eec 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -106,7 +106,7 @@ def test_train_api(job_namespace):
         },
         storage_config={
             "size": "2Gi",
-            "access_modes": "ReadWriteOnce",
+            "access_modes": ["ReadWriteOnce"],
         },
     )
 

From dd970ab825a862f3aff078aa442ead0760102cbe Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Mon, 12 Aug 2024 17:33:57 +0800
Subject: [PATCH 24/86] use gpu

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index f0259b1eec..c2ce5655a5 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -100,8 +100,8 @@ def test_train_api(job_namespace):
         num_workers=num_workers,  # nodes parameter for torchrun command.
         num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
         resources_per_worker={
-            "gpu": 0,
-            "cpu": 2,
+            "gpu": 1,
+            "cpu": 0,
             "memory": "2G",
         },
         storage_config={
@@ -118,7 +118,7 @@ def test_train_api(job_namespace):
     logging.info(f"Training job {JOB_NAME} is running...")
 
     logging.info("---------------------------------------------------------------")
-    wait_timeout = 60 * 15  # 30 minutes.
+    wait_timeout = 60 * 60  # 1 hour.
     polling_interval = 30  # 30 seconds.
     for _ in range(round(wait_timeout / polling_interval)):
         # Get the list of pods associated with the job.
@@ -151,7 +151,7 @@ def test_train_api(job_namespace):
 
                     get_logs_of_master_pod(job_namespace, num_workers)
 
-                    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+                    #TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
 
                     # Raise an exception to indicate that a pod has failed at least once.
                     raise Exception(f"Training job {JOB_NAME} is failed.")

From 10bbfa0e6aba2be56776300d90d822f99394be04 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Mon, 12 Aug 2024 17:46:40 +0800
Subject: [PATCH 25/86] use gpu

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index c2ce5655a5..167a27afac 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -118,7 +118,7 @@ def test_train_api(job_namespace):
     logging.info(f"Training job {JOB_NAME} is running...")
 
     logging.info("---------------------------------------------------------------")
-    wait_timeout = 60 * 60  # 1 hour.
+    wait_timeout = 60 * 10  # 1 hour.
     polling_interval = 30  # 30 seconds.
     for _ in range(round(wait_timeout / polling_interval)):
         # Get the list of pods associated with the job.

From d47d6a6c0e01f400aa4a7d66f6d201da2cf1eaf5 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Mon, 12 Aug 2024 17:48:43 +0800
Subject: [PATCH 26/86] use gpu

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index 167a27afac..cb273bd4a5 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -151,7 +151,7 @@ def test_train_api(job_namespace):
 
                     get_logs_of_master_pod(job_namespace, num_workers)
 
-                    #TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+                    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
 
                     # Raise an exception to indicate that a pod has failed at least once.
                     raise Exception(f"Training job {JOB_NAME} is failed.")

From 4ccd4a76dc91bc0d4bd54b967341bbbf736b9443 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Mon, 12 Aug 2024 19:14:59 +0800
Subject: [PATCH 27/86] fix 'set_device' error

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index cb273bd4a5..1898fd570b 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -88,6 +88,8 @@ def test_train_api(job_namespace):
                 disable_tqdm=True,
                 log_level="info",
                 num_train_epochs=1,
+                no_cuda=True,
+                use_cpu=True,
             ),
             # Set LoRA config to reduce number of trainable model parameters.
             lora_config=LoraConfig(
@@ -100,8 +102,8 @@ def test_train_api(job_namespace):
         num_workers=num_workers,  # nodes parameter for torchrun command.
         num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
         resources_per_worker={
-            "gpu": 1,
-            "cpu": 0,
+            "gpu": 0,
+            "cpu": 2,
             "memory": "2G",
         },
         storage_config={
@@ -118,7 +120,7 @@ def test_train_api(job_namespace):
     logging.info(f"Training job {JOB_NAME} is running...")
 
     logging.info("---------------------------------------------------------------")
-    wait_timeout = 60 * 10  # 1 hour.
+    wait_timeout = 60 * 60  # 1 hour.
     polling_interval = 30  # 30 seconds.
     for _ in range(round(wait_timeout / polling_interval)):
         # Get the list of pods associated with the job.

From 0750322689f36353bd291b9fd2c07d27a8cfc6bf Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 15 Aug 2024 17:03:22 +0800
Subject: [PATCH 28/86] add timeout error

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index 1898fd570b..cd19670565 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -88,8 +88,6 @@ def test_train_api(job_namespace):
                 disable_tqdm=True,
                 log_level="info",
                 num_train_epochs=1,
-                no_cuda=True,
-                use_cpu=True,
             ),
             # Set LoRA config to reduce number of trainable model parameters.
             lora_config=LoraConfig(
@@ -104,10 +102,10 @@ def test_train_api(job_namespace):
         resources_per_worker={
             "gpu": 0,
             "cpu": 2,
-            "memory": "2G",
+            "memory": "10G",
         },
         storage_config={
-            "size": "2Gi",
+            "size": "10Gi",
             "access_modes": ["ReadWriteOnce"],
         },
     )
@@ -122,7 +120,16 @@ def test_train_api(job_namespace):
     logging.info("---------------------------------------------------------------")
     wait_timeout = 60 * 60  # 1 hour.
     polling_interval = 30  # 30 seconds.
-    for _ in range(round(wait_timeout / polling_interval)):
+    start_time = time.time()  # Record the start time
+
+    while True:
+        elapsed_time = time.time() - start_time  # Calculate the elapsed time
+        if elapsed_time > wait_timeout:
+            # Raise a TimeoutError if the job takes too long
+            logging.error(f"Training job {JOB_NAME} exceeded the timeout of {wait_timeout} seconds.")
+            TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+            raise TimeoutError(f"Training job {JOB_NAME} did not complete within the allowed time of {wait_timeout} seconds.")
+
         # Get the list of pods associated with the job.
         pod_names = TRAINING_CLIENT.get_job_pod_names(
             name=JOB_NAME, namespace=job_namespace
@@ -156,7 +163,7 @@ def test_train_api(job_namespace):
                     TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
 
                     # Raise an exception to indicate that a pod has failed at least once.
-                    raise Exception(f"Training job {JOB_NAME} is failed.")
+                    raise Exception(f"Training job {JOB_NAME} has failed.")
 
         # Get Job only once per cycle and check the statuses.
         job = TRAINING_CLIENT.get_job(
@@ -177,7 +184,7 @@ def test_train_api(job_namespace):
             logging.info(
                 "---------------------------------------------------------------"
             )
-            logging.info(f"Training job {JOB_NAME} is succeeded.")
+            logging.info(f"Training job {JOB_NAME} has succeeded.")
 
             logging.info(
                 "---------------------------------------------------------------"
@@ -190,3 +197,4 @@ def test_train_api(job_namespace):
 
 if __name__ == "__main__":
     test_train_api(job_namespace="default")
+    
\ No newline at end of file

From 5ca0923e98a3d7dde7b5a9e1065d5a258e0f8646 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 15 Aug 2024 17:06:56 +0800
Subject: [PATCH 29/86] fix format

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index cd19670565..317b6d9b88 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -126,9 +126,13 @@ def test_train_api(job_namespace):
         elapsed_time = time.time() - start_time  # Calculate the elapsed time
         if elapsed_time > wait_timeout:
             # Raise a TimeoutError if the job takes too long
-            logging.error(f"Training job {JOB_NAME} exceeded the timeout of {wait_timeout} seconds.")
+            logging.error(
+                f"Training job {JOB_NAME} exceeded the timeout of {wait_timeout} seconds."
+            )
             TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
-            raise TimeoutError(f"Training job {JOB_NAME} did not complete within the allowed time of {wait_timeout} seconds.")
+            raise TimeoutError(
+                f"Training job {JOB_NAME} did not complete within the allowed time of {wait_timeout} seconds."
+            )
 
         # Get the list of pods associated with the job.
         pod_names = TRAINING_CLIENT.get_job_pod_names(

From 387eb8479a53d3cd3c8f01fbe37c6bb40b94d20e Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 15 Aug 2024 17:09:30 +0800
Subject: [PATCH 30/86] fix format

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index 317b6d9b88..8f7d81ff70 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -201,4 +201,3 @@ def test_train_api(job_namespace):
 
 if __name__ == "__main__":
     test_train_api(job_namespace="default")
-    
\ No newline at end of file

From 9cc5429c22c589c21120f1e77b14ecc8079470a8 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 15 Aug 2024 17:18:41 +0800
Subject: [PATCH 31/86] fix format

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
index 8f7d81ff70..dde3161084 100644
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ b/sdk/python/test_e2e/test_e2e_train_api.py
@@ -78,7 +78,7 @@ def test_train_api(job_namespace):
             repo_id="yelp_review_full",
             split="train[:8]",
         ),
-        # Specify HuggingFace Trainer parameters. In this example, we will skip evaluation and model checkpoints.
+        # Specify HuggingFace Trainer parameters.
         trainer_parameters=HuggingFaceTrainerParams(
             training_parameters=transformers.TrainingArguments(
                 output_dir="test_trainer",
@@ -131,7 +131,8 @@ def test_train_api(job_namespace):
             )
             TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
             raise TimeoutError(
-                f"Training job {JOB_NAME} did not complete within the allowed time of {wait_timeout} seconds."
+                f"Training job {JOB_NAME} did not complete within the allowed time of "
+                f"{wait_timeout} seconds."
             )
 
         # Get the list of pods associated with the job.
@@ -159,7 +160,8 @@ def test_train_api(job_namespace):
             for container_status in pod_status.status.container_statuses:
                 if container_status.restart_count > 0:
                     logging.warning(
-                        f"Pod {pod_name} in job {JOB_NAME} has been restarted {container_status.restart_count} times. Retrieving logs..."
+                        f"Pod {pod_name} in job {JOB_NAME} has been restarted "
+                        f"{container_status.restart_count} times. Retrieving logs..."
                     )
 
                     get_logs_of_master_pod(job_namespace, num_workers)

From 8a537adf74e22694b330ea6bd60bd0015bc2a68f Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 27 Aug 2024 07:12:53 +0800
Subject: [PATCH 32/86] fix typo

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test_e2e/test_e2e_train_api.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sdk/python/test_e2e/test_e2e_train_api.sh b/sdk/python/test_e2e/test_e2e_train_api.sh
index af41771faf..9495a0c3af 100755
--- a/sdk/python/test_e2e/test_e2e_train_api.sh
+++ b/sdk/python/test_e2e/test_e2e_train_api.sh
@@ -14,8 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# This shell script is used to run Katib Experiment.
-# Input parameter - path to Experiment yaml.
+# This shell script is used to run e2e test.
 
 set -o errexit
 set -o nounset

From e508ef445c14bd8bbb4f909e066834200dbedaa8 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 29 Aug 2024 20:18:27 +0800
Subject: [PATCH 33/86] update e2e test for train api

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml  |  67 -------
 sdk/python/test/e2e/test_e2e_pytorchjob.py |  80 ++++++++
 sdk/python/test_e2e/test_e2e_train_api.py  | 205 ---------------------
 sdk/python/test_e2e/test_e2e_train_api.sh  |  36 ----
 4 files changed, 80 insertions(+), 308 deletions(-)
 delete mode 100644 .github/workflows/e2e-test-train-api.yaml
 delete mode 100644 sdk/python/test_e2e/test_e2e_train_api.py
 delete mode 100755 sdk/python/test_e2e/test_e2e_train_api.sh

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
deleted file mode 100644
index 4378a01bb6..0000000000
--- a/.github/workflows/e2e-test-train-api.yaml
+++ /dev/null
@@ -1,67 +0,0 @@
-name: E2E Test with train API
-
-on:
-  - pull_request
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-    e2e:
-        runs-on: ubuntu-latest
-        strategy:
-            fail-fast: false
-            matrix:
-                kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
-                python-version: ["3.8", "3.9", "3.10", "3.11"]
-        steps:
-            - name: Checkout
-              uses: actions/checkout@v4
-
-            - name: Free-Up Disk Space
-              uses: ./.github/workflows/free-up-disk-space
-
-            - name: Setup Python
-              uses: actions/setup-python@v5
-              with:
-                python-version: ${{ matrix.python-version }}
-
-            - name: Setup Go
-              uses: actions/setup-go@v5
-              with:
-                go-version-file: go.mod
-
-            - name: Create k8s Kind Cluster
-              uses: helm/kind-action@v1.10.0
-              with:
-                node_image: kindest/node:${{ matrix.kubernetes-version }}
-                cluster_name: training-operator-cluster
-                kubectl_version: ${{ matrix.kubernetes-version }}
-
-            - name: Build training-operator
-              run: |
-                ./scripts/gha/build-image.sh
-              env:
-                TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
-
-            - name: Deploy training operator
-              run: |
-                ./scripts/gha/setup-training-operator.sh
-              env:
-                KIND_CLUSTER: training-operator-cluster
-                TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
-                GANG_SCHEDULER_NAME: "none"
-                KUBERNETES_VERSION: ${{ matrix.kubernetes-version }}
-
-            - name: Run tests
-              run: |
-                python3 -m pip install -e sdk/python[huggingface]
-                ./sdk/python/test_e2e/test_e2e_train_api.sh
-                kubectl get pods -n default
-                POD_NAME=$(kubectl get pods -n default -o jsonpath='{.items[0].metadata.name}')
-                kubectl describe pod -n default $POD_NAME
-                kubectl get pvc -n default
-                PVC_NAME=$(kubectl get pvc -n default -o jsonpath='{.items[0].metadata.name}')
-                kubectl describe pvc -n default $PVC_NAME
-                exit 1
diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index c5b28faaf8..508f4d03a3 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -23,6 +23,10 @@
 from kubernetes.client import V1Container
 from kubernetes.client import V1ResourceRequirements
 
+from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams
+from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams
+from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams
+
 from kubeflow.training import TrainingClient
 from kubeflow.training import KubeflowOrgV1ReplicaSpec
 from kubeflow.training import KubeflowOrgV1PyTorchJob
@@ -30,6 +34,10 @@
 from kubeflow.training import KubeflowOrgV1RunPolicy
 from kubeflow.training import KubeflowOrgV1SchedulingPolicy
 from kubeflow.training import constants
+from kubeflow.training.utils import utils
+
+from peft import LoraConfig
+import transformers
 
 import test.e2e.utils as utils
 from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY
@@ -240,6 +248,78 @@ def test_sdk_e2e_create_from_image(job_namespace):
     TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
 
 
+@pytest.mark.skipif(
+    GANG_SCHEDULER_NAME in GANG_SCHEDULERS,
+    reason="For plain scheduling",
+)
+def test_sdk_e2e_create_from_train_api(job_namespace):
+    JOB_NAME = "pytorchjob-from-train-api"
+
+    num_workers = 1
+
+    # Use test case from fine-tuning API tutorial.
+    # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
+    TRAINING_CLIENT.train(
+        name=JOB_NAME,
+        namespace=job_namespace,
+        # BERT model URI and type of Transformer to train it.
+        model_provider_parameters=HuggingFaceModelParams(
+            model_uri="hf://google-bert/bert-base-cased",
+            transformer_type=transformers.AutoModelForSequenceClassification,
+        ),
+        # In order to save test time, use 8 samples from Yelp dataset.
+        dataset_provider_parameters=HuggingFaceDatasetParams(
+            repo_id="yelp_review_full",
+            split="train[:8]",
+        ),
+        # Specify HuggingFace Trainer parameters.
+        trainer_parameters=HuggingFaceTrainerParams(
+            training_parameters=transformers.TrainingArguments(
+                output_dir="test_trainer",
+                save_strategy="no",
+                evaluation_strategy="no",
+                do_eval=False,
+                disable_tqdm=True,
+                log_level="info",
+                num_train_epochs=1,
+            ),
+            # Set LoRA config to reduce number of trainable model parameters.
+            lora_config=LoraConfig(
+                r=8,
+                lora_alpha=8,
+                lora_dropout=0.1,
+                bias="none",
+            ),
+        ),
+        num_workers=num_workers,  # nodes parameter for torchrun command.
+        num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
+        resources_per_worker={
+            "gpu": 0,
+            "cpu": 2,
+            "memory": "10G",
+        },
+        storage_config={
+            "size": "10Gi",
+            "access_modes": ["ReadWriteOnce"],
+        },
+    )
+
+    logging.info(f"List of created {TRAINING_CLIENT.job_kind}s")
+    logging.info(TRAINING_CLIENT.list_jobs(job_namespace))
+
+    try:
+        utils.verify_job_e2e(
+            TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=1800
+        )
+    except Exception as e:
+        utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
+        TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+        raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}")
+
+    utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
+    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+
+
 def generate_pytorchjob(
     job_namespace: str,
     job_name: str,
diff --git a/sdk/python/test_e2e/test_e2e_train_api.py b/sdk/python/test_e2e/test_e2e_train_api.py
deleted file mode 100644
index dde3161084..0000000000
--- a/sdk/python/test_e2e/test_e2e_train_api.py
+++ /dev/null
@@ -1,205 +0,0 @@
-# Copyright 2024 kubeflow.org.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import time
-
-from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams
-from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams
-from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams
-from kubeflow.training import constants
-from kubeflow.training import TrainingClient
-from kubeflow.training.utils import utils
-from kubernetes import client
-from kubernetes import config
-from kubernetes.client.exceptions import ApiException
-from peft import LoraConfig
-import transformers
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(message)s",
-    level=logging.INFO,
-)
-logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG)
-
-TRAINING_CLIENT = TrainingClient(job_kind=constants.PYTORCHJOB_KIND)
-JOB_NAME = "test-train-api"
-
-
-def get_logs_of_master_pod(job_namespace, num_workers):
-    # Verify that training job has correct pods.
-    pod_names = TRAINING_CLIENT.get_job_pod_names(
-        name=JOB_NAME, namespace=job_namespace
-    )
-
-    if len(pod_names) != num_workers:
-        raise Exception(f"Training job has incorrect pods: {pod_names}")
-
-    # Get and print the logs of the master pod.
-    master_pod_name = next((name for name in pod_names if "master" in name), None)
-    if master_pod_name:
-        config.load_kube_config()  # Load kube config to interact with the cluster.
-        v1 = client.CoreV1Api()
-        try:
-            pod_logs = v1.read_namespaced_pod_log(
-                name=master_pod_name, namespace=job_namespace
-            )
-            logging.info(f"Logs of master pod {master_pod_name}:\n{pod_logs}")
-        except ApiException as e:
-            logging.error(f"Failed to get logs for pod {master_pod_name}: {e}")
-
-
-def test_train_api(job_namespace):
-    num_workers = 1
-
-    # Use test case from fine-tuning API tutorial.
-    # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
-    TRAINING_CLIENT.train(
-        name=JOB_NAME,
-        namespace=job_namespace,
-        # BERT model URI and type of Transformer to train it.
-        model_provider_parameters=HuggingFaceModelParams(
-            model_uri="hf://google-bert/bert-base-cased",
-            transformer_type=transformers.AutoModelForSequenceClassification,
-        ),
-        # In order to save test time, use 8 samples from Yelp dataset.
-        dataset_provider_parameters=HuggingFaceDatasetParams(
-            repo_id="yelp_review_full",
-            split="train[:8]",
-        ),
-        # Specify HuggingFace Trainer parameters.
-        trainer_parameters=HuggingFaceTrainerParams(
-            training_parameters=transformers.TrainingArguments(
-                output_dir="test_trainer",
-                save_strategy="no",
-                evaluation_strategy="no",
-                do_eval=False,
-                disable_tqdm=True,
-                log_level="info",
-                num_train_epochs=1,
-            ),
-            # Set LoRA config to reduce number of trainable model parameters.
-            lora_config=LoraConfig(
-                r=8,
-                lora_alpha=8,
-                lora_dropout=0.1,
-                bias="none",
-            ),
-        ),
-        num_workers=num_workers,  # nodes parameter for torchrun command.
-        num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
-        resources_per_worker={
-            "gpu": 0,
-            "cpu": 2,
-            "memory": "10G",
-        },
-        storage_config={
-            "size": "10Gi",
-            "access_modes": ["ReadWriteOnce"],
-        },
-    )
-
-    logging.info("---------------------------------------------------------------")
-    logging.info(f"List of created {TRAINING_CLIENT.job_kind}s:")
-    logging.info(TRAINING_CLIENT.list_jobs(job_namespace))
-
-    logging.info("---------------------------------------------------------------")
-    logging.info(f"Training job {JOB_NAME} is running...")
-
-    logging.info("---------------------------------------------------------------")
-    wait_timeout = 60 * 60  # 1 hour.
-    polling_interval = 30  # 30 seconds.
-    start_time = time.time()  # Record the start time
-
-    while True:
-        elapsed_time = time.time() - start_time  # Calculate the elapsed time
-        if elapsed_time > wait_timeout:
-            # Raise a TimeoutError if the job takes too long
-            logging.error(
-                f"Training job {JOB_NAME} exceeded the timeout of {wait_timeout} seconds."
-            )
-            TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
-            raise TimeoutError(
-                f"Training job {JOB_NAME} did not complete within the allowed time of "
-                f"{wait_timeout} seconds."
-            )
-
-        # Get the list of pods associated with the job.
-        pod_names = TRAINING_CLIENT.get_job_pod_names(
-            name=JOB_NAME, namespace=job_namespace
-        )
-
-        config.load_kube_config()  # Load kube config to interact with the cluster.
-        v1 = client.CoreV1Api()
-
-        # Iterate over each pod to check its status.
-        for pod_name in pod_names:
-            pod_status = v1.read_namespaced_pod_status(
-                name=pod_name, namespace=job_namespace
-            )
-
-            # Ensure that container_statuses is not None before iterating.
-            if pod_status.status.container_statuses is None:
-                logging.warning(
-                    f"Pod {pod_name} has no container statuses available yet."
-                )
-                continue
-
-            # Check if any container in the pod has been restarted, indicating a previous failure.
-            for container_status in pod_status.status.container_statuses:
-                if container_status.restart_count > 0:
-                    logging.warning(
-                        f"Pod {pod_name} in job {JOB_NAME} has been restarted "
-                        f"{container_status.restart_count} times. Retrieving logs..."
-                    )
-
-                    get_logs_of_master_pod(job_namespace, num_workers)
-
-                    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
-
-                    # Raise an exception to indicate that a pod has failed at least once.
-                    raise Exception(f"Training job {JOB_NAME} has failed.")
-
-        # Get Job only once per cycle and check the statuses.
-        job = TRAINING_CLIENT.get_job(
-            name=JOB_NAME,
-            namespace=job_namespace,
-            job_kind=constants.PYTORCHJOB_KIND,
-            timeout=constants.DEFAULT_TIMEOUT,
-        )
-
-        # Get Job conditions.
-        conditions = TRAINING_CLIENT.get_job_conditions(
-            job=job, timeout=constants.DEFAULT_TIMEOUT
-        )
-
-        # Check if the job has succeeded.
-        if utils.has_condition(conditions, constants.JOB_CONDITION_SUCCEEDED):
-            get_logs_of_master_pod(job_namespace, num_workers)
-            logging.info(
-                "---------------------------------------------------------------"
-            )
-            logging.info(f"Training job {JOB_NAME} has succeeded.")
-
-            logging.info(
-                "---------------------------------------------------------------"
-            )
-            TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
-            break
-
-        time.sleep(polling_interval)
-
-
-if __name__ == "__main__":
-    test_train_api(job_namespace="default")
diff --git a/sdk/python/test_e2e/test_e2e_train_api.sh b/sdk/python/test_e2e/test_e2e_train_api.sh
deleted file mode 100755
index 9495a0c3af..0000000000
--- a/sdk/python/test_e2e/test_e2e_train_api.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright 2024 The Kubeflow Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This shell script is used to run e2e test.
-
-set -o errexit
-set -o nounset
-set -o pipefail
-
-cd "$(dirname "$0")"
-
-echo "Training Operator deployments"
-kubectl -n kubeflow get deploy
-echo "Training Operator services"
-kubectl -n kubeflow get svc
-echo "Training Operator pods"
-kubectl -n kubeflow get pod
-echo "Training Operator persistent volume claims"
-kubectl get pvc -n kubeflow
-echo "Available CRDs"
-kubectl get crd
-
-python test_e2e_train_api.py || (kubectl get pods -n kubeflow && exit 1)

From 788359bec23bf1b696292a3021b4e191980f3ffd Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 29 Aug 2024 20:21:06 +0800
Subject: [PATCH 34/86] add num_labels

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index 508f4d03a3..827b86032d 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -266,6 +266,7 @@ def test_sdk_e2e_create_from_train_api(job_namespace):
         model_provider_parameters=HuggingFaceModelParams(
             model_uri="hf://google-bert/bert-base-cased",
             transformer_type=transformers.AutoModelForSequenceClassification,
+            num_labels=5,
         ),
         # In order to save test time, use 8 samples from Yelp dataset.
         dataset_provider_parameters=HuggingFaceDatasetParams(

From 9b4222e7b08eb09aa002ec5595effc71ede6ee89 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 29 Aug 2024 20:31:37 +0800
Subject: [PATCH 35/86] update pip install

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/integration-tests.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml
index ca2b543fc7..b2886587bc 100644
--- a/.github/workflows/integration-tests.yaml
+++ b/.github/workflows/integration-tests.yaml
@@ -96,7 +96,7 @@ jobs:
       - name: Run tests
         run: |
           pip install pytest
-          python3 -m pip install -e sdk/python; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
+          python3 -m pip install -e sdk/python[huggingface]; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
         env:
           GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}
 

From d75938d057761d3afca395326ce1635f39cf7382 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 29 Aug 2024 21:32:23 +0800
Subject: [PATCH 36/86] check disk space

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/integration-tests.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml
index b2886587bc..b34981c378 100644
--- a/.github/workflows/integration-tests.yaml
+++ b/.github/workflows/integration-tests.yaml
@@ -92,6 +92,9 @@ jobs:
           TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
           GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}
           KUBERNETES_VERSION: ${{ matrix.kubernetes-version }}
+      
+      - name: Check Disk Space
+        run: df -h
 
       - name: Run tests
         run: |

From 1148bc8010b26b26c1e8e83de0a03d8e69ece9f4 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 29 Aug 2024 21:54:04 +0800
Subject: [PATCH 37/86] change sequence of e2e tests

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 146 ++++++++++-----------
 1 file changed, 73 insertions(+), 73 deletions(-)

diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index 827b86032d..a8415de4cb 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -128,6 +128,79 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace):
     TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
 
 
+@pytest.mark.skipif(
+    GANG_SCHEDULER_NAME in GANG_SCHEDULERS,
+    reason="For plain scheduling",
+)
+def test_sdk_e2e_create_from_train_api(job_namespace):
+    JOB_NAME = "pytorchjob-from-train-api"
+
+    num_workers = 1
+
+    # Use test case from fine-tuning API tutorial.
+    # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
+    TRAINING_CLIENT.train(
+        name=JOB_NAME,
+        namespace=job_namespace,
+        # BERT model URI and type of Transformer to train it.
+        model_provider_parameters=HuggingFaceModelParams(
+            model_uri="hf://google-bert/bert-base-cased",
+            transformer_type=transformers.AutoModelForSequenceClassification,
+            num_labels=5,
+        ),
+        # In order to save test time, use 8 samples from Yelp dataset.
+        dataset_provider_parameters=HuggingFaceDatasetParams(
+            repo_id="yelp_review_full",
+            split="train[:8]",
+        ),
+        # Specify HuggingFace Trainer parameters.
+        trainer_parameters=HuggingFaceTrainerParams(
+            training_parameters=transformers.TrainingArguments(
+                output_dir="test_trainer",
+                save_strategy="no",
+                evaluation_strategy="no",
+                do_eval=False,
+                disable_tqdm=True,
+                log_level="info",
+                num_train_epochs=1,
+            ),
+            # Set LoRA config to reduce number of trainable model parameters.
+            lora_config=LoraConfig(
+                r=8,
+                lora_alpha=8,
+                lora_dropout=0.1,
+                bias="none",
+            ),
+        ),
+        num_workers=num_workers,  # nodes parameter for torchrun command.
+        num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
+        resources_per_worker={
+            "gpu": 0,
+            "cpu": 2,
+            "memory": "10G",
+        },
+        storage_config={
+            "size": "10Gi",
+            "access_modes": ["ReadWriteOnce"],
+        },
+    )
+
+    logging.info(f"List of created {TRAINING_CLIENT.job_kind}s")
+    logging.info(TRAINING_CLIENT.list_jobs(job_namespace))
+
+    try:
+        utils.verify_job_e2e(
+            TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900
+        )
+    except Exception as e:
+        utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
+        TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+        raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}")
+
+    utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
+    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+
+
 @pytest.mark.skipif(
     GANG_SCHEDULER_NAME in GANG_SCHEDULERS,
     reason="For plain scheduling",
@@ -248,79 +321,6 @@ def test_sdk_e2e_create_from_image(job_namespace):
     TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
 
 
-@pytest.mark.skipif(
-    GANG_SCHEDULER_NAME in GANG_SCHEDULERS,
-    reason="For plain scheduling",
-)
-def test_sdk_e2e_create_from_train_api(job_namespace):
-    JOB_NAME = "pytorchjob-from-train-api"
-
-    num_workers = 1
-
-    # Use test case from fine-tuning API tutorial.
-    # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
-    TRAINING_CLIENT.train(
-        name=JOB_NAME,
-        namespace=job_namespace,
-        # BERT model URI and type of Transformer to train it.
-        model_provider_parameters=HuggingFaceModelParams(
-            model_uri="hf://google-bert/bert-base-cased",
-            transformer_type=transformers.AutoModelForSequenceClassification,
-            num_labels=5,
-        ),
-        # In order to save test time, use 8 samples from Yelp dataset.
-        dataset_provider_parameters=HuggingFaceDatasetParams(
-            repo_id="yelp_review_full",
-            split="train[:8]",
-        ),
-        # Specify HuggingFace Trainer parameters.
-        trainer_parameters=HuggingFaceTrainerParams(
-            training_parameters=transformers.TrainingArguments(
-                output_dir="test_trainer",
-                save_strategy="no",
-                evaluation_strategy="no",
-                do_eval=False,
-                disable_tqdm=True,
-                log_level="info",
-                num_train_epochs=1,
-            ),
-            # Set LoRA config to reduce number of trainable model parameters.
-            lora_config=LoraConfig(
-                r=8,
-                lora_alpha=8,
-                lora_dropout=0.1,
-                bias="none",
-            ),
-        ),
-        num_workers=num_workers,  # nodes parameter for torchrun command.
-        num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
-        resources_per_worker={
-            "gpu": 0,
-            "cpu": 2,
-            "memory": "10G",
-        },
-        storage_config={
-            "size": "10Gi",
-            "access_modes": ["ReadWriteOnce"],
-        },
-    )
-
-    logging.info(f"List of created {TRAINING_CLIENT.job_kind}s")
-    logging.info(TRAINING_CLIENT.list_jobs(job_namespace))
-
-    try:
-        utils.verify_job_e2e(
-            TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=1800
-        )
-    except Exception as e:
-        utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
-        TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
-        raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}")
-
-    utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
-    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
-
-
 def generate_pytorchjob(
     job_namespace: str,
     job_name: str,

From d29a85da74aebcb601e39fd8a1eb24ae7cfc030c Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Thu, 29 Aug 2024 22:32:14 +0800
Subject: [PATCH 38/86] add clean-up after each e2e test of pytorchjob

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/integration-tests.yaml   |   3 -
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 161 +++++++++++----------
 2 files changed, 88 insertions(+), 76 deletions(-)

diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml
index b34981c378..b2886587bc 100644
--- a/.github/workflows/integration-tests.yaml
+++ b/.github/workflows/integration-tests.yaml
@@ -92,9 +92,6 @@ jobs:
           TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
           GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}
           KUBERNETES_VERSION: ${{ matrix.kubernetes-version }}
-      
-      - name: Check Disk Space
-        run: df -h
 
       - name: Run tests
         run: |
diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index a8415de4cb..f443bd8a30 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -15,6 +15,7 @@
 import os
 import logging
 import pytest
+import subprocess
 from typing import Optional
 
 from kubernetes.client import V1PodTemplateSpec
@@ -128,79 +129,6 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace):
     TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
 
 
-@pytest.mark.skipif(
-    GANG_SCHEDULER_NAME in GANG_SCHEDULERS,
-    reason="For plain scheduling",
-)
-def test_sdk_e2e_create_from_train_api(job_namespace):
-    JOB_NAME = "pytorchjob-from-train-api"
-
-    num_workers = 1
-
-    # Use test case from fine-tuning API tutorial.
-    # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
-    TRAINING_CLIENT.train(
-        name=JOB_NAME,
-        namespace=job_namespace,
-        # BERT model URI and type of Transformer to train it.
-        model_provider_parameters=HuggingFaceModelParams(
-            model_uri="hf://google-bert/bert-base-cased",
-            transformer_type=transformers.AutoModelForSequenceClassification,
-            num_labels=5,
-        ),
-        # In order to save test time, use 8 samples from Yelp dataset.
-        dataset_provider_parameters=HuggingFaceDatasetParams(
-            repo_id="yelp_review_full",
-            split="train[:8]",
-        ),
-        # Specify HuggingFace Trainer parameters.
-        trainer_parameters=HuggingFaceTrainerParams(
-            training_parameters=transformers.TrainingArguments(
-                output_dir="test_trainer",
-                save_strategy="no",
-                evaluation_strategy="no",
-                do_eval=False,
-                disable_tqdm=True,
-                log_level="info",
-                num_train_epochs=1,
-            ),
-            # Set LoRA config to reduce number of trainable model parameters.
-            lora_config=LoraConfig(
-                r=8,
-                lora_alpha=8,
-                lora_dropout=0.1,
-                bias="none",
-            ),
-        ),
-        num_workers=num_workers,  # nodes parameter for torchrun command.
-        num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
-        resources_per_worker={
-            "gpu": 0,
-            "cpu": 2,
-            "memory": "10G",
-        },
-        storage_config={
-            "size": "10Gi",
-            "access_modes": ["ReadWriteOnce"],
-        },
-    )
-
-    logging.info(f"List of created {TRAINING_CLIENT.job_kind}s")
-    logging.info(TRAINING_CLIENT.list_jobs(job_namespace))
-
-    try:
-        utils.verify_job_e2e(
-            TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900
-        )
-    except Exception as e:
-        utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
-        TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
-        raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}")
-
-    utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
-    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
-
-
 @pytest.mark.skipif(
     GANG_SCHEDULER_NAME in GANG_SCHEDULERS,
     reason="For plain scheduling",
@@ -321,6 +249,79 @@ def test_sdk_e2e_create_from_image(job_namespace):
     TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
 
 
+@pytest.mark.skipif(
+    GANG_SCHEDULER_NAME in GANG_SCHEDULERS,
+    reason="For plain scheduling",
+)
+def test_sdk_e2e_create_from_train_api(job_namespace):
+    JOB_NAME = "pytorchjob-from-train-api"
+
+    num_workers = 1
+
+    # Use test case from fine-tuning API tutorial.
+    # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
+    TRAINING_CLIENT.train(
+        name=JOB_NAME,
+        namespace=job_namespace,
+        # BERT model URI and type of Transformer to train it.
+        model_provider_parameters=HuggingFaceModelParams(
+            model_uri="hf://google-bert/bert-base-cased",
+            transformer_type=transformers.AutoModelForSequenceClassification,
+            num_labels=5,
+        ),
+        # In order to save test time, use 8 samples from Yelp dataset.
+        dataset_provider_parameters=HuggingFaceDatasetParams(
+            repo_id="yelp_review_full",
+            split="train[:8]",
+        ),
+        # Specify HuggingFace Trainer parameters.
+        trainer_parameters=HuggingFaceTrainerParams(
+            training_parameters=transformers.TrainingArguments(
+                output_dir="test_trainer",
+                save_strategy="no",
+                evaluation_strategy="no",
+                do_eval=False,
+                disable_tqdm=True,
+                log_level="info",
+                num_train_epochs=1,
+            ),
+            # Set LoRA config to reduce number of trainable model parameters.
+            lora_config=LoraConfig(
+                r=8,
+                lora_alpha=8,
+                lora_dropout=0.1,
+                bias="none",
+            ),
+        ),
+        num_workers=num_workers,  # nodes parameter for torchrun command.
+        num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
+        resources_per_worker={
+            "gpu": 0,
+            "cpu": 2,
+            "memory": "10G",
+        },
+        storage_config={
+            "size": "10Gi",
+            "access_modes": ["ReadWriteOnce"],
+        },
+    )
+
+    logging.info(f"List of created {TRAINING_CLIENT.job_kind}s")
+    logging.info(TRAINING_CLIENT.list_jobs(job_namespace))
+
+    try:
+        utils.verify_job_e2e(
+            TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900
+        )
+    except Exception as e:
+        utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
+        TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+        raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}")
+
+    utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
+    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+
+
 def generate_pytorchjob(
     job_namespace: str,
     job_name: str,
@@ -349,3 +350,17 @@ def generate_container() -> V1Container:
         args=["--backend", "gloo", "--epochs", "1"],
         resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),
     )
+
+
+@pytest.fixture(scope="function", autouse=True)
+def clean_up_resources():
+    # This code runs after each test function
+    yield
+
+    # Prune all unused Docker images
+    try:
+        subprocess.run(["docker", "image", "prune", "-a", "-f"], check=True)
+        subprocess.run(["docker", "system", "df"], check=True)
+        subprocess.run(["df", "-hT"], check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Error during cleanup: {e}")

From 82ea9bee2d2d1185c53f3b18d271e0ee73df3aad Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 30 Aug 2024 08:17:44 +0800
Subject: [PATCH 39/86] update cleanup function

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index f443bd8a30..2f31cb5d77 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -359,8 +359,15 @@ def clean_up_resources():
 
     # Prune all unused Docker images
     try:
+        # Remove all stopped containers
+        subprocess.run(["docker", "container", "prune", "-f"], check=True)
+        # Remove all unused images
         subprocess.run(["docker", "image", "prune", "-a", "-f"], check=True)
+        # Remove all unused volumes
+        subprocess.run(["docker", "volume", "prune", "-f"], check=True)
+        # Remove all unused networks
+        subprocess.run(["docker", "network", "prune", "-f"], check=True)
+        # Show Docker disk usage
         subprocess.run(["docker", "system", "df"], check=True)
-        subprocess.run(["df", "-hT"], check=True)
     except subprocess.CalledProcessError as e:
-        print(f"Error during cleanup: {e}")
+        print(f"Error during Docker cleanup: {e}")

From b45f9f75459180e00431fa6cb131a21c929c871f Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 30 Aug 2024 08:35:42 +0800
Subject: [PATCH 40/86] update cleanup function

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 30 +++++++++++++++-------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index 2f31cb5d77..3017b40f86 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -357,17 +357,29 @@ def clean_up_resources():
     # This code runs after each test function
     yield
 
-    # Prune all unused Docker images
     try:
-        # Remove all stopped containers
-        subprocess.run(["docker", "container", "prune", "-f"], check=True)
-        # Remove all unused images
+        # 1. Remove unnecessary files
+        print("Freeing up disk space by removing unnecessary files...")
+        subprocess.run([
+            "sudo", "rm", "-rf", 
+            "/usr/share/dotnet",
+            "/opt/ghc",
+            "/usr/local/share/boost",
+            "$AGENT_TOOLSDIRECTORY",
+            "/usr/local/lib/android",
+            "/usr/local/share/powershell",
+            "/usr/share/swift"
+        ], check=True)
+        
+        print("Disk usage after removing unnecessary files:")
+        subprocess.run(["df", "-hT"], check=True)
+
+        # 2. Prune Docker images
+        print("Pruning Docker images to free up space...")
         subprocess.run(["docker", "image", "prune", "-a", "-f"], check=True)
-        # Remove all unused volumes
-        subprocess.run(["docker", "volume", "prune", "-f"], check=True)
-        # Remove all unused networks
-        subprocess.run(["docker", "network", "prune", "-f"], check=True)
-        # Show Docker disk usage
+        
+        print("Docker disk usage after pruning images:")
         subprocess.run(["docker", "system", "df"], check=True)
+
     except subprocess.CalledProcessError as e:
         print(f"Error during Docker cleanup: {e}")

From a204746dc7f048987d9210da86b0f46459dbaf2f Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 30 Aug 2024 09:23:19 +0800
Subject: [PATCH 41/86] update cleanup function-add check disk

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index 3017b40f86..8815d46f8d 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -358,6 +358,10 @@ def clean_up_resources():
     yield
 
     try:
+        # Check contents of /mnt before cleanup
+        print("Listing contents of /mnt directory before cleanup:")
+        subprocess.run(["ls", "-lh", "/mnt"], check=True)
+        
         # 1. Remove unnecessary files
         print("Freeing up disk space by removing unnecessary files...")
         subprocess.run([
@@ -368,7 +372,7 @@ def clean_up_resources():
             "$AGENT_TOOLSDIRECTORY",
             "/usr/local/lib/android",
             "/usr/local/share/powershell",
-            "/usr/share/swift"
+            "/usr/share/swift",
         ], check=True)
         
         print("Disk usage after removing unnecessary files:")
@@ -381,5 +385,9 @@ def clean_up_resources():
         print("Docker disk usage after pruning images:")
         subprocess.run(["docker", "system", "df"], check=True)
 
+        # Check contents of /mnt after cleanup
+        print("Listing contents of /mnt directory after cleanup:")
+        subprocess.run(["ls", "-lh", "/mnt"], check=True)
+
     except subprocess.CalledProcessError as e:
         print(f"Error during Docker cleanup: {e}")

From 2d8f8b1de369ec4a8599c9bcfeb0af866d570f71 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 30 Aug 2024 09:49:53 +0800
Subject: [PATCH 42/86] check docker volumes

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/integration-tests.yaml   | 6 ++++++
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml
index b2886587bc..2f6c8a8bd5 100644
--- a/.github/workflows/integration-tests.yaml
+++ b/.github/workflows/integration-tests.yaml
@@ -100,6 +100,12 @@ jobs:
         env:
           GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}
 
+      # List all Docker volumes to understand disk usage
+      - name: List Docker volumes
+        run: |
+          echo "Listing all Docker volumes:"
+          docker volume ls
+
       - name: Collect volcano logs
         if: ${{ failure() &&  matrix.gang-scheduler-name == 'volcano' }}
         run: |
diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index 8815d46f8d..a20763e49e 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -311,7 +311,7 @@ def test_sdk_e2e_create_from_train_api(job_namespace):
 
     try:
         utils.verify_job_e2e(
-            TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900
+            TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=300
         )
     except Exception as e:
         utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)

From c748d0e3a7213f27b2068e3c5336a5a140957216 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 30 Aug 2024 11:19:11 +0800
Subject: [PATCH 43/86] update cleanup function

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/integration-tests.yaml   |  6 ----
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 37 ++++++++--------------
 2 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml
index 2f6c8a8bd5..b2886587bc 100644
--- a/.github/workflows/integration-tests.yaml
+++ b/.github/workflows/integration-tests.yaml
@@ -100,12 +100,6 @@ jobs:
         env:
           GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}
 
-      # List all Docker volumes to understand disk usage
-      - name: List Docker volumes
-        run: |
-          echo "Listing all Docker volumes:"
-          docker volume ls
-
       - name: Collect volcano logs
         if: ${{ failure() &&  matrix.gang-scheduler-name == 'volcano' }}
         run: |
diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index a20763e49e..b7b3c6d786 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -358,36 +358,27 @@ def clean_up_resources():
     yield
 
     try:
-        # Check contents of /mnt before cleanup
-        print("Listing contents of /mnt directory before cleanup:")
-        subprocess.run(["ls", "-lh", "/mnt"], check=True)
-        
-        # 1. Remove unnecessary files
+        # Display disk usage before cleanup
+        print("Disk usage before removing unnecessary files:")
+        subprocess.run(["df", "-hT"], check=True)
+
+        # Remove unnecessary docker files
         print("Freeing up disk space by removing unnecessary files...")
         subprocess.run([
             "sudo", "rm", "-rf", 
-            "/usr/share/dotnet",
-            "/opt/ghc",
-            "/usr/local/share/boost",
-            "$AGENT_TOOLSDIRECTORY",
-            "/usr/local/lib/android",
-            "/usr/local/share/powershell",
-            "/usr/share/swift",
+            "mnt/docker"
         ], check=True)
-        
-        print("Disk usage after removing unnecessary files:")
-        subprocess.run(["df", "-hT"], check=True)
 
-        # 2. Prune Docker images
-        print("Pruning Docker images to free up space...")
+        # Prune Docker images and build cache
+        print("Pruning Docker images...")
         subprocess.run(["docker", "image", "prune", "-a", "-f"], check=True)
-        
-        print("Docker disk usage after pruning images:")
-        subprocess.run(["docker", "system", "df"], check=True)
 
-        # Check contents of /mnt after cleanup
-        print("Listing contents of /mnt directory after cleanup:")
-        subprocess.run(["ls", "-lh", "/mnt"], check=True)
+        print("Clearing Docker build cache...")
+        subprocess.run(["docker", "builder", "prune", "-f"], check=True)
+             
+        # Display disk usage after cleanup
+        print("Disk usage after removing unnecessary files:")
+        subprocess.run(["df", "-hT"], check=True)
 
     except subprocess.CalledProcessError as e:
         print(f"Error during Docker cleanup: {e}")

From a68e182d1b486c59e516fd30e661d36534c9cc42 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 30 Aug 2024 12:00:16 +0800
Subject: [PATCH 44/86] update cleanup function

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 24 +++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index b7b3c6d786..c5660f0f48 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -35,7 +35,6 @@
 from kubeflow.training import KubeflowOrgV1RunPolicy
 from kubeflow.training import KubeflowOrgV1SchedulingPolicy
 from kubeflow.training import constants
-from kubeflow.training.utils import utils
 
 from peft import LoraConfig
 import transformers
@@ -362,23 +361,38 @@ def clean_up_resources():
         print("Disk usage before removing unnecessary files:")
         subprocess.run(["df", "-hT"], check=True)
 
-        # Remove unnecessary docker files
+        # Check detailed disk usage in /mnt
+        print("Detailed disk usage in /mnt before cleanup:")
+        subprocess.run(["du", "-sh", "/mnt/*"], check=True)
+
+        # Remove unnecessary docker files from the correct directory
         print("Freeing up disk space by removing unnecessary files...")
         subprocess.run([
             "sudo", "rm", "-rf", 
-            "mnt/docker"
+            "/mnt/docker"
         ], check=True)
 
-        # Prune Docker images and build cache
+        # List open files in /mnt/docker to understand usage
+        print("Listing open files in /mnt/docker:")
+        subprocess.run(["lsof", "+D", "/mnt/docker"], check=True)
+
+        # Prune Docker images and volumes
         print("Pruning Docker images...")
         subprocess.run(["docker", "image", "prune", "-a", "-f"], check=True)
 
+        print("Pruning Docker volumes...")
+        subprocess.run(["docker", "volume", "prune", "-f"], check=True)
+
         print("Clearing Docker build cache...")
         subprocess.run(["docker", "builder", "prune", "-f"], check=True)
-             
+
         # Display disk usage after cleanup
         print("Disk usage after removing unnecessary files:")
         subprocess.run(["df", "-hT"], check=True)
 
+        # Check detailed disk usage in /mnt after cleanup
+        print("Detailed disk usage in /mnt after cleanup:")
+        subprocess.run(["du", "-sh", "/mnt/*"], check=True)
+
     except subprocess.CalledProcessError as e:
         print(f"Error during Docker cleanup: {e}")

From 227129e38a6f65b8b041aabdfeed074ed2dc1cd8 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 30 Aug 2024 13:33:07 +0800
Subject: [PATCH 45/86] check docker directory

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 45 +++++++---------------
 1 file changed, 13 insertions(+), 32 deletions(-)

diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index c5660f0f48..efb6f9f248 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -361,38 +361,19 @@ def clean_up_resources():
         print("Disk usage before removing unnecessary files:")
         subprocess.run(["df", "-hT"], check=True)
 
-        # Check detailed disk usage in /mnt
-        print("Detailed disk usage in /mnt before cleanup:")
-        subprocess.run(["du", "-sh", "/mnt/*"], check=True)
-
-        # Remove unnecessary docker files from the correct directory
-        print("Freeing up disk space by removing unnecessary files...")
-        subprocess.run([
-            "sudo", "rm", "-rf", 
-            "/mnt/docker"
-        ], check=True)
-
-        # List open files in /mnt/docker to understand usage
-        print("Listing open files in /mnt/docker:")
-        subprocess.run(["lsof", "+D", "/mnt/docker"], check=True)
-
-        # Prune Docker images and volumes
-        print("Pruning Docker images...")
-        subprocess.run(["docker", "image", "prune", "-a", "-f"], check=True)
-
-        print("Pruning Docker volumes...")
-        subprocess.run(["docker", "volume", "prune", "-f"], check=True)
-
-        print("Clearing Docker build cache...")
-        subprocess.run(["docker", "builder", "prune", "-f"], check=True)
-
-        # Display disk usage after cleanup
-        print("Disk usage after removing unnecessary files:")
-        subprocess.run(["df", "-hT"], check=True)
-
-        # Check detailed disk usage in /mnt after cleanup
-        print("Detailed disk usage in /mnt after cleanup:")
-        subprocess.run(["du", "-sh", "/mnt/*"], check=True)
+        # Check contents of /var/lib/docker before cleanup
+        print("Listing contents of /var/lib/docker directory before cleanup:")
+        try:
+            subprocess.run(["ls", "-lh", "/var/lib/docker"], check=True)
+        except subprocess.CalledProcessError as e:
+            print(f"Error listing /var/lib/docker: {e}")
+
+        # Check contents of /mnt/docker before cleanup
+        print("Listing contents of /mnt/docker directory before cleanup:")
+        try:
+            subprocess.run(["ls", "-lh", "/mnt/docker"], check=True)
+        except subprocess.CalledProcessError as e:
+            print(f"Error listing /mnt/docker: {e}")
 
     except subprocess.CalledProcessError as e:
         print(f"Error during Docker cleanup: {e}")

From 79e9e32fd092abe6a625c4bc6a193840c45bf543 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 30 Aug 2024 15:47:37 +0800
Subject: [PATCH 46/86] update pip install and 'num_workers'

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/integration-tests.yaml   |  2 +-
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 36 ++++++++++------------
 2 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml
index b2886587bc..92f8ced2f1 100644
--- a/.github/workflows/integration-tests.yaml
+++ b/.github/workflows/integration-tests.yaml
@@ -96,7 +96,7 @@ jobs:
       - name: Run tests
         run: |
           pip install pytest
-          python3 -m pip install -e sdk/python[huggingface]; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
+          python3 -m pip install -e sdk/python/kubeflow/trainer -e sdk/pythoh; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
         env:
           GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}
 
diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index efb6f9f248..01dad75c82 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -255,8 +255,6 @@ def test_sdk_e2e_create_from_image(job_namespace):
 def test_sdk_e2e_create_from_train_api(job_namespace):
     JOB_NAME = "pytorchjob-from-train-api"
 
-    num_workers = 1
-
     # Use test case from fine-tuning API tutorial.
     # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
     TRAINING_CLIENT.train(
@@ -292,7 +290,7 @@ def test_sdk_e2e_create_from_train_api(job_namespace):
                 bias="none",
             ),
         ),
-        num_workers=num_workers,  # nodes parameter for torchrun command.
+        num_workers=1,  # nodes parameter for torchrun command.
         num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
         resources_per_worker={
             "gpu": 0,
@@ -357,23 +355,21 @@ def clean_up_resources():
     yield
 
     try:
-        # Display disk usage before cleanup
-        print("Disk usage before removing unnecessary files:")
-        subprocess.run(["df", "-hT"], check=True)
-
-        # Check contents of /var/lib/docker before cleanup
-        print("Listing contents of /var/lib/docker directory before cleanup:")
-        try:
-            subprocess.run(["ls", "-lh", "/var/lib/docker"], check=True)
-        except subprocess.CalledProcessError as e:
-            print(f"Error listing /var/lib/docker: {e}")
-
-        # Check contents of /mnt/docker before cleanup
-        print("Listing contents of /mnt/docker directory before cleanup:")
-        try:
-            subprocess.run(["ls", "-lh", "/mnt/docker"], check=True)
-        except subprocess.CalledProcessError as e:
-            print(f"Error listing /mnt/docker: {e}")
+        # List all volumes and inspect them
+        print("Listing all Docker volumes:")
+        subprocess.run(["docker", "volume", "ls"], check=True)
+
+        # Check for stopped containers
+        print("Checking for stopped containers:")
+        subprocess.run(["docker", "ps", "-a"], check=True)
+
+        # Remove all stopped containers
+        print("Removing stopped containers...")
+        subprocess.run(["docker", "rm", "$(docker ps -a -q)"], shell=True, check=True)
+
+        # Prune unused volumes
+        print("Pruning unused Docker volumes...")
+        subprocess.run(["docker", "volume", "prune", "-f"], check=True)
 
     except subprocess.CalledProcessError as e:
         print(f"Error during Docker cleanup: {e}")

From b7dbf5c4f39bbbb698760e4e1532e70432ec75d9 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 30 Aug 2024 15:48:35 +0800
Subject: [PATCH 47/86] update pip install and 'num_workers'

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/integration-tests.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml
index 92f8ced2f1..2fed976393 100644
--- a/.github/workflows/integration-tests.yaml
+++ b/.github/workflows/integration-tests.yaml
@@ -96,7 +96,7 @@ jobs:
       - name: Run tests
         run: |
           pip install pytest
-          python3 -m pip install -e sdk/python/kubeflow/trainer -e sdk/pythoh; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
+          python3 -m pip install -e sdk/python/kubeflow/trainer -e sdk/python; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
         env:
           GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}
 

From 1f639a71bddb4566e05336a7fe6b0b7cb8bdba62 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 30 Aug 2024 15:56:10 +0800
Subject: [PATCH 48/86] update pip install

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/integration-tests.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml
index 2fed976393..234a2988be 100644
--- a/.github/workflows/integration-tests.yaml
+++ b/.github/workflows/integration-tests.yaml
@@ -96,7 +96,7 @@ jobs:
       - name: Run tests
         run: |
           pip install pytest
-          python3 -m pip install -e sdk/python/kubeflow/trainer -e sdk/python; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
+          python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
         env:
           GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}
 

From 832273073feaad2d846a3f505cae44e0a4b023be Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 30 Aug 2024 16:57:22 +0800
Subject: [PATCH 49/86] change the value of 'clean_pod_policy'

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index 01dad75c82..41c079ab98 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -332,7 +332,7 @@ def generate_pytorchjob(
         metadata=V1ObjectMeta(name=job_name, namespace=job_namespace),
         spec=KubeflowOrgV1PyTorchJobSpec(
             run_policy=KubeflowOrgV1RunPolicy(
-                clean_pod_policy="None",
+                clean_pod_policy="Running",
                 scheduling_policy=scheduling_policy,
             ),
             pytorch_replica_specs={"Master": master, "Worker": worker},
@@ -355,10 +355,18 @@ def clean_up_resources():
     yield
 
     try:
+        # Display disk usage before cleanup
+        print("Disk usage before removing unnecessary files:")
+        subprocess.run(["df", "-hT"], check=True)
+
         # List all volumes and inspect them
         print("Listing all Docker volumes:")
         subprocess.run(["docker", "volume", "ls"], check=True)
 
+        # Prune unused volumes
+        print("Pruning unused Docker volumes...")
+        subprocess.run(["docker", "volume", "prune", "-f"], check=True)
+
         # Check for stopped containers
         print("Checking for stopped containers:")
         subprocess.run(["docker", "ps", "-a"], check=True)
@@ -367,9 +375,9 @@ def clean_up_resources():
         print("Removing stopped containers...")
         subprocess.run(["docker", "rm", "$(docker ps -a -q)"], shell=True, check=True)
 
-        # Prune unused volumes
-        print("Pruning unused Docker volumes...")
-        subprocess.run(["docker", "volume", "prune", "-f"], check=True)
+        # Display disk usage before cleanup
+        print("Disk usage before removing unnecessary files:")
+        subprocess.run(["df", "-hT"], check=True)
 
     except subprocess.CalledProcessError as e:
         print(f"Error during Docker cleanup: {e}")

From ed105746a23ebd4f130f468703645a4fc69375d3 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 30 Aug 2024 18:19:04 +0800
Subject: [PATCH 50/86] change the value of 'update cleanup function

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index 41c079ab98..ac6e1d23f5 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -332,7 +332,7 @@ def generate_pytorchjob(
         metadata=V1ObjectMeta(name=job_name, namespace=job_namespace),
         spec=KubeflowOrgV1PyTorchJobSpec(
             run_policy=KubeflowOrgV1RunPolicy(
-                clean_pod_policy="Running",
+                clean_pod_policy="None",
                 scheduling_policy=scheduling_policy,
             ),
             pytorch_replica_specs={"Master": master, "Worker": worker},
@@ -379,5 +379,12 @@ def clean_up_resources():
         print("Disk usage before removing unnecessary files:")
         subprocess.run(["df", "-hT"], check=True)
 
+        # Remove unnecessary docker files from the correct directory
+        print("Freeing up disk space by removing unnecessary files...")
+        subprocess.run([
+            "sudo", "rm", "-rf", 
+            "/var/lib/docker"
+        ], check=True)
+
     except subprocess.CalledProcessError as e:
         print(f"Error during Docker cleanup: {e}")

From 50ed9e8449ea8ccd24767ab5fd0706aa6a6a4c76 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 30 Aug 2024 20:42:09 +0800
Subject: [PATCH 51/86] update cleanup function

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 25 +++-------------------
 1 file changed, 3 insertions(+), 22 deletions(-)

diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index ac6e1d23f5..20fa45bacb 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -359,32 +359,13 @@ def clean_up_resources():
         print("Disk usage before removing unnecessary files:")
         subprocess.run(["df", "-hT"], check=True)
 
-        # List all volumes and inspect them
-        print("Listing all Docker volumes:")
-        subprocess.run(["docker", "volume", "ls"], check=True)
-
         # Prune unused volumes
         print("Pruning unused Docker volumes...")
-        subprocess.run(["docker", "volume", "prune", "-f"], check=True)
-
-        # Check for stopped containers
-        print("Checking for stopped containers:")
-        subprocess.run(["docker", "ps", "-a"], check=True)
-
-        # Remove all stopped containers
-        print("Removing stopped containers...")
-        subprocess.run(["docker", "rm", "$(docker ps -a -q)"], shell=True, check=True)
+        subprocess.run(["docker", "system", "prune", "-a", "--volumes", "-f"], check=True)
 
-        # Display disk usage before cleanup
-        print("Disk usage before removing unnecessary files:")
+        # Display disk usage after cleanup
+        print("Disk usage after removing unnecessary files:")
         subprocess.run(["df", "-hT"], check=True)
 
-        # Remove unnecessary docker files from the correct directory
-        print("Freeing up disk space by removing unnecessary files...")
-        subprocess.run([
-            "sudo", "rm", "-rf", 
-            "/var/lib/docker"
-        ], check=True)
-
     except subprocess.CalledProcessError as e:
         print(f"Error during Docker cleanup: {e}")

From b2cd27ab579804dac668abceedaea973f11cb880 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sat, 31 Aug 2024 08:44:10 +0800
Subject: [PATCH 52/86] update cleanup function

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 46 +++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index 20fa45bacb..0ee316eb16 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -354,18 +354,62 @@ def clean_up_resources():
     # This code runs after each test function
     yield
 
+    docker_accessible = False
+
+    # Check Docker daemon access
+    try:
+        result = subprocess.run(["docker", "version"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        print("Docker daemon is accessible.")
+        print(result.stdout.decode())
+        docker_accessible = True
+    except subprocess.CalledProcessError as e:
+        print("Error: Docker daemon is not accessible.")
+        print(e.stderr.decode())
+
+    if not docker_accessible:
+        print("Skipping Docker cleanup since Docker is not accessible.")
+        return
+
     try:
         # Display disk usage before cleanup
         print("Disk usage before removing unnecessary files:")
         subprocess.run(["df", "-hT"], check=True)
 
+        # Display Docker disk usage before cleanup
+        print("Docker disk usage before removing unnecessary files:")
+        subprocess.run(["docker", "system", "df", "-v"], check=True)
+
+        # Display Docker images before cleanup
+        print("Docker images before removing unnecessary files:")
+        subprocess.run(["docker", "images"], check=True)
+
+        # Display Docker containers before cleanup
+        print("Docker containers before removing unnecessary files:")
+        subprocess.run(["docker", "ps", "-s", "--all"], check=True)
+
+        # Display Docker volumes before cleanup
+        print("Docker volumess before removing unnecessary files:")
+        subprocess.run(["docker", "volume", "ls"], check=True)
+
+        # Check Docker root directory disk usage
+        print("Check Docker root directory:")
+        subprocess.run(["sudo", "du", "-sh", "/var/lib/docker"], check=True)
+
+        # Check Docker runtime directory disk usage
+        print("Check Docker runtime directory:")
+        subprocess.run(["sudo", "du", "-sh", "/var/lib/containerd"], check=True)
+
         # Prune unused volumes
         print("Pruning unused Docker volumes...")
-        subprocess.run(["docker", "system", "prune", "-a", "--volumes", "-f"], check=True)
+        subprocess.run(["sudo", "docker", "system", "prune", "-a", "--volumes", "-f"], check=True)
 
         # Display disk usage after cleanup
         print("Disk usage after removing unnecessary files:")
         subprocess.run(["df", "-hT"], check=True)
 
+        # Display Docker disk usage after cleanup
+        print("Docker disk usage after removing unnecessary files:")
+        subprocess.run(["docker", "system", "df", "-v"], check=True)
+
     except subprocess.CalledProcessError as e:
         print(f"Error during Docker cleanup: {e}")

From 3af5d87c40ee37462e1393feb87f757dd51e11c0 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sat, 31 Aug 2024 09:07:13 +0800
Subject: [PATCH 53/86] check docker volumes

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index 0ee316eb16..418f6f03ba 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -403,6 +403,21 @@ def clean_up_resources():
         print("Pruning unused Docker volumes...")
         subprocess.run(["sudo", "docker", "system", "prune", "-a", "--volumes", "-f"], check=True)
 
+        # Additional check: List volumes and remove large unused ones
+        print("Listing Docker volumes to check for large unused ones:")
+        result = subprocess.run(["docker", "volume", "ls", "-q"], check=True, stdout=subprocess.PIPE)
+        volumes = result.stdout.decode().splitlines()
+        for volume in volumes:
+            inspect_result = subprocess.run(["docker", "volume", "inspect", volume], check=True, stdout=subprocess.PIPE)
+            volume_details = inspect_result.stdout.decode()
+            if '"Mountpoint":' in volume_details and '/mnt/' in volume_details:
+                volume_size = subprocess.run(["sudo", "du", "-sh", f"/mnt/{volume}"], check=True, stdout=subprocess.PIPE).stdout.decode().split()[0]
+                print(f"Volume {volume} size: {volume_size}")
+                # Example: Remove if larger than 10GB
+                if float(volume_size[:-1]) > 10:  # Adjust this condition as needed
+                    print(f"Removing large unused volume: {volume}")
+                    subprocess.run(["docker", "volume", "rm", volume], check=True)
+
         # Display disk usage after cleanup
         print("Disk usage after removing unnecessary files:")
         subprocess.run(["df", "-hT"], check=True)

From 1a0eff33a78b6b65daafb3412cefe2c071669945 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sat, 31 Aug 2024 09:24:58 +0800
Subject: [PATCH 54/86] check docker volumes

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index 418f6f03ba..cd317eb244 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import json
 import logging
 import pytest
 import subprocess
@@ -407,16 +408,25 @@ def clean_up_resources():
         print("Listing Docker volumes to check for large unused ones:")
         result = subprocess.run(["docker", "volume", "ls", "-q"], check=True, stdout=subprocess.PIPE)
         volumes = result.stdout.decode().splitlines()
+
         for volume in volumes:
             inspect_result = subprocess.run(["docker", "volume", "inspect", volume], check=True, stdout=subprocess.PIPE)
-            volume_details = inspect_result.stdout.decode()
-            if '"Mountpoint":' in volume_details and '/mnt/' in volume_details:
-                volume_size = subprocess.run(["sudo", "du", "-sh", f"/mnt/{volume}"], check=True, stdout=subprocess.PIPE).stdout.decode().split()[0]
+            volume_details = json.loads(inspect_result.stdout.decode())
+            mountpoint = volume_details[0]["Mountpoint"]
+
+            # Check if the mountpoint exists before accessing it
+            try:
+                volume_size = subprocess.run(["sudo", "du", "-sh", mountpoint], check=True, stdout=subprocess.PIPE).stdout.decode().split()[0]
                 print(f"Volume {volume} size: {volume_size}")
                 # Example: Remove if larger than 10GB
-                if float(volume_size[:-1]) > 10:  # Adjust this condition as needed
+                size_value = float(volume_size[:-1])
+                size_unit = volume_size[-1].upper()
+
+                if size_unit == 'G' and size_value > 10:  # Adjust this condition as needed
                     print(f"Removing large unused volume: {volume}")
                     subprocess.run(["docker", "volume", "rm", volume], check=True)
+            except subprocess.CalledProcessError:
+                print(f"Volume {volume} not found at expected mountpoint {mountpoint} or cannot access.")
 
         # Display disk usage after cleanup
         print("Disk usage after removing unnecessary files:")

From 604265a8006da171f898a86966e9cd1d2aa72393 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sat, 31 Aug 2024 09:49:56 +0800
Subject: [PATCH 55/86] stop the controller and restart it again to clean up

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 24 ++++++++++------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index cd317eb244..8efda407a5 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -392,19 +392,12 @@ def clean_up_resources():
         print("Docker volumess before removing unnecessary files:")
         subprocess.run(["docker", "volume", "ls"], check=True)
 
-        # Check Docker root directory disk usage
-        print("Check Docker root directory:")
-        subprocess.run(["sudo", "du", "-sh", "/var/lib/docker"], check=True)
+        # Stop the training-operator-control-plane container if running
+        container_name = "training-operator-cluster-control-plane"
+        print(f"Stopping container {container_name}...")
+        subprocess.run(["docker", "stop", container_name], check=True)
 
-        # Check Docker runtime directory disk usage
-        print("Check Docker runtime directory:")
-        subprocess.run(["sudo", "du", "-sh", "/var/lib/containerd"], check=True)
-
-        # Prune unused volumes
-        print("Pruning unused Docker volumes...")
-        subprocess.run(["sudo", "docker", "system", "prune", "-a", "--volumes", "-f"], check=True)
-
-        # Additional check: List volumes and remove large unused ones
+        # List volumes and remove large unused ones
         print("Listing Docker volumes to check for large unused ones:")
         result = subprocess.run(["docker", "volume", "ls", "-q"], check=True, stdout=subprocess.PIPE)
         volumes = result.stdout.decode().splitlines()
@@ -418,16 +411,21 @@ def clean_up_resources():
             try:
                 volume_size = subprocess.run(["sudo", "du", "-sh", mountpoint], check=True, stdout=subprocess.PIPE).stdout.decode().split()[0]
                 print(f"Volume {volume} size: {volume_size}")
+
                 # Example: Remove if larger than 10GB
                 size_value = float(volume_size[:-1])
                 size_unit = volume_size[-1].upper()
 
                 if size_unit == 'G' and size_value > 10:  # Adjust this condition as needed
-                    print(f"Removing large unused volume: {volume}")
+                    print(f"Removing volume: {volume}")
                     subprocess.run(["docker", "volume", "rm", volume], check=True)
             except subprocess.CalledProcessError:
                 print(f"Volume {volume} not found at expected mountpoint {mountpoint} or cannot access.")
 
+        # Restart the training-operator-control-plane container if necessary
+        print(f"Starting container {container_name}...")
+        subprocess.run(["docker", "start", container_name], check=True)
+
         # Display disk usage after cleanup
         print("Disk usage after removing unnecessary files:")
         subprocess.run(["df", "-hT"], check=True)

From a4f848f398ad0faf8d34e2ac48e53337458fb1de Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sat, 31 Aug 2024 10:36:21 +0800
Subject: [PATCH 56/86] update cleanup function

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 57 ++--------------------
 1 file changed, 5 insertions(+), 52 deletions(-)

diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index 8efda407a5..75c4525e18 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -355,22 +355,6 @@ def clean_up_resources():
     # This code runs after each test function
     yield
 
-    docker_accessible = False
-
-    # Check Docker daemon access
-    try:
-        result = subprocess.run(["docker", "version"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        print("Docker daemon is accessible.")
-        print(result.stdout.decode())
-        docker_accessible = True
-    except subprocess.CalledProcessError as e:
-        print("Error: Docker daemon is not accessible.")
-        print(e.stderr.decode())
-
-    if not docker_accessible:
-        print("Skipping Docker cleanup since Docker is not accessible.")
-        return
-
     try:
         # Display disk usage before cleanup
         print("Disk usage before removing unnecessary files:")
@@ -389,44 +373,13 @@ def clean_up_resources():
         subprocess.run(["docker", "ps", "-s", "--all"], check=True)
 
         # Display Docker volumes before cleanup
-        print("Docker volumess before removing unnecessary files:")
+        print("Docker volumes before removing unnecessary files:")
         subprocess.run(["docker", "volume", "ls"], check=True)
 
-        # Stop the training-operator-control-plane container if running
-        container_name = "training-operator-cluster-control-plane"
-        print(f"Stopping container {container_name}...")
-        subprocess.run(["docker", "stop", container_name], check=True)
-
-        # List volumes and remove large unused ones
-        print("Listing Docker volumes to check for large unused ones:")
-        result = subprocess.run(["docker", "volume", "ls", "-q"], check=True, stdout=subprocess.PIPE)
-        volumes = result.stdout.decode().splitlines()
-
-        for volume in volumes:
-            inspect_result = subprocess.run(["docker", "volume", "inspect", volume], check=True, stdout=subprocess.PIPE)
-            volume_details = json.loads(inspect_result.stdout.decode())
-            mountpoint = volume_details[0]["Mountpoint"]
-
-            # Check if the mountpoint exists before accessing it
-            try:
-                volume_size = subprocess.run(["sudo", "du", "-sh", mountpoint], check=True, stdout=subprocess.PIPE).stdout.decode().split()[0]
-                print(f"Volume {volume} size: {volume_size}")
-
-                # Example: Remove if larger than 10GB
-                size_value = float(volume_size[:-1])
-                size_unit = volume_size[-1].upper()
-
-                if size_unit == 'G' and size_value > 10:  # Adjust this condition as needed
-                    print(f"Removing volume: {volume}")
-                    subprocess.run(["docker", "volume", "rm", volume], check=True)
-            except subprocess.CalledProcessError:
-                print(f"Volume {volume} not found at expected mountpoint {mountpoint} or cannot access.")
-
-        # Restart the training-operator-control-plane container if necessary
-        print(f"Starting container {container_name}...")
-        subprocess.run(["docker", "start", container_name], check=True)
-
-        # Display disk usage after cleanup
+        # Remove unused Docker volumes
+        print("Remove unused Docker volumes:")
+        subprocess.run(["docker", "volume", "prune", "--filter", "all=1"], check=True)
+        
         print("Disk usage after removing unnecessary files:")
         subprocess.run(["df", "-hT"], check=True)
 

From 3e86e90e9eedf9b47dd95894a8550469ba6962a4 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sat, 31 Aug 2024 10:52:37 +0800
Subject: [PATCH 57/86] update cleanup function

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index 75c4525e18..3d1a31451b 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -378,7 +378,7 @@ def clean_up_resources():
 
         # Remove unused Docker volumes
         print("Remove unused Docker volumes:")
-        subprocess.run(["docker", "volume", "prune", "--filter", "all=1"], check=True)
+        subprocess.run(["docker", "volume", "prune", "--filter", "all=1", "-f"], check=True)
         
         print("Disk usage after removing unnecessary files:")
         subprocess.run(["df", "-hT"], check=True)

From 558330b17069b006bac1a580a1d5628d3e3a5c66 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sat, 31 Aug 2024 13:28:45 +0800
Subject: [PATCH 58/86] update cleanup function

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 27 +++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index 3d1a31451b..60d27c1efd 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -378,7 +378,32 @@ def clean_up_resources():
 
         # Remove unused Docker volumes
         print("Remove unused Docker volumes:")
-        subprocess.run(["docker", "volume", "prune", "--filter", "all=1", "-f"], check=True)
+        subprocess.run(["docker", "volume", "prune", "-f"], check=True)
+
+        # Additionally list volumes and remove large unused ones
+        print("Listing Docker volumes to check for large unused ones:")
+        result = subprocess.run(["docker", "volume", "ls", "-q"], check=True, stdout=subprocess.PIPE)
+        volumes = result.stdout.decode().splitlines()
+
+        for volume in volumes:
+            inspect_result = subprocess.run(["docker", "volume", "inspect", volume], check=True, stdout=subprocess.PIPE)
+            volume_details = json.loads(inspect_result.stdout.decode())
+            mountpoint = volume_details[0]["Mountpoint"]
+
+            # Check if the mountpoint exists before accessing it
+            try:
+                volume_size = subprocess.run(["sudo", "du", "-sh", mountpoint], check=True, stdout=subprocess.PIPE).stdout.decode().split()[0]
+                print(f"Volume {volume} size: {volume_size}")
+
+                # Example: Remove if larger than 10GB
+                size_value = float(volume_size[:-1])
+                size_unit = volume_size[-1].upper()
+
+                if size_unit == 'G' and size_value > 10:  # Adjust this condition as needed
+                    print(f"Removing volume: {volume}")
+                    subprocess.run(["docker", "volume", "rm", volume], check=True)
+            except subprocess.CalledProcessError:
+                print(f"Volume {volume} not found at expected mountpoint {mountpoint} or cannot access.")
         
         print("Disk usage after removing unnecessary files:")
         subprocess.run(["df", "-hT"], check=True)

From d4ed2d81b4435655db95e3e97fcbacee18020705 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 3 Sep 2024 10:26:06 +0800
Subject: [PATCH 59/86] separate e2e test for train api

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml     |  60 +++++++
 .github/workflows/integration-tests.yaml      |   2 +-
 sdk/python/test/e2e/test_e2e_pytorchjob.py    | 148 +-----------------
 .../test_train_api/test_e2e_train_api.py      |  96 ++++++++++++
 4 files changed, 158 insertions(+), 148 deletions(-)
 create mode 100644 .github/workflows/e2e-test-train-api.yaml
 create mode 100644 sdk/python/test_train_api/test_e2e_train_api.py

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
new file mode 100644
index 0000000000..182998df51
--- /dev/null
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -0,0 +1,60 @@
+name: E2E Test with train API
+on:
+  - pull_request
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  e2e-test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
+    
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Free-Up Disk Space
+        uses: ./.github/workflows/free-up-disk-space
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Setup Go
+        uses: actions/setup-go@v5
+        with:
+          go-version-file: go.mod
+
+      - name: Create k8s Kind Cluster
+        uses: helm/kind-action@v1.10.0
+        with:
+          node_image: kindest/node:${{ matrix.kubernetes-version }}
+          cluster_name: training-operator-cluster
+          kubectl_version: ${{ matrix.kubernetes-version }}
+
+      - name: Build training-operator
+        run: |
+          ./scripts/gha/build-image.sh
+        env:
+          TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
+
+      - name: Deploy training operator
+        run: |
+          ./scripts/gha/setup-training-operator.sh
+        env:
+          KIND_CLUSTER: training-operator-cluster
+          TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
+          GANG_SCHEDULER_NAME: "none"
+          KUBERNETES_VERSION: ${{ matrix.kubernetes-version }}
+
+      - name: Run tests
+        run: |
+          pip install pytest
+          python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test_train_api --log-cli-level=debug --namespace=default
diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml
index 234a2988be..ca2b543fc7 100644
--- a/.github/workflows/integration-tests.yaml
+++ b/.github/workflows/integration-tests.yaml
@@ -96,7 +96,7 @@ jobs:
       - name: Run tests
         run: |
           pip install pytest
-          python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
+          python3 -m pip install -e sdk/python; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
         env:
           GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}
 
diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index 60d27c1efd..8e0739c9b4 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 
 import os
-import json
 import logging
 import pytest
-import subprocess
 from typing import Optional
 
 from kubernetes.client import V1PodTemplateSpec
@@ -25,10 +23,6 @@
 from kubernetes.client import V1Container
 from kubernetes.client import V1ResourceRequirements
 
-from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams
-from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams
-from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams
-
 from kubeflow.training import TrainingClient
 from kubeflow.training import KubeflowOrgV1ReplicaSpec
 from kubeflow.training import KubeflowOrgV1PyTorchJob
@@ -37,9 +31,6 @@
 from kubeflow.training import KubeflowOrgV1SchedulingPolicy
 from kubeflow.training import constants
 
-from peft import LoraConfig
-import transformers
-
 import test.e2e.utils as utils
 from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY
 from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS
@@ -249,77 +240,6 @@ def test_sdk_e2e_create_from_image(job_namespace):
     TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
 
 
-@pytest.mark.skipif(
-    GANG_SCHEDULER_NAME in GANG_SCHEDULERS,
-    reason="For plain scheduling",
-)
-def test_sdk_e2e_create_from_train_api(job_namespace):
-    JOB_NAME = "pytorchjob-from-train-api"
-
-    # Use test case from fine-tuning API tutorial.
-    # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
-    TRAINING_CLIENT.train(
-        name=JOB_NAME,
-        namespace=job_namespace,
-        # BERT model URI and type of Transformer to train it.
-        model_provider_parameters=HuggingFaceModelParams(
-            model_uri="hf://google-bert/bert-base-cased",
-            transformer_type=transformers.AutoModelForSequenceClassification,
-            num_labels=5,
-        ),
-        # In order to save test time, use 8 samples from Yelp dataset.
-        dataset_provider_parameters=HuggingFaceDatasetParams(
-            repo_id="yelp_review_full",
-            split="train[:8]",
-        ),
-        # Specify HuggingFace Trainer parameters.
-        trainer_parameters=HuggingFaceTrainerParams(
-            training_parameters=transformers.TrainingArguments(
-                output_dir="test_trainer",
-                save_strategy="no",
-                evaluation_strategy="no",
-                do_eval=False,
-                disable_tqdm=True,
-                log_level="info",
-                num_train_epochs=1,
-            ),
-            # Set LoRA config to reduce number of trainable model parameters.
-            lora_config=LoraConfig(
-                r=8,
-                lora_alpha=8,
-                lora_dropout=0.1,
-                bias="none",
-            ),
-        ),
-        num_workers=1,  # nodes parameter for torchrun command.
-        num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
-        resources_per_worker={
-            "gpu": 0,
-            "cpu": 2,
-            "memory": "10G",
-        },
-        storage_config={
-            "size": "10Gi",
-            "access_modes": ["ReadWriteOnce"],
-        },
-    )
-
-    logging.info(f"List of created {TRAINING_CLIENT.job_kind}s")
-    logging.info(TRAINING_CLIENT.list_jobs(job_namespace))
-
-    try:
-        utils.verify_job_e2e(
-            TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=300
-        )
-    except Exception as e:
-        utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
-        TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
-        raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}")
-
-    utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
-    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
-
-
 def generate_pytorchjob(
     job_namespace: str,
     job_name: str,
@@ -347,70 +267,4 @@ def generate_container() -> V1Container:
         image="kubeflow/pytorch-dist-mnist:latest",
         args=["--backend", "gloo", "--epochs", "1"],
         resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),
-    )
-
-
-@pytest.fixture(scope="function", autouse=True)
-def clean_up_resources():
-    # This code runs after each test function
-    yield
-
-    try:
-        # Display disk usage before cleanup
-        print("Disk usage before removing unnecessary files:")
-        subprocess.run(["df", "-hT"], check=True)
-
-        # Display Docker disk usage before cleanup
-        print("Docker disk usage before removing unnecessary files:")
-        subprocess.run(["docker", "system", "df", "-v"], check=True)
-
-        # Display Docker images before cleanup
-        print("Docker images before removing unnecessary files:")
-        subprocess.run(["docker", "images"], check=True)
-
-        # Display Docker containers before cleanup
-        print("Docker containers before removing unnecessary files:")
-        subprocess.run(["docker", "ps", "-s", "--all"], check=True)
-
-        # Display Docker volumes before cleanup
-        print("Docker volumes before removing unnecessary files:")
-        subprocess.run(["docker", "volume", "ls"], check=True)
-
-        # Remove unused Docker volumes
-        print("Remove unused Docker volumes:")
-        subprocess.run(["docker", "volume", "prune", "-f"], check=True)
-
-        # Additionally list volumes and remove large unused ones
-        print("Listing Docker volumes to check for large unused ones:")
-        result = subprocess.run(["docker", "volume", "ls", "-q"], check=True, stdout=subprocess.PIPE)
-        volumes = result.stdout.decode().splitlines()
-
-        for volume in volumes:
-            inspect_result = subprocess.run(["docker", "volume", "inspect", volume], check=True, stdout=subprocess.PIPE)
-            volume_details = json.loads(inspect_result.stdout.decode())
-            mountpoint = volume_details[0]["Mountpoint"]
-
-            # Check if the mountpoint exists before accessing it
-            try:
-                volume_size = subprocess.run(["sudo", "du", "-sh", mountpoint], check=True, stdout=subprocess.PIPE).stdout.decode().split()[0]
-                print(f"Volume {volume} size: {volume_size}")
-
-                # Example: Remove if larger than 10GB
-                size_value = float(volume_size[:-1])
-                size_unit = volume_size[-1].upper()
-
-                if size_unit == 'G' and size_value > 10:  # Adjust this condition as needed
-                    print(f"Removing volume: {volume}")
-                    subprocess.run(["docker", "volume", "rm", volume], check=True)
-            except subprocess.CalledProcessError:
-                print(f"Volume {volume} not found at expected mountpoint {mountpoint} or cannot access.")
-        
-        print("Disk usage after removing unnecessary files:")
-        subprocess.run(["df", "-hT"], check=True)
-
-        # Display Docker disk usage after cleanup
-        print("Docker disk usage after removing unnecessary files:")
-        subprocess.run(["docker", "system", "df", "-v"], check=True)
-
-    except subprocess.CalledProcessError as e:
-        print(f"Error during Docker cleanup: {e}")
+    )
\ No newline at end of file
diff --git a/sdk/python/test_train_api/test_e2e_train_api.py b/sdk/python/test_train_api/test_e2e_train_api.py
new file mode 100644
index 0000000000..9fe4e6b731
--- /dev/null
+++ b/sdk/python/test_train_api/test_e2e_train_api.py
@@ -0,0 +1,96 @@
+# Copyright 2024 kubeflow.org.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import test.e2e.utils as utils
+
+import transformers
+from kubeflow.storage_initializer.hugging_face import (
+    HuggingFaceDatasetParams,
+    HuggingFaceModelParams,
+    HuggingFaceTrainerParams,
+)
+from kubeflow.training import TrainingClient, constants
+from peft import LoraConfig
+
+logging.basicConfig(format="%(message)s")
+logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG)
+
+TRAINING_CLIENT = TrainingClient(job_kind=constants.PYTORCHJOB_KIND)
+CONTAINER_NAME = "pytorch"
+
+
+def test_sdk_e2e_create_from_train_api(job_namespace):
+    JOB_NAME = "pytorchjob-from-train-api"
+
+    # Use test case from fine-tuning API tutorial.
+    # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
+    TRAINING_CLIENT.train(
+        name=JOB_NAME,
+        namespace=job_namespace,
+        # BERT model URI and type of Transformer to train it.
+        model_provider_parameters=HuggingFaceModelParams(
+            model_uri="hf://google-bert/bert-base-cased",
+            transformer_type=transformers.AutoModelForSequenceClassification,
+            num_labels=5,
+        ),
+        # In order to save test time, use 8 samples from Yelp dataset.
+        dataset_provider_parameters=HuggingFaceDatasetParams(
+            repo_id="yelp_review_full",
+            split="train[:8]",
+        ),
+        # Specify HuggingFace Trainer parameters.
+        trainer_parameters=HuggingFaceTrainerParams(
+            training_parameters=transformers.TrainingArguments(
+                output_dir="test_trainer",
+                save_strategy="no",
+                evaluation_strategy="no",
+                do_eval=False,
+                disable_tqdm=True,
+                log_level="info",
+                num_train_epochs=1,
+            ),
+            # Set LoRA config to reduce number of trainable model parameters.
+            lora_config=LoraConfig(
+                r=8,
+                lora_alpha=8,
+                lora_dropout=0.1,
+                bias="none",
+            ),
+        ),
+        num_workers=1,  # nodes parameter for torchrun command.
+        num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
+        resources_per_worker={
+            "gpu": 0,
+            "cpu": 2,
+            "memory": "10G",
+        },
+        storage_config={
+            "size": "10Gi",
+            "access_modes": ["ReadWriteOnce"],
+        },
+    )
+
+    logging.info(f"List of created {TRAINING_CLIENT.job_kind}s")
+    logging.info(TRAINING_CLIENT.list_jobs(job_namespace))
+
+    try:
+        utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=300)
+    except Exception as e:
+        utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
+        TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+        raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}")
+
+    utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
+    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)

From 7a2ae05ce7555194f18194e907f1de20d8811bd9 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 3 Sep 2024 10:28:37 +0800
Subject: [PATCH 60/86] fix format

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index 182998df51..b60f3e071c 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -14,7 +14,6 @@ jobs:
       matrix:
         kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
         python-version: ["3.8", "3.9", "3.10", "3.11"]
-    
     steps:
       - name: Checkout
         uses: actions/checkout@v4

From 9efcce5b08a3b63a60f5dc794664d2ba9339e75b Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 3 Sep 2024 14:16:00 +0800
Subject: [PATCH 61/86] fix parameter of namespace

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml       | 2 +-
 sdk/python/test_train_api/test_e2e_train_api.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index b60f3e071c..8b3944277f 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -56,4 +56,4 @@ jobs:
       - name: Run tests
         run: |
           pip install pytest
-          python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test_train_api --log-cli-level=debug --namespace=default
+          python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test_train_api/test_e2e_train_api.py --log-cli-level=debug
diff --git a/sdk/python/test_train_api/test_e2e_train_api.py b/sdk/python/test_train_api/test_e2e_train_api.py
index 9fe4e6b731..0918941672 100644
--- a/sdk/python/test_train_api/test_e2e_train_api.py
+++ b/sdk/python/test_train_api/test_e2e_train_api.py
@@ -31,7 +31,7 @@
 CONTAINER_NAME = "pytorch"
 
 
-def test_sdk_e2e_create_from_train_api(job_namespace):
+def test_sdk_e2e_create_from_train_api(job_namespace="default"):
     JOB_NAME = "pytorchjob-from-train-api"
 
     # Use test case from fine-tuning API tutorial.
@@ -86,7 +86,7 @@ def test_sdk_e2e_create_from_train_api(job_namespace):
     logging.info(TRAINING_CLIENT.list_jobs(job_namespace))
 
     try:
-        utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=300)
+        utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900)
     except Exception as e:
         utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
         TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)

From a443ea2c4e42382da45b4b0e9c87e2e2cb79c5f9 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 3 Sep 2024 15:26:07 +0800
Subject: [PATCH 62/86] fix format

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test/e2e/test_e2e_pytorchjob.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index 8e0739c9b4..c5b28faaf8 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -267,4 +267,4 @@ def generate_container() -> V1Container:
         image="kubeflow/pytorch-dist-mnist:latest",
         args=["--backend", "gloo", "--epochs", "1"],
         resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),
-    )
\ No newline at end of file
+    )

From 85fd8e62525adde9a2137bfd0cc3e08d8f54db3b Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 3 Sep 2024 15:48:52 +0800
Subject: [PATCH 63/86] reduce resources

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml     | 59 ------------
 .github/workflows/integration-tests.yaml      |  2 +-
 sdk/python/test/e2e/test_e2e_pytorchjob.py    | 80 ++++++++++++++++
 .../test_train_api/test_e2e_train_api.py      | 96 -------------------
 4 files changed, 81 insertions(+), 156 deletions(-)
 delete mode 100644 .github/workflows/e2e-test-train-api.yaml
 delete mode 100644 sdk/python/test_train_api/test_e2e_train_api.py

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
deleted file mode 100644
index 8b3944277f..0000000000
--- a/.github/workflows/e2e-test-train-api.yaml
+++ /dev/null
@@ -1,59 +0,0 @@
-name: E2E Test with train API
-on:
-  - pull_request
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  e2e-test:
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Free-Up Disk Space
-        uses: ./.github/workflows/free-up-disk-space
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - name: Setup Go
-        uses: actions/setup-go@v5
-        with:
-          go-version-file: go.mod
-
-      - name: Create k8s Kind Cluster
-        uses: helm/kind-action@v1.10.0
-        with:
-          node_image: kindest/node:${{ matrix.kubernetes-version }}
-          cluster_name: training-operator-cluster
-          kubectl_version: ${{ matrix.kubernetes-version }}
-
-      - name: Build training-operator
-        run: |
-          ./scripts/gha/build-image.sh
-        env:
-          TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
-
-      - name: Deploy training operator
-        run: |
-          ./scripts/gha/setup-training-operator.sh
-        env:
-          KIND_CLUSTER: training-operator-cluster
-          TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
-          GANG_SCHEDULER_NAME: "none"
-          KUBERNETES_VERSION: ${{ matrix.kubernetes-version }}
-
-      - name: Run tests
-        run: |
-          pip install pytest
-          python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test_train_api/test_e2e_train_api.py --log-cli-level=debug
diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml
index ca2b543fc7..234a2988be 100644
--- a/.github/workflows/integration-tests.yaml
+++ b/.github/workflows/integration-tests.yaml
@@ -96,7 +96,7 @@ jobs:
       - name: Run tests
         run: |
           pip install pytest
-          python3 -m pip install -e sdk/python; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
+          python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
         env:
           GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}
 
diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index c5b28faaf8..cbe1b54039 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -13,8 +13,10 @@
 # limitations under the License.
 
 import os
+import json
 import logging
 import pytest
+import subprocess
 from typing import Optional
 
 from kubernetes.client import V1PodTemplateSpec
@@ -23,6 +25,10 @@
 from kubernetes.client import V1Container
 from kubernetes.client import V1ResourceRequirements
 
+from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams
+from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams
+from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams
+
 from kubeflow.training import TrainingClient
 from kubeflow.training import KubeflowOrgV1ReplicaSpec
 from kubeflow.training import KubeflowOrgV1PyTorchJob
@@ -31,6 +37,9 @@
 from kubeflow.training import KubeflowOrgV1SchedulingPolicy
 from kubeflow.training import constants
 
+from peft import LoraConfig
+import transformers
+
 import test.e2e.utils as utils
 from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY
 from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS
@@ -240,6 +249,77 @@ def test_sdk_e2e_create_from_image(job_namespace):
     TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
 
 
+@pytest.mark.skipif(
+    GANG_SCHEDULER_NAME in GANG_SCHEDULERS,
+    reason="For plain scheduling",
+)
+def test_sdk_e2e_create_from_train_api(job_namespace):
+    JOB_NAME = "pytorchjob-from-train-api"
+
+    # Use test case from fine-tuning API tutorial.
+    # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
+    TRAINING_CLIENT.train(
+        name=JOB_NAME,
+        namespace=job_namespace,
+        # BERT model URI and type of Transformer to train it.
+        model_provider_parameters=HuggingFaceModelParams(
+            model_uri="hf://google-bert/bert-base-cased",
+            transformer_type=transformers.AutoModelForSequenceClassification,
+            num_labels=5,
+        ),
+        # In order to save test time, use 8 samples from Yelp dataset.
+        dataset_provider_parameters=HuggingFaceDatasetParams(
+            repo_id="yelp_review_full",
+            split="train[:8]",
+        ),
+        # Specify HuggingFace Trainer parameters.
+        trainer_parameters=HuggingFaceTrainerParams(
+            training_parameters=transformers.TrainingArguments(
+                output_dir="test_trainer",
+                save_strategy="no",
+                evaluation_strategy="no",
+                do_eval=False,
+                disable_tqdm=True,
+                log_level="info",
+                num_train_epochs=1,
+            ),
+            # Set LoRA config to reduce number of trainable model parameters.
+            lora_config=LoraConfig(
+                r=8,
+                lora_alpha=8,
+                lora_dropout=0.1,
+                bias="none",
+            ),
+        ),
+        num_workers=1,  # nodes parameter for torchrun command.
+        num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
+        resources_per_worker={
+            "gpu": 0,
+            "cpu": 1,
+            "memory": "5G",
+        },
+        storage_config={
+            "size": "5Gi",
+            "access_modes": ["ReadWriteOnce"],
+        },
+    )
+
+    logging.info(f"List of created {TRAINING_CLIENT.job_kind}s")
+    logging.info(TRAINING_CLIENT.list_jobs(job_namespace))
+
+    try:
+        utils.verify_job_e2e(
+            TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=300
+        )
+    except Exception as e:
+        utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
+        TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+        raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}")
+
+    utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
+    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+
+
 def generate_pytorchjob(
     job_namespace: str,
     job_name: str,
diff --git a/sdk/python/test_train_api/test_e2e_train_api.py b/sdk/python/test_train_api/test_e2e_train_api.py
deleted file mode 100644
index 0918941672..0000000000
--- a/sdk/python/test_train_api/test_e2e_train_api.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright 2024 kubeflow.org.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import test.e2e.utils as utils
-
-import transformers
-from kubeflow.storage_initializer.hugging_face import (
-    HuggingFaceDatasetParams,
-    HuggingFaceModelParams,
-    HuggingFaceTrainerParams,
-)
-from kubeflow.training import TrainingClient, constants
-from peft import LoraConfig
-
-logging.basicConfig(format="%(message)s")
-logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG)
-
-TRAINING_CLIENT = TrainingClient(job_kind=constants.PYTORCHJOB_KIND)
-CONTAINER_NAME = "pytorch"
-
-
-def test_sdk_e2e_create_from_train_api(job_namespace="default"):
-    JOB_NAME = "pytorchjob-from-train-api"
-
-    # Use test case from fine-tuning API tutorial.
-    # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
-    TRAINING_CLIENT.train(
-        name=JOB_NAME,
-        namespace=job_namespace,
-        # BERT model URI and type of Transformer to train it.
-        model_provider_parameters=HuggingFaceModelParams(
-            model_uri="hf://google-bert/bert-base-cased",
-            transformer_type=transformers.AutoModelForSequenceClassification,
-            num_labels=5,
-        ),
-        # In order to save test time, use 8 samples from Yelp dataset.
-        dataset_provider_parameters=HuggingFaceDatasetParams(
-            repo_id="yelp_review_full",
-            split="train[:8]",
-        ),
-        # Specify HuggingFace Trainer parameters.
-        trainer_parameters=HuggingFaceTrainerParams(
-            training_parameters=transformers.TrainingArguments(
-                output_dir="test_trainer",
-                save_strategy="no",
-                evaluation_strategy="no",
-                do_eval=False,
-                disable_tqdm=True,
-                log_level="info",
-                num_train_epochs=1,
-            ),
-            # Set LoRA config to reduce number of trainable model parameters.
-            lora_config=LoraConfig(
-                r=8,
-                lora_alpha=8,
-                lora_dropout=0.1,
-                bias="none",
-            ),
-        ),
-        num_workers=1,  # nodes parameter for torchrun command.
-        num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
-        resources_per_worker={
-            "gpu": 0,
-            "cpu": 2,
-            "memory": "10G",
-        },
-        storage_config={
-            "size": "10Gi",
-            "access_modes": ["ReadWriteOnce"],
-        },
-    )
-
-    logging.info(f"List of created {TRAINING_CLIENT.job_kind}s")
-    logging.info(TRAINING_CLIENT.list_jobs(job_namespace))
-
-    try:
-        utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900)
-    except Exception as e:
-        utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
-        TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
-        raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}")
-
-    utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
-    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)

From 1a0c455d2f1a8e8df44547258fe0a4ee916c2ab1 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 3 Sep 2024 16:55:30 +0800
Subject: [PATCH 64/86] separate e2e test for train API

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml     | 59 ++++++++++++
 .github/workflows/integration-tests.yaml      |  2 +-
 sdk/python/test/e2e/test_e2e_pytorchjob.py    | 80 ----------------
 .../test_train_api/test_e2e_train_api.py      | 96 +++++++++++++++++++
 4 files changed, 156 insertions(+), 81 deletions(-)
 create mode 100644 .github/workflows/e2e-test-train-api.yaml
 create mode 100644 sdk/python/test_train_api/test_e2e_train_api.py

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
new file mode 100644
index 0000000000..c3f885cc9b
--- /dev/null
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -0,0 +1,59 @@
+name: E2E Test with train API
+on:
+  - pull_request
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  e2e-test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
+        python-version: ["3.9", "3.10", "3.11"]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Free-Up Disk Space
+        uses: ./.github/workflows/free-up-disk-space
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Setup Go
+        uses: actions/setup-go@v5
+        with:
+          go-version-file: go.mod
+
+      - name: Create k8s Kind Cluster
+        uses: helm/kind-action@v1.10.0
+        with:
+          node_image: kindest/node:${{ matrix.kubernetes-version }}
+          cluster_name: training-operator-cluster
+          kubectl_version: ${{ matrix.kubernetes-version }}
+
+      - name: Build training-operator
+        run: |
+          ./scripts/gha/build-image.sh
+        env:
+          TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
+
+      - name: Deploy training operator
+        run: |
+          ./scripts/gha/setup-training-operator.sh
+        env:
+          KIND_CLUSTER: training-operator-cluster
+          TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
+          GANG_SCHEDULER_NAME: "none"
+          KUBERNETES_VERSION: ${{ matrix.kubernetes-version }}
+
+      - name: Run tests
+        run: |
+          pip install pytest
+          python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test_train_api/test_e2e_train_api.py --log-cli-level=debug
diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml
index 234a2988be..ca2b543fc7 100644
--- a/.github/workflows/integration-tests.yaml
+++ b/.github/workflows/integration-tests.yaml
@@ -96,7 +96,7 @@ jobs:
       - name: Run tests
         run: |
           pip install pytest
-          python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
+          python3 -m pip install -e sdk/python; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
         env:
           GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}
 
diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
index cbe1b54039..c5b28faaf8 100644
--- a/sdk/python/test/e2e/test_e2e_pytorchjob.py
+++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 
 import os
-import json
 import logging
 import pytest
-import subprocess
 from typing import Optional
 
 from kubernetes.client import V1PodTemplateSpec
@@ -25,10 +23,6 @@
 from kubernetes.client import V1Container
 from kubernetes.client import V1ResourceRequirements
 
-from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams
-from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams
-from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams
-
 from kubeflow.training import TrainingClient
 from kubeflow.training import KubeflowOrgV1ReplicaSpec
 from kubeflow.training import KubeflowOrgV1PyTorchJob
@@ -37,9 +31,6 @@
 from kubeflow.training import KubeflowOrgV1SchedulingPolicy
 from kubeflow.training import constants
 
-from peft import LoraConfig
-import transformers
-
 import test.e2e.utils as utils
 from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY
 from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS
@@ -249,77 +240,6 @@ def test_sdk_e2e_create_from_image(job_namespace):
     TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
 
 
-@pytest.mark.skipif(
-    GANG_SCHEDULER_NAME in GANG_SCHEDULERS,
-    reason="For plain scheduling",
-)
-def test_sdk_e2e_create_from_train_api(job_namespace):
-    JOB_NAME = "pytorchjob-from-train-api"
-
-    # Use test case from fine-tuning API tutorial.
-    # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
-    TRAINING_CLIENT.train(
-        name=JOB_NAME,
-        namespace=job_namespace,
-        # BERT model URI and type of Transformer to train it.
-        model_provider_parameters=HuggingFaceModelParams(
-            model_uri="hf://google-bert/bert-base-cased",
-            transformer_type=transformers.AutoModelForSequenceClassification,
-            num_labels=5,
-        ),
-        # In order to save test time, use 8 samples from Yelp dataset.
-        dataset_provider_parameters=HuggingFaceDatasetParams(
-            repo_id="yelp_review_full",
-            split="train[:8]",
-        ),
-        # Specify HuggingFace Trainer parameters.
-        trainer_parameters=HuggingFaceTrainerParams(
-            training_parameters=transformers.TrainingArguments(
-                output_dir="test_trainer",
-                save_strategy="no",
-                evaluation_strategy="no",
-                do_eval=False,
-                disable_tqdm=True,
-                log_level="info",
-                num_train_epochs=1,
-            ),
-            # Set LoRA config to reduce number of trainable model parameters.
-            lora_config=LoraConfig(
-                r=8,
-                lora_alpha=8,
-                lora_dropout=0.1,
-                bias="none",
-            ),
-        ),
-        num_workers=1,  # nodes parameter for torchrun command.
-        num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
-        resources_per_worker={
-            "gpu": 0,
-            "cpu": 1,
-            "memory": "5G",
-        },
-        storage_config={
-            "size": "5Gi",
-            "access_modes": ["ReadWriteOnce"],
-        },
-    )
-
-    logging.info(f"List of created {TRAINING_CLIENT.job_kind}s")
-    logging.info(TRAINING_CLIENT.list_jobs(job_namespace))
-
-    try:
-        utils.verify_job_e2e(
-            TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=300
-        )
-    except Exception as e:
-        utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
-        TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
-        raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}")
-
-    utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
-    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
-
-
 def generate_pytorchjob(
     job_namespace: str,
     job_name: str,
diff --git a/sdk/python/test_train_api/test_e2e_train_api.py b/sdk/python/test_train_api/test_e2e_train_api.py
new file mode 100644
index 0000000000..0918941672
--- /dev/null
+++ b/sdk/python/test_train_api/test_e2e_train_api.py
@@ -0,0 +1,96 @@
+# Copyright 2024 kubeflow.org.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import test.e2e.utils as utils
+
+import transformers
+from kubeflow.storage_initializer.hugging_face import (
+    HuggingFaceDatasetParams,
+    HuggingFaceModelParams,
+    HuggingFaceTrainerParams,
+)
+from kubeflow.training import TrainingClient, constants
+from peft import LoraConfig
+
+logging.basicConfig(format="%(message)s")
+logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG)
+
+TRAINING_CLIENT = TrainingClient(job_kind=constants.PYTORCHJOB_KIND)
+CONTAINER_NAME = "pytorch"
+
+
+def test_sdk_e2e_create_from_train_api(job_namespace="default"):
+    JOB_NAME = "pytorchjob-from-train-api"
+
+    # Use test case from fine-tuning API tutorial.
+    # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
+    TRAINING_CLIENT.train(
+        name=JOB_NAME,
+        namespace=job_namespace,
+        # BERT model URI and type of Transformer to train it.
+        model_provider_parameters=HuggingFaceModelParams(
+            model_uri="hf://google-bert/bert-base-cased",
+            transformer_type=transformers.AutoModelForSequenceClassification,
+            num_labels=5,
+        ),
+        # In order to save test time, use 8 samples from Yelp dataset.
+        dataset_provider_parameters=HuggingFaceDatasetParams(
+            repo_id="yelp_review_full",
+            split="train[:8]",
+        ),
+        # Specify HuggingFace Trainer parameters.
+        trainer_parameters=HuggingFaceTrainerParams(
+            training_parameters=transformers.TrainingArguments(
+                output_dir="test_trainer",
+                save_strategy="no",
+                evaluation_strategy="no",
+                do_eval=False,
+                disable_tqdm=True,
+                log_level="info",
+                num_train_epochs=1,
+            ),
+            # Set LoRA config to reduce number of trainable model parameters.
+            lora_config=LoraConfig(
+                r=8,
+                lora_alpha=8,
+                lora_dropout=0.1,
+                bias="none",
+            ),
+        ),
+        num_workers=1,  # nodes parameter for torchrun command.
+        num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
+        resources_per_worker={
+            "gpu": 0,
+            "cpu": 2,
+            "memory": "10G",
+        },
+        storage_config={
+            "size": "10Gi",
+            "access_modes": ["ReadWriteOnce"],
+        },
+    )
+
+    logging.info(f"List of created {TRAINING_CLIENT.job_kind}s")
+    logging.info(TRAINING_CLIENT.list_jobs(job_namespace))
+
+    try:
+        utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900)
+    except Exception as e:
+        utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
+        TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+        raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}")
+
+    utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
+    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)

From afe4240c62bac0fdfd58f0e35f400f60ecb61065 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 3 Sep 2024 19:24:59 +0800
Subject: [PATCH 65/86] remove go setup

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index c3f885cc9b..471b351272 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -26,11 +26,6 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
 
-      - name: Setup Go
-        uses: actions/setup-go@v5
-        with:
-          go-version-file: go.mod
-
       - name: Create k8s Kind Cluster
         uses: helm/kind-action@v1.10.0
         with:

From 250b830bc5f881d224003d4b4c6bde3b6663cae5 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 3 Sep 2024 22:58:41 +0800
Subject: [PATCH 66/86] adjust the version of k8s

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index 471b351272..94271e180f 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -12,7 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
+        kubernetes-version: ["v1.28.7"]
         python-version: ["3.9", "3.10", "3.11"]
     steps:
       - name: Checkout

From c5b39a4821f183c85b95be2c6e9ad52ae746bf8e Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Wed, 4 Sep 2024 07:49:32 +0800
Subject: [PATCH 67/86] move test file to new place

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml                       | 2 +-
 .github/workflows/integration-tests.yaml                        | 2 +-
 .../e2e-train-api}/test_e2e_train_api.py                        | 0
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename sdk/python/{test_train_api => test/e2e-train-api}/test_e2e_train_api.py (100%)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index 94271e180f..dfaf615aff 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -51,4 +51,4 @@ jobs:
       - name: Run tests
         run: |
           pip install pytest
-          python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test_train_api/test_e2e_train_api.py --log-cli-level=debug
+          python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test/e2e-train-api/test_e2e_train_api.py --log-cli-level=debug
diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml
index ca2b543fc7..d88f26e77f 100644
--- a/.github/workflows/integration-tests.yaml
+++ b/.github/workflows/integration-tests.yaml
@@ -96,7 +96,7 @@ jobs:
       - name: Run tests
         run: |
           pip install pytest
-          python3 -m pip install -e sdk/python; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
+          python3 -m pip install -e sdk/python; pytest -s sdk/python/test/e2e --log-cli-level=debug --namespace=default
         env:
           GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}
 
diff --git a/sdk/python/test_train_api/test_e2e_train_api.py b/sdk/python/test/e2e-train-api/test_e2e_train_api.py
similarity index 100%
rename from sdk/python/test_train_api/test_e2e_train_api.py
rename to sdk/python/test/e2e-train-api/test_e2e_train_api.py

From fa99a92dd6e798207424d2377a78a17375c54323 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Wed, 4 Sep 2024 08:47:10 +0800
Subject: [PATCH 68/86] fix typos

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test/e2e-train-api/test_e2e_train_api.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sdk/python/test/e2e-train-api/test_e2e_train_api.py b/sdk/python/test/e2e-train-api/test_e2e_train_api.py
index 0918941672..59b4bafa58 100644
--- a/sdk/python/test/e2e-train-api/test_e2e_train_api.py
+++ b/sdk/python/test/e2e-train-api/test_e2e_train_api.py
@@ -28,7 +28,6 @@
 logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG)
 
 TRAINING_CLIENT = TrainingClient(job_kind=constants.PYTORCHJOB_KIND)
-CONTAINER_NAME = "pytorch"
 
 
 def test_sdk_e2e_create_from_train_api(job_namespace="default"):
@@ -90,7 +89,7 @@ def test_sdk_e2e_create_from_train_api(job_namespace="default"):
     except Exception as e:
         utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
         TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
-        raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}")
+        raise Exception(f"PyTorchJob create from API E2E fails. Exception: {e}")
 
     utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
     TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)

From f0d8cc4dc03618ff6100166453b0f6b8331a85fe Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Wed, 4 Sep 2024 08:56:27 +0800
Subject: [PATCH 69/86] rerun tests

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 sdk/python/test/e2e-train-api/test_e2e_train_api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sdk/python/test/e2e-train-api/test_e2e_train_api.py b/sdk/python/test/e2e-train-api/test_e2e_train_api.py
index 59b4bafa58..764db97042 100644
--- a/sdk/python/test/e2e-train-api/test_e2e_train_api.py
+++ b/sdk/python/test/e2e-train-api/test_e2e_train_api.py
@@ -68,8 +68,8 @@ def test_sdk_e2e_create_from_train_api(job_namespace="default"):
                 bias="none",
             ),
         ),
-        num_workers=1,  # nodes parameter for torchrun command.
-        num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
+        num_workers=1,
+        num_procs_per_worker=1,
         resources_per_worker={
             "gpu": 0,
             "cpu": 2,

From d2c3cacfe4308715fdcfbfd34e343942410e3777 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 20 Sep 2024 18:54:39 -0700
Subject: [PATCH 70/86] update install packages

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index dfaf615aff..776aa65a92 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -51,4 +51,4 @@ jobs:
       - name: Run tests
         run: |
           pip install pytest
-          python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test/e2e-train-api/test_e2e_train_api.py --log-cli-level=debug
+          python3 -m pip install -e sdk/python[huggingface]; pytest -s sdk/python/test/e2e-train-api/test_e2e_train_api.py --log-cli-level=debug

From 9f4244909b44f0a8be3c7234d90c808728e94b71 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 20 Sep 2024 20:56:38 -0700
Subject: [PATCH 71/86] build and verify images of storage-intializer and
 trainer

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml           | 13 +++++++++++--
 scripts/gha/build-image.sh                          |  2 ++
 sdk/python/kubeflow/training/api/training_client.py |  9 +++++++++
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index 776aa65a92..20b19ec769 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -33,11 +33,13 @@ jobs:
           cluster_name: training-operator-cluster
           kubectl_version: ${{ matrix.kubernetes-version }}
 
-      - name: Build training-operator
+      - name: Build training-operator, storage-initializer, and trainer images
         run: |
           ./scripts/gha/build-image.sh
         env:
           TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
+          STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
+          TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
 
       - name: Deploy training operator
         run: |
@@ -50,5 +52,12 @@ jobs:
 
       - name: Run tests
         run: |
+          kind load docker-image ${{ env.STORAGE_INITIALIZER_IMAGE }} --name ${{ env.KIND_CLUSTER }}
+          kind load docker-image ${{ env.TRAINER_TRANSFORMER_IMAGE_DEFAULT }} --name ${{ env.KIND_CLUSTER }}
           pip install pytest
-          python3 -m pip install -e sdk/python[huggingface]; pytest -s sdk/python/test/e2e-train-api/test_e2e_train_api.py --log-cli-level=debug
+          python3 -m pip install -e sdk/python[huggingface]
+          pytest -s sdk/python/test/e2e-train-api/test_e2e_train_api.py --log-cli-level=debug
+        env:
+          KIND_CLUSTER: training-operator-cluster
+          STORAGE_INITIALIZER_IMAGE: kubeflowtraining/storage-initializer:test
+          TRAINER_TRANSFORMER_IMAGE_DEFAULT: kubeflowtraining/trainer:test
diff --git a/scripts/gha/build-image.sh b/scripts/gha/build-image.sh
index cb4f0fc832..7c2947bdce 100755
--- a/scripts/gha/build-image.sh
+++ b/scripts/gha/build-image.sh
@@ -22,3 +22,5 @@ set -o nounset
 set -o pipefail
 
 docker build . -t ${TRAINING_CI_IMAGE} -f build/images/training-operator/Dockerfile
+docker build . -t ${STORAGE_INITIALIZER_CI_IMAGE} -f sdk/python/kubeflow/storage_initializer/Dockerfile
+docker build . -t ${TRAINER_CI_IMAGE} -f sdk/python/kubeflow/trainer/Dockerfile
diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py
index 1626f18820..459e16a046 100644
--- a/sdk/python/kubeflow/training/api/training_client.py
+++ b/sdk/python/kubeflow/training/api/training_client.py
@@ -258,6 +258,10 @@ def train(
             ],
             volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT],
         )
+        base_image1=os.getenv(
+                "STORAGE_INITIALIZER_IMAGE", constants.STORAGE_INITIALIZER_IMAGE_DEFAULT
+            )
+        print("base_image1: " + base_image1)
 
         # create app container spec
         container_spec = utils.get_container_spec(
@@ -287,6 +291,11 @@ def train(
             volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT],
             resources=resources_per_worker,
         )
+        base_image2=os.getenv(
+                "TRAINER_TRANSFORMER_IMAGE_DEFAULT",
+                constants.TRAINER_TRANSFORMER_IMAGE_DEFAULT,
+            )
+        print("base_image2: " + base_image2)
 
         storage_initializer_volume = models.V1Volume(
             name=constants.STORAGE_INITIALIZER,

From bb406cee573141304c37b35c672a027b40f51d4c Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 20 Sep 2024 21:07:18 -0700
Subject: [PATCH 72/86] fix image build error

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 scripts/gha/build-image.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/gha/build-image.sh b/scripts/gha/build-image.sh
index 7c2947bdce..0246d18dc1 100755
--- a/scripts/gha/build-image.sh
+++ b/scripts/gha/build-image.sh
@@ -22,5 +22,5 @@ set -o nounset
 set -o pipefail
 
 docker build . -t ${TRAINING_CI_IMAGE} -f build/images/training-operator/Dockerfile
-docker build . -t ${STORAGE_INITIALIZER_CI_IMAGE} -f sdk/python/kubeflow/storage_initializer/Dockerfile
-docker build . -t ${TRAINER_CI_IMAGE} -f sdk/python/kubeflow/trainer/Dockerfile
+docker build sdk/python/kubeflow/storage_initializer -t ${STORAGE_INITIALIZER_CI_IMAGE} -f Dockerfile
+docker build sdk/python/kubeflow/trainer -t ${TRAINER_CI_IMAGE} -f Dockerfile

From f0b6b38515bb10b914cefb32f5ad545dd2f49086 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 20 Sep 2024 21:13:27 -0700
Subject: [PATCH 73/86] fix image build error

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 scripts/gha/build-image.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/gha/build-image.sh b/scripts/gha/build-image.sh
index 0246d18dc1..9ffbb314ed 100755
--- a/scripts/gha/build-image.sh
+++ b/scripts/gha/build-image.sh
@@ -22,5 +22,5 @@ set -o nounset
 set -o pipefail
 
 docker build . -t ${TRAINING_CI_IMAGE} -f build/images/training-operator/Dockerfile
-docker build sdk/python/kubeflow/storage_initializer -t ${STORAGE_INITIALIZER_CI_IMAGE} -f Dockerfile
-docker build sdk/python/kubeflow/trainer -t ${TRAINER_CI_IMAGE} -f Dockerfile
+docker build sdk/python/kubeflow/storage_initializer -t ${STORAGE_INITIALIZER_CI_IMAGE} -f sdk/python/kubeflow/storage_initializer/Dockerfile
+docker build sdk/python/kubeflow/trainer -t ${TRAINER_CI_IMAGE} -f sdk/python/kubeflow/trainer/Dockerfile

From 45eb7e082fb129067972c38030f193aa6c811dd5 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 20 Sep 2024 22:06:11 -0700
Subject: [PATCH 74/86] check disk space

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml     | 16 +++++++++---
 scripts/gha/build-image.sh                    |  2 --
 .../setup-storage-initializer-and-trainer.sh  | 25 +++++++++++++++++++
 3 files changed, 37 insertions(+), 6 deletions(-)
 create mode 100644 scripts/gha/setup-storage-initializer-and-trainer.sh

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index 20b19ec769..d6af0a1d55 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -33,13 +33,11 @@ jobs:
           cluster_name: training-operator-cluster
           kubectl_version: ${{ matrix.kubernetes-version }}
 
-      - name: Build training-operator, storage-initializer, and trainer images
+      - name: Build training-operator
         run: |
           ./scripts/gha/build-image.sh
         env:
           TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
-          STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
-          TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
 
       - name: Deploy training operator
         run: |
@@ -49,11 +47,21 @@ jobs:
           TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
           GANG_SCHEDULER_NAME: "none"
           KUBERNETES_VERSION: ${{ matrix.kubernetes-version }}
+      
+      - name: Build and load storage initializer and trainer
+        run: |
+          ./scripts/gha/setup-storage-initializer-and-trainer.sh
+        env:
+          STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
+          TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
+
+      - name: Check disk space
+        run: df -h
 
       - name: Run tests
         run: |
           kind load docker-image ${{ env.STORAGE_INITIALIZER_IMAGE }} --name ${{ env.KIND_CLUSTER }}
-          kind load docker-image ${{ env.TRAINER_TRANSFORMER_IMAGE_DEFAULT }} --name ${{ env.KIND_CLUSTER }}
+          kind load docker-image ${{ env.TRAINER_TRANSFORMER_IMAGE_DEFAULT }} --name ${{ env.KIND_CLUSTER }} 
           pip install pytest
           python3 -m pip install -e sdk/python[huggingface]
           pytest -s sdk/python/test/e2e-train-api/test_e2e_train_api.py --log-cli-level=debug
diff --git a/scripts/gha/build-image.sh b/scripts/gha/build-image.sh
index 9ffbb314ed..cb4f0fc832 100755
--- a/scripts/gha/build-image.sh
+++ b/scripts/gha/build-image.sh
@@ -22,5 +22,3 @@ set -o nounset
 set -o pipefail
 
 docker build . -t ${TRAINING_CI_IMAGE} -f build/images/training-operator/Dockerfile
-docker build sdk/python/kubeflow/storage_initializer -t ${STORAGE_INITIALIZER_CI_IMAGE} -f sdk/python/kubeflow/storage_initializer/Dockerfile
-docker build sdk/python/kubeflow/trainer -t ${TRAINER_CI_IMAGE} -f sdk/python/kubeflow/trainer/Dockerfile
diff --git a/scripts/gha/setup-storage-initializer-and-trainer.sh b/scripts/gha/setup-storage-initializer-and-trainer.sh
new file mode 100644
index 0000000000..3f06fa6a5b
--- /dev/null
+++ b/scripts/gha/setup-storage-initializer-and-trainer.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Copyright 2024 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The script is used to build Kubeflow Training image.
+
+
+set -o errexit
+set -o nounset
+set -o pipefail
+
+docker build sdk/python/kubeflow/storage_initializer -t ${STORAGE_INITIALIZER_CI_IMAGE} -f sdk/python/kubeflow/storage_initializer/Dockerfile
+docker build sdk/python/kubeflow/trainer -t ${TRAINER_CI_IMAGE} -f sdk/python/kubeflow/trainer/Dockerfile

From f21779494b72a1391197cca75d4b4221e58fe10e Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 20 Sep 2024 22:15:25 -0700
Subject: [PATCH 75/86] make 'setup-storage-initializer-and-trainer' executable

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml          |  2 +-
 .../gha/setup-storage-initializer-and-trainer.sh   |  0
 .../kubeflow/training/api/training_client.py       | 14 +++++++-------
 3 files changed, 8 insertions(+), 8 deletions(-)
 mode change 100644 => 100755 scripts/gha/setup-storage-initializer-and-trainer.sh

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index d6af0a1d55..1df822a735 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -48,7 +48,7 @@ jobs:
           GANG_SCHEDULER_NAME: "none"
           KUBERNETES_VERSION: ${{ matrix.kubernetes-version }}
       
-      - name: Build and load storage initializer and trainer
+      - name: Build storage initializer and trainer
         run: |
           ./scripts/gha/setup-storage-initializer-and-trainer.sh
         env:
diff --git a/scripts/gha/setup-storage-initializer-and-trainer.sh b/scripts/gha/setup-storage-initializer-and-trainer.sh
old mode 100644
new mode 100755
diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py
index 459e16a046..4ca57e7c76 100644
--- a/sdk/python/kubeflow/training/api/training_client.py
+++ b/sdk/python/kubeflow/training/api/training_client.py
@@ -258,9 +258,9 @@ def train(
             ],
             volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT],
         )
-        base_image1=os.getenv(
-                "STORAGE_INITIALIZER_IMAGE", constants.STORAGE_INITIALIZER_IMAGE_DEFAULT
-            )
+        base_image1 = os.getenv(
+            "STORAGE_INITIALIZER_IMAGE", constants.STORAGE_INITIALIZER_IMAGE_DEFAULT
+        )
         print("base_image1: " + base_image1)
 
         # create app container spec
@@ -291,10 +291,10 @@ def train(
             volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT],
             resources=resources_per_worker,
         )
-        base_image2=os.getenv(
-                "TRAINER_TRANSFORMER_IMAGE_DEFAULT",
-                constants.TRAINER_TRANSFORMER_IMAGE_DEFAULT,
-            )
+        base_image2 = os.getenv(
+            "TRAINER_TRANSFORMER_IMAGE_DEFAULT",
+            constants.TRAINER_TRANSFORMER_IMAGE_DEFAULT,
+        )
         print("base_image2: " + base_image2)
 
         storage_initializer_volume = models.V1Volume(

From 083e15572dc7982de18519505399bc89f27fcec3 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 20 Sep 2024 22:40:03 -0700
Subject: [PATCH 76/86] separate step of loading images

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index 1df822a735..198fdd7f1c 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -58,14 +58,30 @@ jobs:
       - name: Check disk space
         run: df -h
 
+      - name: Load storage initializer
+        run: |
+          kind load docker-image ${{ env.STORAGE_INITIALIZER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }}
+          docker rmi ${{ env.STORAGE_INITIALIZER_CI_IMAGE }}
+        env:
+          KIND_CLUSTER: training-operator-cluster
+          STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
+      
+      - name: Check disk space
+        run: df -h
+      
+      - name: Load trainer
+        run: |
+          kind load docker-image ${{ env.TRAINER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }}
+          docker rmi ${{ env.TRAINER_CI_IMAGE }}
+        env:
+          KIND_CLUSTER: training-operator-cluster
+          TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
+      
       - name: Run tests
         run: |
-          kind load docker-image ${{ env.STORAGE_INITIALIZER_IMAGE }} --name ${{ env.KIND_CLUSTER }}
-          kind load docker-image ${{ env.TRAINER_TRANSFORMER_IMAGE_DEFAULT }} --name ${{ env.KIND_CLUSTER }} 
           pip install pytest
           python3 -m pip install -e sdk/python[huggingface]
           pytest -s sdk/python/test/e2e-train-api/test_e2e_train_api.py --log-cli-level=debug
         env:
-          KIND_CLUSTER: training-operator-cluster
           STORAGE_INITIALIZER_IMAGE: kubeflowtraining/storage-initializer:test
           TRAINER_TRANSFORMER_IMAGE_DEFAULT: kubeflowtraining/trainer:test

From dc74844601b9b9a5cd089d56de9ceeb94f283ad3 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 20 Sep 2024 22:40:59 -0700
Subject: [PATCH 77/86] check disk space after loading image

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index 198fdd7f1c..129e28f3be 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -77,6 +77,9 @@ jobs:
           KIND_CLUSTER: training-operator-cluster
           TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
       
+      - name: Check disk space
+        run: df -h
+      
       - name: Run tests
         run: |
           pip install pytest

From de18ef0abfaa1e3344ec6dfcbe2e2303fc432f99 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Fri, 20 Sep 2024 23:43:28 -0700
Subject: [PATCH 78/86] clean up and check disk space

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml | 40 +++++++++++++++++------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index 129e28f3be..d1cecccea0 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -48,38 +48,58 @@ jobs:
           GANG_SCHEDULER_NAME: "none"
           KUBERNETES_VERSION: ${{ matrix.kubernetes-version }}
       
+      - name: Prune docker images
+        shell: bash
+        run: |
+          docker image prune -a -f
+          docker system df
+          df -h
+
       - name: Build storage initializer and trainer
         run: |
           ./scripts/gha/setup-storage-initializer-and-trainer.sh
+          docker system df
+          df -h
         env:
           STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
           TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
 
-      - name: Check disk space
-        run: df -h
-
       - name: Load storage initializer
         run: |
           kind load docker-image ${{ env.STORAGE_INITIALIZER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }}
-          docker rmi ${{ env.STORAGE_INITIALIZER_CI_IMAGE }}
+          docker system df
+          df -h
         env:
           KIND_CLUSTER: training-operator-cluster
           STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
       
-      - name: Check disk space
-        run: df -h
-      
+      - name: Remove image
+        run: |
+          docker rmi ${{ env.STORAGE_INITIALIZER_CI_IMAGE}}
+          docker system df
+          df -h
+        env:
+          STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
+
+      - name: Monitor resources usage of node
+        run: |
+          echo "Monitor resources usage of node"
+          kubectl describe nodes training-operator-cluster-control-plane
+          echo "Monitor resources usage of pods"
+          kubectl get pods --all-namespaces
+          echo "Monitor resources usage of storage"
+          docker exec -it training-operator-cluster-control-plane df -h
+   
       - name: Load trainer
         run: |
           kind load docker-image ${{ env.TRAINER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }}
           docker rmi ${{ env.TRAINER_CI_IMAGE }}
+          docker system df
+          df -h
         env:
           KIND_CLUSTER: training-operator-cluster
           TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
       
-      - name: Check disk space
-        run: df -h
-      
       - name: Run tests
         run: |
           pip install pytest

From ef8742ce70d071575cb15d1d10f0f6a4bc125ab4 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sat, 21 Sep 2024 00:01:54 -0700
Subject: [PATCH 79/86] prune docker build cache

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index d1cecccea0..a2f52c6cbc 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -42,6 +42,8 @@ jobs:
       - name: Deploy training operator
         run: |
           ./scripts/gha/setup-training-operator.sh
+          docker system df
+          df -h
         env:
           KIND_CLUSTER: training-operator-cluster
           TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
@@ -76,6 +78,7 @@ jobs:
       - name: Remove image
         run: |
           docker rmi ${{ env.STORAGE_INITIALIZER_CI_IMAGE}}
+          docker builder prune
           docker system df
           df -h
         env:
@@ -87,8 +90,6 @@ jobs:
           kubectl describe nodes training-operator-cluster-control-plane
           echo "Monitor resources usage of pods"
           kubectl get pods --all-namespaces
-          echo "Monitor resources usage of storage"
-          docker exec -it training-operator-cluster-control-plane df -h
    
       - name: Load trainer
         run: |

From 1eb3ef1b3c74c4598a110dd437b020c43595133a Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sat, 21 Sep 2024 00:13:49 -0700
Subject: [PATCH 80/86] prune docker build cache

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index a2f52c6cbc..66feaadedf 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -78,7 +78,7 @@ jobs:
       - name: Remove image
         run: |
           docker rmi ${{ env.STORAGE_INITIALIZER_CI_IMAGE}}
-          docker builder prune
+          docker builder prune --all --force
           docker system df
           df -h
         env:

From 1e407a51ecdbb556e225a85aa3f9f0f59ae4bc74 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sat, 21 Sep 2024 16:17:27 -0700
Subject: [PATCH 81/86] adjust sequence of building and loading images

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml     | 54 ++++++++++++-------
 ...rainer.sh => build-storage-initializer.sh} |  1 -
 scripts/gha/build-trainer.sh                  | 24 +++++++++
 3 files changed, 59 insertions(+), 20 deletions(-)
 rename scripts/gha/{setup-storage-initializer-and-trainer.sh => build-storage-initializer.sh} (88%)
 create mode 100755 scripts/gha/build-trainer.sh

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index 66feaadedf..b72fa60ef7 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -57,50 +57,66 @@ jobs:
           docker system df
           df -h
 
-      - name: Build storage initializer and trainer
+      - name: Build trainer
         run: |
-          ./scripts/gha/setup-storage-initializer-and-trainer.sh
+          ./scripts/gha/build-trainer.sh
           docker system df
           df -h
         env:
-          STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
           TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
+      
+      - name: Clean up build cache
+        run: |
+          docker builder prune --all --force
+          docker volume ls
+          docker system df
+          df -h
 
-      - name: Load storage initializer
+      - name: Load trainer
         run: |
-          kind load docker-image ${{ env.STORAGE_INITIALIZER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }}
+          kind load docker-image ${{ env.TRAINER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }}
+          docker image prune -a -f
+          docker volume prune -f
           docker system df
           df -h
         env:
           KIND_CLUSTER: training-operator-cluster
+          TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
+      
+      - name: Build storage initializer
+        run: |
+          ./scripts/gha/build-storage-initializer.sh
+          docker system df
+          df -h
+        env:
           STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
+          TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
       
-      - name: Remove image
+      - name: Clean up build cache
         run: |
-          docker rmi ${{ env.STORAGE_INITIALIZER_CI_IMAGE}}
           docker builder prune --all --force
+          docker volume ls
+          docker system df
+          df -h
+
+      - name: Load storage initializer
+        run: |
+          kind load docker-image ${{ env.STORAGE_INITIALIZER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }}
+          docker image prune -a -f
+          docker volume prune -f
           docker system df
           df -h
         env:
+          KIND_CLUSTER: training-operator-cluster
           STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
-
+      
       - name: Monitor resources usage of node
         run: |
           echo "Monitor resources usage of node"
           kubectl describe nodes training-operator-cluster-control-plane
           echo "Monitor resources usage of pods"
           kubectl get pods --all-namespaces
-   
-      - name: Load trainer
-        run: |
-          kind load docker-image ${{ env.TRAINER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }}
-          docker rmi ${{ env.TRAINER_CI_IMAGE }}
-          docker system df
-          df -h
-        env:
-          KIND_CLUSTER: training-operator-cluster
-          TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
-      
+         
       - name: Run tests
         run: |
           pip install pytest
diff --git a/scripts/gha/setup-storage-initializer-and-trainer.sh b/scripts/gha/build-storage-initializer.sh
similarity index 88%
rename from scripts/gha/setup-storage-initializer-and-trainer.sh
rename to scripts/gha/build-storage-initializer.sh
index 3f06fa6a5b..261e140a60 100755
--- a/scripts/gha/setup-storage-initializer-and-trainer.sh
+++ b/scripts/gha/build-storage-initializer.sh
@@ -22,4 +22,3 @@ set -o nounset
 set -o pipefail
 
 docker build sdk/python/kubeflow/storage_initializer -t ${STORAGE_INITIALIZER_CI_IMAGE} -f sdk/python/kubeflow/storage_initializer/Dockerfile
-docker build sdk/python/kubeflow/trainer -t ${TRAINER_CI_IMAGE} -f sdk/python/kubeflow/trainer/Dockerfile
diff --git a/scripts/gha/build-trainer.sh b/scripts/gha/build-trainer.sh
new file mode 100755
index 0000000000..87bf229246
--- /dev/null
+++ b/scripts/gha/build-trainer.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Copyright 2024 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The script is used to build Kubeflow Training image.
+
+
+set -o errexit
+set -o nounset
+set -o pipefail
+
+docker build sdk/python/kubeflow/trainer -t ${TRAINER_CI_IMAGE} -f sdk/python/kubeflow/trainer/Dockerfile

From 751955907a71f8b1b3852e096e8e2c89089e4957 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sat, 21 Sep 2024 16:48:28 -0700
Subject: [PATCH 82/86] move working directory

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index b72fa60ef7..f158bcf5d0 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -83,6 +83,24 @@ jobs:
           KIND_CLUSTER: training-operator-cluster
           TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
       
+      # Step to move Docker data directory back to / for Storage Initializer build
+      - name: Move docker data directory back to / for Storage Initializer build
+        shell: bash
+        run: |
+          echo "Stopping docker service ..."
+          sudo systemctl stop docker
+          DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
+          DOCKER_ROOT_DIR=/mnt/docker
+          echo "Removing symlink and moving Docker data back to ${DOCKER_DEFAULT_ROOT_DIR}..."
+          sudo rm -rf ${DOCKER_DEFAULT_ROOT_DIR}
+          sudo mv ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
+          echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
+          echo "Starting docker service ..."
+          sudo systemctl daemon-reload
+          sudo systemctl start docker
+          echo "Docker service status:"
+          sudo systemctl --no-pager -l -o short status docker
+          
       - name: Build storage initializer
         run: |
           ./scripts/gha/build-storage-initializer.sh

From f5d63c40b65176279ae55bc6cbcbecd37f6aa731 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sat, 21 Sep 2024 17:29:59 -0700
Subject: [PATCH 83/86] delete moving working directory

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml | 52 ++---------------------
 1 file changed, 3 insertions(+), 49 deletions(-)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index f158bcf5d0..93064415c2 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -42,35 +42,20 @@ jobs:
       - name: Deploy training operator
         run: |
           ./scripts/gha/setup-training-operator.sh
-          docker system df
-          df -h
         env:
           KIND_CLUSTER: training-operator-cluster
           TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
           GANG_SCHEDULER_NAME: "none"
           KUBERNETES_VERSION: ${{ matrix.kubernetes-version }}
-      
-      - name: Prune docker images
-        shell: bash
-        run: |
-          docker image prune -a -f
-          docker system df
-          df -h
 
       - name: Build trainer
         run: |
           ./scripts/gha/build-trainer.sh
+          docker builder prune --all --force
           docker system df
           df -h
         env:
           TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
-      
-      - name: Clean up build cache
-        run: |
-          docker builder prune --all --force
-          docker volume ls
-          docker system df
-          df -h
 
       - name: Load trainer
         run: |
@@ -82,40 +67,16 @@ jobs:
         env:
           KIND_CLUSTER: training-operator-cluster
           TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
-      
-      # Step to move Docker data directory back to / for Storage Initializer build
-      - name: Move docker data directory back to / for Storage Initializer build
-        shell: bash
-        run: |
-          echo "Stopping docker service ..."
-          sudo systemctl stop docker
-          DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
-          DOCKER_ROOT_DIR=/mnt/docker
-          echo "Removing symlink and moving Docker data back to ${DOCKER_DEFAULT_ROOT_DIR}..."
-          sudo rm -rf ${DOCKER_DEFAULT_ROOT_DIR}
-          sudo mv ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
-          echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
-          echo "Starting docker service ..."
-          sudo systemctl daemon-reload
-          sudo systemctl start docker
-          echo "Docker service status:"
-          sudo systemctl --no-pager -l -o short status docker
-          
+
       - name: Build storage initializer
         run: |
           ./scripts/gha/build-storage-initializer.sh
+          docker builder prune --all --force
           docker system df
           df -h
         env:
           STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
           TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
-      
-      - name: Clean up build cache
-        run: |
-          docker builder prune --all --force
-          docker volume ls
-          docker system df
-          df -h
 
       - name: Load storage initializer
         run: |
@@ -127,13 +88,6 @@ jobs:
         env:
           KIND_CLUSTER: training-operator-cluster
           STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
-      
-      - name: Monitor resources usage of node
-        run: |
-          echo "Monitor resources usage of node"
-          kubectl describe nodes training-operator-cluster-control-plane
-          echo "Monitor resources usage of pods"
-          kubectl get pods --all-namespaces
          
       - name: Run tests
         run: |

From 08c8562b0e41b6fe49add34df05b8ff516265c2a Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Sat, 21 Sep 2024 17:34:15 -0700
Subject: [PATCH 84/86] fix format

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml           | 2 +-
 sdk/python/kubeflow/training/api/training_client.py | 9 ---------
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index 93064415c2..c286d9a6e9 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -88,7 +88,7 @@ jobs:
         env:
           KIND_CLUSTER: training-operator-cluster
           STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
-         
+
       - name: Run tests
         run: |
           pip install pytest
diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py
index 4ca57e7c76..1626f18820 100644
--- a/sdk/python/kubeflow/training/api/training_client.py
+++ b/sdk/python/kubeflow/training/api/training_client.py
@@ -258,10 +258,6 @@ def train(
             ],
             volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT],
         )
-        base_image1 = os.getenv(
-            "STORAGE_INITIALIZER_IMAGE", constants.STORAGE_INITIALIZER_IMAGE_DEFAULT
-        )
-        print("base_image1: " + base_image1)
 
         # create app container spec
         container_spec = utils.get_container_spec(
@@ -291,11 +287,6 @@ def train(
             volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT],
             resources=resources_per_worker,
         )
-        base_image2 = os.getenv(
-            "TRAINER_TRANSFORMER_IMAGE_DEFAULT",
-            constants.TRAINER_TRANSFORMER_IMAGE_DEFAULT,
-        )
-        print("base_image2: " + base_image2)
 
         storage_initializer_volume = models.V1Volume(
             name=constants.STORAGE_INITIALIZER,

From d2ae5423539e2b17ab65279c8de9b3bc3cda7c24 Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 24 Sep 2024 13:13:41 -0700
Subject: [PATCH 85/86] use 'docker system prune'

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index c286d9a6e9..84fba097dd 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -42,6 +42,9 @@ jobs:
       - name: Deploy training operator
         run: |
           ./scripts/gha/setup-training-operator.sh
+          docker system prune -a -f
+          docker system df
+          df -h
         env:
           KIND_CLUSTER: training-operator-cluster
           TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
@@ -60,8 +63,7 @@ jobs:
       - name: Load trainer
         run: |
           kind load docker-image ${{ env.TRAINER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }}
-          docker image prune -a -f
-          docker volume prune -f
+          docker system prune -a -f
           docker system df
           df -h
         env:
@@ -81,8 +83,7 @@ jobs:
       - name: Load storage initializer
         run: |
           kind load docker-image ${{ env.STORAGE_INITIALIZER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }}
-          docker image prune -a -f
-          docker volume prune -f
+          docker system prune -a -f
           docker system df
           df -h
         env:

From 09fc8a906afaeffeaa7815d0961a9360ccc8b98c Mon Sep 17 00:00:00 2001
From: helenxie-bit <helenxiehz@gmail.com>
Date: Tue, 24 Sep 2024 13:15:47 -0700
Subject: [PATCH 86/86] make the format of the commands to be consistent

Signed-off-by: helenxie-bit <helenxiehz@gmail.com>
---
 .github/workflows/e2e-test-train-api.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
index 84fba097dd..fa65402682 100644
--- a/.github/workflows/e2e-test-train-api.yaml
+++ b/.github/workflows/e2e-test-train-api.yaml
@@ -54,7 +54,7 @@ jobs:
       - name: Build trainer
         run: |
           ./scripts/gha/build-trainer.sh
-          docker builder prune --all --force
+          docker builder prune -a -f
           docker system df
           df -h
         env:
@@ -73,7 +73,7 @@ jobs:
       - name: Build storage initializer
         run: |
           ./scripts/gha/build-storage-initializer.sh
-          docker builder prune --all --force
+          docker builder prune -a -f
           docker system df
           df -h
         env: