kubeflow · google-oss-prow · Dec 9, 2024 · Sep 29, 2024 · Sep 29, 2024 · Sep 29, 2024
diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml
@@ -58,40 +58,12 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
 
-      - name: Free-Up Disk Space
-        uses: ./.github/workflows/free-up-disk-space
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
+      - name: Setup E2E Tests
+        uses: ./.github/workflows/setup-e2e-test
         with:
+          kubernetes-version: ${{ matrix.kubernetes-version }}
           python-version: ${{ matrix.python-version }}
-
-      - name: Setup Go
-        uses: actions/setup-go@v5
-        with:
-          go-version-file: go.mod
-
-      - name: Create k8s Kind Cluster
-        uses: helm/kind-action@9fdad0686e6f19fcd572f62516f5e0436f562ee7
-        with:
-          node_image: kindest/node:${{ matrix.kubernetes-version }}
-          cluster_name: training-operator-cluster
-          kubectl_version: ${{ matrix.kubernetes-version }}
-
-      - name: Build training-operator
-        run: |
-          ./scripts/gha/build-image.sh
-        env:
-          TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
-
-      - name: Deploy training operator
-        run: |
-          ./scripts/gha/setup-training-operator.sh
-        env:
-          KIND_CLUSTER: training-operator-cluster
-          TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
-          GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}
-          KUBERNETES_VERSION: ${{ matrix.kubernetes-version }}
+          gang-scheduler-name: ${{ matrix.gang-scheduler-name }}
 
       - name: Run tests
         run: |

diff --git a/.github/workflows/setup-e2e-test/action.yaml b/.github/workflows/setup-e2e-test/action.yaml
@@ -0,0 +1,57 @@
+name: Setup E2E test template
+description: A composite action to setup e2e tests
+
+inputs:
+  kubernetes-version:
+    required: true
+    description: Kubernetes version
+  python-version:
+    required: true
+    description: Python version
+  gang-scheduler-name:
+    required: false
+    default: "none"
+    description: Gang scheduler name
+
+runs:
+  using: composite
+  steps:
+    - name: Free-Up Disk Space
+      uses: ./.github/workflows/free-up-disk-space
+
+    - name: Setup Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ inputs.python-version }}
+
+    - name: Setup Go
+      uses: actions/setup-go@v5
+      with:
+        go-version-file: go.mod
+
+    - name: Create k8s Kind Cluster
+      uses: helm/kind-action@9fdad0686e6f19fcd572f62516f5e0436f562ee7
+      with:
+        node_image: kindest/node:${{ inputs.kubernetes-version }}
+        cluster_name: training-operator-cluster
+        kubectl_version: ${{ inputs.kubernetes-version }}
+
+    - name: Build training-operator
+      shell: bash
+      run: |
+        ./scripts/gha/build-image.sh
+      env:
+        TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
+
+    - name: Deploy training operator
+      shell: bash
+      run: |
+        ./scripts/gha/setup-training-operator.sh
+        docker system prune -a -f
+        docker system df
+        df -h
+      env:
+        KIND_CLUSTER: training-operator-cluster
+        TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
+        GANG_SCHEDULER_NAME: ${{ inputs.gang-scheduler-name }}
+        KUBERNETES_VERSION: ${{ inputs.kubernetes-version }}
diff --git a/.github/workflows/test-example-notebooks.yaml b/.github/workflows/test-example-notebooks.yaml
@@ -0,0 +1,39 @@
+name: Test example notebooks
+
+on:
+  - pull_request
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  create-pytorchjob-notebook-test:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        kubernetes-version: ["v1.28.7", "v1.29.2", "v1.30.6"]
+        python-version: ["3.9", "3.10", "3.11"]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup E2E Tests
+        uses: ./.github/workflows/setup-e2e-test
+        with:
+          kubernetes-version: ${{ matrix.kubernetes-version }}
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install Python Dependencies
+        run: |
+          pip install papermill==2.6.0 jupyter==1.1.1 ipykernel==6.29.5
+
+      - name: Run Jupyter Notebook with Papermill
+        shell: bash
+        run: |
+          ./scripts/run-notebook.sh \
+          -i ./examples/pytorch/image-classification/create-pytorchjob.ipynb \
+          -n default \
+          -k ./sdk/python
diff --git a/examples/pytorch/image-classification/create-pytorchjob.ipynb b/examples/pytorch/image-classification/create-pytorchjob.ipynb
@@ -24,6 +24,20 @@
     "The notebook shows how to use Kubeflow Training SDK to create, get, wait, check and delete PyTorchJob."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "training_python_sdk='kubeflow-training'\n",
+    "namespace='kubeflow-user-example-com'"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -42,12 +56,13 @@
    "outputs": [],
    "source": [
     "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n",
-    "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python"
+    "# Install Kubeflow Python SDK\n",
+    "!pip install {training_python_sdk}"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -93,7 +108,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -102,12 +117,11 @@
    "outputs": [],
    "source": [
     "name = \"pytorch-dist-mnist-gloo\"\n",
-    "namespace = \"kubeflow-user-example-com\"\n",
     "container_name = \"pytorch\"\n",
     "\n",
     "container = V1Container(\n",
     "    name=container_name,\n",
-    "    image=\"gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0\",\n",
+    "    image=\"kubeflow/pytorch-dist-mnist:latest\",\n",
     "    args=[\"--backend\", \"gloo\"],\n",
     ")\n",
     "\n",
@@ -157,7 +171,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -176,8 +190,8 @@
     "# Namespace will be reused in every APIs.\n",
     "training_client = TrainingClient(namespace=namespace)\n",
     "\n",
-    "# If `job_kind` is not set in `TrainingClient`, we need to set it for each API.\n",
-    "training_client.create_job(pytorchjob, job_kind=constants.PYTORCHJOB_KIND)"
+    "# `job_kind` is set in `TrainingClient`\n",
+    "training_client.create_job(pytorchjob)"
    ]
   },
   {
@@ -195,7 +209,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -214,7 +228,7 @@
     }
    ],
    "source": [
-    "training_client.get_job(name, job_kind=constants.PYTORCHJOB_KIND).metadata.name"
+    "training_client.get_job(name).metadata.name"
    ]
   },
   {
@@ -230,7 +244,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -260,7 +274,7 @@
     }
    ],
    "source": [
-    "training_client.get_job_conditions(name=name, job_kind=constants.PYTORCHJOB_KIND)"
+    "training_client.get_job_conditions(name=name)"
    ]
   },
   {
@@ -276,7 +290,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -302,7 +316,7 @@
     }
    ],
    "source": [
-    "pytorchjob = training_client.wait_for_job_conditions(name=name, job_kind=constants.PYTORCHJOB_KIND)\n",
+    "pytorchjob = training_client.wait_for_job_conditions(name=name)\n",
     "\n",
     "print(f\"Succeeded number of replicas: {pytorchjob.status.replica_statuses['Master'].succeeded}\")"
    ]
@@ -320,7 +334,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -339,7 +353,7 @@
     }
    ],
    "source": [
-    "training_client.is_job_succeeded(name=name, job_kind=constants.PYTORCHJOB_KIND)"
+    "training_client.is_job_succeeded(name=name)"
    ]
   },
   {
@@ -355,7 +369,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -476,7 +490,7 @@
     }
    ],
    "source": [
-    "training_client.get_job_logs(name=name, job_kind=constants.PYTORCHJOB_KIND)"
+    "training_client.get_job_logs(name=name)"
    ]
   },
   {
@@ -492,7 +506,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"