kubeflow · google-oss-prow · Dec 9, 2024 · Sep 29, 2024 · Sep 29, 2024 · Sep 29, 2024
diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml
@@ -58,41 +58,17 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
 
-      - name: Free-Up Disk Space
-        uses: ./.github/workflows/free-up-disk-space
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
+      - name: Setup E2E Tests
+        uses: ./.github/workflows/setup-e2e-test
         with:
+          kubernetes-version: ${{ matrix.kubernetes-version }}
           python-version: ${{ matrix.python-version }}
 
       - name: Setup Go
         uses: actions/setup-go@v5
         with:
           go-version-file: go.mod
 
-      - name: Create k8s Kind Cluster
-        uses: helm/kind-action@9fdad0686e6f19fcd572f62516f5e0436f562ee7
-        with:
-          node_image: kindest/node:${{ matrix.kubernetes-version }}
-          cluster_name: training-operator-cluster
-          kubectl_version: ${{ matrix.kubernetes-version }}
-
-      - name: Build training-operator
-        run: |
-          ./scripts/gha/build-image.sh
-        env:
-          TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
-
-      - name: Deploy training operator
-        run: |
-          ./scripts/gha/setup-training-operator.sh
-        env:
-          KIND_CLUSTER: training-operator-cluster
-          TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
-          GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}
-          KUBERNETES_VERSION: ${{ matrix.kubernetes-version }}
-
       - name: Run tests
         run: |
           pip install pytest

diff --git a/.github/workflows/setup-e2e-test/action.yaml b/.github/workflows/setup-e2e-test/action.yaml
@@ -0,0 +1,48 @@
+name: Setup E2E test template
+description: A composite action to setup e2e tests
+
+inputs:
+  kubernetes-version:
+    required: true
+    description: kubernetes version
+  python-version:
+    required: true
+    description: Python version
+
+runs:
+  using: composite
+  steps:
+    - name: Free-Up Disk Space
+      uses: ./.github/workflows/free-up-disk-space
+
+    - name: Setup Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ inputs.python-version }}
+
+    - name: Create k8s Kind Cluster
+      uses: helm/kind-action@9fdad0686e6f19fcd572f62516f5e0436f562ee7
+      with:
+        node_image: kindest/node:${{ inputs.kubernetes-version }}
+        cluster_name: training-operator-cluster
+        kubectl_version: ${{ inputs.kubernetes-version }}
+
+    - name: Build training-operator
+      shell: bash
+      run: |
+        ./scripts/gha/build-image.sh
+      env:
+        TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
+
+    - name: Deploy training operator
+      shell: bash
+      run: |
+        ./scripts/gha/setup-training-operator.sh
+        docker system prune -a -f
+        docker system df
+        df -h
+      env:
+        KIND_CLUSTER: training-operator-cluster
+        TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
+        GANG_SCHEDULER_NAME: "none"
+        KUBERNETES_VERSION: ${{ inputs.kubernetes-version }}
diff --git a/.github/workflows/test-example-notebooks.yaml b/.github/workflows/test-example-notebooks.yaml
@@ -0,0 +1,39 @@
+name: Test example notebooks
+
+on:
+  - pull_request
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  create-pytorchjob-notebook-test:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        kubernetes-version: ["v1.28.7", "v1.29.2", "v1.30.6"]
+        python-version: ["3.9", "3.10", "3.11"]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup E2E Tests
+        uses: ./.github/workflows/setup-e2e-test
+        with:
+          kubernetes-version: ${{ matrix.kubernetes-version }}
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install Python Dependencies
+        run: |
+          pip install papermill==2.6.0 jupyter==1.1.1 ipykernel==6.29.5
+
+      - name: Run Jupyter Notebook with Papermill
+        shell: bash
+        run: |
+          ./scripts/run-notebook.sh \
+          -i ./examples/pytorch/image-classification/create-pytorchjob.ipynb \
+          -n default \
+          -k ./sdk/python
diff --git a/examples/pytorch/image-classification/create-pytorchjob.ipynb b/examples/pytorch/image-classification/create-pytorchjob.ipynb
@@ -24,6 +24,20 @@
     "The notebook shows how to use Kubeflow Training SDK to create, get, wait, check and delete PyTorchJob."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "training_python_sdk='kubeflow-training'\n",
+    "namespace='kubeflow-user-example-com'"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -42,12 +56,13 @@
    "outputs": [],
    "source": [
     "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n",
-    "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python"
+    "# Install Kubeflow Python SDK\n",
+    "!pip install {training_python_sdk}"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -93,7 +108,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -102,12 +117,11 @@
    "outputs": [],
    "source": [
     "name = \"pytorch-dist-mnist-gloo\"\n",
-    "namespace = \"kubeflow-user-example-com\"\n",
     "container_name = \"pytorch\"\n",
     "\n",
     "container = V1Container(\n",
     "    name=container_name,\n",
-    "    image=\"gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0\",\n",
+    "    image=\"kubeflow/pytorch-dist-mnist:latest\",\n",
     "    args=[\"--backend\", \"gloo\"],\n",
     ")\n",
     "\n",
@@ -157,7 +171,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -176,8 +190,8 @@
     "# Namespace will be reused in every APIs.\n",
     "training_client = TrainingClient(namespace=namespace)\n",
     "\n",
-    "# If `job_kind` is not set in `TrainingClient`, we need to set it for each API.\n",
-    "training_client.create_job(pytorchjob, job_kind=constants.PYTORCHJOB_KIND)"
+    "# `job_kind` is set in `TrainingClient`\n",
+    "training_client.create_job(pytorchjob)"
    ]
   },
   {
@@ -195,7 +209,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -214,7 +228,7 @@
     }
    ],
    "source": [
-    "training_client.get_job(name, job_kind=constants.PYTORCHJOB_KIND).metadata.name"
+    "training_client.get_job(name).metadata.name"
    ]
   },
   {
@@ -230,7 +244,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -260,7 +274,7 @@
     }
    ],
    "source": [
-    "training_client.get_job_conditions(name=name, job_kind=constants.PYTORCHJOB_KIND)"
+    "training_client.get_job_conditions(name=name)"
    ]
   },
   {
@@ -276,7 +290,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -302,7 +316,7 @@
     }
    ],
    "source": [
-    "pytorchjob = training_client.wait_for_job_conditions(name=name, job_kind=constants.PYTORCHJOB_KIND)\n",
+    "pytorchjob = training_client.wait_for_job_conditions(name=name)\n",
     "\n",
     "print(f\"Succeeded number of replicas: {pytorchjob.status.replica_statuses['Master'].succeeded}\")"
    ]
@@ -320,7 +334,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -339,7 +353,7 @@
     }
    ],
    "source": [
-    "training_client.is_job_succeeded(name=name, job_kind=constants.PYTORCHJOB_KIND)"
+    "training_client.is_job_succeeded(name=name)"
    ]
   },
   {
@@ -355,7 +369,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -476,7 +490,7 @@
     }
    ],
    "source": [
-    "training_client.get_job_logs(name=name, job_kind=constants.PYTORCHJOB_KIND)"
+    "training_client.get_job_logs(name=name)"
    ]
   },
   {
@@ -492,7 +506,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"

diff --git a/scripts/run-notebook.sh b/scripts/run-notebook.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+# Copyright 2024 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This bash script is used to run the example notebooks
+
+set -o errexit
+set -o nounset
+set -o pipefail
+
+NOTEBOOK_INPUT=""
+NOTEBOOK_OUTPUT="-" # outputs to console
+PAPERMILL_PARAMS=()
+NAMESPACE="default"
+TRAINING_PYTHON_SDK="./sdk/python"
+
+usage() {
+  echo "Usage: $0 -i <input_notebook> -o <output_notebook> [-p \"<param> <value>\"...] [-y <params.yaml>]"
+  echo "Options:"
+  echo "  -i  Input notebook (required)"
+  echo "  -o  Output notebook (required)"
+  echo "  -p  Papermill parameters (optional), pass param name and value pair (in quotes whitespace separated)"
+  echo "  -k  Kubeflow Training Operator Python SDK (optional)"
+  echo "  -n  Kubernetes namespace used by tests"
+  echo "  -h  Show this help message"
+  echo "NOTE: papermill, jupyter and ipykernel are required Python dependencies to run Notebooks"
+  exit 1
+}
+
+while getopts "i:o:p:k:n:r:d:h:" opt; do
+  case "$opt" in
+    i) NOTEBOOK_INPUT="$OPTARG" ;;            # -i for notebook input path
+    o) NOTEBOOK_OUTPUT="$OPTARG" ;;           # -o for notebook output path
+    p) PAPERMILL_PARAMS+=("$OPTARG") ;;       # -p for papermill parameters
 for param in "${PAPERMILL_PARAMS[@]}"; do 
   papermill_cmd="$papermill_cmd -p $param" 
 done 
 for param in "${PAPERMILL_PARAMS[@]}"; do 
   papermill_cmd="$papermill_cmd -p $param" 
 done 
+    k) TRAINING_PYTHON_SDK="$OPTARG" ;;       # -k for training operator python sdk
+    n) NAMESPACE="$OPTARG" ;;                 # -n for kubernetes namespace used by tests
+    h) usage ;;                               # -h for help (usage)
+    *) usage; exit 1 ;;
+  esac
+done
+
+if [ -z "$NOTEBOOK_INPUT" ]; then
+  echo "Error: -i notebook input path is required."
+  exit 1
+fi
+
+papermill_cmd="papermill $NOTEBOOK_INPUT $NOTEBOOK_OUTPUT -p training_python_sdk $TRAINING_PYTHON_SDK -p namespace $NAMESPACE"
+# Add papermill parameters (param name and value)
+for param in "${PAPERMILL_PARAMS[@]}"; do
+  papermill_cmd="$papermill_cmd -p $param"
+done
+
+if ! command -v papermill &> /dev/null; then
+  echo "Error: papermill is not installed. Please install papermill to proceed."
+  exit 1
+fi
+
+echo "Running command: $papermill_cmd"
+$papermill_cmd
+
+if [ $? -ne 0 ]; then
+  echo "Error: papermill execution failed." >&2
+  exit 1
+fi
+
+echo "Notebook execution completed successfully"