Add e2e test for train API #49
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: E2E Test with train API | |
on: | |
- pull_request | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.ref }} | |
cancel-in-progress: true | |
jobs: | |
e2e-test: | |
runs-on: ubuntu-latest | |
strategy: | |
fail-fast: false | |
matrix: | |
kubernetes-version: ["v1.28.7"] | |
python-version: ["3.9", "3.10", "3.11"] | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Free-Up Disk Space | |
uses: ./.github/workflows/free-up-disk-space | |
- name: Setup Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Create k8s Kind Cluster | |
uses: helm/[email protected] | |
with: | |
node_image: kindest/node:${{ matrix.kubernetes-version }} | |
cluster_name: training-operator-cluster | |
kubectl_version: ${{ matrix.kubernetes-version }} | |
- name: Build training-operator | |
run: | | |
./scripts/gha/build-image.sh | |
env: | |
TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test | |
- name: Deploy training operator | |
run: | | |
./scripts/gha/setup-training-operator.sh | |
env: | |
KIND_CLUSTER: training-operator-cluster | |
TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test | |
GANG_SCHEDULER_NAME: "none" | |
KUBERNETES_VERSION: ${{ matrix.kubernetes-version }} | |
- name: Prune docker images | |
shell: bash | |
run: | | |
docker image prune -a -f | |
docker system df | |
df -h | |
- name: Build storage initializer and trainer | |
run: | | |
./scripts/gha/setup-storage-initializer-and-trainer.sh | |
docker system df | |
df -h | |
env: | |
STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test | |
TRAINER_CI_IMAGE: kubeflowtraining/trainer:test | |
- name: Load storage initializer | |
run: | | |
kind load docker-image ${{ env.STORAGE_INITIALIZER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }} | |
docker system df | |
df -h | |
env: | |
KIND_CLUSTER: training-operator-cluster | |
STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test | |
- name: Remove image | |
run: | | |
docker rmi ${{ env.STORAGE_INITIALIZER_CI_IMAGE}} | |
docker system df | |
df -h | |
env: | |
STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test | |
- name: Monitor resources usage of node | |
run: | | |
echo "Monitor resources usage of node" | |
kubectl describe nodes training-operator-cluster-control-plane | |
echo "Monitor resources usage of pods" | |
kubectl get pods --all-namespaces | |
echo "Monitor resources usage of storage" | |
docker exec -it training-operator-cluster-control-plane df -h | |
- name: Load trainer | |
run: | | |
kind load docker-image ${{ env.TRAINER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }} | |
docker rmi ${{ env.TRAINER_CI_IMAGE }} | |
docker system df | |
df -h | |
env: | |
KIND_CLUSTER: training-operator-cluster | |
TRAINER_CI_IMAGE: kubeflowtraining/trainer:test | |
- name: Run tests | |
run: | | |
pip install pytest | |
python3 -m pip install -e sdk/python[huggingface] | |
pytest -s sdk/python/test/e2e-train-api/test_e2e_train_api.py --log-cli-level=debug | |
env: | |
STORAGE_INITIALIZER_IMAGE: kubeflowtraining/storage-initializer:test | |
TRAINER_TRANSFORMER_IMAGE_DEFAULT: kubeflowtraining/trainer:test |