diff --git a/.github/workflows/merlin.yml b/.github/workflows/merlin.yml index 5e118094f..0081382e3 100644 --- a/.github/workflows/merlin.yml +++ b/.github/workflows/merlin.yml @@ -441,10 +441,10 @@ jobs: k3d registry create $LOCAL_REGISTRY --port $LOCAL_REGISTRY_PORT k3d cluster create $K3D_CLUSTER --image rancher/k3s:${K3S_VERSION} --port 80:80@loadbalancer \ --k3s-arg '--disable=traefik,metrics-server@server:*' \ - --k3s-arg '--kubelet-arg=eviction-hard=imagefs.available<1%,nodefs.available<1%@server:0' \ - --k3s-arg '--kubelet-arg=eviction-minimum-reclaim=imagefs.available=1%,nodefs.available=1%@server:0' \ - --k3s-arg '--kubelet-arg=eviction-hard=imagefs.available<1%,nodefs.available<1%@agent:*' \ - --k3s-arg '--kubelet-arg=eviction-minimum-reclaim=imagefs.available=1%,nodefs.available=1%@agent:*' + --k3s-arg '--kubelet-arg=eviction-hard=imagefs.available<0.5%,nodefs.available<0.5%@server:0' \ + --k3s-arg '--kubelet-arg=eviction-minimum-reclaim=imagefs.available=0.5%,nodefs.available=0.5%@server:0' \ + --k3s-arg '--kubelet-arg=eviction-hard=imagefs.available<0.5%,nodefs.available<0.5%@agent:*' \ + --k3s-arg '--kubelet-arg=eviction-minimum-reclaim=imagefs.available=0.5%,nodefs.available=0.5%@agent:*' - name: Setup cluster working-directory: merlin/scripts/e2e run: ./setup-cluster.sh merlin-cluster @@ -470,8 +470,10 @@ jobs: - name: Deploy merlin and mlp working-directory: merlin/scripts/e2e run: ./deploy-merlin.sh ${{ env.INGRESS_HOST }} ${{ env.LOCAL_REGISTRY }}:${{ env.LOCAL_REGISTRY_PORT }} ${{ needs.create-version.outputs.version }} ${{ github.ref }} ${{ env.MERLIN_CHART_VERSION }} - - name: Prune docker image to make some space - run: docker image prune --all --force + - name: Prune docker system to make some space + run: docker system prune --all --force + - name: Print space consumption + run: sudo df -h - name: Run E2E Test timeout-minutes: 30 id: run-e2e-test @@ -482,6 +484,11 @@ jobs: continue-on-error: true working-directory: merlin/scripts/e2e run: ./debug-e2e.sh + - name: "Print post-e2e test space consumption" + if: always() + continue-on-error: true + working-directory: merlin/scripts/e2e + run: sudo df -h release: uses: ./.github/workflows/release.yml diff --git a/python/sdk/test/conftest.py b/python/sdk/test/conftest.py index e00d86146..6b8f8ff6e 100644 --- a/python/sdk/test/conftest.py +++ b/python/sdk/test/conftest.py @@ -82,7 +82,12 @@ def model(project, mlflow_url, api_client): @pytest.fixture def version(project, model, mlflow_url, api_client): mlflow.set_tracking_uri(mlflow_url) - r = mlflow.start_run() + run_id = None + if not mlflow.get_experiment_by_name("unit_test_experiment"): + run_id = mlflow.create_experiment( + name="unit_test_experiment" + ) + r = mlflow.start_run(experiment_id=run_id) mlflow.end_run() v = cl.Version( id=1, diff --git a/python/sdk/test/integration_test.py b/python/sdk/test/integration_test.py index 07bebd175..d477eb2dc 100644 --- a/python/sdk/test/integration_test.py +++ b/python/sdk/test/integration_test.py @@ -78,6 +78,13 @@ def test_model_version_with_labels( ) assert len(should_not_exist_versions) == 0 + print(f"Deleting model version created...") + try: + v.delete_model_version() + except Exception as e: + print(e) + print(f"Model version deleted.") + @pytest.mark.integration @pytest.mark.dependency() @@ -109,6 +116,13 @@ def test_sklearn( merlin.undeploy(v) + print(f"Deleting model version created...") + try: + v.delete_model_version() + except Exception as e: + print(e) + print(f"Model version deleted.") + @pytest.mark.integration @pytest.mark.dependency() @@ -141,6 +155,14 @@ def test_xgboost( merlin.undeploy(v) + print(f"Deleting model version created...") + try: + v.delete_model_version() + except Exception as e: + print(e) + print(f"Model version deleted.") + + @pytest.mark.integration @pytest.mark.dependency() @pytest.mark.parametrize( @@ -190,6 +212,12 @@ def test_model_schema( merlin.undeploy(v) + print(f"Deleting model version created...") + try: + v.delete_model_version() + except Exception as e: + print(e) + print(f"Model version deleted.") @pytest.mark.integration @@ -239,6 +267,13 @@ def test_mlflow_tracking( # artifact_dir = os.listdir('test/downloaded_artifact') # assert len(artifact_dir) > 0 # not empty directory + print(f"Deleting model version created...") + try: + v.delete_model_version() + except Exception as e: + print(e) + print(f"Model version deleted.") + @pytest.mark.integration @pytest.mark.dependency() @@ -271,6 +306,13 @@ def test_tensorflow( merlin.undeploy(v) + print(f"Deleting model version created...") + try: + v.delete_model_version() + except Exception as e: + print(e) + print(f"Model version deleted.") + @pytest.mark.pytorch @pytest.mark.integration @@ -296,6 +338,13 @@ def test_pytorch(integration_test_url, project_name, use_google_oauth, requests) merlin.undeploy(v) + print(f"Deleting model version created...") + try: + v.delete_model_version() + except Exception as e: + print(e) + print(f"Model version deleted.") + @pytest.mark.serving @pytest.mark.integration @@ -337,6 +386,13 @@ def test_set_traffic(integration_test_url, project_name, use_google_oauth, reque # Undeploy other running model version endpoints undeploy_all_version() + print(f"Deleting model version created...") + try: + v.delete_model_version() + except Exception as e: + print(e) + print(f"Model version deleted.") + @pytest.mark.serving @pytest.mark.integration @@ -378,6 +434,13 @@ def test_serve_traffic(integration_test_url, project_name, use_google_oauth, req # Undeploy other running model version endpoints undeploy_all_version() + print(f"Deleting model version created...") + try: + v.delete_model_version() + except Exception as e: + print(e) + print(f"Model version deleted.") + @pytest.mark.integration def test_multi_env(integration_test_url, project_name, use_google_oauth, requests): @@ -408,6 +471,13 @@ def test_multi_env(integration_test_url, project_name, use_google_oauth, request merlin.undeploy(v) + print(f"Deleting model version created...") + try: + v.delete_model_version() + except Exception as e: + print(e) + print(f"Model version deleted.") + @pytest.mark.integration @pytest.mark.parametrize( @@ -451,6 +521,13 @@ def test_resource_request( merlin.undeploy(v) + print(f"Deleting model version created...") + try: + v.delete_model_version() + except Exception as e: + print(e) + print(f"Model version deleted.") + @pytest.mark.gpu @pytest.mark.integration @@ -549,6 +626,13 @@ def test_logger( merlin.undeploy(v) + print(f"Deleting model version created...") + try: + v.delete_model_version() + except Exception as e: + print(e) + print(f"Model version deleted.") + @pytest.mark.customtransformer @pytest.mark.integration @@ -723,6 +807,13 @@ def test_standard_transformer_without_feast( ) # asserts lhs = rhs, with tolerance merlin.undeploy(v) + print(f"Deleting model version created...") + try: + v.delete_model_version() + except Exception as e: + print(e) + print(f"Model version deleted.") + @pytest.mark.feast @pytest.mark.integration @@ -976,6 +1067,13 @@ def test_custom_model_without_artifact( # Undeploy other running model version endpoints undeploy_all_version() + print(f"Deleting model version created...") + try: + v.delete_model_version() + except Exception as e: + print(e) + print(f"Model version deleted.") + @pytest.mark.integration @pytest.mark.parametrize( @@ -1013,6 +1111,13 @@ def test_custom_model_with_artifact( # Undeploy other running model version endpoints undeploy_all_version() + print(f"Deleting model version created...") + try: + v.delete_model_version() + except Exception as e: + print(e) + print(f"Model version deleted.") + @pytest.mark.raw_deployment @pytest.mark.integration @@ -1099,6 +1204,14 @@ def test_deployment_mode_for_serving_model( merlin.stop_serving_traffic(model_endpoint.environment_name) undeploy_all_version() + print(f"Deleting model version created...") + try: + v1.delete_model_version() + v2.delete_model_version() + except Exception as e: + print(e) + print(f"Model version deleted.") + @pytest.mark.integration def test_redeploy_model(integration_test_url, project_name, use_google_oauth, requests): @@ -1183,6 +1296,14 @@ def test_redeploy_model(integration_test_url, project_name, use_google_oauth, re undeploy_all_version() + print(f"Deleting model versions created...") + for v in merlin.active_model().list_version(): + try: + v.delete_model_version() + except Exception as e: + print(e) + print(f"Model versions deleted.") + def deployment_mode_suffix(deployment_mode: DeploymentMode): return deployment_mode.value.lower()[0:1] diff --git a/scripts/e2e/debug-e2e.sh b/scripts/e2e/debug-e2e.sh index bd28305aa..63f881d31 100755 --- a/scripts/e2e/debug-e2e.sh +++ b/scripts/e2e/debug-e2e.sh @@ -4,6 +4,10 @@ echo "::group::Get nodes" kubectl get nodes echo "::endgroup::" +echo "::group::Describe nodes" +kubectl describe nodes +echo "::endgroup::" + echo "::group::Get all events" kubectl get events -A echo "::endgroup::" diff --git a/scripts/e2e/run-e2e.sh b/scripts/e2e/run-e2e.sh index 3d1f9c1fa..988c00bf3 100755 --- a/scripts/e2e/run-e2e.sh +++ b/scripts/e2e/run-e2e.sh @@ -33,4 +33,4 @@ kubectl create namespace ${E2E_PROJECT_NAME} --dry-run=client -o yaml | kubectl cd ../../python/sdk pip install pipenv==2023.7.23 pipenv install --dev --skip-lock --python ${PYTHON_VERSION} -pipenv run pytest -n=4 -W=ignore --cov=merlin -m "not (gpu or feast or batch or pyfunc or local_server_test or cli or customtransformer)" --durations=0 +pipenv run pytest -n=1 -W=ignore --cov=merlin -m "not (gpu or feast or batch or pyfunc or local_server_test or cli or customtransformer)" --durations=0