fix(cicd): Reduce memory footprint of e2e tests (#534)

# Description Recently the e2e test job has been [failing](https://github.com/caraml-dev/merlin/actions/runs/7880641930/job/21503099773) due to errors in deploying a model version, which would messages such as: ``` An error occurred (XMinioStorageFull) when calling the PutObject operation: Storage backend has reached its minimum free disk threshold. Please delete a few objects to proceed. ``` Upon digging further, there appears to be a lack of ephemeral storage on the k3d node we set up in the e2e tests, which causes a deployment's pods (the transformer usually) to be evicted: ``` The node was low on resource: ephemeral-storage. Threshold quantity: 778512530, available: 748868Ki. ``` Apparently we’ve been encountering this problem for some time already (@ariefrahmansyah tried to debug and address this in a previous PR #446), and we already tried some of the workarounds suggested here: - https://k3d.io/v5.4.6/faq/faq/#pods-evicted-due-to-lack-of-disk-space - k3d-io/k3d#133 As a quick fix to workaround these issues given that some of the workarounds have already been implemented, this PR introduces some changes to the e2e test setup: - Deleting the model created after each e2e test - Making the e2e tests (run using pytest) serialised (not [parallel](https://github.com/caraml-dev/merlin/blob/e110a96763e96333518b53ef6b9c18785fd08fa0/scripts/e2e/run-e2e.sh#L36)) - Using docker system prune instead of just docker image prune [before](https://github.com/caraml-dev/merlin/blob/e110a96763e96333518b53ef6b9c18785fd08fa0/.github/workflows/merlin.yml#L474) running the e2e tests - Lowering of kubelet eviction thresholds even further from 1% to 0.5% # Modifications - `.github/workflows/merlin.yml` - Reduction of kubelet eviction thresholds and replacement of docker image pruning with a system-level pruning - `python/sdk/test/integration_test.py` - Addition of steps to delete model versions created after each test run - `scripts/e2e/run-e2e.sh` - Reduction of pytest workers from 4 to 1 to remove parallelism # Tests  # Checklist - [x] Added PR label - [x] Added unit test, integration, and/or e2e tests - [x] Tested locally - [ ] Updated documentation - [ ] Update Swagger spec if the PR introduce API changes - [ ] Regenerated Golang and Python client if the PR introduces API changes # Release Notes ```release-note NONE ```
caraml-dev · Feb 14, 2024 · e4a0a01 · e4a0a01
1 parent 1269dbf
commit e4a0a01
Show file tree

Hide file tree

Showing 5 changed files with 145 additions and 8 deletions.
diff --git a/.github/workflows/merlin.yml b/.github/workflows/merlin.yml
@@ -441,10 +441,10 @@ jobs:
           k3d registry create $LOCAL_REGISTRY --port $LOCAL_REGISTRY_PORT
           k3d cluster create $K3D_CLUSTER --image rancher/k3s:${K3S_VERSION} --port 80:80@loadbalancer \
             --k3s-arg '--disable=traefik,metrics-server@server:*' \
-            --k3s-arg '--kubelet-arg=eviction-hard=imagefs.available<1%,nodefs.available<1%@server:0' \
-            --k3s-arg '--kubelet-arg=eviction-minimum-reclaim=imagefs.available=1%,nodefs.available=1%@server:0' \
-            --k3s-arg '--kubelet-arg=eviction-hard=imagefs.available<1%,nodefs.available<1%@agent:*' \
-            --k3s-arg '--kubelet-arg=eviction-minimum-reclaim=imagefs.available=1%,nodefs.available=1%@agent:*'
+            --k3s-arg '--kubelet-arg=eviction-hard=imagefs.available<0.5%,nodefs.available<0.5%@server:0' \
+            --k3s-arg '--kubelet-arg=eviction-minimum-reclaim=imagefs.available=0.5%,nodefs.available=0.5%@server:0' \
+            --k3s-arg '--kubelet-arg=eviction-hard=imagefs.available<0.5%,nodefs.available<0.5%@agent:*' \
+            --k3s-arg '--kubelet-arg=eviction-minimum-reclaim=imagefs.available=0.5%,nodefs.available=0.5%@agent:*'
       - name: Setup cluster
         working-directory: merlin/scripts/e2e
         run: ./setup-cluster.sh merlin-cluster
@@ -470,8 +470,10 @@ jobs:
       - name: Deploy merlin and mlp
         working-directory: merlin/scripts/e2e
         run: ./deploy-merlin.sh ${{ env.INGRESS_HOST }} ${{ env.LOCAL_REGISTRY }}:${{ env.LOCAL_REGISTRY_PORT }} ${{ needs.create-version.outputs.version }} ${{ github.ref }} ${{ env.MERLIN_CHART_VERSION }}
-      - name: Prune docker image to make some space
-        run: docker image prune --all --force
+      - name: Prune docker system to make some space
+        run: docker system prune --all --force
+      - name: Print space consumption
+        run: sudo df -h
       - name: Run E2E Test
         timeout-minutes: 30
         id: run-e2e-test
@@ -482,6 +484,11 @@ jobs:
         continue-on-error: true
         working-directory: merlin/scripts/e2e
         run: ./debug-e2e.sh
+      - name: "Print post-e2e test space consumption"
+        if: always()
+        continue-on-error: true
+        working-directory: merlin/scripts/e2e
+        run: sudo df -h
 
   release:
     uses: ./.github/workflows/release.yml

diff --git a/python/sdk/test/conftest.py b/python/sdk/test/conftest.py
@@ -82,7 +82,12 @@ def model(project, mlflow_url, api_client):
 @pytest.fixture
 def version(project, model, mlflow_url, api_client):
     mlflow.set_tracking_uri(mlflow_url)
-    r = mlflow.start_run()
+    run_id = None
+    if not mlflow.get_experiment_by_name("unit_test_experiment"):
+        run_id = mlflow.create_experiment(
+            name="unit_test_experiment"
+        )
+    r = mlflow.start_run(experiment_id=run_id)
     mlflow.end_run()
     v = cl.Version(
         id=1,

diff --git a/python/sdk/test/integration_test.py b/python/sdk/test/integration_test.py
@@ -78,6 +78,13 @@ def test_model_version_with_labels(
     )
     assert len(should_not_exist_versions) == 0
 
+    print(f"Deleting model version created...")
+    try:
+        v.delete_model_version()
+    except Exception as e:
+        print(e)
+    print(f"Model version deleted.")
+
 
 @pytest.mark.integration
 @pytest.mark.dependency()
@@ -109,6 +116,13 @@ def test_sklearn(
 
     merlin.undeploy(v)
 
+    print(f"Deleting model version created...")
+    try:
+        v.delete_model_version()
+    except Exception as e:
+        print(e)
+    print(f"Model version deleted.")
+
 
 @pytest.mark.integration
 @pytest.mark.dependency()
@@ -141,6 +155,14 @@ def test_xgboost(
 
     merlin.undeploy(v)
 
+    print(f"Deleting model version created...")
+    try:
+        v.delete_model_version()
+    except Exception as e:
+        print(e)
+    print(f"Model version deleted.")
+
+
 @pytest.mark.integration
 @pytest.mark.dependency()
 @pytest.mark.parametrize(
@@ -190,6 +212,12 @@ def test_model_schema(
 
     merlin.undeploy(v)
 
+    print(f"Deleting model version created...")
+    try:
+        v.delete_model_version()
+    except Exception as e:
+        print(e)
+    print(f"Model version deleted.")
 
 
 @pytest.mark.integration
@@ -239,6 +267,13 @@ def test_mlflow_tracking(
         # artifact_dir = os.listdir('test/downloaded_artifact')
         # assert len(artifact_dir) > 0  # not empty directory
 
+    print(f"Deleting model version created...")
+    try:
+        v.delete_model_version()
+    except Exception as e:
+        print(e)
+    print(f"Model version deleted.")
+
 
 @pytest.mark.integration
 @pytest.mark.dependency()
@@ -271,6 +306,13 @@ def test_tensorflow(
 
     merlin.undeploy(v)
 
+    print(f"Deleting model version created...")
+    try:
+        v.delete_model_version()
+    except Exception as e:
+        print(e)
+    print(f"Model version deleted.")
+
 
 @pytest.mark.pytorch
 @pytest.mark.integration
@@ -296,6 +338,13 @@ def test_pytorch(integration_test_url, project_name, use_google_oauth, requests)
 
     merlin.undeploy(v)
 
+    print(f"Deleting model version created...")
+    try:
+        v.delete_model_version()
+    except Exception as e:
+        print(e)
+    print(f"Model version deleted.")
+
 
 @pytest.mark.serving
 @pytest.mark.integration
@@ -337,6 +386,13 @@ def test_set_traffic(integration_test_url, project_name, use_google_oauth, reque
     # Undeploy other running model version endpoints
     undeploy_all_version()
 
+    print(f"Deleting model version created...")
+    try:
+        v.delete_model_version()
+    except Exception as e:
+        print(e)
+    print(f"Model version deleted.")
+
 
 @pytest.mark.serving
 @pytest.mark.integration
@@ -378,6 +434,13 @@ def test_serve_traffic(integration_test_url, project_name, use_google_oauth, req
     # Undeploy other running model version endpoints
     undeploy_all_version()
 
+    print(f"Deleting model version created...")
+    try:
+        v.delete_model_version()
+    except Exception as e:
+        print(e)
+    print(f"Model version deleted.")
+
 
 @pytest.mark.integration
 def test_multi_env(integration_test_url, project_name, use_google_oauth, requests):
@@ -408,6 +471,13 @@ def test_multi_env(integration_test_url, project_name, use_google_oauth, request
 
     merlin.undeploy(v)
 
+    print(f"Deleting model version created...")
+    try:
+        v.delete_model_version()
+    except Exception as e:
+        print(e)
+    print(f"Model version deleted.")
+
 
 @pytest.mark.integration
 @pytest.mark.parametrize(
@@ -451,6 +521,13 @@ def test_resource_request(
 
     merlin.undeploy(v)
 
+    print(f"Deleting model version created...")
+    try:
+        v.delete_model_version()
+    except Exception as e:
+        print(e)
+    print(f"Model version deleted.")
+
 
 @pytest.mark.gpu
 @pytest.mark.integration
@@ -549,6 +626,13 @@ def test_logger(
 
     merlin.undeploy(v)
 
+    print(f"Deleting model version created...")
+    try:
+        v.delete_model_version()
+    except Exception as e:
+        print(e)
+    print(f"Model version deleted.")
+
 
 @pytest.mark.customtransformer
 @pytest.mark.integration
@@ -723,6 +807,13 @@ def test_standard_transformer_without_feast(
     )  # asserts lhs = rhs, with tolerance
     merlin.undeploy(v)
 
+    print(f"Deleting model version created...")
+    try:
+        v.delete_model_version()
+    except Exception as e:
+        print(e)
+    print(f"Model version deleted.")
+
 
 @pytest.mark.feast
 @pytest.mark.integration
@@ -976,6 +1067,13 @@ def test_custom_model_without_artifact(
     # Undeploy other running model version endpoints
     undeploy_all_version()
 
+    print(f"Deleting model version created...")
+    try:
+        v.delete_model_version()
+    except Exception as e:
+        print(e)
+    print(f"Model version deleted.")
+
 
 @pytest.mark.integration
 @pytest.mark.parametrize(
@@ -1013,6 +1111,13 @@ def test_custom_model_with_artifact(
     # Undeploy other running model version endpoints
     undeploy_all_version()
 
+    print(f"Deleting model version created...")
+    try:
+        v.delete_model_version()
+    except Exception as e:
+        print(e)
+    print(f"Model version deleted.")
+
 
 @pytest.mark.raw_deployment
 @pytest.mark.integration
@@ -1099,6 +1204,14 @@ def test_deployment_mode_for_serving_model(
     merlin.stop_serving_traffic(model_endpoint.environment_name)
     undeploy_all_version()
 
+    print(f"Deleting model version created...")
+    try:
+        v1.delete_model_version()
+        v2.delete_model_version()
+    except Exception as e:
+        print(e)
+    print(f"Model version deleted.")
+
 
 @pytest.mark.integration
 def test_redeploy_model(integration_test_url, project_name, use_google_oauth, requests):
@@ -1183,6 +1296,14 @@ def test_redeploy_model(integration_test_url, project_name, use_google_oauth, re
 
     undeploy_all_version()
 
+    print(f"Deleting model versions created...")
+    for v in merlin.active_model().list_version():
+        try:
+            v.delete_model_version()
+        except Exception as e:
+            print(e)
+    print(f"Model versions deleted.")
+
 
 def deployment_mode_suffix(deployment_mode: DeploymentMode):
     return deployment_mode.value.lower()[0:1]

diff --git a/scripts/e2e/debug-e2e.sh b/scripts/e2e/debug-e2e.sh
@@ -4,6 +4,10 @@ echo "::group::Get nodes"
 kubectl get nodes
 echo "::endgroup::"
 
+echo "::group::Describe nodes"
+kubectl describe nodes
+echo "::endgroup::"
+
 echo "::group::Get all events"
 kubectl get events -A
 echo "::endgroup::"

diff --git a/scripts/e2e/run-e2e.sh b/scripts/e2e/run-e2e.sh
@@ -33,4 +33,4 @@ kubectl create namespace ${E2E_PROJECT_NAME} --dry-run=client -o yaml | kubectl
 cd ../../python/sdk
 pip install pipenv==2023.7.23
 pipenv install --dev --skip-lock --python ${PYTHON_VERSION}
-pipenv run pytest -n=4 -W=ignore --cov=merlin -m "not (gpu or feast or batch or pyfunc or local_server_test or cli or customtransformer)" --durations=0
+pipenv run pytest -n=1 -W=ignore --cov=merlin -m "not (gpu or feast or batch or pyfunc or local_server_test or cli or customtransformer)" --durations=0