From 4b6f0027d1ecb2e9d4a7a1f7f2bb688f40406810 Mon Sep 17 00:00:00 2001 From: Michal Hucko Date: Mon, 30 Sep 2024 13:36:18 +0200 Subject: [PATCH] Add uats CI for bundle onm EKS (#276) * Add tests for running UATs --------- Co-authored-by: Noha Ihab <49988746+NohaIhab@users.noreply.github.com> --- .github/cluster.yaml | 12 +- .github/dependencies.yaml | 10 ++ .github/workflows/deploy-eks.yaml | 158 ++++++++++++++------ scripts/gh-actions/parse_versions.py | 25 ++++ tests/integration/test_bundle_deployment.py | 119 +++++++++++++++ tox.ini | 20 +++ 6 files changed, 294 insertions(+), 50 deletions(-) create mode 100644 .github/dependencies.yaml create mode 100644 scripts/gh-actions/parse_versions.py create mode 100644 tests/integration/test_bundle_deployment.py diff --git a/.github/cluster.yaml b/.github/cluster.yaml index 8f95251a..a5be0a2f 100644 --- a/.github/cluster.yaml +++ b/.github/cluster.yaml @@ -1,7 +1,7 @@ apiVersion: eksctl.io/v1alpha5 availabilityZones: -- {{ region }}a -- {{ region }}b +- eu-central-1a +- eu-central-1b cloudWatch: clusterLogging: {} iam: @@ -10,8 +10,6 @@ iam: addons: - name: aws-ebs-csi-driver serviceAccountRoleARN: "arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy" - wellKnownPolicies: - ebsCSIController: true kind: ClusterConfig kubernetesNetworkConfig: ipFamily: IPv4 @@ -35,6 +33,6 @@ managedNodeGroups: alpha.eksctl.io/nodegroup-type: managed volumeSize: 100 metadata: - name: mlflow-bundle-test - region: {{ region }} - version: "1.24" + name: mlflow-test + region: eu-central-1 + version: "1.26" diff --git a/.github/dependencies.yaml b/.github/dependencies.yaml new file mode 100644 index 00000000..4d7cd679 --- /dev/null +++ b/.github/dependencies.yaml @@ -0,0 +1,10 @@ +"2.15": + K8S_VERSION: "1.29" + JUJU_VERSION: "3.4" + JUJU_VERSION_WITH_PATCH: "3.4.4" + UATS_BRANCH: "main" +latest: + K8S_VERSION: "1.29" + JUJU_VERSION: "3.4" + JUJU_VERSION_WITH_PATCH: "3.4.4" + UATS_BRANCH: "main" diff --git a/.github/workflows/deploy-eks.yaml b/.github/workflows/deploy-eks.yaml index d99ebd9c..399bf5b9 100644 --- a/.github/workflows/deploy-eks.yaml +++ b/.github/workflows/deploy-eks.yaml @@ -1,32 +1,85 @@ -name: Create EKS cluster, deploy MLflow and run bundle test +name: Create EKS cluster, deploy CKF and MLflow and run MLflow bundle UATs on: workflow_dispatch: # This event allows manual triggering from the Github UI - secrets: - BUNDLE_KUBEFLOW_EKS_AWS_ACCESS_KEY_ID: - required: true - BUNDLE_KUBEFLOW_EKS_AWS_SECRET_ACCESS_KEY: - required: true inputs: - region: - description: 'Insert the AWS Region name in which the script will deploy the EKS cluster.' + bundle_version: + description: 'Comma-separated list of bundle versions e.g. 2.15, latest. Make sure that the corresponding K8s version is supported by the cloud.' + default: '2.15, latest' + required: true + k8s_version: + description: 'Kubernetes version to be used for the EKS cluster' + required: false + uats_branch: + description: 'Branch to run the UATs from e.g. main or track/1.9. By default, this is defined by the dependencies.yaml file.' required: false - default: 'eu-central-1' - type: string schedule: - - cron: "23 0 * * 2" + - cron: "23 0 * * 4" jobs: - deploy-mlflow-to-eks: + preprocess-input: + runs-on: ubuntu-22.04 + outputs: + processed_bundle_versions: ${{ steps.process_bundle_versions.outputs.bundle_versions }} + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + - name: Process bundle versions + id: process_bundle_versions + run: python scripts/gh-actions/parse_versions.py + + deploy-ckf-to-eks: + needs: preprocess-input runs-on: ubuntu-22.04 + strategy: + matrix: + bundle_version: ${{ fromJSON(needs.preprocess-input.outputs.processed_bundle_versions) }} + fail-fast: false + env: + PYTHON_VERSION: "3.8" steps: - name: Checkout repository uses: actions/checkout@v2 - - name: Install tox + - name: Run YAML to Github Output Action + id: yaml-output + uses: christian-ci/action-yaml-github-output@v2 + with: + file_path: ".github/dependencies.yaml" + main_key: ${{ matrix.bundle_version }} + + - name: Update ENV variables from inputs if available run: | - python -m pip install --upgrade pip - pip install tox - + K8S_VERSION=${{ inputs.k8s_version || env.K8S_VERSION }} + echo "K8S_VERSION=${K8S_VERSION}" >> $GITHUB_ENV + UATS_BRANCH=${{ inputs.uats_branch || env.UATS_BRANCH }} + echo "UATS_BRANCH=${UATS_BRANCH}" >> $GITHUB_ENV + + # Remove once https://github.com/canonical/bundle-kubeflow/issues/761 + # is resolved and applied to uats repository. + - name: Install python ${{ env.PYTHON_VERSION }} + run: | + sudo add-apt-repository ppa:deadsnakes/ppa -y + sudo apt update -y + sudo apt install python${{ env.PYTHON_VERSION }} python${{ env.PYTHON_VERSION }}-distutils python${{ env.PYTHON_VERSION }}-venv -y + + - name: Install CLI tools + run: | + wget https://bootstrap.pypa.io/get-pip.py + python${{ env.PYTHON_VERSION }} get-pip.py + python${{ env.PYTHON_VERSION }} -m pip install tox + sudo snap install charmcraft --classic + # We need to install from binary because of this https://bugs.launchpad.net/juju/+bug/2007575 + curl -LO https://launchpad.net/juju/${{ env.JUJU_VERSION }}/${{ env.JUJU_VERSION_WITH_PATCH }}/+download/juju-${{ env.JUJU_VERSION_WITH_PATCH }}-linux-amd64.tar.xz + tar xf juju-${{ env.JUJU_VERSION_WITH_PATCH }}-linux-amd64.tar.xz + sudo install -o root -g root -m 0755 juju /usr/local/bin/juju + juju version + - name: Configure AWS Credentials env: AWS_ACCESS_KEY_ID: ${{ secrets.BUNDLE_KUBEFLOW_EKS_AWS_ACCESS_KEY_ID }} @@ -34,11 +87,11 @@ jobs: run: | aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY - aws configure set default.region ${{ inputs.region }} + aws configure set default.region eu-central-1 - name: Install kubectl run: | - sudo snap install kubectl --classic --channel=1.24/stable + sudo snap install kubectl --classic --channel=${{ env.K8S_VERSION }}/stable mkdir ~/.kube kubectl version --client @@ -50,54 +103,73 @@ jobs: sudo mv /tmp/eksctl /usr/local/bin eksctl version - - name: Install juju - run: | - sudo snap install juju --classic --channel=2.9/stable - sudo snap install charmcraft --classic - juju version - - name: Create cluster run: | - sed -i "s/{{ region }}/${{ inputs.region }}/" .github/cluster.yaml + VERSION=${{ matrix.bundle_version }} + VERSION_WITHOUT_DOT="${VERSION//.}" + yq e ".metadata.name |= \"mlflow-test-$VERSION_WITHOUT_DOT\"" -i .github/cluster.yaml + yq e ".metadata.version |= \"${{ env.K8S_VERSION }}\"" -i .github/cluster.yaml eksctl create cluster -f .github/cluster.yaml kubectl get nodes - name: Setup juju run: | - juju add-k8s kubeflow --client - juju bootstrap --no-gui kubeflow kubeflow-controller + juju add-k8s eks --client + juju bootstrap eks kubeflow-controller juju add-model kubeflow - name: Test bundle deployment + run: | + tox -vve test_bundle_deployment-${{ matrix.bundle_version }} -- --model kubeflow --keep-models -vv -s + + - name: Run Kubeflow UATs run: | - tox -vve bundle-test -- --model kubeflow --keep-models -vv -s - + git clone https://github.com/canonical/charmed-kubeflow-uats.git ~/charmed-kubeflow-uats + cd ~/charmed-kubeflow-uats + git checkout ${{ env.UATS_BRANCH }} + tox -e mlflow-remote + # On failure, capture debugging resources - - name: Get all kubernetes resources - run: kubectl get all -A - if: failure() + - name: Save debug artifacts + uses: canonical/kubeflow-ci/actions/dump-charm-debug-artifacts@main + if: failure() || cancelled() + # On failure, capture debugging resources - name: Get juju status run: juju status - if: failure() + if: failure() || cancelled() + + - name: Get juju debug logs + run: juju debug-log --replay --no-tail + if: failure() || cancelled() + + - name: Get all kubernetes resources + run: kubectl get all -A + if: failure() || cancelled() + + - name: Get logs from pods with status = Pending + run: kubectl -n kubeflow get pods | tail -n +2 | grep Pending | awk '{print $1}' | xargs -n1 kubectl -n kubeflow logs --all-containers=true --tail 100 + if: failure() || cancelled() - - name: Get workload logs - run: kubectl logs --tail 100 -ntesting -lapp.kubernetes.io/name=mlflow-server - if: failure() + - name: Get logs from pods with status = Failed + run: kubectl -n kubeflow get pods | tail -n +2 | grep Failed | awk '{print $1}' | xargs -n1 kubectl -n kubeflow logs --all-containers=true --tail 100 + if: failure() || cancelled() - - name: Get operator logs - run: kubectl logs --tail 100 -ntesting -loperator.juju.is/name=mlflow-server - if: failure() + - name: Get logs from pods with status = CrashLoopBackOff + run: kubectl -n kubeflow get pods | tail -n +2 | grep CrashLoopBackOff | awk '{print $1}' | xargs -n1 kubectl -n kubeflow logs --all-containers=true --tail 100 + if: failure() || cancelled() - - name: Remove eks + - name: Delete EKS cluster if: always() run: | - eksctl delete cluster --name=mlflow-bundle-test + VERSION=${{ matrix.bundle_version }} + VERSION_WITHOUT_DOT="${VERSION//.}" + eksctl delete cluster --region eu-central-1 --name=mlflow-test-$VERSION_WITHOUT_DOT delete-unattached-volumes: if: always() uses: ./.github/workflows/delete-aws-volumes.yaml secrets: inherit with: - region: ${{ inputs.region }} - needs: [deploy-mlflow-to-eks] + region: eu-central-1 + needs: [deploy-ckf-to-eks] diff --git a/scripts/gh-actions/parse_versions.py b/scripts/gh-actions/parse_versions.py new file mode 100644 index 00000000..7f768f55 --- /dev/null +++ b/scripts/gh-actions/parse_versions.py @@ -0,0 +1,25 @@ +import os +import sys +import json + +# Parse the versions given as a comma-separated list and return a JSON array +def parse_versions(input_versions): + # Default version string if the input is empty + if not input_versions: + input_versions = "2.15,latest" + else: + # Remove whitespace between entries + input_versions = input_versions.replace(" ", "") + + # Convert to JSON array + json_array = json.dumps(input_versions.split(",")) + return json_array + +if __name__ == "__main__": + # Read the input of the Github Action from the environment variable + input_versions = os.getenv('INPUT_BUNDLE_VERSION', '') + json_array = parse_versions(input_versions) + print(f"bundle_versions={json_array}") + with open(os.environ['GITHUB_OUTPUT'], 'a') as output_file: + output_file.write(f"bundle_versions={json_array}\n") + diff --git a/tests/integration/test_bundle_deployment.py b/tests/integration/test_bundle_deployment.py new file mode 100644 index 00000000..ace90fc3 --- /dev/null +++ b/tests/integration/test_bundle_deployment.py @@ -0,0 +1,119 @@ +import os + +import aiohttp +import lightkube +import pytest +from lightkube.resources.core_v1 import Service +from pytest_operator.plugin import OpsTest + +# Environment variables +KUBEFLOW_CHANNEL = os.environ.get( + "KUBEFLOW_CHANNEL", "1.9/stable" +) # Default to '1.9/stable' if not set +RESOURCE_DISPATCHER_CHANNEL = os.environ.get( + "RESOURCE_DISPATCHER_CHANNEL", "2.0/stable" +) # Default to '2.0/stable' if not set + + +@pytest.fixture() +def lightkube_client() -> lightkube.Client: + client = lightkube.Client(field_manager="kubeflow") + return client + + +@pytest.fixture +def bundle_path() -> str: + return os.environ.get("BUNDLE_PATH").replace('"', "") + + +async def deploy_bundle(ops_test: OpsTest, bundle_path, trust: bool) -> None: + """Deploy a bundle from file using juju CLI.""" + run_args = ["juju", "deploy", "-m", ops_test.model_full_name, f"{bundle_path}"] + if trust: + run_args.append("--trust") + retcode, stdout, stderr = await ops_test.run(*run_args) + print(stdout) + assert retcode == 0, f"Deploy failed: {(stderr or stdout).strip()}" + + +class TestCharm: + @pytest.mark.abort_on_fail + async def test_deploy_bundles_and_resource_dispatcher( + self, ops_test: OpsTest, lightkube_client, bundle_path + ): + """ + Deploy the Kubeflow bundle, a custom bundle from the given bundle path, + and the resource-dispatcher charm. Then, integrate the components + and wait for the model to become active and idle. + """ + # Deploy Kubeflow with channel and trust + await ops_test.model.deploy( + entity_url="kubeflow", + channel=KUBEFLOW_CHANNEL, + trust=True, + ) + + # Deploy the bundle path + await deploy_bundle(ops_test, bundle_path, trust=True) + + # Deploy resource-dispatcher with its channel and trust + await ops_test.model.deploy( + entity_url="resource-dispatcher", + channel=RESOURCE_DISPATCHER_CHANNEL, + trust=True, + ) + + # Relate services as per Juju integrations + await ops_test.model.relate("mlflow-server:secrets", "resource-dispatcher:secrets") + await ops_test.model.relate( + "mlflow-server:pod-defaults", "resource-dispatcher:pod-defaults" + ) + await ops_test.model.relate( + "mlflow-minio:object-storage", "kserve-controller:object-storage" + ) + await ops_test.model.relate( + "kserve-controller:service-accounts", "resource-dispatcher:service-accounts" + ) + await ops_test.model.relate("kserve-controller:secrets", "resource-dispatcher:secrets") + await ops_test.model.relate("mlflow-server:ingress", "istio-pilot:ingress") + await ops_test.model.relate("mlflow-server:dashboard-links", "kubeflow-dashboard:links") + + # Wait for the model to become active and idle + await ops_test.model.wait_for_idle( + status="active", + raise_on_blocked=False, + raise_on_error=False, + timeout=1500, + ) + + # Verify deployment by checking the public URL + url = get_public_url(lightkube_client, "kubeflow") + result_status, result_text = await fetch_response(url) + assert result_status == 200 + assert "Log in to Your Account" in result_text + assert "Email Address" in result_text + assert "Password" in result_text + + +def get_public_url(lightkube_client: lightkube.Client, bundle_name: str): + """Extracts public URL from service istio-ingressgateway-workload.""" + ingressgateway_svc = lightkube_client.get( + Service, "istio-ingressgateway-workload", namespace=bundle_name + ) + address = ( + ingressgateway_svc.status.loadBalancer.ingress[0].hostname + or ingressgateway_svc.status.loadBalancer.ingress[0].ip + ) + public_url = f"http://{address}" + return public_url + + +async def fetch_response(url, headers=None): + """Fetch provided URL and return (status, text).""" + result_status = 0 + result_text = "" + async with aiohttp.ClientSession() as session: + async with session.get(url=url, headers=headers) as response: + result_status = response.status + result_text = await response.text() + return result_status, str(result_text) diff --git a/tox.ini b/tox.ini index aeb1806c..b6b3c6e8 100644 --- a/tox.ini +++ b/tox.ini @@ -106,3 +106,23 @@ allowlist_externals = commands = tflint --chdir=terraform --recursive description = Check Terraform code against coding style standards + +[testenv:test_bundle_deployment-{2.15,latest}] +commands = + pytest -v --tb native --asyncio-mode=auto {[vars]tst_path}/integration/test_bundle_deployment.py --keep-models --log-cli-level=INFO -s {posargs} +setenv = + 2.15: BUNDLE_PATH = "./releases/2.15/stable/mlflow/bundle.yaml" + 2.15: KUBEFLOW_CHANNEL = 1.9/stable + 2.15: RESOURCE_DISPATCHER_CHANNEL = 2.0/stable + latest: BUNDLE_PATH = "./releases/latest/edge/mlflow/bundle.yaml" + latest: KUBEFLOW_CHANNEL = 1.9/stable + latest: RESOURCE_DISPATCHER_CHANNEL = 2.0/stable +deps = + aiohttp + lightkube + pytest-operator + tenacity + ops>=2.3.0 + 2.15: juju<4.0.0 + latest: juju<4.0.0 +description = Test bundle deployment