diff --git a/.github/workflows/robot.yml b/.github/workflows/robot.yml new file mode 100644 index 000000000..66d438995 --- /dev/null +++ b/.github/workflows/robot.yml @@ -0,0 +1,39 @@ +name: Robot + +on: + push: + branches: [main] + paths: + - .github/workflows/robot.yml + - dev/robot/** + schedule: + # https://crontab.guru/#30_12_*_*_1 + - cron: "30 12 * * 1" + +jobs: + install: + runs-on: ubuntu-latest + + concurrency: robot + environment: e2e-robot + + defaults: + run: + working-directory: dev/robot + + env: + PY_COLORS: true + ANSIBLE_FORCE_COLOR: true + + steps: + - uses: actions/checkout@v4 + + - name: Install robot server + env: + ROBOT_USER: ${{ secrets.ROBOT_USER }} + ROBOT_PASSWORD: ${{ secrets.ROBOT_PASSWORD }} + ROBOT_SSH_KEY: ${{ secrets.ROBOT_SSH_KEY }} + run: | + ansible-galaxy install -r requirements.yml + + dev/robot/with-ssh-agent ansible-playbook -vv install.yml diff --git a/.github/workflows/test_e2e.yml b/.github/workflows/test_e2e.yml index e5e1264d8..4d5a1ef75 100644 --- a/.github/workflows/test_e2e.yml +++ b/.github/workflows/test_e2e.yml @@ -5,20 +5,28 @@ on: branches: [main] jobs: cloud: - name: Cloud ${{ matrix.k3s }} + name: cloud ${{ matrix.k3s }} + runs-on: ubuntu-latest + permissions: id-token: write - runs-on: ubuntu-latest + + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.k3s }} + cancel-in-progress: true + strategy: + fail-fast: false # Continue tests matrix if a flaky run occurs. matrix: - # All k3s after January 2024 break our e2e tests, we hardcode - # the versions for now until we can fix the source of this. - k3s: [ v1.26.12+k3s1, v1.27.9+k3s1, v1.28.5+k3s1, v1.29.0+k3s1 ] - fail-fast: false + k3s: + - v1.26 + - v1.27 + - v1.28 + - v1.29 env: - K3S_VERSION: ${{ matrix.k3s }} - SCOPE: gha-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.k3s }} + K3S_CHANNEL: ${{ matrix.k3s }} + ENV: gha-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.k3s }} # Domain must be available in the account running the tests. This domain is available in the account # running the public integration tests. @@ -33,9 +41,10 @@ jobs: - uses: hetznercloud/setup-hcloud@v1 - - uses: hetznercloud/tps-action@main + - uses: opentofu/setup-opentofu@v1 with: - token: ${{ secrets.HCLOUD_TOKEN }} + tofu_version: v1.7.2 # renovate: datasource=github-releases depName=opentofu/opentofu + tofu_wrapper: false - uses: yokawasa/action-setup-kube-tools@v0.11.1 with: @@ -43,30 +52,29 @@ jobs: helm kubectl skaffold - helm: v3.15.1 - kubectl: v1.29.0 - skaffold: v2.12.0 + helm: v3.15.2 # renovate: datasource=github-releases depName=helm/helm + kubectl: v1.29.6 # renovate: datasource=github-releases depName=kubernetes/kubernetes + skaffold: v2.12.0 # renovate: datasource=github-releases depName=GoogleContainerTools/skaffold + - name: Install k3sup run: | curl -sLS https://get.k3sup.dev | sh - - name: Setup test environment - run: | - source <(hack/dev-up.sh) - - # make exported env variables available to following jobs - echo "KUBECONFIG=$KUBECONFIG" >> "$GITHUB_ENV" - echo "SKAFFOLD_DEFAULT_REPO=$SKAFFOLD_DEFAULT_REPO" >> "$GITHUB_ENV" - echo "CONTROL_IP=$CONTROL_IP" >> "$GITHUB_ENV" - - - name: Build and Deploy HCCM + - uses: hetznercloud/tps-action@main + with: + token: ${{ secrets.HCLOUD_TOKEN }} + + - name: Setup environment + run: make -C dev up + + - name: Run skaffold run: | - skaffold build --tag="e2e-${GITHUB_RUN_ID}-${GITHUB_RUN_NUMBER}" - tag=$(skaffold build --tag="e2e-${GITHUB_RUN_ID}-${GITHUB_RUN_NUMBER}" --quiet --output="{{ (index .Builds 0).Tag }}") - skaffold deploy --images=hetznercloud/hcloud-cloud-controller-manager=$tag + source dev/files/env.sh + skaffold run - name: Run tests run: | + source dev/files/env.sh go test ./tests/e2e -tags e2e -v -race -timeout 60m -coverprofile=coverage.txt - name: Upload coverage reports to Codecov @@ -74,66 +82,60 @@ jobs: with: token: ${{ secrets.CODECOV_TOKEN }} - - name: Download logs & events + - name: Dump logs & events if: always() continue-on-error: true run: | + source dev/files/env.sh mkdir debug-logs + + echo "::group::hccm.log" kubectl logs \ --namespace kube-system \ --selector app.kubernetes.io/name=hcloud-cloud-controller-manager \ --all-containers \ --prefix=true \ --tail=-1 \ - > debug-logs/hccm.log - + | tee debug-logs/hccm.log + echo "::endgroup::" + + echo "::group::events.yaml" kubectl get events \ --all-namespaces \ --sort-by=.firstTimestamp \ --output yaml \ - > debug-logs/events.yaml - - - name: Show HCCM Logs on Failure - if: failure() - continue-on-error: true - run: | - echo "::group::hccm.log" - cat debug-logs/hccm.log + | tee debug-logs/events.yaml echo "::endgroup::" - - name: Cleanup test environment + - name: Cleanup if: always() continue-on-error: true - run: | - hack/dev-down.sh + run: make -C dev down - name: Persist debug artifacts if: always() continue-on-error: true uses: actions/upload-artifact@v4 with: - name: debug-logs-${{ env.SCOPE }} + name: debug-logs-${{ env.ENV }} path: debug-logs/ robot: - name: Robot + runs-on: ubuntu-latest + permissions: id-token: write - # Make sure that only one Job is using the server at a time - concurrency: robot-test-server + # Make sure that only one job is using the server at a time + concurrency: robot environment: e2e-robot env: - K3S_VERSION: v1.29.0+k3s1 - SCOPE: gha-${{ github.run_id }}-${{ github.run_attempt }}-robot + K3S_CHANNEL: v1.29 + ENV: gha-${{ github.run_id }}-${{ github.run_attempt }}-robot - # Disable routes in dev-env, not supported for Robot. - ROUTES_ENABLED: "false" ROBOT_ENABLED: "true" - SERVER_NUMBER: ${{ vars.SERVER_NUMBER }} - runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -143,9 +145,10 @@ jobs: - uses: hetznercloud/setup-hcloud@v1 - - uses: hetznercloud/tps-action@main + - uses: opentofu/setup-opentofu@v1 with: - token: ${{ secrets.HCLOUD_TOKEN }} + tofu_version: v1.7.2 # renovate: datasource=github-releases depName=opentofu/opentofu + tofu_wrapper: false - uses: yokawasa/action-setup-kube-tools@v0.11.1 with: @@ -153,92 +156,73 @@ jobs: helm kubectl skaffold - helm: v3.15.1 - kubectl: v1.29.0 - skaffold: v2.12.0 + helm: v3.15.2 # renovate: datasource=github-releases depName=helm/helm + kubectl: v1.29.6 # renovate: datasource=github-releases depName=kubernetes/kubernetes + skaffold: v2.12.0 # renovate: datasource=github-releases depName=GoogleContainerTools/skaffold + - name: Install k3sup run: | curl -sLS https://get.k3sup.dev | sh - - name: Setup test environment - env: - ROBOT_USER: ${{ secrets.ROBOT_USER }} - ROBOT_PASSWORD: ${{ secrets.ROBOT_PASSWORD }} - run: | - source <(hack/dev-up.sh) - - # make exported env variables available to following jobs - echo "KUBECONFIG=$KUBECONFIG" >> "$GITHUB_ENV" - echo "SKAFFOLD_DEFAULT_REPO=$SKAFFOLD_DEFAULT_REPO" >> "$GITHUB_ENV" - echo "CONTROL_IP=$CONTROL_IP" >> "$GITHUB_ENV" - - - name: Build and Deploy HCCM - run: | - skaffold build --tag="e2e-${GITHUB_RUN_ID}-${GITHUB_RUN_NUMBER}" - tag=$(skaffold build --tag="e2e-${GITHUB_RUN_ID}-${GITHUB_RUN_NUMBER}" --quiet --output="{{ (index .Builds 0).Tag }}") - skaffold deploy \ - --profile=robot \ - --images=hetznercloud/hcloud-cloud-controller-manager=$tag + - uses: hetznercloud/tps-action@main + with: + token: ${{ secrets.HCLOUD_TOKEN }} - - name: Setup Robot Server + - name: Setup environment env: + ROBOT_SSH_KEY: ${{ secrets.ROBOT_SSH_KEY }} ROBOT_USER: ${{ secrets.ROBOT_USER }} ROBOT_PASSWORD: ${{ secrets.ROBOT_PASSWORD }} + run: | + dev/robot/with-ssh-agent make -C dev up - # Nicer output - PY_COLORS: true - ANSIBLE_FORCE_COLOR: true - working-directory: hack/robot-e2e + - name: Run skaffold run: | - ansible-galaxy install -r requirements.yml - echo "::group::ansible-playbook e2e-setup-robot-server.yml" - ansible-playbook e2e-setup-robot-server.yml -e scope=$SCOPE -e server_number=$SERVER_NUMBER -vvv - echo "::endgroup::" + source dev/files/env.sh + skaffold run --profile=robot - name: Run tests env: ROBOT_USER: ${{ secrets.ROBOT_USER }} ROBOT_PASSWORD: ${{ secrets.ROBOT_PASSWORD }} run: | + source dev/files/env.sh go test ./tests/e2e -tags e2e,robot -v -timeout 60m - - name: Download logs & events + - name: Dump logs & events if: always() continue-on-error: true run: | + source dev/files/env.sh mkdir debug-logs + + echo "::group::hccm.log" kubectl logs \ --namespace kube-system \ --selector app.kubernetes.io/name=hcloud-cloud-controller-manager \ --all-containers \ --prefix=true \ --tail=-1 \ - > debug-logs/hccm.log - + | tee debug-logs/hccm.log + echo "::endgroup::" + + echo "::group::events.yaml" kubectl get events \ --all-namespaces \ --sort-by=.firstTimestamp \ --output yaml \ - > debug-logs/events.yaml - - - name: Show HCCM Logs on Failure - if: failure() - continue-on-error: true - run: | - echo "::group::hccm.log" - cat debug-logs/hccm.log + | tee debug-logs/events.yaml echo "::endgroup::" - - name: Cleanup test environment + - name: Cleanup if: always() continue-on-error: true - run: | - hack/dev-down.sh + run: make -C dev down - name: Persist debug artifacts if: always() continue-on-error: true uses: actions/upload-artifact@v4 with: - name: debug-logs-${{ env.SCOPE }} + name: debug-logs-${{ env.ENV }} path: debug-logs/ diff --git a/README.md b/README.md index 296b5f70f..8ebb4bdeb 100644 --- a/README.md +++ b/README.md @@ -200,132 +200,99 @@ Current Kubernetes Releases: https://kubernetes.io/releases/ | 1.24 | v1.17.2 | https://github.com/hetznercloud/hcloud-cloud-controller-manager/releases/download/v1.17.2/ccm.yaml | | 1.23 | v1.13.2 | https://github.com/hetznercloud/hcloud-cloud-controller-manager/releases/download/v1.13.2/ccm.yaml | -## Unit tests +## Development -To run unit tests locally, execute +### Setup a development environment -```sh -go test ./... -``` +To set up a development environment, make sure you installed the following tools: -Check that your go version is up-to-date, tests might fail if it is not. +- [tofu](https://opentofu.org/) +- [k3sup](https://github.com/alexellis/k3sup) +- [docker](https://www.docker.com/) +- [skaffold](https://skaffold.dev/) -## E2E Tests +1. Configure a `HCLOUD_TOKEN` in your shell session. -The Hetzner Cloud cloud controller manager was tested against all -supported Kubernetes versions. We also test against the same k3s -releases (Sample: When we support testing against Kubernetes 1.20.x we -also try to support k3s 1.20.x). We try to keep compatibility with k3s -but never guarantee this. +> [!WARNING] +> The development environment runs on Hetzner Cloud servers which will induce costs. -You can run the tests with the following commands. Keep in mind, that -these tests run on real cloud servers and will create Load Balancers -that will be billed. +2. Deploy the development cluster: -**Test Server Setup:** +```sh +make -C dev up +``` -1x CPX21 (Ubuntu 18.04) +3. Load the generated configuration to access the development cluster: -**Requirements: Docker and Go 1.22** +```sh +source dev/files/env.sh +``` -1. Configure your environment correctly +4. Check that the development cluster is healthy: -```bash -export HCLOUD_TOKEN= -export K8S_VERSION=k8s-1.21.0 # The specific (latest) version is needed here -export USE_SSH_KEYS=key1,key2 # Name or IDs of your SSH Keys within the Hetzner Cloud, the servers will be accessable with that keys -export USE_NETWORKS=yes # if `yes` this identidicates that the tests should provision the server with cilium as CNI and also enable the Network related tests -## Optional configuration env vars: -export TEST_DEBUG_MODE=yes # With this env you can toggle the output of the provision and test commands. With `yes` it will log the whole output to stdout -export KEEP_SERVER_ON_FAILURE=yes # Keep the test server after a test failure. +```sh +kubectl get nodes -o wide ``` -2. Run the tests +5. Start developing hcloud-cloud-controller-manager in the development cluster: -```bash -go test ./tests/e2e -tags e2e -v -timeout 60m +```sh +skaffold dev ``` -The tests will now run and cleanup themselves afterward. Sometimes it might happen that you need to clean up the -project manually via the [Hetzner Cloud Console](https://console.hetzner.cloud) or -the [hcloud-cli](https://github.com/hetznercloud/cli) . +On code change, skaffold will rebuild the image, redeploy it and print all logs. -For easier debugging on the server we always configure the latest version of -the [hcloud-cli](https://github.com/hetznercloud/cli) with the given `HCLOUD_TOKEN` and a few bash aliases on the host: +⚠️ Do not forget to clean up the development cluster once are finished: -```bash -alias k="kubectl" -alias ksy="kubectl -n kube-system" -alias kgp="kubectl get pods" -alias kgs="kubectl get services" +```sh +make -C dev down ``` -The test suite is split in three parts: +### Run the unit tests -- **General Part**: Sets up the test env & checks if the HCCM Pod is properly running - - Build Tag: `e2e` -- **Cloud Part**: Tests regular functionality against a Cloud-only environment - - Build Tag: `e2e && !robot` -- **Robot Part**: Tests Robot functionality against a Cloud+Robot environment - - Build Tag: `e2e && robot` +To run the unit tests, make sure you installed the following tools: -## Local test setup -This repository provides [skaffold](https://skaffold.dev/) to easily deploy / debug this controller on demand +- [Go](https://go.dev/) -### Requirements -1. Install [hcloud-cli](https://github.com/hetznercloud/cli) -2. Install [k3sup](https://github.com/alexellis/k3sup) -3. Install [cilium](https://github.com/cilium/cilium-cli) -4. Install [docker](https://www.docker.com/) +1. Run the following command to run the unit tests: -You will also need to set a `HCLOUD_TOKEN` in your shell session -### Manual Installation guide -1. Create an SSH key - -Assuming you already have created an ssh key via `ssh-keygen` -``` -hcloud ssh-key create --name ssh-key-ccm-test --public-key-from-file ~/.ssh/id_rsa.pub +```sh +go test ./... ``` -2. Create a server -``` -hcloud server create --name ccm-test-server --image ubuntu-20.04 --ssh-key ssh-key-ccm-test --type cx22 -``` +### Run the kubernetes e2e tests -3. Setup k3s on this server -``` -k3sup install --ip $(hcloud server ip ccm-test-server) --local-path=/tmp/kubeconfig --cluster --k3s-channel=v1.23 --k3s-extra-args='--no-flannel --no-deploy=servicelb --no-deploy=traefik --disable-cloud-controller --disable-network-policy --kubelet-arg=cloud-provider=external' -``` -- The kubeconfig will be created under `/tmp/kubeconfig` -- Kubernetes version can be configured via `--k3s-channel` +Before running the e2e tests, make sure you followed the [Setup a development environment](#setup-a-development-environment) steps. -4. Switch your kubeconfig to the test cluster. Very important: exporting this like -``` -export KUBECONFIG=/tmp/kubeconfig -``` +1. Run the kubernetes e2e tests using the following command: -5. Install cilium + test your cluster -``` -cilium install +```sh +source dev/files/env.sh +go test ./tests/e2e -tags e2e -v ``` -6. Add your secret to the cluster -``` -kubectl -n kube-system create secret generic hcloud --from-literal="token=$HCLOUD_TOKEN" -``` +### Development with Robot -7. Deploy the hcloud-cloud-controller-manager -``` -SKAFFOLD_DEFAULT_REPO=your_docker_hub_username skaffold dev +If you want to work on the Robot support, you need to make some changes to the above setup. + +This requires that you have a Robot Server in the same account you use for the development. The server needs to be setup with the Ansible Playbook `dev/robot/install.yml` and configured in `dev/robot/install.yml`. + +1. Set these environment variables: + +```shell +export ROBOT_ENABLED=true + +export ROBOT_USER= +export ROBOT_PASSWORD= ``` -- `docker login` required -- Skaffold is using your own Docker Hub repo to push the HCCM image. -- After the first run, you might need to set the image to "public" on hub.docker.com +2. Continue with the environment setup until you reach the `skaffold` step. Run `skaffold dev --profile=robot` instead. -On code change, Skaffold will repack the image & deploy it to your test cluster again. It will also stream logs from the hccm Deployment. +3. We have another suite of tests for Robot. You can run these with: -*After setting this up, only the command from step 7 is required!*= +```sh +go test ./tests/e2e -tags e2e,robot -v +``` ## License diff --git a/dev/.gitignore b/dev/.gitignore new file mode 100644 index 000000000..fa77144e7 --- /dev/null +++ b/dev/.gitignore @@ -0,0 +1,5 @@ +.terraform/ +terraform.tfstate* +*.auto.tfvars + +files/ diff --git a/dev/.terraform.lock.hcl b/dev/.terraform.lock.hcl new file mode 100644 index 000000000..6d22d4982 --- /dev/null +++ b/dev/.terraform.lock.hcl @@ -0,0 +1,114 @@ +# This file is maintained automatically by "tofu init". +# Manual edits may be lost in future updates. + +provider "registry.opentofu.org/hashicorp/helm" { + version = "2.14.0" + constraints = "2.14.0" + hashes = [ + "h1:ibK3MM61pVjBwBcrro56OLTHwUhhNglvGG9CloLvliI=", + "zh:1c84ca8c274564c46497e89055139c7af64c9e1a8dd4f1cd4c68503ac1322fb8", + "zh:211a763173934d30c2e49c0cc828b1e34a528b0fdec8bf48d2bb3afadd4f9095", + "zh:3dca0b703a2f82d3e283a9e9ca6259a3b9897b217201f3cddf430009a1ca00c9", + "zh:40c5cfd48dcef54e87129e19d31c006c2e3309ee6c09d566139eaf315a59a369", + "zh:6f23c00ca1e2663e2a208a7491aa6dbe2604f00e0af7e23ef9323206e8f2fc81", + "zh:77f8cfc4888600e0d12da137bbdb836de160db168dde7af26c2e44cf00cbf057", + "zh:97b99c945eafa9bafc57c3f628d496356ea30312c3df8dfac499e0f3ff6bf0c9", + "zh:a01cfc53e50d5f722dc2aabd26097a8e4d966d343ffd471034968c2dc7a8819d", + "zh:b69c51e921fe8c91e38f4a82118d0b6b0f47f6c71a76f506fde3642ecbf39911", + "zh:fb8bfc7b8106bef58cc5628c024103f0dd5276d573fe67ac16f343a2b38ecee8", + ] +} + +provider "registry.opentofu.org/hashicorp/kubernetes" { + version = "2.31.0" + constraints = "2.31.0" + hashes = [ + "h1:z2qlqn6WbrjbezwQo4vvlwAgVUGz59klzDU4rlYhYi8=", + "zh:0dd25babf78a88a61dd329b8c18538a295ea63630f1b69575e7898c89307da39", + "zh:3138753e4b2ce6e9ffa5d65d73e9236169ff077c10089c7dc71031a0a139ff6d", + "zh:644f94692dc33de0bb1183c307ae373efbf4ef4cb92654ccc646a5716edf9593", + "zh:6cc630e43193220b1599e3227286cc4e3ca195910e8c56b6bacb50c5b5176dbf", + "zh:764173875e77aa482da4dca9fec5f77c455d028848edfc394aa7dac5dfed6afd", + "zh:7b1d380362d50ffbb3697483036ae351b0571e93b33754255cde6968e62b839f", + "zh:a1d93ca3d8d1ecdd3b69242d16ff21c91b34e2e98f02a3b2d02c908aeb45189b", + "zh:b471d0ab56dbf19c95fba68d2ef127bdb353be96a2be4c4a3dcd4d0db4b4180a", + "zh:d610f725ded4acd3d31a240472bb283aa5e657ed020395bdefea18d094b8c2bf", + "zh:d7f3ddd636ad5af6049922f212feb24830b7158410819c32073bf81c359cd2fa", + ] +} + +provider "registry.opentofu.org/hashicorp/local" { + version = "2.5.1" + constraints = "2.5.1" + hashes = [ + "h1:GgW5qncKu4KnXLE1ZYv5iwmhSYtTNzsOvJAOQIyFR7E=", + "zh:031c2c2070672b7e78e0aa15560839278dc57fe7cf1e58a617ac13c67b31d5fb", + "zh:1ef64ea4f8382cd538a76f3d319f405d18130dc3280f1c16d6aaa52a188ecaa4", + "zh:422ce45691b2f384dbd4596fdc8209d95cb43d85a82aaa0173089d38976d6e96", + "zh:7415fbd8da72d9363ba55dd8115837714f9534f5a9a518ec42268c2da1b9ed2f", + "zh:92aa22d071339c8ef595f18a9f9245c287266c80689f5746b26e10eaed04d542", + "zh:9cd0d99f5d3be835d6336c19c4057af6274e193e677ecf6370e5b0de12b4aafe", + "zh:a8c1525b389be5809a97f02aa7126e491ba518f97f57ed3095a3992f2134bb8f", + "zh:b336fa75f72643154b07c09b3968e417a41293358a54fe03efc0db715c5451e6", + "zh:c66529133599a419123ad2e42874afbd9aba82bd1de2b15cc68d2a1e665d4c8e", + "zh:c7568f75ba6cb7c3660b69eaab8b0e4278533bd9a7a4c33ee6590cc7e69743ea", + ] +} + +provider "registry.opentofu.org/hashicorp/null" { + version = "3.2.2" + constraints = "3.2.2" + hashes = [ + "h1:xN1tSeF/rUBfaddk/AVqk4i65z/MMM9uVZWd2cWCCH0=", + "zh:00e5877d19fb1c1d8c4b3536334a46a5c86f57146fd115c7b7b4b5d2bf2de86d", + "zh:1755c2999e73e4d73f9de670c145c9a0dc5a373802799dff06a0e9c161354163", + "zh:2b29d706353bc9c4edda6a2946af3322abe94372ffb421d81fa176f1e57e33be", + "zh:34f65259c6d2bd51582b6da536e782b181b23725782b181193b965f519fbbacd", + "zh:370f6eb744475926a1fa7464d82d46ad83c2e1148b4b21681b4cec4d75b97969", + "zh:5950bdb23b4fcc6431562d7eba3dea37844aa4220c4da2eb898ae3e4d1b64ec4", + "zh:8f3d5c8d4b9d497fec36953a227f80c76d37fc8431b683a23fb1c42b9cccbf8a", + "zh:8f6eb5e65c047bf490ad3891efecefc488503b65898d4ee106f474697ba257d7", + "zh:a7040eed688316fe00379574c72bb8c47dbe2638b038bb705647cbf224de8f72", + "zh:e561f28df04d9e51b75f33004b7767a53c45ad96e3375d86181ba1363bffbc77", + ] +} + +provider "registry.opentofu.org/hashicorp/tls" { + version = "4.0.5" + constraints = "4.0.5" + hashes = [ + "h1:zEH0OgSkeXDqNWzmOUWDczrUwyyujAHvnbW79qdxVMI=", + "zh:05a7dc3ac92005485714f87541ad6d0d478988b478c5774227a7d39b01660050", + "zh:547e0def44080456169bf77c21037aa6dc9e7f3e644a8f6a2c5fc3e6c15cf560", + "zh:6842b03d050ae1a4f1aaed2a2b1ca707eae84ae45ae492e4bb57c3d48c26e1f1", + "zh:6ced0a9eaaba12377f3a9b08df2fd9b83ae3cb357f859eb6aecf24852f718d9a", + "zh:766bcdf71a7501da73d4805d05764dcb7c848619fa7c04b3b9bd514e5ce9e4aa", + "zh:84cc8617ce0b9a3071472863f43152812e5e8544802653f636c866ef96f1ed34", + "zh:b1939e0d44c89315173b78228c1cf8660a6924604e75ced7b89e45196ce4f45e", + "zh:ced317916e13326766427790b1d8946c4151c4f3b0efd8f720a3bc24abe065fa", + "zh:ec9ff3412cf84ba81ca88328b62c17842b803ef406ae19152c13860b356b259c", + "zh:ff064f0071e98702e542e1ce00c0465b7cd186782fe9ccab8b8830cac0f10dd4", + ] +} + +provider "registry.opentofu.org/hetznercloud/hcloud" { + version = "1.47.0" + constraints = "1.47.0" + hashes = [ + "h1:aqEPcSpaWhKqbMs7c7Pf5ot6Tye7ntRitWsuNGPRPfk=", + "zh:0759f0c23d0e59baab3382320eef4eb314e0c5967b6ef67ff07135da07a97b34", + "zh:0e9ca84c4059d6d7e2c9f13d3c2b1cd91f7d9a47bedcb4b80c7c77d536eff887", + "zh:17a033ac4650a39ddacf3265a449edabaea528f81542c4e63e254272d5aac340", + "zh:2997c76a500e42b7519b24fa1f8646d9baab70c68277f80394560d3e1fd06e6d", + "zh:37f3fe7bb34cac63c69123e43e5426bab75816b3665dbe7125276a8d2ee6b2d8", + "zh:45d4b04dc470f24ad96c1c0b6636ea5422c659004f3e472c863bc50130fabf25", + "zh:46df99d972a78af6875565e53a73df66d870c474a20cd90e9e0a3092aa25197f", + "zh:4b5bb8d49366ad895c6c767efe16a1b8143802414abfe3fdb1184cbbecf424eb", + "zh:55c6199eb401c4b0a6c948ceac8b50f352e252e1c985903ed173bf26ad0f109e", + "zh:7b6efe897bffa37248064155a699e67953350b5b9a5476456c0160ce59254557", + "zh:7bc004bcb649ce1ec70e2cf848392e10a1edbcbf11b3292a4cc5c5d49bd769e4", + "zh:e1b17b7595f158fbb3021afa8869b541b5c10bdd2d8d2b2b3eaa82200b104ddd", + "zh:f741ca40e8e99a3e4114ad108ea2b5a5bccbedb008326c7f647f250580e69c0e", + "zh:fae9c7f8d08a447bb0972529f6db06999c35391046320206041a988aeca6b54c", + ] +} diff --git a/dev/Dockerfile b/dev/Dockerfile new file mode 100644 index 000000000..18616db35 --- /dev/null +++ b/dev/Dockerfile @@ -0,0 +1,23 @@ +FROM golang:1.22 as builder + +WORKDIR /build + +ADD go.mod go.sum /build/ +RUN go mod download + +ADD . /build/ +RUN ls -al + +ARG CGO_ENABLED=0 +# `skaffold debug` sets SKAFFOLD_GO_GCFLAGS to disable compiler optimizations +ARG SKAFFOLD_GO_GCFLAGS +RUN go build -gcflags="$SKAFFOLD_GO_GCFLAGS" -o hcloud-cloud-controller-manager.bin github.com/hetznercloud/hcloud-cloud-controller-manager + +FROM alpine:3.20 + +RUN apk add --no-cache \ + bash \ + ca-certificates + +COPY --from=builder /build/hcloud-cloud-controller-manager.bin /bin/hcloud-cloud-controller-manager +ENTRYPOINT ["/bin/hcloud-cloud-controller-manager"] diff --git a/dev/Makefile b/dev/Makefile new file mode 100644 index 000000000..37bf08d30 --- /dev/null +++ b/dev/Makefile @@ -0,0 +1,35 @@ +SHELL = bash +.SHELLFLAGS = -e -c +.ONESHELL: + +ENV ?= dev +K3S_CHANNEL ?= stable +ROBOT_ENABLED ?= false + +env.auto.tfvars: + @echo 'name = "$(ENV)"' > "$@" + @echo 'hcloud_token = "$(HCLOUD_TOKEN)"' >> "$@" + @echo 'k3s_channel = "$(K3S_CHANNEL)"' >> "$@" + @echo 'robot_enabled = "$(ROBOT_ENABLED)"' >> "$@" + @echo 'robot_user = "$(ROBOT_USER)"' >> "$@" + @echo 'robot_password = "$(ROBOT_PASSWORD)"' >> "$@" + +.terraform: + tofu init + +validate: .terraform + tofu validate + +up: .terraform env.auto.tfvars + tofu apply -auto-approve + $(MAKE) port-forward + +down: .terraform env.auto.tfvars + tofu destroy -auto-approve + +port-forward: + source files/env.sh + bash files/registry-port-forward.sh + +clean: + rm -Rf files/ .terraform/ terraform.tfstate* env.auto.tfvars diff --git a/dev/main.tf b/dev/main.tf new file mode 100644 index 000000000..bf1518699 --- /dev/null +++ b/dev/main.tf @@ -0,0 +1,13 @@ +module "dev" { + source = "github.com/hetznercloud/kubernetes-dev-env?ref=v0.4.0" + + name = "hccm-${replace(var.name, "/[^a-zA-Z0-9-_]/", "-")}" + worker_count = 1 + # We deploy hccm through skaffold, its the application under development/test. + deploy_hccm = false + use_cloud_routes = !var.robot_enabled + + hcloud_token = var.hcloud_token + + k3s_channel = var.k3s_channel +} diff --git a/dev/robot.tf b/dev/robot.tf new file mode 100644 index 000000000..3659a5fef --- /dev/null +++ b/dev/robot.tf @@ -0,0 +1,105 @@ +locals { + ansible_inventory = yamldecode(file("${path.module}/robot/inventory.yml")) + robot_ipv4 = local.ansible_inventory["all"]["hosts"]["hccm-test0"]["ansible_host"] +} + +resource "null_resource" "reset_robot" { + count = var.robot_enabled ? 1 : 0 + triggers = { + # Wait the control-plane to be initialized, and re-join the new cluster if the + # control-plane server changed. + control_id = module.dev.control_server_ipv4 + } + + connection { + host = local.robot_ipv4 + } + provisioner "remote-exec" { + inline = [ + # Only reboot if the node was already provisioned since the last reboot + "stat /etc/rancher/k3s 1>/dev/null && systemctl reboot ; exit 0", + ] + } + + provisioner "remote-exec" { + connection { + timeout = "3m" + } + + inline = [ + "whoami" + ] + } + + provisioner "local-exec" { + command = <<-EOT + ssh-copy-id \ + -o StrictHostKeyChecking=no \ + -o UserKnownHostsFile=/dev/null \ + -i ${module.dev.ssh_public_key_filename} \ + root@${local.robot_ipv4} + EOT + } +} + +module "registry_robot" { + count = var.robot_enabled ? 1 : 0 + depends_on = [null_resource.reset_robot] + + source = "github.com/hetznercloud/kubernetes-dev-env//k3s_registry?ref=v0.4.0" + + server = { id = "0", ipv4_address = local.robot_ipv4 } +} + +resource "null_resource" "k3sup_robot" { + count = var.robot_enabled ? 1 : 0 + depends_on = [module.registry_robot.0] + + triggers = { + # Wait the control-plane to be initialized, and re-join the new cluster if the + # control-plane server changed. + control_id = module.dev.control_server_ipv4 + } + + connection { + host = local.robot_ipv4 + } + + provisioner "local-exec" { + // We already use overlayfs for the root file system on the server. + // This caused an issue with the overlayfs default snapshotter in + // containerd. `--snapshotter=native` avoids this issue. We have not + // noticed any negative performance impact from this, as the whole + // filesystem is only kept in memory. + command = <<-EOT + k3sup join \ + --ssh-key='${module.dev.ssh_private_key_filename}' \ + --ip='${local.robot_ipv4}' \ + --server-ip='${module.dev.control_server_ipv4}' \ + --k3s-channel='${var.k3s_channel}' \ + --k3s-extra-args="\ + --kubelet-arg='cloud-provider=external' \ + --node-ip='${local.robot_ipv4}' \ + --node-label instance.hetzner.cloud/is-root-server=true \ + --snapshotter=native" \ + EOT + } +} + +provider "kubernetes" { + config_path = module.dev.kubeconfig_filename +} + +resource "kubernetes_secret_v1" "robot_credentials" { + count = var.robot_enabled ? 1 : 0 + + metadata { + name = "robot" + namespace = "kube-system" + } + + data = { + robot-user = var.robot_user + robot-password = var.robot_password + } +} diff --git a/dev/robot/.gitignore b/dev/robot/.gitignore new file mode 100644 index 000000000..187518b85 --- /dev/null +++ b/dev/robot/.gitignore @@ -0,0 +1 @@ +ansible_* diff --git a/hack/robot-e2e/ansible.cfg b/dev/robot/ansible.cfg similarity index 52% rename from hack/robot-e2e/ansible.cfg rename to dev/robot/ansible.cfg index a42f02eb9..47ae51c0d 100644 --- a/hack/robot-e2e/ansible.cfg +++ b/dev/robot/ansible.cfg @@ -1,8 +1,13 @@ [defaults] +collections_path = ansible_collections +roles_path = ansible_roles + inventory = ${PWD}/inventory.yml -host_key_checking = False +host_key_checking = false stdout_callback = community.general.yaml interpreter_python = /usr/bin/python3 +fact_caching = memory +retry_files_enabled = false [ssh_connection] pipelining = True diff --git a/dev/robot/install.yml b/dev/robot/install.yml new file mode 100644 index 000000000..ec191c441 --- /dev/null +++ b/dev/robot/install.yml @@ -0,0 +1,123 @@ +--- +- name: Boot to rescue + hosts: all + gather_facts: false + + vars: + # SSH keys to add to the server. Must already exist in Robot. + authorized_keys: + - fc:3c:b4:42:c9:bf:f9:6e:be:58:d3:12:40:c7:13:b2 # julian.toelle + - 21:ea:a3:5c:02:a6:0c:42:0b:ef:b5:60:ae:5b:07:9b # jonas.lammler + + module_defaults: + group/community.hrobot.robot: + hetzner_user: "{{ lookup('ansible.builtin.env', 'ROBOT_USER') }}" + hetzner_password: "{{ lookup('ansible.builtin.env', 'ROBOT_PASSWORD') }}" + + tasks: + - name: Upload CI SSH Key # noqa args[module] + delegate_to: localhost + community.hrobot.ssh_key: + name: hcloud-cloud-controller-manager + public_key: ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIA/+QBJNjrtqJjdsOqOwfELUS+3M3g0QRbn7pQaXfw7f ci@2024-06-27 + state: present + register: ci_ssh_key + + - name: Enable rescue system # noqa args[module] + delegate_to: localhost + community.hrobot.boot: + server_number: "{{ server_number }}" + rescue: + authorized_keys: "{{ authorized_keys + [ci_ssh_key.fingerprint] }}" + os: linux + # do not log rescue password + no_log: "{{ lookup('ansible.builtin.env', 'CI', default=false) | bool }}" + + - name: Reboot into rescue system # noqa args[module] + delegate_to: localhost + community.hrobot.reset: + server_number: "{{ server_number }}" + reset_type: hardware # only type that does not require a separate reset for starting again + + - name: Wait for the server to be reachable + ansible.builtin.wait_for_connection: + delay: 5 + +- name: Install server from rescue + hosts: all + gather_facts: true + tasks: + - name: Deploy installimage autosetup + ansible.builtin.copy: + content: | + HOSTNAME {{ inventory_hostname }} + + DRIVE1 /dev/sda + DRIVE2 /dev/sdb + + SWRAID 1 + SWRAIDLEVEL 0 + + BOOTLOADER grub + + PART /boot ext4 1024M + PART lvm vg0 all + + LV vg0 root / ext4 10G + + IMAGE /root/.oldroot/nfs/images/Ubuntu-2404-noble-amd64-base.tar.gz + dest: /autosetup + owner: root + group: root + mode: "0644" + + - name: Run installimage + ansible.builtin.command: + # -t => Take over rescue system SSH public keys + cmd: /root/.oldroot/nfs/install/installimage -t yes + changed_when: true + + - name: Reboot + ansible.builtin.reboot: + + - name: Wait for the server to be reachable + ansible.builtin.wait_for_connection: + delay: 5 + +- name: Configure server + hosts: all + gather_facts: true + tasks: + - name: Run apt update + ansible.builtin.apt: + update_cache: true + + - name: Removed unneeded packages + ansible.builtin.apt: + name: [snapd, unattended-upgrades, ubuntu-pro-client] + purge: true + state: absent + + - name: Run apt upgrade # noqa package-latest + ansible.builtin.apt: + name: "*" + state: latest + + - name: Removed orphan packages + ansible.builtin.apt: + autoremove: true + purge: true + state: absent + + - name: Deploy overlayroot config + ansible.builtin.copy: + content: | + overlayroot=tmpfs + dest: /etc/overlayroot.conf + owner: root + group: root + mode: "0644" + + - name: Reboot + ansible.builtin.reboot: + reboot_timeout: 300 diff --git a/dev/robot/inventory.yml b/dev/robot/inventory.yml new file mode 100644 index 000000000..6b4f3ab51 --- /dev/null +++ b/dev/robot/inventory.yml @@ -0,0 +1,7 @@ +all: + hosts: + hccm-test0: + ansible_host: 142.132.203.104 + ansible_user: root + ansible_ssh_common_args: -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null + server_number: 2159083 diff --git a/hack/robot-e2e/requirements.yml b/dev/robot/requirements.yml similarity index 58% rename from hack/robot-e2e/requirements.yml rename to dev/robot/requirements.yml index d2df60aa8..8369dcb3e 100644 --- a/hack/robot-e2e/requirements.yml +++ b/dev/robot/requirements.yml @@ -1,3 +1,3 @@ collections: + - name: community.general - name: community.hrobot - - name: community.general \ No newline at end of file diff --git a/dev/robot/with-ssh-agent b/dev/robot/with-ssh-agent new file mode 100755 index 000000000..3a37e47d7 --- /dev/null +++ b/dev/robot/with-ssh-agent @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +set -ueo pipefail + +# required for ssh-copy-id +mkdir -p ~/.ssh + +# start ssh agent +eval "$(ssh-agent -s)" + +ssh_key_path="$(mktemp ./ssh_key.XXXXXXXXXX)" + +# ensure ssh agent and keys are cleaned on exit +# shellcheck disable=SC2064 +trap "kill '$SSH_AGENT_PID'; rm -f '$ssh_key_path'" EXIT + +# load robot ssh key +install --mode=600 <(echo -n "$ROBOT_SSH_KEY") "$ssh_key_path" +ssh-add "$ssh_key_path" + +# run the wrapped command +"$@" diff --git a/dev/variables.tf b/dev/variables.tf new file mode 100644 index 000000000..4773c6c49 --- /dev/null +++ b/dev/variables.tf @@ -0,0 +1,28 @@ +# General +variable "name" { + type = string +} +variable "k3s_channel" { + type = string +} + +# Hetzner Cloud +variable "hcloud_token" { + type = string + sensitive = true +} + + +# Hetzner Robot +variable "robot_enabled" { + type = bool + default = false +} +variable "robot_user" { + type = string + sensitive = true +} +variable "robot_password" { + type = string + sensitive = true +} \ No newline at end of file diff --git a/hack/Dockerfile b/hack/Dockerfile deleted file mode 100644 index 226074ece..000000000 --- a/hack/Dockerfile +++ /dev/null @@ -1,14 +0,0 @@ -FROM golang:1.22 as builder -WORKDIR /hccm -ADD go.mod go.sum /hccm/ -RUN go mod download -ADD . /hccm/ -RUN ls -al -# `skaffold debug` sets SKAFFOLD_GO_GCFLAGS to disable compiler optimizations -ARG SKAFFOLD_GO_GCFLAGS -RUN CGO_ENABLED=0 go build -gcflags="${SKAFFOLD_GO_GCFLAGS}" -o hcloud-cloud-controller-manager.bin github.com/hetznercloud/hcloud-cloud-controller-manager - -FROM alpine:3.20 -RUN apk add --no-cache ca-certificates bash -COPY --from=builder /hccm/hcloud-cloud-controller-manager.bin /bin/hcloud-cloud-controller-manager -ENTRYPOINT ["/bin/hcloud-cloud-controller-manager"] diff --git a/hack/dev-down.sh b/hack/dev-down.sh deleted file mode 100755 index 87e513cdd..000000000 --- a/hack/dev-down.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env bash -set -ue -o pipefail -SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" - -scope="${SCOPE:-dev}" -scope=${scope//[^a-zA-Z0-9_]/-} -scope_name=hccm-${scope} -label="managedby=hack" - -if [[ "${ALL:-}" == "" ]]; then - label="$label,scope=$scope_name" - rm -f $SCRIPT_DIR/.ssh-$scope* $SCRIPT_DIR/.kubeconfig-$scope $SCRIPT_DIR/.token-$scope $SCRIPT_DIR/.reg-pf* -else - rm -f $SCRIPT_DIR/.ssh* $SCRIPT_DIR/.kubeconfig* $SCRIPT_DIR/.token-* $SCRIPT_DIR/.reg-pf* -fi - -for instance in $(hcloud server list -o noheader -o columns=id -l $label); do - ( - hcloud server delete $instance - ) & -done - - -for key in $(hcloud ssh-key list -o noheader -o columns=name -l $label); do - ( - hcloud ssh-key delete $key - ) & -done - - -for key in $(hcloud network list -o noheader -o columns=name -l $label); do - ( - hcloud network delete $key - ) & -done - -wait diff --git a/hack/dev-up.sh b/hack/dev-up.sh deleted file mode 100755 index 139ad36a5..000000000 --- a/hack/dev-up.sh +++ /dev/null @@ -1,194 +0,0 @@ -#!/usr/bin/env bash -set -ueo pipefail -SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" - -if [[ -n "${DEBUG:-}" ]]; then set -x; fi - -# Redirect all stdout to stderr. -{ - if ! hcloud version >/dev/null; then echo "ERROR: 'hcloud' CLI not found, please install it and make it available on your \$PATH"; exit 1; fi - if ! k3sup version >/dev/null; then echo "ERROR: 'k3sup' not found, please install it and make it available on your \$PATH"; exit 1; fi - if ! helm version >/dev/null; then echo "ERROR: 'helm' not found, please install it and make it available on your \$PATH"; exit 1; fi - if [[ "${HCLOUD_TOKEN:-}" == "" ]]; then echo "ERROR: please set \$HCLOUD_TOKEN"; exit 1; fi - - # We run a lot of subshells below for speed. If any encounter an error, we shut down the whole process group, pronto. - function error() { - echo "Onoes, something went wrong! :( The output above might have some clues." - kill 0 - } - - trap error ERR - - image_name=${IMAGE_NAME:-ubuntu-22.04} - instance_count=${INSTANCES:-1} - instance_type=${INSTANCE_TYPE:-cpx11} - location=${LOCATION:-fsn1} - network_zone=${NETWORK_ZONE:-eu-central} - ssh_keys=${SSH_KEYS:-} - # All k3s after January 2024 break our e2e tests, we hardcode - # the versions for now until we can fix the source of this. - # channel=${K3S_CHANNEL:-stable} - k3s_version=${K3S_VERSION:-v1.28.5+k3s1} - network_cidr=${NETWORK_CIDR:-10.0.0.0/8} - subnet_cidr=${SUBNET_CIDR:-10.0.0.0/24} - cluster_cidr=${CLUSTER_CIDR:-10.244.0.0/16} - routes_enabled=${ROUTES_ENABLED:-true} - scope="${SCOPE:-dev}" - scope=${scope//[^a-zA-Z0-9_]/-} - scope_name=hccm-${scope} - label="managedby=hack,scope=$scope_name" - ssh_private_key="$SCRIPT_DIR/.ssh-$scope" - k3s_opts=${K3S_OPTS:-"--kubelet-arg cloud-provider=external"} - k3s_server_opts=${K3S_SERVER_OPTS:-"--disable-cloud-controller --disable=traefik --disable=servicelb --flannel-backend=none --disable=local-storage --cluster-cidr ${cluster_cidr}"} - - echo -n "$HCLOUD_TOKEN" > "$SCRIPT_DIR/.token-$scope" - - export KUBECONFIG="$SCRIPT_DIR/.kubeconfig-$scope" - - ssh_command="ssh -i $ssh_private_key -o StrictHostKeyChecking=off -o BatchMode=yes -o ConnectTimeout=5" - - # Generate SSH keys and upload publkey to Hetzner Cloud. - ( trap error ERR - [[ ! -f $ssh_private_key ]] && ssh-keygen -t ed25519 -f $ssh_private_key -C '' -N '' - [[ ! -f $ssh_private_key.pub ]] && ssh-keygen -y -f $ssh_private_key > $ssh_private_key.pub - if ! hcloud ssh-key describe $scope_name >/dev/null 2>&1; then - hcloud ssh-key create --label $label --name $scope_name --public-key-from-file $ssh_private_key.pub - fi - ) & - - # Create Network - ( trap error ERR - if ! hcloud network describe $scope_name >/dev/null 2>&1; then - hcloud network create --label $label --ip-range $network_cidr --name $scope_name - hcloud network add-subnet --network-zone $network_zone --type cloud --ip-range $subnet_cidr $scope_name - fi - ) & - - - for num in $(seq $instance_count); do - # Create server and initialize Kubernetes on it with k3sup. - ( trap error ERR - - server_name="$scope_name-$num" - - # Maybe cluster is already up and node is already there. - if kubectl get node $server_name >/dev/null 2>&1; then - exit 0 - fi - - ip=$(hcloud server ip $server_name 2>/dev/null || true) - - if [[ -z "${ip:-}" ]]; then - # Wait for SSH key - until hcloud ssh-key describe $scope_name >/dev/null 2>&1; do sleep 1; done - until hcloud network describe $scope_name 2>&1 | grep $subnet_cidr >/dev/null; do sleep 1; done - - createcmd="hcloud server create --image $image_name --label $label --location $location --name $server_name --ssh-key=$scope_name --type $instance_type --network $scope_name" - for key in $ssh_keys; do - createcmd+=" --ssh-key $key" - done - $createcmd - ip=$(hcloud server ip $server_name) - fi - - # Wait for SSH. - until [ "$($ssh_command root@$ip echo ok 2>/dev/null)" = "ok" ]; do - sleep 1 - done - - $ssh_command root@$ip 'mkdir -p /etc/rancher/k3s && cat > /etc/rancher/k3s/registries.yaml' < $SCRIPT_DIR/k3s-registries.yaml - - private_ip=$(hcloud server describe $server_name -o format="{{ (index .PrivateNet 0).IP }}") - k3s_node_ip_opts="--node-ip ${ip}" - if [[ "$routes_enabled" == "true" ]]; then - # Only advertise the private IP if we have routing enabled, to avoid issues where the nodes can - # not communicate with each other on the advertised addresses (ie. Robot Servers) - k3s_node_ip_opts="--node-external-ip ${ip} --node-ip ${private_ip}" - fi - - if [[ "$num" == "1" ]]; then - # First node is control plane. - k3sup install --print-config=false --ip $ip --k3s-version "${k3s_version}" --k3s-extra-args "${k3s_server_opts} ${k3s_opts} ${k3s_node_ip_opts}" --local-path $KUBECONFIG --ssh-key $ssh_private_key - else - # All subsequent nodes are initialized as workers. - - # Can't go any further until control plane has bootstrapped a bit though. - until $ssh_command root@$(hcloud server ip $scope_name-1 || true) stat /etc/rancher/node/password >/dev/null 2>&1; do - sleep 1 - done - - k3sup join --server-ip $(hcloud server ip $scope_name-1) --ip $ip --k3s-channel $channel --k3s-extra-args "${k3s_opts} ${k3s_node_ip_opts}" --ssh-key $ssh_private_key - fi - ) & - - # Wait for this node to show up in the cluster. - ( trap error ERR; set +x - until kubectl wait --for=condition=Ready node/$scope_name-$num >/dev/null 2>&1; do sleep 1; done - echo $scope_name-$num is up and in cluster - ) & - done - - ( trap error ERR - # Control plane init tasks. - # This is running in parallel with the server init, above. - - # Wait for control plane to look alive. - until kubectl get nodes >/dev/null 2>&1; do sleep 1; done; - - # Deploy private registry. - ( trap error ERR - if ! helm status -n kube-system registry >/dev/null 2>&1; then - helm upgrade -install registry docker-registry \ - --repo=https://helm.twun.io \ - -n kube-system \ - --version 2.2.2 \ - --set service.clusterIP=10.43.0.2 \ - --set 'tolerations[0].key=node.cloudprovider.kubernetes.io/uninitialized' \ - --set 'tolerations[0].operator=Exists' - fi - ) & - - # Install Cilium. - ( trap error ERR - if ! helm status -n kube-system cilium >/dev/null 2>&1; then - values=( - --set ipam.mode=kubernetes - ) - if [[ "$routes_enabled" == "true" ]]; then - # When using the Network Routes, we do not need (or want) Cilium to handle these ranges - values+=( - --set tunnel=disabled - --set ipv4NativeRoutingCIDR="$cluster_cidr" - ) - fi - helm upgrade -install cilium cilium --repo https://helm.cilium.io/ -n kube-system --version 1.13.1 "${values[@]}" - fi) & - - # Create HCLOUD_TOKEN Secret for hcloud-cloud-controller-manager. - ( trap error ERR - if ! kubectl -n kube-system get secret hcloud >/dev/null 2>&1; then - data=( - --from-literal="token=$HCLOUD_TOKEN" - --from-literal="network=$scope_name" - ) - if [[ -v ROBOT_USER ]]; then - data+=( - --from-literal="robot-user=$ROBOT_USER" - --from-literal="robot-password=$ROBOT_PASSWORD" - ) - fi - kubectl -n kube-system create secret generic hcloud "${data[@]}" - fi) & - wait - ) & - wait - echo "Success - cluster fully initialized and ready, why not see for yourself?" - echo '$ kubectl get nodes' - kubectl get nodes - export CONTROL_IP=$(hcloud server ip "$scope_name-1") -} >&2 - -echo "export KUBECONFIG=$KUBECONFIG" -$SCRIPT_DIR/registry-port-forward.sh -echo "export SKAFFOLD_DEFAULT_REPO=localhost:30666" -echo "export CONTROL_IP=$CONTROL_IP" diff --git a/hack/k3s-registries.yaml b/hack/k3s-registries.yaml deleted file mode 100644 index 8c808b121..000000000 --- a/hack/k3s-registries.yaml +++ /dev/null @@ -1,3 +0,0 @@ -mirrors: - localhost:30666: - endpoint: ["http://10.43.0.2:5000"] diff --git a/hack/registry-port-forward.sh b/hack/registry-port-forward.sh deleted file mode 100755 index 082079d24..000000000 --- a/hack/registry-port-forward.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash -set -ue -o pipefail -SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" - -{ -until kubectl -n kube-system --timeout=30s rollout status deployment/registry-docker-registry >/dev/null 2>&1; do sleep 1; done -old_pid=$(cat $SCRIPT_DIR/.reg-pf 2>/dev/null || true) -if [[ -n "$old_pid" ]]; then - echo "killing old port-forward with PID $old_pid" - kill $old_pid || true -fi - -nohup kubectl port-forward -n kube-system svc/registry-docker-registry 30666:5000 >$SCRIPT_DIR/.reg-pf.out 2>$SCRIPT_DIR/.reg-pf.err & -} >&2 - -echo $! > $SCRIPT_DIR/.reg-pf diff --git a/hack/robot-e2e/autosetup.j2 b/hack/robot-e2e/autosetup.j2 deleted file mode 100644 index 473a629f6..000000000 --- a/hack/robot-e2e/autosetup.j2 +++ /dev/null @@ -1,15 +0,0 @@ -HOSTNAME {{ server_name }} - -DRIVE1 /dev/sda -DRIVE2 /dev/sdb - -# We do not care at all about data consistency/availability, as we reprovision for every test run -SWRAID 1 -SWRAIDLEVEL 0 - -BOOTLOADER grub - -PART /boot ext3 1024M -PART / ext4 all - -IMAGE /root/.oldroot/nfs/images/Ubuntu-2204-jammy-amd64-base.tar.gz diff --git a/hack/robot-e2e/e2e-setup-robot-server.yml b/hack/robot-e2e/e2e-setup-robot-server.yml deleted file mode 100644 index 4414965d1..000000000 --- a/hack/robot-e2e/e2e-setup-robot-server.yml +++ /dev/null @@ -1,101 +0,0 @@ ---- -- name: Prepare Reinstall - hosts: localhost - connection: local - gather_facts: false - - vars: - scope: dev - # Additional SSH keys to add to the server for debugging. Must already exist in Robot. - authorized_keys: [] - - module_defaults: - group/community.hrobot.robot: - hetzner_user: "{{ lookup('ansible.builtin.env', 'ROBOT_USER') }}" - hetzner_password: "{{ lookup('ansible.builtin.env', 'ROBOT_PASSWORD') }}" - - tasks: - - name: Get Server Info - community.hrobot.server_info: - server_number: "{{ server_number }}" - register: server_info - - - name: Set Server Facts - ansible.builtin.set_fact: - server_ip: "{{ server_info.servers[0].server_ip }}" - server_name: "{{ server_info.servers[0].server_name }}" - - - name: Create SSH Key - community.hrobot.ssh_key: - name: "hccm-{{ scope }}" - public_key: "{{ lookup('file', '../.ssh-{{ scope }}.pub') }}" - state: present - register: ssh_key - - - name: Enable Rescue System - community.hrobot.boot: - server_number: "{{ server_number }}" - rescue: - authorized_keys: "{{ authorized_keys + [ ssh_key.fingerprint ] }}" - os: linux - - - name: Reset Server (to get to Rescue System) - community.hrobot.reset: - server_number: "{{ server_number }}" - reset_type: hardware # only type that does not require a separate reset for starting again - - - name: Wait for SSH - ansible.builtin.wait_for: - host: "{{ server_ip }}" - port: "{{ 22 }}" - search_regex: SSH - -- name: Install OS to Server - hosts: all - gather_facts: false - tasks: - - name: Write autosetup - ansible.builtin.template: - src: autosetup.j2 - dest: /autosetup - vars: - server_name: "{{ hostvars['localhost']['server_name'] }}" - - - name: installimage - # -t => Take over rescue system SSH public keys - ansible.builtin.command: /root/.oldroot/nfs/install/installimage -t yes - - - name: Reboot - ansible.builtin.reboot: - # 5 minutes should be enough for a reboot, and in case - # there is some issue, we can abort earlier. - reboot_timeout: 300 - - - name: Create k3s directory - ansible.builtin.file: - path: /etc/rancher/k3s - state: directory - - - name: Prepare Local Registry - ansible.builtin.copy: - src: ../k3s-registries.yaml - dest: /etc/rancher/k3s/registries.yaml - -- name: Join Kubernetes Cluster - hosts: localhost - connection: local - gather_facts: false - vars: - control_ip: "{{ lookup('ansible.builtin.env', 'CONTROL_IP') }}" - k3s_channel: stable - scope: dev - - tasks: - - name: k3sup - ansible.builtin.command: >- - k3sup join - --server-ip={{ control_ip | ansible.builtin.mandatory }} - --ip={{ server_ip }} - --k3s-channel={{ k3s_channel }} - --k3s-extra-args="--kubelet-arg cloud-provider=external --node-label instance.hetzner.cloud/is-root-server=true" - --ssh-key ../.ssh-{{ scope }} diff --git a/hack/robot-e2e/inventory.yml b/hack/robot-e2e/inventory.yml deleted file mode 100644 index c3f8c2f02..000000000 --- a/hack/robot-e2e/inventory.yml +++ /dev/null @@ -1,8 +0,0 @@ -all: - hosts: - # TODO: Dynamic inventory - hccm-test: - ansible_host: 142.132.203.104 - ansible_user: root - ansible_ssh_private_key_file: ../.ssh-{{ scope }} - ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null' diff --git a/skaffold.yaml b/skaffold.yaml index 5e407e575..4a34333a0 100644 --- a/skaffold.yaml +++ b/skaffold.yaml @@ -1,19 +1,17 @@ -apiVersion: skaffold/v4beta3 +apiVersion: skaffold/v4beta11 kind: Config metadata: name: cloud-controller-manager build: artifacts: - - image: hetznercloud/hcloud-cloud-controller-manager - runtimeType: go + - image: docker.io/hetznercloud/hcloud-cloud-controller-manager docker: - dockerfile: hack/Dockerfile - cacheFrom: - - hetznercloud/hcloud-cloud-controller-manager:buildcache + dockerfile: dev/Dockerfile local: useBuildkit: true insecureRegistries: - localhost:30666 + manifests: helm: releases: @@ -33,3 +31,9 @@ profiles: - op: add path: /manifests/helm/releases/0/setValues/robot.enabled value: true + - op: add + path: /manifests/helm/releases/0/setValues/env.ROBOT_USER.valueFrom.secretKeyRef.name + value: robot + - op: add + path: /manifests/helm/releases/0/setValues/env.ROBOT_PASSWORD.valueFrom.secretKeyRef.name + value: robot diff --git a/tests/e2e/cloud_test.go b/tests/e2e/cloud_test.go index ac2b7fdf6..b9bfdcb39 100644 --- a/tests/e2e/cloud_test.go +++ b/tests/e2e/cloud_test.go @@ -25,10 +25,10 @@ func TestNodeSetCorrectNodeLabelsAndIPAddresses(t *testing.T) { ctx := context.Background() - node, err := testCluster.k8sClient.CoreV1().Nodes().Get(ctx, "hccm-"+testCluster.scope+"-1", metav1.GetOptions{}) + node, err := testCluster.k8sClient.CoreV1().Nodes().Get(ctx, testCluster.ControlNodeName(), metav1.GetOptions{}) assert.NoError(t, err) - server, _, err := testCluster.hcloud.Server.Get(ctx, "hccm-"+testCluster.scope+"-1") + server, _, err := testCluster.hcloud.Server.Get(ctx, testCluster.ControlNodeName()) if err != nil { return } @@ -191,12 +191,12 @@ func TestRouteNetworksPodIPsAreAccessible(t *testing.T) { t.Parallel() err := wait.PollUntilContextTimeout(context.Background(), 1*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) { - node, err := testCluster.k8sClient.CoreV1().Nodes().Get(ctx, "hccm-"+testCluster.scope+"-1", metav1.GetOptions{}) + node, err := testCluster.k8sClient.CoreV1().Nodes().Get(ctx, testCluster.ControlNodeName(), metav1.GetOptions{}) if err != nil { return false, err } - network, _, err := testCluster.hcloud.Network.Get(ctx, "hccm-"+testCluster.scope) + network, _, err := testCluster.hcloud.Network.Get(ctx, testCluster.NetworkName()) if err != nil { return false, err } @@ -239,7 +239,7 @@ func TestRouteDeleteCorrectRoutes(t *testing.T) { t.Fatal(err) } - network, _, err := testCluster.hcloud.Network.Get(ctx, "hccm-"+testCluster.scope) + network, _, err := testCluster.hcloud.Network.Get(ctx, testCluster.NetworkName()) if err != nil { t.Fatal(err) } @@ -280,7 +280,7 @@ func TestRouteDeleteCorrectRoutes(t *testing.T) { } err = wait.PollUntilContextTimeout(ctx, 1*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) { - network, _, err = testCluster.hcloud.Network.Get(ctx, "hccm-"+testCluster.scope) + network, _, err = testCluster.hcloud.Network.Get(ctx, testCluster.NetworkName()) if err != nil { return false, err } diff --git a/tests/e2e/testing.go b/tests/e2e/testing.go index 6bc232351..d51046c71 100644 --- a/tests/e2e/testing.go +++ b/tests/e2e/testing.go @@ -9,7 +9,6 @@ import ( "math/rand" "net/http" "os" - "regexp" "strconv" "testing" "time" @@ -28,7 +27,6 @@ import ( ) var rng *rand.Rand -var scopeButcher = regexp.MustCompile(`[^a-zA-Z0-9_]`) func init() { rng = rand.New(rand.NewSource(time.Now().UnixNano())) @@ -39,27 +37,14 @@ type TestCluster struct { hrobot hrobot.RobotClient k8sClient *kubernetes.Clientset certificates []*hcloud.Certificate - scope string - certDomain string + // envName is used as the name prefix in all resources + envName string + certDomain string } func (tc *TestCluster) Start() error { - tc.scope = os.Getenv("SCOPE") - if tc.scope == "" { - tc.scope = "dev" - } - tc.scope = scopeButcher.ReplaceAllString(tc.scope, "-") - // Hetzner Cloud Client token := os.Getenv("HCLOUD_TOKEN") - if token == "" { - buf, err := os.ReadFile(fmt.Sprintf("../../hack/.token-%s", tc.scope)) - if err != nil { - return err - } - token = string(buf) - } - if token == "" { return fmt.Errorf("no valid HCLOUD_TOKEN found") } @@ -67,9 +52,9 @@ func (tc *TestCluster) Start() error { opts := []hcloud.ClientOption{ hcloud.WithToken(token), hcloud.WithApplication("hcloud-ccm-testsuite", "1.0"), + hcloud.WithPollBackoffFunc(hcloud.ExponentialBackoff(2, 1*time.Second)), } - hcloudClient := hcloud.NewClient(opts...) - tc.hcloud = hcloudClient + tc.hcloud = hcloud.NewClient(opts...) // Hetzner Robot Client if enabled := os.Getenv("ROBOT_ENABLED"); enabled == "true" { @@ -78,10 +63,7 @@ func (tc *TestCluster) Start() error { tc.hrobot = hrobot.NewBasicAuthClient(robotUser, robotPassword) } - err := os.Setenv("KUBECONFIG", "../../hack/.kubeconfig-"+tc.scope) - if err != nil { - return err - } + tc.envName = os.Getenv("ENV_NAME") loadingRules := clientcmd.NewDefaultClientConfigLoadingRules() configOverrides := &clientcmd.ConfigOverrides{} @@ -142,6 +124,21 @@ func (tc *TestCluster) CreateTLSCertificate(t *testing.T, baseName string) *hclo return cert } +// NetworkName returns the network name. +func (tc *TestCluster) NetworkName() string { + return tc.envName +} + +// ControlNodeName returns the control node name. +func (tc *TestCluster) ControlNodeName() string { + return fmt.Sprintf("%s-control", tc.envName) +} + +// WorkerNodeName returns the worker node name, zero indexed. +func (tc *TestCluster) WorkerNodeName(index int) string { + return fmt.Sprintf("%s-worker-%d", tc.envName, index) +} + type lbTestHelper struct { podName string port int