diff --git a/.github/workflows/package-build-ofed.yml b/.github/workflows/package-build-ofed.yml new file mode 100644 index 000000000..798e0c4bf --- /dev/null +++ b/.github/workflows/package-build-ofed.yml @@ -0,0 +1,254 @@ +--- +name: Build OFED packages +on: + workflow_dispatch: + inputs: + rocky9: + description: Build Rocky Linux 9 + type: boolean + default: true + secrets: + KAYOBE_VAULT_PASSWORD: + required: true + CLOUDS_YAML: + required: true + OS_APPLICATION_CREDENTIAL_ID: + required: true + OS_APPLICATION_CREDENTIAL_SECRET: + required: true + +env: + ANSIBLE_FORCE_COLOR: True + KAYOBE_ENVIRONMENT: ci-builder + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} +jobs: + overcloud-ofed-packages: + name: Build OFED packages + if: github.repository == 'stackhpc/stackhpc-kayobe-config' + runs-on: arc-skc-host-image-builder-runner + permissions: {} + steps: + - name: Install Package + uses: ConorMacBride/install-package@main + with: + apt: git unzip nodejs python3-pip python3-venv openssh-server openssh-client jq + + - name: Start the SSH service + run: | + sudo /etc/init.d/ssh start + + - name: Checkout + uses: actions/checkout@v4 + with: + path: src/kayobe-config + + - name: Determine OpenStack release + id: openstack_release + run: | + BRANCH=$(awk -F'=' '/defaultbranch/ {print $2}' src/kayobe-config/.gitreview) + echo "openstack_release=${BRANCH}" | sed -E "s,(stable|unmaintained)/,," >> $GITHUB_OUTPUT + + - name: Clone StackHPC Kayobe repository + uses: actions/checkout@v4 + with: + repository: stackhpc/kayobe + ref: refs/heads/stackhpc/${{ steps.openstack_release.outputs.openstack_release }} + path: src/kayobe + + - name: Install Kayobe + run: | + mkdir -p venvs && + pushd venvs && + python3 -m venv kayobe && + source kayobe/bin/activate && + pip install -U pip && + pip install ../src/kayobe + + - name: Install terraform + uses: hashicorp/setup-terraform@v2 + + - name: Initialise terraform + run: terraform init + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + + - name: Generate SSH keypair + run: ssh-keygen -f id_rsa -N '' + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + + - name: Generate clouds.yaml + run: | + cat << EOF > clouds.yaml + ${{ secrets.CLOUDS_YAML }} + EOF + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + + - name: Output image tag + id: image_tag + run: | + echo image_tag=$(grep stackhpc_rocky_9_overcloud_host_image_version: etc/kayobe/pulp-host-image-versions.yml | awk '{print $2}') >> $GITHUB_OUTPUT + + # Use the image override if set, otherwise use overcloud-os_distribution-os_release-tag + - name: Output image name + id: image_name + run: | + echo image_name=overcloud-rocky-9-${{ steps.image_tag.outputs.image_tag }} >> $GITHUB_OUTPUT + + - name: Generate terraform.tfvars + run: | + cat << EOF > terraform.tfvars + ssh_public_key = "id_rsa.pub" + ssh_username = "cloud-user" + aio_vm_name = "skc-ofed-builder" + aio_vm_image = "${{ env.VM_IMAGE }}" + aio_vm_flavor = "en1.medium" + aio_vm_network = "stackhpc-ci" + aio_vm_subnet = "stackhpc-ci" + aio_vm_interface = "ens3" + EOF + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + env: + VM_IMAGE: ${{ steps.image_name.outputs.image_name }} + + - name: Terraform Plan + run: terraform plan + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + env: + OS_CLOUD: "openstack" + OS_APPLICATION_CREDENTIAL_ID: ${{ secrets.OS_APPLICATION_CREDENTIAL_ID }} + OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }} + + - name: Terraform Apply + run: | + for attempt in $(seq 5); do + if terraform apply -auto-approve; then + echo "Created infrastructure on attempt $attempt" + exit 0 + fi + echo "Failed to create infrastructure on attempt $attempt" + sleep 10 + terraform destroy -auto-approve + sleep 60 + done + echo "Failed to create infrastructure after $attempt attempts" + exit 1 + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + env: + OS_CLOUD: "openstack" + OS_APPLICATION_CREDENTIAL_ID: ${{ secrets.OS_APPLICATION_CREDENTIAL_ID }} + OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }} + + - name: Get Terraform outputs + id: tf_outputs + run: | + terraform output -json + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + + - name: Write Terraform outputs + run: | + cat << EOF > src/kayobe-config/etc/kayobe/environments/ci-builder/tf-outputs.yml + ${{ steps.tf_outputs.outputs.stdout }} + EOF + + - name: Write Terraform network config + run: | + cat << EOF > src/kayobe-config/etc/kayobe/environments/ci-builder/tf-network-allocation.yml + --- + aio_ips: + builder: "{{ access_ip_v4.value }}" + EOF + + - name: Write Terraform network interface config + run: | + mkdir -p src/kayobe-config/etc/kayobe/environments/$KAYOBE_ENVIRONMENT/inventory/group_vars/seed + rm -f src/kayobe-config/etc/kayobe/environments/$KAYOBE_ENVIRONMENT/inventory/group_vars/seed/network-interfaces + cat << EOF > src/kayobe-config/etc/kayobe/environments/$KAYOBE_ENVIRONMENT/inventory/group_vars/seed/network-interfaces + admin_interface: "{{ access_interface.value }}" + aio_interface: "{{ access_interface.value }}" + EOF + + - name: Manage SSH keys + run: | + mkdir -p ~/.ssh + touch ~/.ssh/authorized_keys + cat src/kayobe-config/terraform/aio/id_rsa.pub >> ~/.ssh/authorized_keys + cp src/kayobe-config/terraform/aio/id_rsa* ~/.ssh/ + + - name: Bootstrap the control host + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe control host bootstrap + + - name: Run growroot playbook + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe playbook run src/kayobe-config/etc/kayobe/ansible/growroot.yml + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} + + - name: Configure the seed host (Builder VM) + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe seed host configure --skip-tags network,docker + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} + + - name: Run a distro-sync + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe seed host command run --become --command "dnf distro-sync --refresh" + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} + + - name: Reset BLS entries on the seed host + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe playbook run src/kayobe-config/etc/kayobe/ansible/reset-bls-entries.yml \ + -e "reset_bls_host=ofed-builder" + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} + + - name: Disable noexec in /var/tmp + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe seed host command run --become --command "sed -i 's/noexec,//g' /etc/fstab" + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} + + - name: Reboot to apply the kernel update + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe playbook run src/kayobe-config/etc/kayobe/ansible/reboot.yml + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} + + - name: Run OFED builder playbook + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe playbook run src/kayobe-config/etc/kayobe/ansible/build-ofed-rocky.yml + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} + + - name: Run OFED upload playbook + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe playbook run src/kayobe-config/etc/kayobe/ansible/push-ofed.yml + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} + + - name: Destroy + run: terraform destroy -auto-approve + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + env: + OS_CLOUD: openstack + OS_APPLICATION_CREDENTIAL_ID: ${{ secrets.OS_APPLICATION_CREDENTIAL_ID }} + OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }} + if: always() diff --git a/doc/source/contributor/index.rst b/doc/source/contributor/index.rst index 988957541..0073c48a2 100644 --- a/doc/source/contributor/index.rst +++ b/doc/source/contributor/index.rst @@ -12,3 +12,4 @@ This guide is for contributors of the StackHPC Kayobe configuration project. environments/index package-updates pre-commit + ofed diff --git a/doc/source/contributor/ofed.rst b/doc/source/contributor/ofed.rst new file mode 100644 index 000000000..e53b0f125 --- /dev/null +++ b/doc/source/contributor/ofed.rst @@ -0,0 +1,55 @@ +==== +OFED +==== + +Warning: Experimental workflow subject to change + +This section documents the workflow for building OFED packages for Release train integration. + +The workflow builds the OFED kernel modules against the latest available kernel in Release train +(as configured in SKC) and compiles them into RPM packages to be uploaded to Ark. Addtionally, +this workflow downloads the userspace OFED packages from the Nvidia repository and uploads these +to Ark. + +Workflow +======== + +The workflow uses workflow_dispatch to manually request an OFED build, which will deploy a builder +VM, apply kayobe config to the builder, upgrade the kernel, reboot, then run two Ansible playbooks +for building and uploading OFED to Ark. + +Pre-requisites +-------------- + +Before building OFED packages, the workflow will ensure that: + +* A full distro-sync has taken place, ensuring the kernel is upgraded. + +* The bootloader has been configured to use the latest kernel + +* noexec is disabled in the temporary logical volume. + +build-ofed +---------- + +Currently we only support building Rocky Linux 9 OFED packages. + +In order to setup OFED, we're required to build kernel modules for the OFED drivers as +the kernels we provide in release train are unsupported by OFED. To accomplish this we +will need to use the doca-kernel-support from the doca-extra repository. + +We will need to instll dependencies in order to build the OFED kernel modules, and these +are installed at the beginning of the build playbook. We also install base and appstream +dependencies of userspace OFED packages here, this is intended to stop these dependencies +being pulled in later when we download the OFED packages from the doca-host repository. + +At the end of the playbook following the kernel module build, the OFED userspace packages +are downloaded from the upstream repository in order to upload these to Ark. + +push-ofed +--------- + +As we're not syncing OFED from any upstream source, and are instead creating our own +repository of custom packages, we will be required to setup the Pulp distribution/publication +and upload the content directly to Ark. This playbook uses the Pulp CLI to upload the RPMs +to Ark. diff --git a/etc/kayobe/ansible/build-ofed-rocky.yml b/etc/kayobe/ansible/build-ofed-rocky.yml new file mode 100644 index 000000000..4c5b74bba --- /dev/null +++ b/etc/kayobe/ansible/build-ofed-rocky.yml @@ -0,0 +1,73 @@ +--- +- name: Build OFED packages + become: true + hosts: ofed-builder + gather_facts: false + tasks: + - name: Check whether noexec is enabled for /var/tmp + ansible.builtin.lineinfile: + path: "/etc/fstab" + regexp: "noexec" + state: absent + changed_when: false + check_mode: true + register: result + failed_when: result.found + + - name: Install package dependencies + ansible.builtin.dnf: + name: + - kpartx + - perl + - rpm-build + - automake + - patch + - kernel + - kernel-devel + - autoconf + - pciutils + - kernel-modules-extra + - kernel-rpm-macros + - lsof + - libtool + - tk + - gcc-gfortran + - tcl + - createrepo + - cmake-filesystem + - libnl3-devel + - python3-devel + state: latest + update_cache: true + + - name: Add DOCA host repository package + ansible.builtin.dnf: + name: https://developer.nvidia.com/downloads/networking/secure/doca-sdk/DOCA_2.8/doca-host-2.8.0-204000_{{ stackhpc_pulp_doca_ofed_version }}_rhel9{{ stackhpc_pulp_repo_rocky_9_minor_version }}.x86_64.rpm + disable_gpg_check: true + + - name: Install DOCA extra packages + ansible.builtin.dnf: + name: doca-extra + + - name: Create build directory + ansible.builtin.file: + path: /home/cloud-user/ofed + state: directory + mode: 0777 + + - name: Set build directory + ansible.builtin.replace: + path: /opt/mellanox/doca/tools/doca-kernel-support + regexp: 'TMP_DIR=\$1' + replace: 'TMP_DIR=/home/cloud-user/ofed' + + - name: Build OFED kernel modules + ansible.builtin.shell: + cmd: | + /opt/mellanox/doca/tools/doca-kernel-support + + - name: Download OFED userspace packages + ansible.builtin.dnf: + name: doca-ofed-userspace + download_only: true + download_dir: /home/cloud-user/ofed diff --git a/etc/kayobe/ansible/push-ofed.yml b/etc/kayobe/ansible/push-ofed.yml new file mode 100644 index 000000000..c0214a0b0 --- /dev/null +++ b/etc/kayobe/ansible/push-ofed.yml @@ -0,0 +1,54 @@ +--- +- name: Push OFED packages + hosts: ofed-builder + tasks: + - name: Install python dependencies + ansible.builtin.pip: + name: pulp-cli + + - name: Create Pulp repository for OFED + pulp.squeezer.rpm_repository: + pulp_url: "{{ stackhpc_release_pulp_url }}" + username: "{{ stackhpc_release_pulp_username }}" + password: "{{ stackhpc_release_pulp_password }}" + name: "{{ stackhpc_pulp_repo_doca_ofed_rhel9.name }}" + state: present + retries: "{{ pulp_timeout_retries | default(3) }}" + + - name: Lookup Pulp RPMs on builder + ansible.builtin.find: + paths: "/home/cloud-user/ofed" + register: rpm_dir + + - name: Upload OFED RPMs to Pulp + ansible.builtin.shell: + cmd: | + pulp \ + --base-url '{{ stackhpc_release_pulp_url }}' \ + --username '{{ stackhpc_release_pulp_username }}' \ + --password '{{ stackhpc_release_pulp_password }}' \ + rpm content \ + --type package upload \ + --repository '{{ stackhpc_pulp_repo_doca_ofed_rhel9.name }}' \ + --file {{ item.path }} \ + with_items: "{{ rpm_dir.files }}" + no_log: true + + - name: Create Pulp publication for OFED + pulp.squeezer.rpm_publication: + pulp_url: "{{ stackhpc_release_pulp_url }}" + username: "{{ stackhpc_release_pulp_username }}" + password: "{{ stackhpc_release_pulp_password }}" + repository: "{{ stackhpc_pulp_repo_doca_ofed_rhel9.name }}" + state: present + register: publication + + - name: Create Pulp distribution for OFED + pulp.squeezer.rpm_distribution: + pulp_url: "{{ stackhpc_release_pulp_url }}" + username: "{{ stackhpc_release_pulp_username }}" + password: "{{ stackhpc_release_pulp_password }}" + name: "{{ stackhpc_pulp_repo_doca_ofed_rhel9.distribution_name }}" + publication: "{{ publication.publication.pulp_href }}" + base_path: "{{ stackhpc_pulp_repo_doca_ofed_rhel9.base_path }}" + state: present diff --git a/etc/kayobe/ansible/reset-bls-entries.yml b/etc/kayobe/ansible/reset-bls-entries.yml index 59e968cba..68989d1bb 100644 --- a/etc/kayobe/ansible/reset-bls-entries.yml +++ b/etc/kayobe/ansible/reset-bls-entries.yml @@ -5,7 +5,7 @@ # https://opendev.org/openstack/diskimage-builder/src/branch/master/diskimage_builder/elements/rhel/post-install.d/03-reset-bls-entries - name: Reset BLS entries - hosts: overcloud + hosts: "{{ reset_bls_host | default('overcloud') }}" become: true tags: - reset-bls-entries diff --git a/etc/kayobe/environments/ci-builder/inventory/hosts b/etc/kayobe/environments/ci-builder/inventory/hosts index 33fda8b73..759e41184 100644 --- a/etc/kayobe/environments/ci-builder/inventory/hosts +++ b/etc/kayobe/environments/ci-builder/inventory/hosts @@ -1,3 +1,7 @@ # A 'seed' host used for building images. + +[ofed-builder:children] +seed + [seed] builder diff --git a/etc/kayobe/ofed.yml b/etc/kayobe/ofed.yml new file mode 100644 index 000000000..696e3c93b --- /dev/null +++ b/etc/kayobe/ofed.yml @@ -0,0 +1,12 @@ +--- +# DOCA OFED configuration + +# DOCA OFED version +stackhpc_pulp_doca_ofed_version: 24.07 + +# DOCA OFED repositories +stackhpc_pulp_repo_doca_ofed_rhel9: + name: Mellanox Technologies doca_ofed {{ stackhpc_pulp_doca_ofed_version }} Rocky 9.{{ stackhpc_pulp_repo_rocky_9_minor_version }} + url: "{{ stackhpc_release_pulp_content_url }}/doca_ofed/{{ stackhpc_pulp_doca_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64" + distribution_name: "mlnx-ofed-{{ stackhpc_pulp_doca_ofed_version }}-rocky-9-{{ stackhpc_pulp_repo_rocky_9_minor_version }}" + base_path: "mlnx_ofed/{{ stackhpc_pulp_doca_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64" diff --git a/releasenotes/notes/ofed-workflow-119225f36a9265c3.yaml b/releasenotes/notes/ofed-workflow-119225f36a9265c3.yaml new file mode 100644 index 000000000..b371b1a17 --- /dev/null +++ b/releasenotes/notes/ofed-workflow-119225f36a9265c3.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + Implement an OFED workflow that builds kernel modules + to support OFED drivers in release train kernels and + upload OFED kernel/userspace drivers to Ark.