diff --git a/roles/sideload_kernel/README.md b/roles/sideload_kernel/README.md new file mode 100644 index 000000000..02251d43b --- /dev/null +++ b/roles/sideload_kernel/README.md @@ -0,0 +1,73 @@ +# Sideload-kernel utility role + +This role will side-load a given realtime kernel onto an OpenShift SNO instance for +development and testing purposes. It will replace the stock standard kernel, or +the stock realtime kernel if that has been enabled via MachineConfig or +NodeTuningOperator CRs. + +Note: Only supported for side-loading custom realtime kernels, and only tested +on SNO installations. + +## Variables + +| Variable | Default | Required | Description | +| --------------------------- | ----------- | -------- | --------------------- | +| sideload_kernel_uri | undefined | Yes | The full URI to the kernel-rt-core rpm package to be installed, or the special value `"reset"` to reset back to the original kernel. | +| sideload_kernel_namespace | `"default"` | No | The namespace where the job and pods run that conduct the change. Must be privileged. | +| sideload_kernel_force | `false` | No | Forces re-creation of the kubernetes job even if no changes occurred. | +| sideload_kernel_job_timeout | `15` | No | The amount of time to wait for the sideload operation to complete (in minutes) | +| sideload_kernel_base_image | `"ubi9"` | No | The image used to run the script on the cluster. | +| k8s_auth | `{}` | No | See the "Authentication" section below. | + +## Requirements + +- python3-kubernetes (or [kubernetes python library](https://pypi.org/project/kubernetes/)) + +## Authentication + +The steps taken by this role require proper kubernetes authentication be set up +on the ansible host (or localhost) for the cluster in question. This may be +done in 3 ways: + +- If the ansible host has a valid kubeconfig in ~/.kube/config, this will be + used by default. +- You can set the appropriate environment variables via the `k8s_auth` role + variable. These will be named K8S_AUTH_* and are outlined in [kubernetes.core.k8s](https://galaxy.ansible.com/ui/repo/published/kubernetes/core/content/module/k8s/) + +## Usage example + +- Side-load a kernel, assuming ~/.kube/config is authenticated to + the proper kubernetes cluster: + +```yaml +- name: Sideload a kernel + ansible.builtin.include_role: + name: redhatci.ocp.sideload_kernel + vars: + sideload_kernel_uri: "https://example.com/packages/kernel/5.14.0/417.el9/x86_64/kernel-rt-core-5.14.0-417.el9.x86_64.rpm" +``` + +- Side-load a kernel, specifying the path to an alternative + kubeconfig file: + +```yaml +- name: Sideload a kernel + ansible.builtin.include_role: + name: redhatci.ocp.sideload_kernel + vars: + sideload_kernel_uri: "https://mymirror.local/kernels/kernel-rt-core-5.14.0-417.el9.x86_64.rpm" + k8s_auth: + K8S_AUTH_KUBECONFIG: /var/lib/clusterauth/kubeconfig +``` + +- Reset back to the original kernel + +```yaml +- name: Reset to the custom standard kernel + ansible.builtin.include_role: + name: redhatci.ocp.sideload_kernel + vars: + sideload_kernel_uri: "reset" + k8s_auth: + K8S_AUTH_KUBECONFIG: /var/lib/clusterauth/kubeconfig +``` diff --git a/roles/sideload_kernel/defaults/main.yml b/roles/sideload_kernel/defaults/main.yml new file mode 100644 index 000000000..e42e1cb4a --- /dev/null +++ b/roles/sideload_kernel/defaults/main.yml @@ -0,0 +1,6 @@ +--- +sideload_kernel_namespace: default +sideload_kernel_force: false +sideload_kernel_job_timeout: 15 +sideload_kernel_base_image: ubi9 +k8s_auth: {} diff --git a/roles/sideload_kernel/files/flip_kernel b/roles/sideload_kernel/files/flip_kernel new file mode 100755 index 000000000..45c2402fe --- /dev/null +++ b/roles/sideload_kernel/files/flip_kernel @@ -0,0 +1,184 @@ +#!/bin/bash -eu +# +# Note: This script is designed to run inside a privileged OCP container that +# has been chroot'd into the host filesystem, as it deals directly with +# rpm-ostree. +# +# Upon execution, one of 3 results will happen +# - It will return 0 if the desired kernel is installed and active +# - It will return 1 if there is any error +# - It will reboot the system if the requested kernel is not yet active +# +# Run this script with '-h' for usage instructions. +# + +NONRT=(kernel{-core,-modules,-modules-core,-modules-extra}) +RT=(kernel-rt{-core,-modules,-modules-core,-modules-extra}) +PREFETCH=/tmp + +replace_standard_kernel() { + local -a install + for pkg in "$@"; do + install+=(--install "${pkg}") + done + rpm-ostree override remove kernel "${NONRT[@]}" "${install[@]}" +} + +reinstall_kernel_packages() { + local -a install + for pkg in "$@"; do + install+=(--install "${pkg}") + done + rpm-ostree uninstall --all "${install[@]}" +} + +replaced_base_kernel() { + rpm-ostree status --json | jq -r '.deployments[0]."requested-base-removals"[]' | grep -q kernel-core +} + +is_rpmostree_staged() { + [[ $(rpm-ostree status --json | jq '.deployments[0].staged') == "true" ]] +} + +is_pkg_installed() { + local basename + basename=$(basename "$1" .rpm) + rpm -q "$basename" >/dev/null +} + +reboot_if_staged() { + rpm-ostree status + if is_rpmostree_staged; then + # TODO: There is a race with MCD which may run 'rpm-ostree cleanup -p' + # on us and blow away our staged changes before they take effect. + # Attempt to make the window smaller by rebooting immediately + echo "Rebooting!" + systemctl reboot + else + echo "No changes staged!" + return 1 + fi +} + +inject_kernel_rpms() { + if is_pkg_installed "$1"; then + echo "$(basename "$1") is already installed; nothing to do!" + uname -r + return 0 + fi + + if replaced_base_kernel; then + reinstall_kernel_packages "$@" || return $? + else + replace_standard_kernel "$@" || return $? + fi + reboot_if_staged +} + +inject_from_dir() { + local pkgdir=${1:-/tmp} + local -a rpms + rpms=("$pkgdir/kernel-*.rpm") + inject_kernel_rpms "${rpms[@]}" +} + +get_kernel_rpms_from_uri() { + local -n uris=$1; shift + uris=() + local prefix=$1; shift + local suffix=$1; shift + for pkg in "$@"; do + local uri="$prefix/$pkg-$suffix" + local fetch="-I" + if [[ -d $PREFETCH ]]; then + pushd "$PREFETCH" >/dev/null || return 1 + fetch="-O" + fi + if ! curl "$fetch" -fksS "$uri" >/dev/null; then + echo "Error: Could not fetch RPM '$uri'" + return 1 + fi + if [[ -d $PREFETCH ]]; then + uris+=("$PREFETCH/$(basename "$uri")") + popd >/dev/null || return 1 + else + uris+=("$uri") + fi + done +} + +inject_from_uri() { + local base=$1 + local -a rpms + local prefix suffix + local -a pkglist + if [[ $base =~ .*/kernel-core-.*.rpm ]]; then + prefix=${base%%/kernel-core-*} + suffix=${base##*/kernel-core-} + pkglist=("${NONRT[@]}") + elif [[ $base =~ .*/kernel-rt-core-.*.rpm ]]; then + prefix=${base%%/kernel-rt-core-*} + suffix=${base##*/kernel-rt-core-} + pkglist=("${RT[@]}") + else + echo "URI must point to either the kernel-rt-core or kernel-core RPM" + return 1 + fi + + get_kernel_rpms_from_uri rpms "$prefix" "$suffix" "${pkglist[@]}" || return 1 + inject_kernel_rpms "${rpms[@]}" +} + +reset_kernel() { + if ! replaced_base_kernel; then + echo "The standard kernel is already running; nothing to do!" + uname -r + return 0 + fi + + rpm-ostree reset + reboot_if_staged +} + +usage() { + echo "Switches to arbitrary kernel-[rt-]*.rpm packages" + echo + echo "Usage:" + echo " $(basename "$0") http[s]://full-uri-to/kernel-[rt-]core-\$SOMEVERSION.rpm" + echo " Fetches the kernel packages at the URI given, provided:" + echo " - the URI must be to the 'kernel-rt-core' or 'kernel-core' package" + echo " - the other packages must be in the same directory on the HTTP server" + echo + echo " $(basename "$0") /path/to/pkgdir" + echo " Installs a set of kernel packages from a local directory" + echo " Must only contain one set of kernel packages" + echo + echo " $(basename "$0") reset" + echo " Resets the kernel back to the default standard kernel" + return 1 +} + +main() { + if [[ $1 == "-h" || $1 == "--help" ]]; then + usage + return 0 + fi + + if [[ $(id -u) -ne 0 ]]; then + echo "This script must be run as root" + return 1 + fi + + if [[ $1 == "reset" ]]; then + reset_kernel + elif [[ $1 =~ https?:// ]]; then + inject_from_uri "$1" + elif [[ -d $1 ]]; then + inject_from_dir "$1" + else + echo "'$1' was not a URI or a local directory." + usage + fi +} + +[[ "${BASH_SOURCE[0]}" == "$0" ]] && main "$@" diff --git a/roles/sideload_kernel/tasks/main.yml b/roles/sideload_kernel/tasks/main.yml new file mode 100644 index 000000000..8bab1abe2 --- /dev/null +++ b/roles/sideload_kernel/tasks/main.yml @@ -0,0 +1,113 @@ +--- +- name: Validate required variables + ansible.builtin.assert: + that: + - sideload_kernel_uri | string + +- name: Sideload-kernel job namespace + community.kubernetes.k8s: + state: present + api_version: v1 + kind: Namespace + name: "{{ sideload_kernel_namespace }}" + environment: "{{ k8s_auth }}" + +- name: Sideload-kernel configmap + community.kubernetes.k8s: + state: present + apply: true + definition: + apiVersion: v1 + kind: ConfigMap + metadata: + name: sideload-kernel + namespace: "{{ sideload_kernel_namespace }}" + data: + KERNEL_URI: "{{ sideload_kernel_uri }}" + flip_kernel: "{{ lookup('file', 'flip_kernel') }}" + environment: "{{ k8s_auth }}" + register: configmap_created + +- name: Clean out any previous jobs + community.kubernetes.k8s: + state: absent + api_version: batch/v1 + kind: Job + name: flip-kernel + namespace: "{{ sideload_kernel_namespace }}" + environment: "{{ k8s_auth }}" + when: configmap_created.changed or sideload_kernel_force | bool + +- name: Sideload-kernel job + community.kubernetes.k8s: + state: present + definition: + apiVersion: batch/v1 + kind: Job + metadata: + name: flip-kernel + namespace: "{{ sideload_kernel_namespace }}" + spec: + backoffLimit: 3 + # TODO: Add a selector for multi-node cluster cases? + template: + spec: + containers: + - command: + - /bin/bash + args: + - "-c" + - "install /script/flip_kernel /host/tmp && chroot /host /tmp/flip_kernel ${KERNEL_URI}" + env: + - name: KERNEL_URI + valueFrom: + configMapKeyRef: + name: sideload-kernel + key: KERNEL_URI + image: "{{ sideload_kernel_base_image }}" + imagePullPolicy: IfNotPresent + name: flipper + securityContext: + privileged: true + runAsUser: 0 + volumeMounts: + - mountPath: /host + name: host + - mountPath: /script + name: script + hostIPC: true + hostNetwork: true + hostPID: true + restartPolicy: Never + volumes: + - name: host + hostPath: + path: / + type: Directory + - name: script + configMap: + name: sideload-kernel + environment: "{{ k8s_auth }}" + +- name: Warn about a potentially long wait + ansible.builtin.debug: + msg: "Waiting for the job to reach a completed state (may take up to {{ sideload_kernel_job_timeout }} mins)..." + +- name: Wait for job completion + community.kubernetes.k8s_info: + api_version: batch/v1 + kind: Job + name: flip-kernel + namespace: "{{ sideload_kernel_namespace }}" + environment: "{{ k8s_auth }}" + register: job_state + until: not job_state.failed and "conditions" in job_state.resources[0].status and job_state.resources[0].status.conditions[0].status == "True" + # A successful job may a long time, including waiting for a host reboot + retries: "{{ sideload_kernel_job_timeout * 4 | int }}" + delay: 15 + +- name: Ensure the job completed successfully + ansible.builtin.fail: + msg: "Job state is {{ job_state.resources[0].status.conditions[0].type }}" + when: job_state.resources[0].status.conditions[0].type != 'Complete' +...