From 56922f44503152ea51b840856f6bd8ee71207efb Mon Sep 17 00:00:00 2001 From: Simon Gerber Date: Thu, 2 Nov 2023 14:15:33 +0100 Subject: [PATCH] Add how-to for force rebooting all nodes in a machine config pool --- .../ROOT/pages/how-tos/force-reboot.adoc | 90 +++++++++++++++++++ docs/modules/ROOT/partials/nav.adoc | 1 + 2 files changed, 91 insertions(+) create mode 100644 docs/modules/ROOT/pages/how-tos/force-reboot.adoc diff --git a/docs/modules/ROOT/pages/how-tos/force-reboot.adoc b/docs/modules/ROOT/pages/how-tos/force-reboot.adoc new file mode 100644 index 00000000..75594f89 --- /dev/null +++ b/docs/modules/ROOT/pages/how-tos/force-reboot.adoc @@ -0,0 +1,90 @@ += Force reboot of all nodes in a machine config pool + +== Starting situation + +* You have admin-level access to the OpenShift 4 cluster +* You want to trigger node reboots for a whole machine config pool + +== Prerequisites + +The following CLI utilities need to be available + +* `kubectl` +* `oc` (The commands assume you have v4.13 or newer) +* `jq` + +== Reboot nodes + +. Select machine config pool for which you want to reboot all nodes ++ +[source,bash] +---- +MCP= <1> +---- +<1> Replace with the name of the machine config pool for which you want to reboot the nodes + +. List all nodes belonging to the pool ++ +[source,bash] +---- +node_selector=$( \ + kubectl get mcp "${MCP}" -ojsonpath='{.spec.nodeSelector.matchLabels}' | \ + jq -r '. as $root | [. | keys[] | "\(.)=\($root[.])"] | join(",")' \ +) +kubectl get nodes -l $node_selector +---- + +. Prepare the nodes for a force machine config resync ++ +[source,bash] +---- +for node in $(kubectl get nodes -oname -l $node_selector); do + oc --as=cluster-admin debug $node -- chroot /host touch /run/machine-config-daemon-force +done +---- + +. Select an old rendered machine config for the pool ++ +[TIP] +==== +The command selects the second newest rendered machine config. +The exact value doesn't matter, but we want to overwrite the `currentConfig` annotation with an existing machine config, so that the operator doesn't mark the nodes as degraded. +==== ++ +[source,bash] +---- +old_mc=$(kubectl get mc -o json | \ + jq --arg mcp rendered-$MCP -r \ + '[.items[] | select(.metadata.name | contains($mcp))] + | sort_by(.metadata.creationTimestamp) | reverse + | .[1] | .metadata.name' \ +) +---- + +. Trigger machine config daemon resync for *one node at a time* ++ +[IMPORTANT] +==== +Don't do this for multiple nodes at the same time, all the nodes for which this step is executed are immediately drained and rebooted. +==== ++ +[source,bash] +---- +timeout=300s <1> +for node in $(kubectl get node -o name -l $node_selector); do + echo "Rebooting $node" + kubectl annotate --overwrite $node \ + machineconfiguration.openshift.io/currentConfig=$old_mc + echo "Waiting for drain... (up to $timeout)" + if !oc wait --timeout=$timeout $node --for condition=notready; then + echo "$node didn't drain and reboot, please check status, aborting loop" + break + fi + echo "Waiting for reboot completed... (up to $timeout)" + if !oc wait --timeout=$timeout $node --for condition=ready; then + echo "$node didn't become ready, please check status, aborting loop" + break + fi +done +---- +<1> Adjust if you expect node drains and reboots to be slower or faster than 5 minutes diff --git a/docs/modules/ROOT/partials/nav.adoc b/docs/modules/ROOT/partials/nav.adoc index 03cf0b54..93593d5f 100644 --- a/docs/modules/ROOT/partials/nav.adoc +++ b/docs/modules/ROOT/partials/nav.adoc @@ -188,6 +188,7 @@ * Day two operations ** xref:oc4:ROOT:how-tos/maintenance_troubleshooting.adoc[Maintenance troubleshooting] ** xref:oc4:ROOT:how-tos/debug-nodes.adoc[Debugging Nodes] +** xref:oc4:ROOT:how-tos/force-reboot.adoc[] ** Runbooks *** xref:oc4:ROOT:how-tos/monitoring/runbooks/maintenance_alerts.adoc[MaintenanceAlertFiring]