From f1a6d83750782aa00d99eb98bd967dfef93a9105 Mon Sep 17 00:00:00 2001 From: "Gao,Yan" Date: Tue, 10 Nov 2020 13:51:59 +0100 Subject: [PATCH 1/3] Fix: agent: prevent gethosts action and timeout validation from hanging on list and dump commands if any of the devices is silently blocked If any of the configured SBD devices is silently blocked without any explicit I/O error from kernel, fencing will get stuck and time out, even if the majority of the devices are still available. On fencing, list and dump commands are called first. Under this situation, the commands will print output but get stuck on exit_aio() on exit, and become D state. With this commit, sbd fence agent asynchronously calls the commands individually for the devices and wait for any successful return and collect the output, so that it prevents execution of sbd fence agent from hanging. --- agent/sbd.in | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 122 insertions(+), 2 deletions(-) diff --git a/agent/sbd.in b/agent/sbd.in index 174cb87..82ad2f6 100644 --- a/agent/sbd.in +++ b/agent/sbd.in @@ -32,6 +32,126 @@ SBD_DEVS=${sbd_device%;} sbd_device=${SBD_DEVS//;/ -d } +sbd_cmd_output() { + local pid=$1 + local fd=$2 + local call_wait=$3 + local any_output=0 + local failed=0 + local rc=0 + running=0 + unknown_hanging=0 + + # Async IO timeout defaults to 3 seconds + while read -t 5 line; do + echo "$line" + any_output=1 + + # Indicator of failure in case that stderr is retrieved + if [[ "$line" == *"sbd failed"* ]]; then + failed=1 + fi + done <&$fd + + # Command exited + if ! $(kill -0 $pid > /dev/null 2>&1); then + # Safe now to retrieve any remaining output without specifying timeout + while read line; do + echo "$line" + any_output=1 + + if [[ "$line" == *"sbd failed"* ]]; then + failed=1 + fi + done <&$fd + + # Determine the exit status + # bash's wait command only recongizes the latest child even if the pids of the previous children were saved. + if [ $call_wait -ne 0 ]; then + wait $pid + return $? + # Let's assume one that printed anything other than explicit failure to stdout has succeeded. + elif [ $any_output -ne 0 -a $failed -eq 0 ]; then + return 0 + else + return 1 + fi + + # Command still existing + else + running=1 + # Failed but hanging. Don't wait for it any more. + if [ $failed -ne 0 ]; then + return 1 + else + unknown_hanging=1 + return 1 + fi + fi + + return $rc +} + +sbd_cmd_get_stdout() { + local devices=${SBD_DEVS//;/ } + local cmd="$1" + local rc=0 + local success_count=0 + local unknown_hanging_procs="" + + for device in $devices; do + exec {fd}< <(sbd -d $device $cmd) + pid=$! + + sbd_cmd_output $pid $fd 1 + cmd_rc=$? + + if [ $cmd_rc -eq 0 ]; then + success_count=$((success_count + 1)) + else + rc=$cmd_rc + fi + + if [ $unknown_hanging -ne 0 ]; then + unknown_hanging_procs+="$pid:$fd " + fi + done + + if [ -z "$unknown_hanging_procs" -o $success_count -gt 0 ]; then + return $rc + fi + + # We didn't get any successful output + # Desperately wait for the ones hanging in unknown state + while true; do + local running_count=0 + + for proc in $unknown_hanging_procs; do + pid=${proc%:*} + fd=${proc#*:} + + sbd_cmd_output $pid $fd 0 + cmd_rc=$? + + if [ $cmd_rc -eq 0 ]; then + success_count=$((success_count + 1)) + else + rc=$cmd_rc + fi + + if [ $running -ne 0 ]; then + running_count=$((running_count + 1)) + fi + done + + if [ $success_count -gt 0 -o $running_count -eq 0 ]; then + return $rc + fi + done + + return $rc +} + sbd_check_device() { if [ -z "$sbd_device" ]; then ha_log.sh err "No sbd device(s) found in the configuration." @@ -44,7 +164,7 @@ sbd_validate_timeout() { yes|true|1|YES|TRUE|ja|on|ON) return ;; esac crm_timeout=$[$(crm_attribute -t crm_config -G -n stonith-timeout -d 20s -q | sed -e 's/\(.*\)s/\1/' -e 's/\(.*\)m/\1*60/')] - sbd_timeout=$(sbd -d $sbd_device dump | perl -ne 'if (/msgwait.*: (\d+)/) { print "$1\n"; }' | head -n 1) + sbd_timeout=$(sbd_cmd_get_stdout dump | perl -ne 'if (/msgwait.*: (\d+)/) { print "$1\n"; }' | head -n 1) if [ -z "$sbd_timeout" -o "$sbd_timeout" = "0" ]; then return fi @@ -66,7 +186,7 @@ sbd_validate_timeout() { case $1 in gethosts) sbd_check_device - echo `sbd -d $sbd_device list | cut -f2 | sort | uniq` + echo `sbd_cmd_get_stdout list | cut -f2 | sort | uniq` exit 0 ;; off|reset) From 97821f60f64f0ef98dd5edf7a824386f1d1be3dd Mon Sep 17 00:00:00 2001 From: "Gao,Yan" Date: Tue, 10 Nov 2020 17:13:52 +0100 Subject: [PATCH 2/3] Fix: agent: prevent off/reset action from hanging on message command if any of the devices is silently blocked Differently from list and dump commands, message command is actually kind of already asynchronous. Rather than directly accessing the devices, it spawns multiple writing child processes in parallel, one for each device, and waits for majority of them to finish writing of poison pill and returns, even if the minority gets stuck in D state. But if it's called by stonith command, sbd fence agent process will become "defunct" state and get stuck. This commits prevent that by asynchronously calling message command with a subshell. --- agent/sbd.in | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/agent/sbd.in b/agent/sbd.in index 82ad2f6..0057023 100644 --- a/agent/sbd.in +++ b/agent/sbd.in @@ -196,7 +196,8 @@ off|reset) case "$crashdump" in yes|true|1|YES|TRUE|ja|on|ON) message="crashdump" ;; esac - sbd -d $sbd_device message $2 $message + exec {fd}< <(sbd -d $sbd_device message $2 $message) + wait $! exit $? ;; status) From 4b98b42af24335cfd32e8d18391531882dd7eb31 Mon Sep 17 00:00:00 2001 From: "Gao,Yan" Date: Thu, 12 Nov 2020 12:45:50 +0100 Subject: [PATCH 3/3] Fix: agent: prevent status action from hanging on list command if any of the devices is silently blocked If any of the configured SBD devices is silently blocked without any explicit I/O error from kernel, status action will get stuck on list command which will be hanging on exit_aio() and become D state. With this commit, sbd fence agent asynchronously calls list command individually for the devices and won't wait for any devices that actually have been already reported failed, so that it prevents hanging under such a situation. --- agent/sbd.in | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/agent/sbd.in b/agent/sbd.in index 0057023..5c77ae0 100644 --- a/agent/sbd.in +++ b/agent/sbd.in @@ -152,6 +152,34 @@ sbd_cmd_get_stdout() { return $rc } +sbd_cmd_check_error() { + local devices=${SBD_DEVS//;/ } + local cmd="$1" + local rc=0 + + for device in $devices; do + exec {fd}< <(sbd -d $device $cmd 2>&1 >/dev/null) + pid=$! + + while true; do + sbd_cmd_output $pid $fd 1 + local cmd_rc=$? + + # No need to wait for a hanging one that has reported "sbd failed" + # But have to wait for a hanging one in unknown state + if [ $unknown_hanging -eq 0 ]; then + if [ $cmd_rc -ne 0 ]; then + rc=$cmd_rc + fi + + break + fi + done + done + + return $rc +} + sbd_check_device() { if [ -z "$sbd_device" ]; then ha_log.sh err "No sbd device(s) found in the configuration." @@ -203,7 +231,7 @@ off|reset) status) sbd_check_device sbd_validate_timeout - error_output=$(sbd -d $sbd_device list 2>&1 >/dev/null) + error_output=$(sbd_cmd_check_error list) if [ $? -ne 0 ]; then error_message=$(echo "$error_output" | grep -v "please check the logs") ha_log.sh err "sbd list failed: $error_message"