From 48ed6e6d6510f42743e4463970e27f05637e4982 Mon Sep 17 00:00:00 2001 From: Oyvind Albrigtsen <oalbrigt@redhat.com> Date: Tue, 4 Jul 2023 14:40:19 +0200 Subject: [PATCH] Filesystem: improve stop-action and allow setting term/kill signals and signal_delay for large filesystems --- heartbeat/Filesystem | 80 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 70 insertions(+), 10 deletions(-) diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem index 65a9dffb5..fe608ebfd 100755 --- a/heartbeat/Filesystem +++ b/heartbeat/Filesystem @@ -71,6 +71,9 @@ OCF_RESKEY_run_fsck_default="auto" OCF_RESKEY_fast_stop_default="no" OCF_RESKEY_force_clones_default="false" OCF_RESKEY_force_unmount_default="true" +OCF_RESKEY_term_signals_default="TERM" +OCF_RESKEY_kill_signals_default="KILL" +OCF_RESKEY_signal_delay_default="1" # RHEL specific defaults if is_redhat_based; then @@ -104,6 +107,9 @@ if [ -z "${OCF_RESKEY_fast_stop}" ]; then fi : ${OCF_RESKEY_force_clones=${OCF_RESKEY_force_clones_default}} : ${OCF_RESKEY_force_unmount=${OCF_RESKEY_force_unmount_default}} +: ${OCF_RESKEY_term_signals=${OCF_RESKEY_term_signals_default}} +: ${OCF_RESKEY_kill_signals=${OCF_RESKEY_kill_signals_default}} +: ${OCF_RESKEY_signal_delay=${OCF_RESKEY_signal_delay_default}} # Variables used by multiple methods HOSTOS=$(uname) @@ -266,6 +272,30 @@ block if unresponsive nfs mounts are in use on the system. <content type="boolean" default="${OCF_RESKEY_force_unmount_default}" /> </parameter> +<parameter name="term_signals"> +<longdesc lang="en"> +Signals (names or numbers, whitespace separated) to send processes during graceful termination phase in stop-action. +</longdesc> +<shortdesc lang="en">Signals (names or numbers, whitespace separated) to send processes during graceful termination phase in stop-action</shortdesc> +<content type="boolean" default="${OCF_RESKEY_term_signals_default}" /> +</parameter> + +<parameter name="kill_signals"> +<longdesc lang="en"> +Signals (names or numbers, whitespace separated) to send processes during forceful killing phase in stop-action. +</longdesc> +<shortdesc lang="en">Signals (names or numbers, whitespace separated) to send processes during forceful killing phase in stop-action</shortdesc> +<content type="boolean" default="${OCF_RESKEY_kill_signals_default}" /> +</parameter> + +<parameter name="signal_delay"> +<longdesc lang="en"> +How many seconds to wait after sending term/kill signals to processes in stop-action. +</longdesc> +<shortdesc lang="en">How many seconds to wait after sending term/kill signals to processes in stop-action</shortdesc> +<content type="boolean" default="${OCF_RESKEY_kill_signal_delay}" /> +</parameter> + </parameters> <actions> @@ -663,19 +693,49 @@ try_umount() { } return $OCF_ERR_GENERIC } -fs_stop() { - local SUB="$1" timeout=$2 sig cnt - for sig in TERM KILL; do - cnt=$((timeout/2)) # try half time with TERM - while [ $cnt -gt 0 ]; do - try_umount "$SUB" && - return $OCF_SUCCESS - ocf_exit_reason "Couldn't unmount $SUB; trying cleanup with $sig" +timeout_child() { + local pid="$1" timeout="$2" killer ret + + # start job in the background that will KILL the given process after timeout expires + sleep $timeout && kill -s KILL $pid & + killer=$! + + # block until the child process either exits on its own or gets killed by the above killer pipeline + wait $pid + ret=$? + + # ret would be 127 + child exit code if the timeout expired + [ $ret -lt 128 ] && kill -s KILL $killer + return $ret +} +fs_stop_loop() { + local SUB="$1" signals="$2" sig + while true; do + for sig in $signals; do signal_processes "$SUB" $sig - cnt=$((cnt-1)) - sleep 1 done + sleep $OCF_RESKEY_signal_delay + try_umount "$SUB" && return $OCF_SUCCESS done +} +fs_stop() { + local SUB="$1" timeout=$2 grace_time ret + grace_time=$((timeout/2)) + + # try gracefully terminating processes for up to half of the configured timeout + fs_stop_loop "$SUB" "$OCF_RESKEY_term_signals" & + timeout_child $! $grace_time + ret=$? + [ $ret -eq $OCF_SUCCESS ] && return $ret + + # try killing them for the rest of the timeout + fs_stop_loop "$SUB" "$OCF_RESKEY_kill_signals" & + timeout_child $! $grace_time + ret=$? + [ $ret -eq $OCF_SUCCESS ] && return $ret + + # timeout expired + ocf_exit_reason "Couldn't unmount $SUB within given timeout" return $OCF_ERR_GENERIC }