From 5a84bdea6067c4f1ae9e935e8b5f02ddfaa55555 Mon Sep 17 00:00:00 2001 From: Oyvind Albrigtsen Date: Wed, 25 Sep 2024 16:21:18 +0200 Subject: [PATCH] - Filesystem: dont sleep during stop-action when there are no processes to kill, and only use force argument for network filesystems after sending kill_signals - Filesystem: try umount first during stop-action, and avoid potential "Argument list too long" for force_unmount=safe - AWS agents: use awscli2 Resolves: RHEL-58038 Resolves: RHEL-59576 Resolves: RHEL-46233 --- ...es-only-send-force-net-fs-after-kill.patch | 106 ++++++++++++++++++ ...-first-avoid-arguments-list-too-long.patch | 100 +++++++++++++++++ ha-cloud-support-aws.patch | 49 -------- resource-agents.spec | 24 +++- 4 files changed, 225 insertions(+), 54 deletions(-) create mode 100644 RHEL-58038-Filesystem-dont-sleep-no-processes-only-send-force-net-fs-after-kill.patch create mode 100644 RHEL-59576-Filesystem-try-umount-first-avoid-arguments-list-too-long.patch delete mode 100644 ha-cloud-support-aws.patch diff --git a/RHEL-58038-Filesystem-dont-sleep-no-processes-only-send-force-net-fs-after-kill.patch b/RHEL-58038-Filesystem-dont-sleep-no-processes-only-send-force-net-fs-after-kill.patch new file mode 100644 index 0000000..1ae87c4 --- /dev/null +++ b/RHEL-58038-Filesystem-dont-sleep-no-processes-only-send-force-net-fs-after-kill.patch @@ -0,0 +1,106 @@ +From d66a52cfb25f5436255ecc65a407c0166a720146 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Tue, 3 Sep 2024 12:55:28 +0200 +Subject: [PATCH 1/2] Filesystem: dont sleep during stop-action when there are + no processes to kill + +Thanks @SatomiOSAWA for the initial code. +--- + heartbeat/Filesystem | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem +index 3eb520e0c..f54969f20 100755 +--- a/heartbeat/Filesystem ++++ b/heartbeat/Filesystem +@@ -685,12 +685,13 @@ signal_processes() { + pids=$(get_pids "$dir") + if [ -z "$pids" ]; then + ocf_log info "No processes on $dir were signalled. force_unmount is set to '$FORCE_UNMOUNT'" +- return ++ return 1 + fi + for pid in $pids; do + ocf_log info "sending signal $sig to: $(ps -f $pid | tail -1)" + kill -s $sig $pid + done ++ return 0 + } + try_umount() { + local SUB="$1" +@@ -717,12 +718,13 @@ timeout_child() { + return $ret + } + fs_stop_loop() { +- local SUB="$1" signals="$2" sig ++ local SUB="$1" signals="$2" sig send_signal + while true; do ++ send_signal=false + for sig in $signals; do +- signal_processes "$SUB" $sig ++ signal_processes "$SUB" $sig && send_signal=true + done +- sleep $OCF_RESKEY_signal_delay ++ $send_signal && sleep $OCF_RESKEY_signal_delay + try_umount "$SUB" && return $OCF_SUCCESS + done + } + +From cb6aaffc260eea0f0fee6fab44393c6cf12b8a83 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Mon, 9 Sep 2024 10:58:12 +0200 +Subject: [PATCH 2/2] Filesystem: only use $umount_force after sending + kill_signals + +--- + heartbeat/Filesystem | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem +index f54969f20..4dd962fd9 100755 +--- a/heartbeat/Filesystem ++++ b/heartbeat/Filesystem +@@ -694,8 +694,8 @@ signal_processes() { + return 0 + } + try_umount() { +- local SUB="$1" +- $UMOUNT $umount_force "$SUB" ++ local force_arg="$1" SUB="$2" ++ $UMOUNT $force_arg "$SUB" + list_mounts | grep "${TAB}${SUB}${TAB}" >/dev/null 2>&1 || { + ocf_log info "unmounted $SUB successfully" + return $OCF_SUCCESS +@@ -718,14 +718,14 @@ timeout_child() { + return $ret + } + fs_stop_loop() { +- local SUB="$1" signals="$2" sig send_signal ++ local force_arg="$1" SUB="$2" signals="$3" sig send_signal + while true; do + send_signal=false + for sig in $signals; do + signal_processes "$SUB" $sig && send_signal=true + done + $send_signal && sleep $OCF_RESKEY_signal_delay +- try_umount "$SUB" && return $OCF_SUCCESS ++ try_umount "$force_arg" "$SUB" && return $OCF_SUCCESS + done + } + fs_stop() { +@@ -733,13 +733,13 @@ fs_stop() { + grace_time=$((timeout/2)) + + # try gracefully terminating processes for up to half of the configured timeout +- fs_stop_loop "$SUB" "$OCF_RESKEY_term_signals" & ++ fs_stop_loop "" "$SUB" "$OCF_RESKEY_term_signals" & + timeout_child $! $grace_time + ret=$? + [ $ret -eq $OCF_SUCCESS ] && return $ret + + # try killing them for the rest of the timeout +- fs_stop_loop "$SUB" "$OCF_RESKEY_kill_signals" & ++ fs_stop_loop "$umount_force" "$SUB" "$OCF_RESKEY_kill_signals" & + timeout_child $! $grace_time + ret=$? + [ $ret -eq $OCF_SUCCESS ] && return $ret diff --git a/RHEL-59576-Filesystem-try-umount-first-avoid-arguments-list-too-long.patch b/RHEL-59576-Filesystem-try-umount-first-avoid-arguments-list-too-long.patch new file mode 100644 index 0000000..561e29a --- /dev/null +++ b/RHEL-59576-Filesystem-try-umount-first-avoid-arguments-list-too-long.patch @@ -0,0 +1,100 @@ +From f02afd0fadb581ca0fc9798beaf28044cf211200 Mon Sep 17 00:00:00 2001 +From: Lars Ellenberg +Date: Wed, 18 Sep 2024 11:53:52 +0200 +Subject: [PATCH 1/2] Filesystem: on stop, try umount directly, before scanning + for users + +48ed6e6d (Filesystem: improve stop-action and allow setting term/kill signals and signal_delay for large filesystems, 2023-07-04) +changed the logic from +"try umount; if that fails, find and kill users; repeat" to +"try to find and kill users; then try umount; repeat" + +But even just walking /proc may take "a long time" on busy systems, +and may still turn up with "no users found". + +It will take even longer for "force_umount=safe" +(observed 8 to 10 seconds just for "get_pids() with "safe" to return nothing) +than for "force_umount=yes" (still ~ 2 to 3 seconds), +but it will take "a long time" in any case. +(BTW, that may be longer than the hardcoded default of 6 seconds for "fast_stop", +which is also the default on many systems now) + +If the dependencies are properly configured, +there should be no users left, +and the umount should just work. + +Revert back to "try umount first", and only then try to find "rogue" users. +--- + heartbeat/Filesystem | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem +index 4dd962fd9..99bddaf62 100755 +--- a/heartbeat/Filesystem ++++ b/heartbeat/Filesystem +@@ -732,6 +732,11 @@ fs_stop() { + local SUB="$1" timeout=$2 grace_time ret + grace_time=$((timeout/2)) + ++ # Just walking /proc may take "a long time", even if we don't find any users of this FS. ++ # If dependencies are properly configured, umount should just work. ++ # Only if that fails, try to find and kill processes that still use it. ++ try_umount "" "$SUB" && return $OCF_SUCCESS ++ + # try gracefully terminating processes for up to half of the configured timeout + fs_stop_loop "" "$SUB" "$OCF_RESKEY_term_signals" & + timeout_child $! $grace_time + +From b42d698f12aaeb871f4cc6a3c0327a27862b4376 Mon Sep 17 00:00:00 2001 +From: Lars Ellenberg +Date: Wed, 18 Sep 2024 13:42:38 +0200 +Subject: [PATCH 2/2] Filesystem: stop/get_pids to be signaled + +The "safe" way to get process ids that may be using a particular filesystem +currently uses shell globs ("find /proc/[0-9]*"). +With a million processes (and/or a less capable shell), +that may result in "Argument list too long". + +Replace with find /proc -path "/proc/[0-9]*" instead. +While at it, also fix the non-posix -or to be -o, +and add explicit grouping parentheses \( \) and explicit -print. + +Add a comment to not include "interesting" characters in mount point names. +--- + heartbeat/Filesystem | 23 ++++++++++++++++++++--- + 1 file changed, 20 insertions(+), 3 deletions(-) + +diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem +index 99bddaf62..3405e2c26 100755 +--- a/heartbeat/Filesystem ++++ b/heartbeat/Filesystem +@@ -669,9 +669,26 @@ get_pids() + $FUSER -Mm $dir 2>/dev/null + fi + elif [ "$FORCE_UNMOUNT" = "safe" ]; then +- procs=$(find /proc/[0-9]*/ -type l -lname "${dir}/*" -or -lname "${dir}" 2>/dev/null | awk -F/ '{print $3}') +- mmap_procs=$(grep " ${dir}/" /proc/[0-9]*/maps | awk -F/ '{print $3}') +- printf "${procs}\n${mmap_procs}" | sort | uniq ++ # Yes, in theory, ${dir} could contain "intersting" characters ++ # and would need to be quoted for glob (find) and regex (grep). ++ # Don't do that, then. ++ ++ # Avoid /proc/[0-9]*, it may cause "Argument list too long". ++ # There are several ways to filter for /proc/ ++ # -mindepth 1 -not -path "/proc/[0-9]*" -prune -o ... ++ # -path "/proc/[!0-9]*" -prune -o ... ++ # -path "/proc/[0-9]*" -a ... ++ # the latter seemd to be significantly faster for this one in my naive test. ++ procs=$(exec 2>/dev/null; ++ find /proc -path "/proc/[0-9]*" -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print | ++ awk -F/ '{print $3}' | uniq) ++ ++ # This finds both /proc//maps and /proc//task//maps; ++ # if you don't want the latter, add -maxdepth. ++ mmap_procs=$(exec 2>/dev/null; ++ find /proc -path "/proc/[0-9]*/maps" -print | ++ xargs -r grep -l " ${dir}/" | awk -F/ '{print $3}' | uniq) ++ printf "${procs}\n${mmap_procs}" | sort -u + fi + } + diff --git a/ha-cloud-support-aws.patch b/ha-cloud-support-aws.patch deleted file mode 100644 index b05e936..0000000 --- a/ha-cloud-support-aws.patch +++ /dev/null @@ -1,49 +0,0 @@ -diff --color -uNr a/heartbeat/awseip b/heartbeat/awseip ---- a/heartbeat/awseip 2020-12-03 14:31:17.000000000 +0100 -+++ b/heartbeat/awseip 2021-02-15 16:47:36.624610378 +0100 -@@ -43,7 +43,7 @@ - # - # Defaults - # --OCF_RESKEY_awscli_default="/usr/bin/aws" -+OCF_RESKEY_awscli_default="/usr/lib/fence-agents/support/awscli/bin/aws" - OCF_RESKEY_auth_type_default="key" - OCF_RESKEY_profile_default="default" - OCF_RESKEY_region_default="" - OCF_RESKEY_api_delay_default="3" -diff --color -uNr a/heartbeat/awsvip b/heartbeat/awsvip ---- a/heartbeat/awsvip 2020-12-03 14:31:17.000000000 +0100 -+++ b/heartbeat/awsvip 2021-02-15 16:47:48.960632484 +0100 -@@ -42,7 +42,7 @@ - # - # Defaults - # --OCF_RESKEY_awscli_default="/usr/bin/aws" -+OCF_RESKEY_awscli_default="/usr/lib/fence-agents/support/awscli/bin/aws" - OCF_RESKEY_auth_type_default="key" - OCF_RESKEY_profile_default="default" - OCF_RESKEY_region_default="" -diff --color -uNr a/heartbeat/aws-vpc-move-ip b/heartbeat/aws-vpc-move-ip ---- a/heartbeat/aws-vpc-move-ip 2020-12-03 14:31:17.000000000 +0100 -+++ b/heartbeat/aws-vpc-move-ip 2021-02-15 16:47:55.484644118 +0100 -@@ -35,7 +35,7 @@ - . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs - - # Defaults --OCF_RESKEY_awscli_default="/usr/bin/aws" -+OCF_RESKEY_awscli_default="/usr/lib/fence-agents/support/awscli/bin/aws" - OCF_RESKEY_auth_type_default="key" - OCF_RESKEY_profile_default="default" - OCF_RESKEY_region_default="" -diff --color -uNr a/heartbeat/aws-vpc-route53.in b/heartbeat/aws-vpc-route53.in ---- a/heartbeat/aws-vpc-route53.in 2020-12-03 14:31:17.000000000 +0100 -+++ b/heartbeat/aws-vpc-route53.in 2021-02-15 16:47:59.808651828 +0100 -@@ -45,7 +45,7 @@ - . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs - - # Defaults --OCF_RESKEY_awscli_default="/usr/bin/aws" -+OCF_RESKEY_awscli_default="/usr/lib/fence-agents/support/awscli/bin/aws" - OCF_RESKEY_auth_type_default="key" - OCF_RESKEY_profile_default="default" - OCF_RESKEY_region_default="" diff --git a/resource-agents.spec b/resource-agents.spec index 03148ea..2c8a315 100644 --- a/resource-agents.spec +++ b/resource-agents.spec @@ -45,7 +45,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.10.0 -Release: 64%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} +Release: 65%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} License: GPLv2+ and LGPLv2+ URL: https://github.com/ClusterLabs/resource-agents Source0: %{upstream_prefix}-%{upstream_version}.tar.gz @@ -134,11 +134,12 @@ Patch81: RHEL-40393-Filesystem-2-update-bsd-logic.patch Patch82: RHEL-32829-db2-fix-OCF_SUCESS-typo.patch Patch83: RHEL-43579-galera-mysql-redis-remove-Unpromoted-monitor-action.patch Patch84: RHEL-22715-LVM-activate-fix-false-positive.patch +Patch85: RHEL-58038-Filesystem-dont-sleep-no-processes-only-send-force-net-fs-after-kill.patch +Patch86: RHEL-59576-Filesystem-try-umount-first-avoid-arguments-list-too-long.patch # bundled ha-cloud-support libs -Patch500: ha-cloud-support-aws.patch -Patch501: ha-cloud-support-aliyun.patch -Patch502: ha-cloud-support-gcloud.patch +Patch500: ha-cloud-support-aliyun.patch +Patch501: ha-cloud-support-gcloud.patch Obsoletes: heartbeat-resources <= %{version} Provides: heartbeat-resources = %{version} @@ -342,11 +343,12 @@ exit 1 %patch -p1 -P 82 %patch -p1 -P 83 %patch -p1 -P 84 +%patch -p1 -P 85 +%patch -p1 -P 86 # bundled ha-cloud-support libs %patch -p1 -P 500 %patch -p1 -P 501 -%patch -p1 -P 502 chmod 755 heartbeat/nova-compute-wait chmod 755 heartbeat/NovaEvacuate @@ -663,6 +665,18 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm %changelog +* Wed Sep 25 2024 Oyvind Albrigtsen - 4.10.0-65 +- Filesystem: dont sleep during stop-action when there are no + processes to kill, and only use force argument for network + filesystems after sending kill_signals +- Filesystem: try umount first during stop-action, and avoid potential + "Argument list too long" for force_unmount=safe +- AWS agents: use awscli2 + + Resolves: RHEL-58038 + Resolves: RHEL-59576 + Resolves: RHEL-46233 + * Thu Aug 29 2024 Oyvind Albrigtsen - 4.10.0-64 - IPsrcaddr: add IPv6 support