From 5a84bdea6067c4f1ae9e935e8b5f02ddfaa55555 Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Wed, 25 Sep 2024 16:21:18 +0200
Subject: [PATCH] - Filesystem: dont sleep during stop-action when there are no
   processes to kill, and only use force argument for network   filesystems
 after sending kill_signals - Filesystem: try umount first during stop-action,
 and avoid potential   "Argument list too long" for force_unmount=safe - AWS
 agents: use awscli2

  Resolves: RHEL-58038
  Resolves: RHEL-59576
  Resolves: RHEL-46233
---
 ...es-only-send-force-net-fs-after-kill.patch | 106 ++++++++++++++++++
 ...-first-avoid-arguments-list-too-long.patch | 100 +++++++++++++++++
 ha-cloud-support-aws.patch                    |  49 --------
 resource-agents.spec                          |  24 +++-
 4 files changed, 225 insertions(+), 54 deletions(-)
 create mode 100644 RHEL-58038-Filesystem-dont-sleep-no-processes-only-send-force-net-fs-after-kill.patch
 create mode 100644 RHEL-59576-Filesystem-try-umount-first-avoid-arguments-list-too-long.patch
 delete mode 100644 ha-cloud-support-aws.patch

diff --git a/RHEL-58038-Filesystem-dont-sleep-no-processes-only-send-force-net-fs-after-kill.patch b/RHEL-58038-Filesystem-dont-sleep-no-processes-only-send-force-net-fs-after-kill.patch
new file mode 100644
index 0000000..1ae87c4
--- /dev/null
+++ b/RHEL-58038-Filesystem-dont-sleep-no-processes-only-send-force-net-fs-after-kill.patch
@@ -0,0 +1,106 @@
+From d66a52cfb25f5436255ecc65a407c0166a720146 Mon Sep 17 00:00:00 2001
+From: Oyvind Albrigtsen <oalbrigt@redhat.com>
+Date: Tue, 3 Sep 2024 12:55:28 +0200
+Subject: [PATCH 1/2] Filesystem: dont sleep during stop-action when there are
+ no processes to kill
+
+Thanks @SatomiOSAWA for the initial code.
+---
+ heartbeat/Filesystem | 10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
+index 3eb520e0c..f54969f20 100755
+--- a/heartbeat/Filesystem
++++ b/heartbeat/Filesystem
+@@ -685,12 +685,13 @@ signal_processes() {
+ 	pids=$(get_pids "$dir")
+ 	if [ -z "$pids" ]; then
+ 		ocf_log info "No processes on $dir were signalled. force_unmount is set to '$FORCE_UNMOUNT'"
+-		return
++		return 1
+ 	fi
+ 	for pid in $pids; do
+ 		ocf_log info "sending signal $sig to: $(ps -f $pid | tail -1)"
+ 		kill -s $sig $pid
+ 	done
++	return 0
+ }
+ try_umount() {
+ 	local SUB="$1"
+@@ -717,12 +718,13 @@ timeout_child() {
+ 	return $ret
+ }
+ fs_stop_loop() {
+-	local SUB="$1" signals="$2" sig
++	local SUB="$1" signals="$2" sig send_signal
+ 	while true; do
++		send_signal=false
+ 		for sig in $signals; do
+-			signal_processes "$SUB" $sig
++			signal_processes "$SUB" $sig && send_signal=true
+ 		done
+-		sleep $OCF_RESKEY_signal_delay
++		$send_signal && sleep $OCF_RESKEY_signal_delay
+ 		try_umount "$SUB" && return $OCF_SUCCESS
+ 	done
+ }
+
+From cb6aaffc260eea0f0fee6fab44393c6cf12b8a83 Mon Sep 17 00:00:00 2001
+From: Oyvind Albrigtsen <oalbrigt@redhat.com>
+Date: Mon, 9 Sep 2024 10:58:12 +0200
+Subject: [PATCH 2/2] Filesystem: only use $umount_force after sending
+ kill_signals
+
+---
+ heartbeat/Filesystem | 12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
+index f54969f20..4dd962fd9 100755
+--- a/heartbeat/Filesystem
++++ b/heartbeat/Filesystem
+@@ -694,8 +694,8 @@ signal_processes() {
+ 	return 0
+ }
+ try_umount() {
+-	local SUB="$1"
+-	$UMOUNT $umount_force "$SUB"
++	local force_arg="$1" SUB="$2"
++	$UMOUNT $force_arg "$SUB"
+ 	list_mounts | grep "${TAB}${SUB}${TAB}" >/dev/null 2>&1 || {
+ 		ocf_log info "unmounted $SUB successfully"
+ 		return $OCF_SUCCESS
+@@ -718,14 +718,14 @@ timeout_child() {
+ 	return $ret
+ }
+ fs_stop_loop() {
+-	local SUB="$1" signals="$2" sig send_signal
++	local force_arg="$1" SUB="$2" signals="$3" sig send_signal
+ 	while true; do
+ 		send_signal=false
+ 		for sig in $signals; do
+ 			signal_processes "$SUB" $sig && send_signal=true
+ 		done
+ 		$send_signal && sleep $OCF_RESKEY_signal_delay
+-		try_umount "$SUB" && return $OCF_SUCCESS
++		try_umount "$force_arg" "$SUB" && return $OCF_SUCCESS
+ 	done
+ }
+ fs_stop() {
+@@ -733,13 +733,13 @@ fs_stop() {
+ 	grace_time=$((timeout/2))
+ 
+ 	# try gracefully terminating processes for up to half of the configured timeout
+-	fs_stop_loop "$SUB" "$OCF_RESKEY_term_signals" &
++	fs_stop_loop "" "$SUB" "$OCF_RESKEY_term_signals" &
+ 	timeout_child $! $grace_time
+ 	ret=$?
+ 	[ $ret -eq $OCF_SUCCESS ] && return $ret
+ 
+ 	# try killing them for the rest of the timeout
+-	fs_stop_loop "$SUB" "$OCF_RESKEY_kill_signals" &
++	fs_stop_loop "$umount_force" "$SUB" "$OCF_RESKEY_kill_signals" &
+ 	timeout_child $! $grace_time
+ 	ret=$?
+ 	[ $ret -eq $OCF_SUCCESS ] && return $ret
diff --git a/RHEL-59576-Filesystem-try-umount-first-avoid-arguments-list-too-long.patch b/RHEL-59576-Filesystem-try-umount-first-avoid-arguments-list-too-long.patch
new file mode 100644
index 0000000..561e29a
--- /dev/null
+++ b/RHEL-59576-Filesystem-try-umount-first-avoid-arguments-list-too-long.patch
@@ -0,0 +1,100 @@
+From f02afd0fadb581ca0fc9798beaf28044cf211200 Mon Sep 17 00:00:00 2001
+From: Lars Ellenberg <lars.ellenberg@linbit.com>
+Date: Wed, 18 Sep 2024 11:53:52 +0200
+Subject: [PATCH 1/2] Filesystem: on stop, try umount directly, before scanning
+ for users
+
+48ed6e6d (Filesystem: improve stop-action and allow setting term/kill signals and signal_delay for large filesystems, 2023-07-04)
+changed the logic from
+"try umount; if that fails, find and kill users; repeat" to
+"try to find and kill users; then try umount; repeat"
+
+But even just walking /proc may take "a long time" on busy systems,
+and may still turn up with "no users found".
+
+It will take even longer for "force_umount=safe"
+(observed 8 to 10 seconds just for "get_pids() with "safe" to return nothing)
+than for "force_umount=yes" (still ~ 2 to 3 seconds),
+but it will take "a long time" in any case.
+(BTW, that may be longer than the hardcoded default of 6 seconds for "fast_stop",
+which is also the default on many systems now)
+
+If the dependencies are properly configured,
+there should be no users left,
+and the umount should just work.
+
+Revert back to "try umount first", and only then try to find "rogue" users.
+---
+ heartbeat/Filesystem | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
+index 4dd962fd9..99bddaf62 100755
+--- a/heartbeat/Filesystem
++++ b/heartbeat/Filesystem
+@@ -732,6 +732,11 @@ fs_stop() {
+ 	local SUB="$1" timeout=$2 grace_time ret
+ 	grace_time=$((timeout/2))
+ 
++	# Just walking /proc may take "a long time", even if we don't find any users of this FS.
++	# If dependencies are properly configured, umount should just work.
++	# Only if that fails, try to find and kill processes that still use it.
++	try_umount "" "$SUB" && return $OCF_SUCCESS
++
+ 	# try gracefully terminating processes for up to half of the configured timeout
+ 	fs_stop_loop "" "$SUB" "$OCF_RESKEY_term_signals" &
+ 	timeout_child $! $grace_time
+
+From b42d698f12aaeb871f4cc6a3c0327a27862b4376 Mon Sep 17 00:00:00 2001
+From: Lars Ellenberg <lars.ellenberg@linbit.com>
+Date: Wed, 18 Sep 2024 13:42:38 +0200
+Subject: [PATCH 2/2] Filesystem: stop/get_pids to be signaled
+
+The "safe" way to get process ids that may be using a particular filesystem
+currently uses shell globs ("find /proc/[0-9]*").
+With a million processes (and/or a less capable shell),
+that may result in "Argument list too long".
+
+Replace with find /proc -path "/proc/[0-9]*" instead.
+While at it, also fix the non-posix -or to be -o,
+and add explicit grouping parentheses \( \) and explicit -print.
+
+Add a comment to not include "interesting" characters in mount point names.
+---
+ heartbeat/Filesystem | 23 ++++++++++++++++++++---
+ 1 file changed, 20 insertions(+), 3 deletions(-)
+
+diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
+index 99bddaf62..3405e2c26 100755
+--- a/heartbeat/Filesystem
++++ b/heartbeat/Filesystem
+@@ -669,9 +669,26 @@ get_pids()
+ 			$FUSER -Mm $dir 2>/dev/null
+ 		fi
+ 	elif [ "$FORCE_UNMOUNT" = "safe" ]; then
+-		procs=$(find /proc/[0-9]*/ -type l -lname "${dir}/*" -or -lname "${dir}" 2>/dev/null | awk -F/ '{print $3}')
+-		mmap_procs=$(grep " ${dir}/" /proc/[0-9]*/maps | awk -F/ '{print $3}')
+-		printf "${procs}\n${mmap_procs}" | sort | uniq
++		# Yes, in theory, ${dir} could contain "intersting" characters
++		# and would need to be quoted for glob (find) and regex (grep).
++		# Don't do that, then.
++
++		# Avoid /proc/[0-9]*, it may cause "Argument list too long".
++		# There are several ways to filter for /proc/<pid>
++		# -mindepth 1 -not -path "/proc/[0-9]*" -prune -o ...
++		# -path "/proc/[!0-9]*" -prune -o ...
++		# -path "/proc/[0-9]*" -a ...
++		# the latter seemd to be significantly faster for this one in my naive test.
++		procs=$(exec 2>/dev/null;
++			find /proc -path "/proc/[0-9]*" -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print |
++			awk -F/ '{print $3}' | uniq)
++
++		# This finds both /proc/<pid>/maps and /proc/<pid>/task/<tid>/maps;
++		# if you don't want the latter, add -maxdepth.
++		mmap_procs=$(exec 2>/dev/null;
++			find /proc -path "/proc/[0-9]*/maps" -print |
++			xargs -r grep -l " ${dir}/" | awk -F/ '{print $3}' | uniq)
++		printf "${procs}\n${mmap_procs}" | sort -u
+ 	fi
+ }
+ 
diff --git a/ha-cloud-support-aws.patch b/ha-cloud-support-aws.patch
deleted file mode 100644
index b05e936..0000000
--- a/ha-cloud-support-aws.patch
+++ /dev/null
@@ -1,49 +0,0 @@
-diff --color -uNr a/heartbeat/awseip b/heartbeat/awseip
---- a/heartbeat/awseip	2020-12-03 14:31:17.000000000 +0100
-+++ b/heartbeat/awseip	2021-02-15 16:47:36.624610378 +0100
-@@ -43,7 +43,7 @@
- #
- # Defaults
- #
--OCF_RESKEY_awscli_default="/usr/bin/aws"
-+OCF_RESKEY_awscli_default="/usr/lib/fence-agents/support/awscli/bin/aws"
- OCF_RESKEY_auth_type_default="key"
- OCF_RESKEY_profile_default="default"
- OCF_RESKEY_region_default=""
- OCF_RESKEY_api_delay_default="3"
-diff --color -uNr a/heartbeat/awsvip b/heartbeat/awsvip
---- a/heartbeat/awsvip	2020-12-03 14:31:17.000000000 +0100
-+++ b/heartbeat/awsvip	2021-02-15 16:47:48.960632484 +0100
-@@ -42,7 +42,7 @@
- #
- # Defaults
- #
--OCF_RESKEY_awscli_default="/usr/bin/aws"
-+OCF_RESKEY_awscli_default="/usr/lib/fence-agents/support/awscli/bin/aws"
- OCF_RESKEY_auth_type_default="key"
- OCF_RESKEY_profile_default="default"
- OCF_RESKEY_region_default=""
-diff --color -uNr a/heartbeat/aws-vpc-move-ip b/heartbeat/aws-vpc-move-ip
---- a/heartbeat/aws-vpc-move-ip	2020-12-03 14:31:17.000000000 +0100
-+++ b/heartbeat/aws-vpc-move-ip	2021-02-15 16:47:55.484644118 +0100
-@@ -35,7 +35,7 @@
- . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
- 
- # Defaults
--OCF_RESKEY_awscli_default="/usr/bin/aws"
-+OCF_RESKEY_awscli_default="/usr/lib/fence-agents/support/awscli/bin/aws"
- OCF_RESKEY_auth_type_default="key"
- OCF_RESKEY_profile_default="default"
- OCF_RESKEY_region_default=""
-diff --color -uNr a/heartbeat/aws-vpc-route53.in b/heartbeat/aws-vpc-route53.in
---- a/heartbeat/aws-vpc-route53.in	2020-12-03 14:31:17.000000000 +0100
-+++ b/heartbeat/aws-vpc-route53.in	2021-02-15 16:47:59.808651828 +0100
-@@ -45,7 +45,7 @@
- . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
- 
- # Defaults
--OCF_RESKEY_awscli_default="/usr/bin/aws"
-+OCF_RESKEY_awscli_default="/usr/lib/fence-agents/support/awscli/bin/aws"
- OCF_RESKEY_auth_type_default="key"
- OCF_RESKEY_profile_default="default"
- OCF_RESKEY_region_default=""
diff --git a/resource-agents.spec b/resource-agents.spec
index 03148ea..2c8a315 100644
--- a/resource-agents.spec
+++ b/resource-agents.spec
@@ -45,7 +45,7 @@
 Name:		resource-agents
 Summary:	Open Source HA Reusable Cluster Resource Scripts
 Version:	4.10.0
-Release:	64%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
+Release:	65%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
 License:	GPLv2+ and LGPLv2+
 URL:		https://github.com/ClusterLabs/resource-agents
 Source0:	%{upstream_prefix}-%{upstream_version}.tar.gz
@@ -134,11 +134,12 @@ Patch81:	RHEL-40393-Filesystem-2-update-bsd-logic.patch
 Patch82:	RHEL-32829-db2-fix-OCF_SUCESS-typo.patch
 Patch83:	RHEL-43579-galera-mysql-redis-remove-Unpromoted-monitor-action.patch
 Patch84:	RHEL-22715-LVM-activate-fix-false-positive.patch
+Patch85:	RHEL-58038-Filesystem-dont-sleep-no-processes-only-send-force-net-fs-after-kill.patch
+Patch86:	RHEL-59576-Filesystem-try-umount-first-avoid-arguments-list-too-long.patch
 
 # bundled ha-cloud-support libs
-Patch500:	ha-cloud-support-aws.patch
-Patch501:	ha-cloud-support-aliyun.patch
-Patch502:	ha-cloud-support-gcloud.patch
+Patch500:	ha-cloud-support-aliyun.patch
+Patch501:	ha-cloud-support-gcloud.patch
 
 Obsoletes:	heartbeat-resources <= %{version}
 Provides:	heartbeat-resources = %{version}
@@ -342,11 +343,12 @@ exit 1
 %patch -p1 -P 82
 %patch -p1 -P 83
 %patch -p1 -P 84
+%patch -p1 -P 85
+%patch -p1 -P 86
 
 # bundled ha-cloud-support libs
 %patch -p1 -P 500
 %patch -p1 -P 501
-%patch -p1 -P 502
 
 chmod 755 heartbeat/nova-compute-wait
 chmod 755 heartbeat/NovaEvacuate
@@ -663,6 +665,18 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
 %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
 
 %changelog
+* Wed Sep 25 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-65
+- Filesystem: dont sleep during stop-action when there are no
+  processes to kill, and only use force argument for network
+  filesystems after sending kill_signals
+- Filesystem: try umount first during stop-action, and avoid potential
+  "Argument list too long" for force_unmount=safe
+- AWS agents: use awscli2
+
+  Resolves: RHEL-58038
+  Resolves: RHEL-59576
+  Resolves: RHEL-46233
+
 * Thu Aug 29 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-64
 - IPsrcaddr: add IPv6 support