- Filesystem: dont sleep during stop-action when there are no
processes to kill, and only use force argument for network filesystems after sending kill_signals - Filesystem: try umount first during stop-action, and avoid potential "Argument list too long" for force_unmount=safe - AWS agents: use awscli2 Resolves: RHEL-58038 Resolves: RHEL-59576 Resolves: RHEL-46233
This commit is contained in:
parent
6bff52741d
commit
5a84bdea60
@ -0,0 +1,106 @@
|
||||
From d66a52cfb25f5436255ecc65a407c0166a720146 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Tue, 3 Sep 2024 12:55:28 +0200
|
||||
Subject: [PATCH 1/2] Filesystem: dont sleep during stop-action when there are
|
||||
no processes to kill
|
||||
|
||||
Thanks @SatomiOSAWA for the initial code.
|
||||
---
|
||||
heartbeat/Filesystem | 10 ++++++----
|
||||
1 file changed, 6 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||
index 3eb520e0c..f54969f20 100755
|
||||
--- a/heartbeat/Filesystem
|
||||
+++ b/heartbeat/Filesystem
|
||||
@@ -685,12 +685,13 @@ signal_processes() {
|
||||
pids=$(get_pids "$dir")
|
||||
if [ -z "$pids" ]; then
|
||||
ocf_log info "No processes on $dir were signalled. force_unmount is set to '$FORCE_UNMOUNT'"
|
||||
- return
|
||||
+ return 1
|
||||
fi
|
||||
for pid in $pids; do
|
||||
ocf_log info "sending signal $sig to: $(ps -f $pid | tail -1)"
|
||||
kill -s $sig $pid
|
||||
done
|
||||
+ return 0
|
||||
}
|
||||
try_umount() {
|
||||
local SUB="$1"
|
||||
@@ -717,12 +718,13 @@ timeout_child() {
|
||||
return $ret
|
||||
}
|
||||
fs_stop_loop() {
|
||||
- local SUB="$1" signals="$2" sig
|
||||
+ local SUB="$1" signals="$2" sig send_signal
|
||||
while true; do
|
||||
+ send_signal=false
|
||||
for sig in $signals; do
|
||||
- signal_processes "$SUB" $sig
|
||||
+ signal_processes "$SUB" $sig && send_signal=true
|
||||
done
|
||||
- sleep $OCF_RESKEY_signal_delay
|
||||
+ $send_signal && sleep $OCF_RESKEY_signal_delay
|
||||
try_umount "$SUB" && return $OCF_SUCCESS
|
||||
done
|
||||
}
|
||||
|
||||
From cb6aaffc260eea0f0fee6fab44393c6cf12b8a83 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Mon, 9 Sep 2024 10:58:12 +0200
|
||||
Subject: [PATCH 2/2] Filesystem: only use $umount_force after sending
|
||||
kill_signals
|
||||
|
||||
---
|
||||
heartbeat/Filesystem | 12 ++++++------
|
||||
1 file changed, 6 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||
index f54969f20..4dd962fd9 100755
|
||||
--- a/heartbeat/Filesystem
|
||||
+++ b/heartbeat/Filesystem
|
||||
@@ -694,8 +694,8 @@ signal_processes() {
|
||||
return 0
|
||||
}
|
||||
try_umount() {
|
||||
- local SUB="$1"
|
||||
- $UMOUNT $umount_force "$SUB"
|
||||
+ local force_arg="$1" SUB="$2"
|
||||
+ $UMOUNT $force_arg "$SUB"
|
||||
list_mounts | grep "${TAB}${SUB}${TAB}" >/dev/null 2>&1 || {
|
||||
ocf_log info "unmounted $SUB successfully"
|
||||
return $OCF_SUCCESS
|
||||
@@ -718,14 +718,14 @@ timeout_child() {
|
||||
return $ret
|
||||
}
|
||||
fs_stop_loop() {
|
||||
- local SUB="$1" signals="$2" sig send_signal
|
||||
+ local force_arg="$1" SUB="$2" signals="$3" sig send_signal
|
||||
while true; do
|
||||
send_signal=false
|
||||
for sig in $signals; do
|
||||
signal_processes "$SUB" $sig && send_signal=true
|
||||
done
|
||||
$send_signal && sleep $OCF_RESKEY_signal_delay
|
||||
- try_umount "$SUB" && return $OCF_SUCCESS
|
||||
+ try_umount "$force_arg" "$SUB" && return $OCF_SUCCESS
|
||||
done
|
||||
}
|
||||
fs_stop() {
|
||||
@@ -733,13 +733,13 @@ fs_stop() {
|
||||
grace_time=$((timeout/2))
|
||||
|
||||
# try gracefully terminating processes for up to half of the configured timeout
|
||||
- fs_stop_loop "$SUB" "$OCF_RESKEY_term_signals" &
|
||||
+ fs_stop_loop "" "$SUB" "$OCF_RESKEY_term_signals" &
|
||||
timeout_child $! $grace_time
|
||||
ret=$?
|
||||
[ $ret -eq $OCF_SUCCESS ] && return $ret
|
||||
|
||||
# try killing them for the rest of the timeout
|
||||
- fs_stop_loop "$SUB" "$OCF_RESKEY_kill_signals" &
|
||||
+ fs_stop_loop "$umount_force" "$SUB" "$OCF_RESKEY_kill_signals" &
|
||||
timeout_child $! $grace_time
|
||||
ret=$?
|
||||
[ $ret -eq $OCF_SUCCESS ] && return $ret
|
@ -0,0 +1,100 @@
|
||||
From f02afd0fadb581ca0fc9798beaf28044cf211200 Mon Sep 17 00:00:00 2001
|
||||
From: Lars Ellenberg <lars.ellenberg@linbit.com>
|
||||
Date: Wed, 18 Sep 2024 11:53:52 +0200
|
||||
Subject: [PATCH 1/2] Filesystem: on stop, try umount directly, before scanning
|
||||
for users
|
||||
|
||||
48ed6e6d (Filesystem: improve stop-action and allow setting term/kill signals and signal_delay for large filesystems, 2023-07-04)
|
||||
changed the logic from
|
||||
"try umount; if that fails, find and kill users; repeat" to
|
||||
"try to find and kill users; then try umount; repeat"
|
||||
|
||||
But even just walking /proc may take "a long time" on busy systems,
|
||||
and may still turn up with "no users found".
|
||||
|
||||
It will take even longer for "force_umount=safe"
|
||||
(observed 8 to 10 seconds just for "get_pids() with "safe" to return nothing)
|
||||
than for "force_umount=yes" (still ~ 2 to 3 seconds),
|
||||
but it will take "a long time" in any case.
|
||||
(BTW, that may be longer than the hardcoded default of 6 seconds for "fast_stop",
|
||||
which is also the default on many systems now)
|
||||
|
||||
If the dependencies are properly configured,
|
||||
there should be no users left,
|
||||
and the umount should just work.
|
||||
|
||||
Revert back to "try umount first", and only then try to find "rogue" users.
|
||||
---
|
||||
heartbeat/Filesystem | 5 +++++
|
||||
1 file changed, 5 insertions(+)
|
||||
|
||||
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||
index 4dd962fd9..99bddaf62 100755
|
||||
--- a/heartbeat/Filesystem
|
||||
+++ b/heartbeat/Filesystem
|
||||
@@ -732,6 +732,11 @@ fs_stop() {
|
||||
local SUB="$1" timeout=$2 grace_time ret
|
||||
grace_time=$((timeout/2))
|
||||
|
||||
+ # Just walking /proc may take "a long time", even if we don't find any users of this FS.
|
||||
+ # If dependencies are properly configured, umount should just work.
|
||||
+ # Only if that fails, try to find and kill processes that still use it.
|
||||
+ try_umount "" "$SUB" && return $OCF_SUCCESS
|
||||
+
|
||||
# try gracefully terminating processes for up to half of the configured timeout
|
||||
fs_stop_loop "" "$SUB" "$OCF_RESKEY_term_signals" &
|
||||
timeout_child $! $grace_time
|
||||
|
||||
From b42d698f12aaeb871f4cc6a3c0327a27862b4376 Mon Sep 17 00:00:00 2001
|
||||
From: Lars Ellenberg <lars.ellenberg@linbit.com>
|
||||
Date: Wed, 18 Sep 2024 13:42:38 +0200
|
||||
Subject: [PATCH 2/2] Filesystem: stop/get_pids to be signaled
|
||||
|
||||
The "safe" way to get process ids that may be using a particular filesystem
|
||||
currently uses shell globs ("find /proc/[0-9]*").
|
||||
With a million processes (and/or a less capable shell),
|
||||
that may result in "Argument list too long".
|
||||
|
||||
Replace with find /proc -path "/proc/[0-9]*" instead.
|
||||
While at it, also fix the non-posix -or to be -o,
|
||||
and add explicit grouping parentheses \( \) and explicit -print.
|
||||
|
||||
Add a comment to not include "interesting" characters in mount point names.
|
||||
---
|
||||
heartbeat/Filesystem | 23 ++++++++++++++++++++---
|
||||
1 file changed, 20 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||
index 99bddaf62..3405e2c26 100755
|
||||
--- a/heartbeat/Filesystem
|
||||
+++ b/heartbeat/Filesystem
|
||||
@@ -669,9 +669,26 @@ get_pids()
|
||||
$FUSER -Mm $dir 2>/dev/null
|
||||
fi
|
||||
elif [ "$FORCE_UNMOUNT" = "safe" ]; then
|
||||
- procs=$(find /proc/[0-9]*/ -type l -lname "${dir}/*" -or -lname "${dir}" 2>/dev/null | awk -F/ '{print $3}')
|
||||
- mmap_procs=$(grep " ${dir}/" /proc/[0-9]*/maps | awk -F/ '{print $3}')
|
||||
- printf "${procs}\n${mmap_procs}" | sort | uniq
|
||||
+ # Yes, in theory, ${dir} could contain "intersting" characters
|
||||
+ # and would need to be quoted for glob (find) and regex (grep).
|
||||
+ # Don't do that, then.
|
||||
+
|
||||
+ # Avoid /proc/[0-9]*, it may cause "Argument list too long".
|
||||
+ # There are several ways to filter for /proc/<pid>
|
||||
+ # -mindepth 1 -not -path "/proc/[0-9]*" -prune -o ...
|
||||
+ # -path "/proc/[!0-9]*" -prune -o ...
|
||||
+ # -path "/proc/[0-9]*" -a ...
|
||||
+ # the latter seemd to be significantly faster for this one in my naive test.
|
||||
+ procs=$(exec 2>/dev/null;
|
||||
+ find /proc -path "/proc/[0-9]*" -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print |
|
||||
+ awk -F/ '{print $3}' | uniq)
|
||||
+
|
||||
+ # This finds both /proc/<pid>/maps and /proc/<pid>/task/<tid>/maps;
|
||||
+ # if you don't want the latter, add -maxdepth.
|
||||
+ mmap_procs=$(exec 2>/dev/null;
|
||||
+ find /proc -path "/proc/[0-9]*/maps" -print |
|
||||
+ xargs -r grep -l " ${dir}/" | awk -F/ '{print $3}' | uniq)
|
||||
+ printf "${procs}\n${mmap_procs}" | sort -u
|
||||
fi
|
||||
}
|
||||
|
@ -1,49 +0,0 @@
|
||||
diff --color -uNr a/heartbeat/awseip b/heartbeat/awseip
|
||||
--- a/heartbeat/awseip 2020-12-03 14:31:17.000000000 +0100
|
||||
+++ b/heartbeat/awseip 2021-02-15 16:47:36.624610378 +0100
|
||||
@@ -43,7 +43,7 @@
|
||||
#
|
||||
# Defaults
|
||||
#
|
||||
-OCF_RESKEY_awscli_default="/usr/bin/aws"
|
||||
+OCF_RESKEY_awscli_default="/usr/lib/fence-agents/support/awscli/bin/aws"
|
||||
OCF_RESKEY_auth_type_default="key"
|
||||
OCF_RESKEY_profile_default="default"
|
||||
OCF_RESKEY_region_default=""
|
||||
OCF_RESKEY_api_delay_default="3"
|
||||
diff --color -uNr a/heartbeat/awsvip b/heartbeat/awsvip
|
||||
--- a/heartbeat/awsvip 2020-12-03 14:31:17.000000000 +0100
|
||||
+++ b/heartbeat/awsvip 2021-02-15 16:47:48.960632484 +0100
|
||||
@@ -42,7 +42,7 @@
|
||||
#
|
||||
# Defaults
|
||||
#
|
||||
-OCF_RESKEY_awscli_default="/usr/bin/aws"
|
||||
+OCF_RESKEY_awscli_default="/usr/lib/fence-agents/support/awscli/bin/aws"
|
||||
OCF_RESKEY_auth_type_default="key"
|
||||
OCF_RESKEY_profile_default="default"
|
||||
OCF_RESKEY_region_default=""
|
||||
diff --color -uNr a/heartbeat/aws-vpc-move-ip b/heartbeat/aws-vpc-move-ip
|
||||
--- a/heartbeat/aws-vpc-move-ip 2020-12-03 14:31:17.000000000 +0100
|
||||
+++ b/heartbeat/aws-vpc-move-ip 2021-02-15 16:47:55.484644118 +0100
|
||||
@@ -35,7 +35,7 @@
|
||||
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
|
||||
|
||||
# Defaults
|
||||
-OCF_RESKEY_awscli_default="/usr/bin/aws"
|
||||
+OCF_RESKEY_awscli_default="/usr/lib/fence-agents/support/awscli/bin/aws"
|
||||
OCF_RESKEY_auth_type_default="key"
|
||||
OCF_RESKEY_profile_default="default"
|
||||
OCF_RESKEY_region_default=""
|
||||
diff --color -uNr a/heartbeat/aws-vpc-route53.in b/heartbeat/aws-vpc-route53.in
|
||||
--- a/heartbeat/aws-vpc-route53.in 2020-12-03 14:31:17.000000000 +0100
|
||||
+++ b/heartbeat/aws-vpc-route53.in 2021-02-15 16:47:59.808651828 +0100
|
||||
@@ -45,7 +45,7 @@
|
||||
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
|
||||
|
||||
# Defaults
|
||||
-OCF_RESKEY_awscli_default="/usr/bin/aws"
|
||||
+OCF_RESKEY_awscli_default="/usr/lib/fence-agents/support/awscli/bin/aws"
|
||||
OCF_RESKEY_auth_type_default="key"
|
||||
OCF_RESKEY_profile_default="default"
|
||||
OCF_RESKEY_region_default=""
|
@ -45,7 +45,7 @@
|
||||
Name: resource-agents
|
||||
Summary: Open Source HA Reusable Cluster Resource Scripts
|
||||
Version: 4.10.0
|
||||
Release: 64%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
|
||||
Release: 65%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
|
||||
License: GPLv2+ and LGPLv2+
|
||||
URL: https://github.com/ClusterLabs/resource-agents
|
||||
Source0: %{upstream_prefix}-%{upstream_version}.tar.gz
|
||||
@ -134,11 +134,12 @@ Patch81: RHEL-40393-Filesystem-2-update-bsd-logic.patch
|
||||
Patch82: RHEL-32829-db2-fix-OCF_SUCESS-typo.patch
|
||||
Patch83: RHEL-43579-galera-mysql-redis-remove-Unpromoted-monitor-action.patch
|
||||
Patch84: RHEL-22715-LVM-activate-fix-false-positive.patch
|
||||
Patch85: RHEL-58038-Filesystem-dont-sleep-no-processes-only-send-force-net-fs-after-kill.patch
|
||||
Patch86: RHEL-59576-Filesystem-try-umount-first-avoid-arguments-list-too-long.patch
|
||||
|
||||
# bundled ha-cloud-support libs
|
||||
Patch500: ha-cloud-support-aws.patch
|
||||
Patch501: ha-cloud-support-aliyun.patch
|
||||
Patch502: ha-cloud-support-gcloud.patch
|
||||
Patch500: ha-cloud-support-aliyun.patch
|
||||
Patch501: ha-cloud-support-gcloud.patch
|
||||
|
||||
Obsoletes: heartbeat-resources <= %{version}
|
||||
Provides: heartbeat-resources = %{version}
|
||||
@ -342,11 +343,12 @@ exit 1
|
||||
%patch -p1 -P 82
|
||||
%patch -p1 -P 83
|
||||
%patch -p1 -P 84
|
||||
%patch -p1 -P 85
|
||||
%patch -p1 -P 86
|
||||
|
||||
# bundled ha-cloud-support libs
|
||||
%patch -p1 -P 500
|
||||
%patch -p1 -P 501
|
||||
%patch -p1 -P 502
|
||||
|
||||
chmod 755 heartbeat/nova-compute-wait
|
||||
chmod 755 heartbeat/NovaEvacuate
|
||||
@ -663,6 +665,18 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
|
||||
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
|
||||
|
||||
%changelog
|
||||
* Wed Sep 25 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-65
|
||||
- Filesystem: dont sleep during stop-action when there are no
|
||||
processes to kill, and only use force argument for network
|
||||
filesystems after sending kill_signals
|
||||
- Filesystem: try umount first during stop-action, and avoid potential
|
||||
"Argument list too long" for force_unmount=safe
|
||||
- AWS agents: use awscli2
|
||||
|
||||
Resolves: RHEL-58038
|
||||
Resolves: RHEL-59576
|
||||
Resolves: RHEL-46233
|
||||
|
||||
* Thu Aug 29 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-64
|
||||
- IPsrcaddr: add IPv6 support
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user