- pgsqlms: fix validate warnings

- nginx: fix validate warnings
- Filesystem: speed up get PIDs

  Resolves: RHEL-102779, RHEL-112443, RHEL-121985
This commit is contained in:
Oyvind Albrigtsen 2025-11-04 13:31:59 +01:00
parent ec0a35b869
commit 4800c63bd0
4 changed files with 397 additions and 1 deletions

View File

@ -0,0 +1,181 @@
From 443841ea27d61a2eedff4a0c4f18bb5771fb8d5e Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Tue, 8 Jul 2025 15:19:09 +0200
Subject: [PATCH] pgsqlms: improvements and fixes
- add support for promotable variables
- dont fail during validate-all action if notify != true (to avoid
error and future fails during `pcs resource create`)
- report NOT_RUNNING during probe-action when no database has been
created or postgresql is not installed
---
script/pgsqlms | 74 +++++++++++++++++++++++++++++++++-----------------
1 file changed, 49 insertions(+), 25 deletions(-)
diff --git a/heartbeat/pgsqlms b/heartbeat/pgsqlms
index 5ddd67a..1abffeb 100755
--- a/heartbeat/pgsqlms
+++ b/heartbeat/pgsqlms
@@ -485,7 +485,7 @@ sub _pg_isready {
# Add 60s to the timeout or use a 24h timeout fallback to make sure
# Pacemaker will give up before us and take decisions
my $timeout = ( _get_action_timeout() || 60*60*24 ) + 60;
- my $rc = _runas( $PGISREADY, '-h', $pghost, '-p', $pgport, '-d', 'postgres', '-t', $timeout );
+ my $rc = _runas( $PGISREADY, '-q', '-h', $pghost, '-p', $pgport, '-d', 'postgres', '-t', $timeout );
# Possible error codes:
# 1: ping rejected (usually when instance is in startup, in crash
@@ -624,14 +624,18 @@ sub _get_controldata {
and defined $controldata{'redo'}
and defined $controldata{'wal_level'};
- ocf_exit_reason( 'Could not read all datas from controldata file for "%s"',
- $datadir );
+ if ( ! ocf_is_probe() ) {
+ ocf_exit_reason( 'Could not read all datas from controldata file for "%s"',
+ $datadir );
- ocf_log( 'debug',
- "_get_controldata: controldata file: %s",
- Data::Dumper->new( [ \%controldata ] )->Terse(1)->Dump, $ans );
+ ocf_log( 'debug',
+ "_get_controldata: controldata file: %s",
+ Data::Dumper->new( [ \%controldata ] )->Terse(1)->Dump, $ans );
- exit $OCF_ERR_ARGS;
+ exit $OCF_ERR_ARGS;
+ }
+
+ return ();
}
# Pead major version from datadir/PG_VERSION and return it as numeric version
@@ -642,8 +646,12 @@ sub _get_pg_version {
# check PG_VERSION
if ( ! -s "$datadir/PG_VERSION" ) {
- ocf_exit_reason( 'PG_VERSION does not exist in "%s"', $datadir );
- exit $OCF_ERR_ARGS;
+ if ( ! ocf_is_probe() ) {
+ ocf_exit_reason( 'PG_VERSION does not exist in "%s"', $datadir );
+ exit $OCF_ERR_ARGS;
+ } else {
+ return -1;
+ }
}
unless ( open( $fh, '<', "$datadir/PG_VERSION" ) ) {
@@ -1324,22 +1332,34 @@ sub pgsql_validate_all {
}
# check notify=true
- unless ( defined $ENV{'OCF_RESKEY_CRM_meta_notify'}
- and lc($ENV{'OCF_RESKEY_CRM_meta_notify'}) =~ /^true$|^on$|^yes$|^y$|^1$/ ) {
+ unless ( $__OCF_ACTION eq 'validate-all'
+ or ( defined $ENV{'OCF_RESKEY_CRM_meta_notify'}
+ and lc($ENV{'OCF_RESKEY_CRM_meta_notify'}) =~ /^true$|^on$|^yes$|^y$|^1$/ ) ) {
ocf_exit_reason(
'You must set meta parameter notify=true for your "master" resource'
);
return $OCF_ERR_INSTALLED;
}
- # check master-max=1
+ # check promoted_max=1/master-max=1
unless (
- defined $ENV{'OCF_RESKEY_CRM_meta_master_max'}
- and $ENV{'OCF_RESKEY_CRM_meta_master_max'} eq '1'
+ $__OCF_ACTION eq 'validate-all'
+ or
+ ( defined $ENV{'OCF_RESKEY_CRM_meta_promoted_max'}
+ and $ENV{'OCF_RESKEY_CRM_meta_promoted_max'} eq '1' )
+ or
+ (defined $ENV{'OCF_RESKEY_CRM_meta_master_max'}
+ and $ENV{'OCF_RESKEY_CRM_meta_master_max'} eq '1')
) {
- ocf_exit_reason(
- 'You must set meta parameter master-max=1 for your "master" resource'
- );
+ if ( ocf_version_cmp( $ENV{"OCF_RESKEY_crm_feature_set"}, '3.1.0' ) =~ /^[21]$/ ) {
+ ocf_exit_reason(
+ 'You must set meta parameter promoted_max=1 for your "promotable" resource'
+ );
+ } else {
+ ocf_exit_reason(
+ 'You must set meta parameter master-max=1 for your "master" resource'
+ );
+ }
return $OCF_ERR_INSTALLED;
}
@@ -1366,14 +1386,14 @@ sub pgsql_validate_all {
}
$guc = qx{ $POSTGRES -C primary_conninfo -D "$pgdata" $start_opts};
- unless ($guc =~ /\bapplication_name='?$nodename'?\b/) {
+ unless ($guc =~ /\bapplication_name='?$nodename'?\b/ or $__OCF_ACTION eq 'validate-all') {
ocf_exit_reason(
q{Parameter "primary_conninfo" MUST contain 'application_name=%s'. }.
q{It is currently set to '%s'}, $nodename, $guc );
return $OCF_ERR_ARGS;
}
}
- else {
+ elsif ($PGVERNUM > -1 ) {
my @content;
# check recovery template
@@ -1428,14 +1448,14 @@ sub pgsql_validate_all {
}
# require 9.3 minimum
- if ( $PGVERNUM < $PGVER_93 ) {
+ if ( $PGVERNUM < $PGVER_93 && $PGVERNUM > -1 ) {
ocf_exit_reason( "Require 9.3 and more" );
return $OCF_ERR_INSTALLED;
}
# check binaries
- unless ( -x $PGCTL and -x $PGPSQL and -x $PGCTRLDATA and -x $PGISREADY
- and ( -x $PGWALDUMP or -x "$bindir/pg_xlogdump")
+ unless ( ( -x $PGCTL and -x $PGPSQL and -x $PGCTRLDATA and -x $PGISREADY
+ and ( -x $PGWALDUMP or -x "$bindir/pg_xlogdump") ) or ocf_is_probe()
) {
ocf_exit_reason(
"Missing one or more binary. Check following path: %s, %s, %s, %s, %s or %s",
@@ -1445,7 +1465,7 @@ sub pgsql_validate_all {
# require wal_level >= hot_standby
%cdata = _get_controldata();
- unless ( $cdata{'wal_level'} =~ m{hot_standby|logical|replica} ) {
+ unless ( (defined $cdata{'wal_level'} and $cdata{'wal_level'} =~ m{hot_standby|logical|replica}) or ocf_is_probe() ) {
ocf_exit_reason(
'wal_level must be one of "hot_standby", "logical" or "replica"' );
return $OCF_ERR_ARGS;
@@ -1599,6 +1619,10 @@ sub pgsql_monitor {
return _confirm_role();
}
+ if ( ocf_is_probe() ) {
+ return $OCF_NOT_RUNNING;
+ }
+
if ( $pgisready_rc == 1 ) {
# The attempt was rejected.
# This could happen in several cases:
@@ -2254,13 +2278,13 @@ chdir File::Spec->tmpdir();
# mandatory sanity checks
# check pgdata
-if ( ! -d $pgdata ) {
+if ( ! -d $pgdata and ! ocf_is_probe() ) {
ocf_exit_reason( 'PGDATA "%s" does not exist', $pgdata );
exit $OCF_ERR_ARGS;
}
# check datadir
-if ( ! -d $datadir ) {
+if ( ! -d $datadir and ! ocf_is_probe() ) {
ocf_exit_reason( 'data_directory "%s" does not exist', $datadir );
exit $OCF_ERR_ARGS;
}

View File

@ -0,0 +1,66 @@
From 10d61eb3d8d8adcd0356fd855cbba4589027bfcb Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Tue, 4 Nov 2025 12:58:18 +0100
Subject: [PATCH] nginx: fix ls-redirection, mute non-errors in validate-all,
and set unique intervals for monitor actions
---
heartbeat/nginx | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/heartbeat/nginx b/heartbeat/nginx
index cb1c6ec27a..0f856175de 100755
--- a/heartbeat/nginx
+++ b/heartbeat/nginx
@@ -251,7 +251,7 @@ nginxcat() {
close(cmd);
}
function listfiles(pattern, cmd,f) {
- cmd="ls "pattern" 2>/dev/null";
+ cmd="ls "pattern;
while( ( cmd | getline f ) > 0 ) {
printfile(f);
}
@@ -271,7 +271,7 @@ nginxcat() {
return !system("test -d \""s"\"");
}
{ procline(); }
- ' $1 |
+ ' $1 2> /dev/null |
sed 's/#.*//;s/[[:blank:]]*$//;s/^[[:blank:]]*//' |
grep -v '^$'
}
@@ -800,8 +800,8 @@ Extra options to apply when starting nginx.
<action name="status" timeout="30s" />
<action name="monitor" timeout="30s" depth="0" interval="10s" />
<action name="monitor" timeout="30s" depth="10" interval="30s" />
-<action name="monitor" timeout="45s" depth="20" />
-<action name="monitor" timeout="60s" depth="30" />
+<action name="monitor" timeout="45s" depth="20" interval="60s" />
+<action name="monitor" timeout="60s" depth="30" interval="61s" />
<action name="meta-data" timeout="5s" />
<action name="validate-all" timeout="5s" />
</actions>
@@ -847,11 +847,11 @@ validate_all_nginx() {
exit $OCF_ERR_CONFIGURED
fi
if
- ocf_run $NGINXD $OPTIONS -t -c $CONFIGFILE
+ ocf_run $NGINXD $OPTIONS -q -t -c $CONFIGFILE
then
: Cool $NGINXD likes $CONFIGFILE
else
- ocf_exit_reason "$NGINXD $OPTIONS -t -c $CONFIGFILE reported a configuration error."
+ ocf_exit_reason "$NGINXD $OPTIONS -q -t -c $CONFIGFILE reported a configuration error."
return $OCF_ERR_CONFIGURED
fi
return $OCF_SUCCESS
@@ -908,7 +908,7 @@ then
if
[ ! -z "$OCF_RESKEY_httpd" ]
then
- ocf_log info "Using $NGINXD as nginx"
+ ocf_log debug "Using $NGINXD as nginx"
fi
fi

View File

@ -0,0 +1,135 @@
From 93729d83fa5bf15f4ec694e08e9777bde858fb41 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 16 Oct 2025 10:58:37 +0200
Subject: [PATCH 1/2] Filesystem: speed up get_pids
With force_umount=safe, we "manually" scan the /proc/ file system.
We look for symlinks pointing into the path we are interested in.
Specifically, we are interested in
/proc/<pid>/{root,exe,cwd}
/proc/<pid>/fd/<fd>
We also look for relevant memory mappings in /proc/<pid>/maps
All these are per process, not per "task" or "thread".
see procfs(5) and pthreads(7).
Still, we currently also scan /proc/<pid>/task/<tid>/
for all the same things.
With a large system with many heavily threaded processes,
this can significantly slow down this scanning,
without gaining new information.
Adding -maxdepth to the find command line avoids this useless work,
potentially reducing the scanning time by orders of magnitute
on systems with many heavily threaded processes.
We could also write a dedicated helper in C to do the very same thing,
with the option to "short circuit" and proceed with the next pid
as soon as the first "match" is found for the currently inspected pid.
That could further reduce the scanning time
by about an additional factor of 10.
---
heartbeat/Filesystem | 25 +++++++++++++++++++++----
1 file changed, 21 insertions(+), 4 deletions(-)
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
index 6d3960162..f76339fd6 100755
--- a/heartbeat/Filesystem
+++ b/heartbeat/Filesystem
@@ -680,14 +680,31 @@ get_pids()
# -path "/proc/[!0-9]*" -prune -o ...
# -path "/proc/[0-9]*" -a ...
# the latter seemd to be significantly faster for this one in my naive test.
+
+ # root, cwd, exe, maps, fd: all per process, not per task ("thread").
+ # -maxdepth to avoid repeatedly scanning the same thing
+ # for all threads of a heavily threaded process.
+ #
+ # Adding -maxdepth reduced scanning from > 16 seconds to < 2 seconds
+ # on a mostly idle system that happened to run a few java processes.
+ #
+ # We can also add a dedicated helper in C do twhat is done below,
+ # which would reduce the scanning time by an
+ # additional factor of 10 again.
+ #
+ # Or trust that fuser (above) learned something in the last 15 years
+ # and avoids blocking operations meanwhile?
procs=$(exec 2>/dev/null;
- find /proc -path "/proc/[0-9]*" -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print |
+ find /proc -mindepth 1 -maxdepth 3 \
+ -path "/proc/[0-9]*" \
+ -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print |
awk -F/ '{print $3}' | uniq)
- # This finds both /proc/<pid>/maps and /proc/<pid>/task/<tid>/maps;
- # if you don't want the latter, add -maxdepth.
+ # memory mappings are also per process, not per task.
+ # This finds only /proc/<pid>/maps, and not /proc/<pid>/task/<tid>/maps;
+ # if you also want the latter, drop -maxdepth.
mmap_procs=$(exec 2>/dev/null;
- find /proc -path "/proc/[0-9]*/maps" -print |
+ find /proc -mindepth 2 -maxdepth 2 -path "/proc/[0-9]*/maps" -print |
xargs -r grep -l " ${dir}/" | awk -F/ '{print $3}' | uniq)
printf "${procs}\n${mmap_procs}" | sort -u
fi
From 3d34db0c60a125126361b45ff8303358b6275298 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 16 Oct 2025 11:31:00 +0200
Subject: [PATCH 2/2] Filesystem: futher speed up get_pids
If we have /proc/<pid>/map_files/* symlinks,
we don't need to additionally grep /proc/<pid>/maps.
Also don't first collect output of commands into variables
just to pipe them to sort -u later,
just pipe the output of the commands through sort -u directly.
---
heartbeat/Filesystem | 31 +++++++++++++++++++------------
1 file changed, 19 insertions(+), 12 deletions(-)
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
index f76339fd6..7021f13da 100755
--- a/heartbeat/Filesystem
+++ b/heartbeat/Filesystem
@@ -694,19 +694,26 @@ get_pids()
#
# Or trust that fuser (above) learned something in the last 15 years
# and avoids blocking operations meanwhile?
- procs=$(exec 2>/dev/null;
- find /proc -mindepth 1 -maxdepth 3 \
- -path "/proc/[0-9]*" \
- -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print |
- awk -F/ '{print $3}' | uniq)
-
- # memory mappings are also per process, not per task.
- # This finds only /proc/<pid>/maps, and not /proc/<pid>/task/<tid>/maps;
- # if you also want the latter, drop -maxdepth.
- mmap_procs=$(exec 2>/dev/null;
+ (
+ # If you want to debug this, drop this redirection.
+ # But it producess too much "No such file" noise for kernel
+ # threads or due to races with exiting processes or closing fds.
+ exec 2>/dev/null;
+ find /proc -mindepth 1 -maxdepth 3 \
+ -path "/proc/[0-9]*" \
+ -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print |
+ awk -F/ '{print $3}' | uniq
+
+ # If we have "map_files/", "find" above already found the
+ # relevant symlinks, and we don't need to grep "maps" below.
+ # Available since kernel 3.3, respectively 4.3.
+ test -d /proc/$$/map_files ||
+ # memory mappings are also per process, not per task.
+ # This finds only /proc/<pid>/maps, and not /proc/<pid>/task/<tid>/maps;
+ # if you also want the latter, drop -maxdepth.
find /proc -mindepth 2 -maxdepth 2 -path "/proc/[0-9]*/maps" -print |
- xargs -r grep -l " ${dir}/" | awk -F/ '{print $3}' | uniq)
- printf "${procs}\n${mmap_procs}" | sort -u
+ xargs -r grep -l " ${dir}/" | awk -F/ '{print $3}' | uniq
+ ) | sort -u
fi
}

View File

@ -45,7 +45,7 @@
Name: resource-agents
Summary: Open Source HA Reusable Cluster Resource Scripts
Version: 4.16.0
Release: 38%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
Release: 39%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
License: GPL-2.0-or-later AND LGPL-2.1-or-later
URL: https://github.com/ClusterLabs/resource-agents
Source0: %{upstream_prefix}-%{upstream_version}.tar.gz
@ -99,6 +99,9 @@ Patch46: RHEL-124881-oracle-improve-monpassword-description.patch
Patch47: RHEL-109486-1-nfsserver-support-non-clustered-kerberized-mounts.patch
Patch48: RHEL-109486-2-nfsserver-fix-error-message.patch
Patch49: RHEL-109013-2-powervs-move-ip-add-iflabel-parameter.patch
Patch50: RHEL-102779-pgsqlms-fix-validate-warnings.patch
Patch51: RHEL-112443-nginx-fix-validate-warnings.patch
Patch52: RHEL-121985-Filesystem-speed-up-get-PIDs.patch
# bundled ha-cloud-support libs
Patch500: ha-cloud-support-aliyun.patch
@ -319,6 +322,9 @@ exit 1
%patch -p1 -P 47
%patch -p1 -P 48
%patch -p1 -P 49
%patch -p1 -P 50
%patch -p1 -P 51
%patch -p1 -P 52
# bundled ha-cloud-support libs
%patch -p1 -P 500
@ -651,6 +657,14 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
%changelog
* Tue Nov 4 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-39
- pgsqlms: fix validate warnings
- nginx: fix validate warnings
- Filesystem: speed up get PIDs
Resolves: RHEL-102779, RHEL-112443, RHEL-121985
* Mon Nov 3 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-38
- powervs-move-ip: new resource agent