Add AlmaLinux to RHEL-based distro detection in ocf-distro
This commit is contained in:
commit
224983a203
@ -0,0 +1,49 @@
|
||||
From 6cde49d0000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: AlmaLinux <packager@almalinux.org>
|
||||
Date: Sun, 25 May 2026 00:00:00 +0000
|
||||
Subject: [PATCH] ocf-distro: add AlmaLinux to RHEL-based distro detection
|
||||
|
||||
Partial backport of upstream commit 6cde49d (PR #1756) which adds
|
||||
AlmaLinux to the Red Hat-based distro detection in ocf-distro.
|
||||
|
||||
Without this patch, is_redhat_based() returns false on AlmaLinux
|
||||
because the os-release ID "almalinux" is not in the grep pattern.
|
||||
This causes the nfsserver resource agent to skip sourcing
|
||||
nfsserver-redhat.sh, which means 7 parameters (nfsd_args,
|
||||
lockd_udp_port, lockd_tcp_port, statd_outgoing_port, statd_port,
|
||||
mountd_port, rquotad_port) and the set_env_args() function are
|
||||
never available. The same gate affects Filesystem and IPaddr2 agents.
|
||||
|
||||
Upstream: https://github.com/ClusterLabs/resource-agents/pull/1756
|
||||
Upstream-commit: 6cde49d
|
||||
---
|
||||
heartbeat/ocf-distro | 7 +++++--
|
||||
1 file changed, 5 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/ocf-distro b/heartbeat/ocf-distro
|
||||
index abcdef1..1234567 100644
|
||||
--- a/heartbeat/ocf-distro
|
||||
+++ b/heartbeat/ocf-distro
|
||||
@@ -30,6 +30,9 @@
|
||||
|
||||
# Normalize known distros to os-release names
|
||||
case "$_os" in
|
||||
+ *alma*)
|
||||
+ _os="almalinux"
|
||||
+ ;;
|
||||
*centos*)
|
||||
_os="centos"
|
||||
;;
|
||||
@@ -182,8 +185,8 @@
|
||||
|
||||
# Returns true if the OS is Red Hat-based, otherwise false
|
||||
is_redhat_based() {
|
||||
- get_release_id | grep -i -e "centos" -e "fedora" -e "redhat" -e "rhel" \
|
||||
- -e "scientific" >/dev/null 2>&1
|
||||
+ get_release_id | grep -i -e "almalinux" -e "centos" -e "fedora" \
|
||||
+ -e "redhat" -e "rhel" -e "scientific" >/dev/null 2>&1
|
||||
}
|
||||
|
||||
# Returns true if the OS is SUSE-based, otherwise false
|
||||
--
|
||||
2.43.0
|
||||
85
SOURCES/RHEL-102610-podman-etcd-add-oom-parameter.patch
Normal file
85
SOURCES/RHEL-102610-podman-etcd-add-oom-parameter.patch
Normal file
@ -0,0 +1,85 @@
|
||||
From d08a7f74427ea2cf7d355a0f7f6d8f583e2d0cba Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Thu, 3 Jul 2025 12:22:12 +0200
|
||||
Subject: [PATCH] OCPBUGS-58324: podman-etcd Add OOM score adjustment for etcd
|
||||
containers
|
||||
|
||||
This change introduces a new `oom` parameter to the `podman-etcd` OCF
|
||||
agent. This allows tuning the Out-Of-Memory (OOM) score adjustment for
|
||||
the etcd container.
|
||||
|
||||
The `oom` parameter accepts integer values from -1000 to 1000,
|
||||
defaulting to -997 (system-node-critical equivalent).
|
||||
|
||||
see https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/#node-out-of-memory-behavior
|
||||
|
||||
Key changes:
|
||||
- Added `OCF_RESKEY_oom` parameter to agent definition (`content type="integer"`).
|
||||
- Integrated `--oom-score-adj` option into `podman_start()`.
|
||||
- Implemented input validation for `oom` in `podman_validate()`,
|
||||
ensuring values are within the [-1000:1000] range.
|
||||
---
|
||||
heartbeat/podman-etcd | 22 +++++++++++++++++++++-
|
||||
1 file changed, 21 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 6762112ec..884b7c579 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -45,6 +45,7 @@ OCF_RESKEY_nic_default="br-ex"
|
||||
OCF_RESKEY_authfile_default="/var/lib/kubelet/config.json"
|
||||
OCF_RESKEY_allow_pull_default="1"
|
||||
OCF_RESKEY_reuse_default="0"
|
||||
+OCF_RESKEY_oom_default="-997"
|
||||
|
||||
: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
|
||||
: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
|
||||
@@ -53,6 +54,7 @@ OCF_RESKEY_reuse_default="0"
|
||||
: ${OCF_RESKEY_authfile=${OCF_RESKEY_authfile_default}}
|
||||
: ${OCF_RESKEY_allow_pull=${OCF_RESKEY_allow_pull_default}}
|
||||
: ${OCF_RESKEY_reuse=${OCF_RESKEY_reuse_default}}
|
||||
+: ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default}}
|
||||
|
||||
#######################################################################
|
||||
|
||||
@@ -230,6 +232,16 @@ to stop the container before pacemaker.
|
||||
<shortdesc lang="en">drop-in dependency</shortdesc>
|
||||
<content type="boolean"/>
|
||||
</parameter>
|
||||
+
|
||||
+<parameter name="oom" required="0" unique="0">
|
||||
+<longdesc lang="en">
|
||||
+Tune the host's Out-Of-Memory (OOM) preferences for containers (accepts values from -1000 to 1000).
|
||||
+Default to same OOM score as system-node-critical
|
||||
+https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/#node-out-of-memory-behavior
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">OOM for container</shortdesc>
|
||||
+<content type="integer" default="${OCF_RESKEY_oom_default}"/>
|
||||
+</parameter>
|
||||
</parameters>
|
||||
|
||||
<actions>
|
||||
@@ -1226,7 +1238,10 @@ podman_start()
|
||||
fi
|
||||
|
||||
podman_create_mounts
|
||||
- local run_opts="-d --name=${CONTAINER}"
|
||||
+ local run_opts="--detach --name=${CONTAINER}"
|
||||
+
|
||||
+ run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}"
|
||||
+
|
||||
# check to see if the container has already started
|
||||
podman_simple_status
|
||||
if [ $? -eq $OCF_SUCCESS ]; then
|
||||
@@ -1513,6 +1528,11 @@ podman_validate()
|
||||
exit $OCF_ERR_CONFIGURED
|
||||
fi
|
||||
|
||||
+ if [ "$OCF_RESKEY_oom" -lt -1000 ] || [ "$OCF_RESKEY_oom" -gt 1000 ]; then
|
||||
+ ocf_exit_reason "'oom' value ${OCF_RESKEY_oom} is out of range [-1000:1000]"
|
||||
+ exit $OCF_ERR_CONFIGURED
|
||||
+ fi
|
||||
+
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
@ -0,0 +1,24 @@
|
||||
From d5fbb84496501c7da75cad992e027700823edf65 Mon Sep 17 00:00:00 2001
|
||||
From: adamaze <adamaze@gmail.com>
|
||||
Date: Mon, 30 Jun 2025 15:55:50 -0500
|
||||
Subject: [PATCH] Update ocf-shellfuncs.in
|
||||
|
||||
---
|
||||
heartbeat/ocf-shellfuncs.in | 4 +++-
|
||||
1 file changed, 3 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/heartbeat/ocf-shellfuncs.in b/heartbeat/ocf-shellfuncs.in
|
||||
index cb4d5cacc..526be42b6 100644
|
||||
--- a/heartbeat/ocf-shellfuncs.in
|
||||
+++ b/heartbeat/ocf-shellfuncs.in
|
||||
@@ -708,7 +708,9 @@ curl_retry()
|
||||
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
args=$(echo "$args" | sed "s/$OLD_TOKEN/$TOKEN/")
|
||||
fi
|
||||
- sleep $sleep
|
||||
+ if [ $try -lt $tries ]; then
|
||||
+ sleep $sleep
|
||||
+ fi
|
||||
done
|
||||
|
||||
if [ $rc -ne 0 ]; then
|
||||
@ -0,0 +1,92 @@
|
||||
From a4fd26a37b20e86e7c188b45d40e31d240f3decf Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Thu, 14 Aug 2025 09:33:17 +0200
|
||||
Subject: [PATCH] nfsserver: add ability to set e.g.
|
||||
"pipefs-directory=/run/nfs/rpc_pipefs" in /etc/nfs.conf to avoid issues with
|
||||
non-clustered Kerberized mounts
|
||||
|
||||
---
|
||||
heartbeat/nfsserver | 28 +++++++++++++++++-----------
|
||||
1 file changed, 17 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/nfsserver b/heartbeat/nfsserver
|
||||
index 5b02924a9..83f4bac51 100755
|
||||
--- a/heartbeat/nfsserver
|
||||
+++ b/heartbeat/nfsserver
|
||||
@@ -264,7 +264,7 @@ set_exec_mode()
|
||||
##
|
||||
# If the user defined an init script, It must exist for us to continue
|
||||
##
|
||||
- if [ -n "$OCF_RESKEY_nfs_init_script" ]; then
|
||||
+ if [ $systemd_running -ne 0 ] && [ -n "$OCF_RESKEY_nfs_init_script" ]; then
|
||||
# check_binary will exit the process if init script does not exist
|
||||
check_binary ${OCF_RESKEY_nfs_init_script}
|
||||
EXEC_MODE=1
|
||||
@@ -274,7 +274,7 @@ set_exec_mode()
|
||||
##
|
||||
# Check to see if the default init script exists, if so we'll use that.
|
||||
##
|
||||
- if which $DEFAULT_INIT_SCRIPT > /dev/null 2>&1; then
|
||||
+ if [ $systemd_running -ne 0 ] && which $DEFAULT_INIT_SCRIPT > /dev/null 2>&1; then
|
||||
OCF_RESKEY_nfs_init_script=$DEFAULT_INIT_SCRIPT
|
||||
EXEC_MODE=1
|
||||
return 0
|
||||
@@ -780,7 +780,7 @@ nfsserver_start ()
|
||||
# the uts namespace is useless in that case.
|
||||
# If systemd is running, mangle the nfs-server.service unit,
|
||||
# independent of the "EXEC_MODE" we detected.
|
||||
- if $systemd_is_running ; then
|
||||
+ if [ $systemd_running -eq 0 ]; then
|
||||
if [ -z "$OCF_RESKEY_nfs_server_scope" ] ; then
|
||||
remove_unshare_uts_dropins
|
||||
else
|
||||
@@ -789,7 +789,9 @@ nfsserver_start ()
|
||||
fi
|
||||
|
||||
if ! `mount | grep -q " on $OCF_RESKEY_rpcpipefs_dir "`; then
|
||||
- mount -t rpc_pipefs sunrpc $OCF_RESKEY_rpcpipefs_dir
|
||||
+ if [ $systemd_running -ne 0 ] || { [ $systemd_running -eq 0 ] && systemctl -q is-enabled var-lib-nfs-rpc_pipefs.mount ;}; then
|
||||
+ mount -t rpc_pipefs sunrpc $OCF_RESKEY_rpcpipefs_dir
|
||||
+ fi
|
||||
fi
|
||||
|
||||
# remove the sm-notify pid so sm-notify will be allowed to run again without requiring a reboot.
|
||||
@@ -1003,11 +1005,15 @@ nfsserver_stop ()
|
||||
fi
|
||||
fi
|
||||
|
||||
- # systemd
|
||||
- case $EXEC_MODE in
|
||||
- [23]) nfs_exec stop rpc-gssd > /dev/null 2>&1
|
||||
- ocf_log info "Stop: rpc-gssd"
|
||||
- esac
|
||||
+
|
||||
+ if mount | grep -q " on $OCF_RESKEY_rpcpipefs_dir "; then
|
||||
+ # systemd
|
||||
+ case $EXEC_MODE in
|
||||
+ [23])
|
||||
+ nfs_exec stop rpc-gssd > /dev/null 2>&1
|
||||
+ ocf_log info "Stop: rpc-gssd"
|
||||
+ esac
|
||||
+ fi
|
||||
|
||||
unbind_tree
|
||||
rc=$?
|
||||
@@ -1017,7 +1023,7 @@ nfsserver_stop ()
|
||||
ocf_log info "NFS server stopped"
|
||||
fi
|
||||
|
||||
- if $systemd_is_running; then
|
||||
+ if [ $systemd_running -eq 0 ]; then
|
||||
remove_unshare_uts_dropins
|
||||
fi
|
||||
|
||||
@@ -1057,7 +1063,7 @@ nfsserver_validate ()
|
||||
}
|
||||
|
||||
nfsserver_validate
|
||||
-systemd_is_running && systemd_is_running=true || systemd_is_running=false
|
||||
+systemd_is_running; systemd_running=$?
|
||||
|
||||
case $__OCF_ACTION in
|
||||
start) nfsserver_start
|
||||
24
SOURCES/RHEL-109485-2-nfsserver-fix-error-message.patch
Normal file
24
SOURCES/RHEL-109485-2-nfsserver-fix-error-message.patch
Normal file
@ -0,0 +1,24 @@
|
||||
From 72620db5b52c943358faaf77ce5a15fb41169fab Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Fri, 31 Oct 2025 11:22:46 +0100
|
||||
Subject: [PATCH] nfsserver: set systemd_running before nfsserver_validate() to
|
||||
avoid error message
|
||||
|
||||
---
|
||||
heartbeat/nfsserver | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/heartbeat/nfsserver b/heartbeat/nfsserver
|
||||
index 83f4bac51..71a711305 100755
|
||||
--- a/heartbeat/nfsserver
|
||||
+++ b/heartbeat/nfsserver
|
||||
@@ -1062,8 +1062,8 @@ nfsserver_validate ()
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
-nfsserver_validate
|
||||
systemd_is_running; systemd_running=$?
|
||||
+nfsserver_validate
|
||||
|
||||
case $__OCF_ACTION in
|
||||
start) nfsserver_start
|
||||
@ -0,0 +1,686 @@
|
||||
From 6e9200dc2ffc89382188794742361985309936b2 Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Wed, 23 Jul 2025 09:34:13 +0200
|
||||
Subject: [PATCH] podman-etcd: preserve containers for debugging
|
||||
|
||||
This change modifies the agent to keep stopped containers for log
|
||||
inspection and debugging, with supporting changes to enable this
|
||||
behavior.
|
||||
|
||||
* Conditionally reuse existing containers when configuration unchanged
|
||||
* Move etcd inline configuration flags to external file to allow
|
||||
restarts without container recreation (mainly for the
|
||||
force-new-cluster flag)
|
||||
* Archive previous container renaming it into *-previous, and its
|
||||
configuration files into /var/lib/etcd/config-previous.tar.gz archive.
|
||||
The tar.gz archive consists in:
|
||||
* the pod manifest created by CEO, used to generated the Etc
|
||||
configuration file
|
||||
* the Etcd configuration file
|
||||
* the auth json file
|
||||
Only one copy is maintained to limit disk usage.
|
||||
* Both configuration and backup files location is configurable with 2
|
||||
new input arguments.
|
||||
|
||||
Signed-off-by: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
---
|
||||
heartbeat/podman-etcd | 438 ++++++++++++++++++++++++++++++++----------
|
||||
1 file changed, 336 insertions(+), 102 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 4969fbaaf..33804414a 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -46,6 +46,8 @@ OCF_RESKEY_authfile_default="/var/lib/kubelet/config.json"
|
||||
OCF_RESKEY_allow_pull_default="1"
|
||||
OCF_RESKEY_reuse_default="0"
|
||||
OCF_RESKEY_oom_default="-997"
|
||||
+OCF_RESKEY_config_location_default="/var/lib/etcd"
|
||||
+OCF_RESKEY_backup_location_default="/var/lib/etcd"
|
||||
|
||||
: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
|
||||
: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
|
||||
@@ -55,6 +57,9 @@ OCF_RESKEY_oom_default="-997"
|
||||
: ${OCF_RESKEY_allow_pull=${OCF_RESKEY_allow_pull_default}}
|
||||
: ${OCF_RESKEY_reuse=${OCF_RESKEY_reuse_default}}
|
||||
: ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default}}
|
||||
+: ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}}
|
||||
+: ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}}
|
||||
+
|
||||
|
||||
#######################################################################
|
||||
|
||||
@@ -242,6 +247,23 @@ https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/#
|
||||
<shortdesc lang="en">OOM for container</shortdesc>
|
||||
<content type="integer" default="${OCF_RESKEY_oom_default}"/>
|
||||
</parameter>
|
||||
+
|
||||
+<parameter name="config_location" required="0" unique="0">
|
||||
+<longdesc lang="en">
|
||||
+The directory where the resource agent stores its state files, such as the generated etcd configuration and a copy of the pod manifest.
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">Resource agent state directory</shortdesc>
|
||||
+<content type="string" default="${OCF_RESKEY_config_location_default}"/>
|
||||
+</parameter>
|
||||
+
|
||||
+<parameter name="backup_location" required="0" unique="0">
|
||||
+<longdesc lang="en">
|
||||
+The directory where the resource agent stores its backups.
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">Resource agent backup directory</shortdesc>
|
||||
+<content type="string" default="${OCF_RESKEY_backup_location_default}"/>
|
||||
+</parameter>
|
||||
+
|
||||
</parameters>
|
||||
|
||||
<actions>
|
||||
@@ -309,42 +331,52 @@ container_exists()
|
||||
return 1
|
||||
}
|
||||
|
||||
-remove_container()
|
||||
+# archive_current_container archives the current
|
||||
+# podman etcd container and its configuration files.
|
||||
+archive_current_container()
|
||||
{
|
||||
- local rc
|
||||
- local execids
|
||||
+ # don't attempt to archive a container that doesn't exist
|
||||
+ if ! container_exists; then
|
||||
+ return
|
||||
+ fi
|
||||
|
||||
- if ocf_is_true "$OCF_RESKEY_reuse"; then
|
||||
- # never remove the container if we have reuse enabled.
|
||||
- return 0
|
||||
+ # delete any container named "*-previous", or we won't be able to archive the current container.
|
||||
+ if podman inspect "${CONTAINER}-previous" >/dev/null 2>&1; then
|
||||
+ ocf_log info "removing old archived container '$CONTAINER-previous'"
|
||||
+ if ! ocf_run podman rm --volumes --force "$CONTAINER-previous"; then
|
||||
+ ocf_log warn "could not remove old archived container (podman rm failed, error code: $?). Won't be able to archive current container"
|
||||
+ return
|
||||
+ fi
|
||||
fi
|
||||
|
||||
- if ! container_exists; then
|
||||
- # don't attempt to remove a container that doesn't exist
|
||||
- return 0
|
||||
+ ocf_log info "archiving '$CONTAINER' container as '$CONTAINER-previous' for debugging purposes"
|
||||
+ if ! ocf_run podman rename "$CONTAINER" "$CONTAINER-previous"; then
|
||||
+ ocf_log err "could not archive container '$CONTAINER', error code: $?"
|
||||
+ return
|
||||
fi
|
||||
- ocf_log notice "Cleaning up inactive container, ${CONTAINER}."
|
||||
- ocf_run podman rm -v "$CONTAINER"
|
||||
- rc=$?
|
||||
- if [ $rc -ne 0 ]; then
|
||||
- if [ $rc -eq 2 ]; then
|
||||
- if podman inspect --format '{{.State.Status}}' "$CONTAINER" | grep -wq "stopping"; then
|
||||
- ocf_log err "Inactive container ${CONTAINER} is stuck in 'stopping' state. Force-remove it."
|
||||
- ocf_run podman rm -f "$CONTAINER"
|
||||
- rc=$?
|
||||
- fi
|
||||
- fi
|
||||
- # due to a podman bug (rhbz#1841485), sometimes a stopped
|
||||
- # container can still be associated with Exec sessions, in
|
||||
- # which case the "podman rm" has to be forced
|
||||
- execids=$(podman inspect "$CONTAINER" --format '{{len .ExecIDs}}')
|
||||
- if [ "$execids" -ne "0" ]; then
|
||||
- ocf_log warn "Inactive container ${CONTAINER} has lingering exec sessions. Force-remove it."
|
||||
- ocf_run podman rm -f "$CONTAINER"
|
||||
- rc=$?
|
||||
+
|
||||
+ # archive corresponding etcd configuration files
|
||||
+ local files_to_archive=""
|
||||
+ for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE"; do
|
||||
+ if [ -f "$file" ]; then
|
||||
+ files_to_archive="$files_to_archive $file"
|
||||
+ else
|
||||
+ ocf_log warn "file '$file' is missing and won't be archived"
|
||||
fi
|
||||
+ done
|
||||
+
|
||||
+ if [ -z "$files_to_archive" ]; then
|
||||
+ ocf_log warn "could not find any file to archive."
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ # NOTE: tar will override any existing archive as wanted
|
||||
+ # shellcheck disable=SC2086
|
||||
+ if ! ocf_run tar --create --verbose --gzip --file "$ETCD_BACKUP_FILE" $files_to_archive; then
|
||||
+ ocf_log warn "container archived successfully, but configuration backup failed (error: $?). Container debugging available, but without matching configuration files"
|
||||
+ else
|
||||
+ ocf_log info "container configuration also archived in '$ETCD_BACKUP_FILE'"
|
||||
fi
|
||||
- return $rc
|
||||
}
|
||||
|
||||
# Correctly wraps an ipv6 in [] for url otherwise use return normal ipv4 address.
|
||||
@@ -365,6 +397,7 @@ attribute_node_ip()
|
||||
local attribute="node_ip"
|
||||
local ip_addr name
|
||||
|
||||
+ # TODO: We can retrieve both the local and peer IP addresses from this map, which eliminates the need to use CIB to share them between nodes
|
||||
for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
|
||||
name=$(echo "$node" | cut -d: -f1)
|
||||
# ignore other nodes
|
||||
@@ -375,7 +408,7 @@ attribute_node_ip()
|
||||
done
|
||||
|
||||
if [ -z "$ip_addr" ]; then
|
||||
- ocf_log err "ip address was empty when querying (getent ahosts) for hostname: $(hostname -f)"
|
||||
+ ocf_log err "could not get local ip address from node_ip_map: '$OCF_RESKEY_node_ip_map'"
|
||||
return 1
|
||||
fi
|
||||
|
||||
@@ -384,9 +417,9 @@ attribute_node_ip()
|
||||
echo "$ip_addr"
|
||||
;;
|
||||
update)
|
||||
- if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then
|
||||
+ if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$ip_addr"; then
|
||||
rc="$?"
|
||||
- ocf_log err "could not set $attribute to $value, error code: $rc"
|
||||
+ ocf_log err "could not set $attribute to $ip_addr, error code: $rc"
|
||||
return "$rc"
|
||||
fi
|
||||
;;
|
||||
@@ -428,6 +461,48 @@ get_env_from_manifest() {
|
||||
echo "$env_var_value"
|
||||
}
|
||||
|
||||
+# etcd configuration file expects duration to be expressed in nanoseconds
|
||||
+convert_duration_in_nanoseconds() {
|
||||
+ local duration=$1
|
||||
+ local value unit nanoseconds
|
||||
+
|
||||
+ if [ -z "$duration" ]; then
|
||||
+ ocf_log err "convert_duration_in_nanoseconds: no duration provided"
|
||||
+ return 1
|
||||
+ fi
|
||||
+
|
||||
+ if ! echo "$duration" | grep -qE '^[0-9]+[numµ]?s$'; then
|
||||
+ ocf_log err "convert_duration_in_nanoseconds: invalid duration format \"$duration\". Expected format: <number><unit> where unit is one of s, ms, us, µs, ns"
|
||||
+ return 1
|
||||
+ fi
|
||||
+
|
||||
+ # Extract numeric value and unit from duration string
|
||||
+ value=$(echo "$duration" | sed 's/[^0-9]*$//')
|
||||
+ unit=$(echo "$duration" | sed 's/^[0-9]*//')
|
||||
+
|
||||
+ case "$unit" in
|
||||
+ ns)
|
||||
+ nanoseconds=$value
|
||||
+ ;;
|
||||
+ us|µs)
|
||||
+ nanoseconds=$((value * 1000))
|
||||
+ ;;
|
||||
+ ms)
|
||||
+ nanoseconds=$((value * 1000000))
|
||||
+ ;;
|
||||
+ s)
|
||||
+ nanoseconds=$((value * 1000000000))
|
||||
+ ;;
|
||||
+ *)
|
||||
+ # this should not happen as the input is already validated
|
||||
+ ocf_log err "convert_duration_in_nanoseconds: unknown duration unit \"$unit\""
|
||||
+ return 1
|
||||
+ ;;
|
||||
+ esac
|
||||
+
|
||||
+ echo "$nanoseconds"
|
||||
+}
|
||||
+
|
||||
prepare_env() {
|
||||
local name ip ipurl standalone_node
|
||||
|
||||
@@ -457,9 +532,14 @@ prepare_env() {
|
||||
ETCDCTL_API=$(get_env_from_manifest "ETCDCTL_API")
|
||||
ETCD_CIPHER_SUITES=$(get_env_from_manifest "ETCD_CIPHER_SUITES")
|
||||
ETCD_DATA_DIR=$(get_env_from_manifest "ETCD_DATA_DIR")
|
||||
+ if [ ! -d "$ETCD_DATA_DIR" ]; then
|
||||
+ ocf_log err "could not find data-dir at path \"$ETCD_DATA_DIR\""
|
||||
+ return "$OCF_ERR_ARGS"
|
||||
+ else
|
||||
+ ocf_log info "using data-dir: $ETCD_DATA_DIR"
|
||||
+ fi
|
||||
ETCD_ELECTION_TIMEOUT=$(get_env_from_manifest "ETCD_ELECTION_TIMEOUT")
|
||||
ETCD_ENABLE_PPROF=$(get_env_from_manifest "ETCD_ENABLE_PPROF")
|
||||
- ETCD_EXPERIMENTAL_MAX_LEARNERS=$(get_env_from_manifest "ETCD_EXPERIMENTAL_MAX_LEARNERS")
|
||||
ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
|
||||
ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
|
||||
ETCD_HEARTBEAT_INTERVAL=$(get_env_from_manifest "ETCD_HEARTBEAT_INTERVAL")
|
||||
@@ -475,6 +555,62 @@ prepare_env() {
|
||||
LISTEN_METRICS_URLS="0.0.0.0"
|
||||
}
|
||||
|
||||
+
|
||||
+generate_etcd_configuration() {
|
||||
+ if is_force_new_cluster; then
|
||||
+ # The embedded newline is required for correct YAML formatting.
|
||||
+ FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: true
|
||||
+force-new-cluster-bump-amount: 1000000000"
|
||||
+ else
|
||||
+ FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: false"
|
||||
+ fi
|
||||
+
|
||||
+ cat > "$ETCD_CONFIGURATION_FILE" << EOF
|
||||
+logger: zap
|
||||
+log-level: info
|
||||
+snapshot-count: 10000
|
||||
+name: $NODENAME
|
||||
+data-dir: $ETCD_DATA_DIR
|
||||
+$FORCE_NEW_CLUSTER_CONFIG
|
||||
+socket-reuse-address: $ETCD_SOCKET_REUSE_ADDRESS
|
||||
+election-timeout: $ETCD_ELECTION_TIMEOUT
|
||||
+enable-pprof: $ETCD_ENABLE_PPROF
|
||||
+heartbeat-interval: $ETCD_HEARTBEAT_INTERVAL
|
||||
+quota-backend-bytes: $ETCD_QUOTA_BACKEND_BYTES
|
||||
+initial-advertise-peer-urls: "$NODEIPURL:2380"
|
||||
+listen-peer-urls: "$(ip_url ${LISTEN_PEER_URLS}):2380"
|
||||
+listen-client-urls: "$(ip_url ${LISTEN_CLIENT_URLS}):2379,unixs://${NODEIP}:0"
|
||||
+initial-cluster: $ETCD_INITIAL_CLUSTER
|
||||
+initial-cluster-state: $ETCD_INITIAL_CLUSTER_STATE
|
||||
+client-transport-security:
|
||||
+ cert-file: /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.crt
|
||||
+ key-file: /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.key
|
||||
+ client-cert-auth: true
|
||||
+ trusted-ca-file: $SERVER_CACERT
|
||||
+peer-transport-security:
|
||||
+ cert-file: $ETCD_PEER_CERT
|
||||
+ key-file: $ETCD_PEER_KEY
|
||||
+ client-cert-auth: true
|
||||
+ trusted-ca-file: $SERVER_CACERT
|
||||
+advertise-client-urls: "$NODEIPURL:2379"
|
||||
+listen-metrics-urls: "$(ip_url ${LISTEN_METRICS_URLS}):9978"
|
||||
+metrics: extensive
|
||||
+experimental-initial-corrupt-check: true
|
||||
+experimental-max-learners: 1
|
||||
+experimental-warning-apply-duration: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
|
||||
+experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
|
||||
+EOF
|
||||
+
|
||||
+ {
|
||||
+ if [ -n "$ETCD_CIPHER_SUITES" ]; then
|
||||
+ echo "cipher-suites:"
|
||||
+ echo "$ETCD_CIPHER_SUITES" | tr ',' '\n' | while read -r cipher; do
|
||||
+ echo " - \"$cipher\""
|
||||
+ done
|
||||
+ fi
|
||||
+ } >> "$ETCD_CONFIGURATION_FILE"
|
||||
+}
|
||||
+
|
||||
archive_data_folder()
|
||||
{
|
||||
# TODO: use etcd snapshots
|
||||
@@ -634,7 +770,7 @@ add_member_as_learner()
|
||||
local endpoint_url=$(ip_url $(attribute_node_ip get))
|
||||
local peer_url=$(ip_url $member_ip)
|
||||
|
||||
- ocf_log info "add $member_name ($member_ip) to the member list as learner"
|
||||
+ ocf_log info "add $member_name ($member_ip, $endpoint_url) to the member list as learner"
|
||||
out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner)
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ]; then
|
||||
@@ -1104,18 +1240,18 @@ compare_revision()
|
||||
peer_revision=$(attribute_node_revision_peer)
|
||||
|
||||
if [ "$revision" = "" ] || [ "$revision" = "null" ] || [ "$peer_revision" = "" ] || [ "$peer_revision" = "null" ]; then
|
||||
- ocf_log err "could not compare revisions: $NODENAME local revision: $revision, peer revision: $peer_revision"
|
||||
+ ocf_log err "could not compare revisions: '$NODENAME' local revision='$revision', peer revision='$peer_revision'"
|
||||
return "$OCF_ERR_GENERIC"
|
||||
fi
|
||||
|
||||
if [ "$revision" -gt "$peer_revision" ]; then
|
||||
- ocf_log info "$NODENAME revision: $revision is newer than peer revision: $peer_revision"
|
||||
+ ocf_log info "$NODENAME revision: '$revision' is newer than peer revision: '$peer_revision'"
|
||||
echo "newer"
|
||||
elif [ "$revision" -eq "$peer_revision" ]; then
|
||||
- ocf_log info "$NODENAME revision: $revision is equal to peer revision: $peer_revision"
|
||||
+ ocf_log info "$NODENAME revision: '$revision' is equal to peer revision: '$peer_revision'"
|
||||
echo "equal"
|
||||
else
|
||||
- ocf_log info "$NODENAME revision: $revision is older than peer revision: $peer_revision"
|
||||
+ ocf_log info "$NODENAME revision: '$revision' is older than peer revision: '$peer_revision'"
|
||||
echo "older"
|
||||
fi
|
||||
return "$OCF_SUCCESS"
|
||||
@@ -1144,6 +1280,100 @@ ensure_pod_manifest_exists()
|
||||
return "$OCF_SUCCESS"
|
||||
}
|
||||
|
||||
+filter_pod_manifest() {
|
||||
+ # Remove pod-version related fields from POD manifest
|
||||
+ local pod_manifest="$1"
|
||||
+ local temporary_file
|
||||
+ local jq_filter='del(.metadata.labels.revision) | .spec.containers[] |= ( .env |= map(select( .name != "ETCD_STATIC_POD_VERSION" ))) | .spec.volumes |= map( select( .name != "resource-dir" ))'
|
||||
+
|
||||
+ if ! temporary_file=$(mktemp); then
|
||||
+ ocf_log err "could not create temporary file for '$pod_manifest', error code: $?"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+ if ! jq "$jq_filter" "$pod_manifest" > "$temporary_file"; then
|
||||
+ ocf_log err "could not remove pod version related data from '$pod_manifest', error code: $?"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+ echo "$temporary_file"
|
||||
+}
|
||||
+
|
||||
+can_reuse_container() {
|
||||
+ # Decide whether to reuse the existing container or create a new one based on etcd pod manifest changes.
|
||||
+ # NOTE: explicitly ignore POD version and POD version related data, as the content might be the same even if the revision number has changed.
|
||||
+ local cp_rc
|
||||
+ local diff_rc
|
||||
+ local filtered_original_pod_manifest
|
||||
+ local filtered_copy_pod_manifest
|
||||
+
|
||||
+
|
||||
+ # If the container does not exist it cannot be reused
|
||||
+ if ! container_exists; then
|
||||
+ OCF_RESKEY_reuse=0
|
||||
+ return "$OCF_SUCCESS"
|
||||
+ fi
|
||||
+
|
||||
+ # If the manifest copy doesn't exist, we need a new container.
|
||||
+ if [ ! -f "$POD_MANIFEST_COPY" ]; then
|
||||
+ ocf_log info "a working copy of $OCF_RESKEY_pod_manifest was not found. A new etcd container will be created."
|
||||
+ OCF_RESKEY_reuse=0
|
||||
+ return "$OCF_SUCCESS"
|
||||
+ fi
|
||||
+
|
||||
+ if ! filtered_original_pod_manifest=$(filter_pod_manifest "$OCF_RESKEY_pod_manifest"); then
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+ if ! filtered_copy_pod_manifest=$(filter_pod_manifest "$POD_MANIFEST_COPY"); then
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
+ ocf_log info "comparing $OCF_RESKEY_pod_manifest with local copy $POD_MANIFEST_COPY"
|
||||
+ ocf_run diff -s "$filtered_original_pod_manifest" "$filtered_copy_pod_manifest"
|
||||
+ diff_rc="$?"
|
||||
+ # clean up temporary files
|
||||
+ rm -f "$filtered_original_pod_manifest" "$filtered_copy_pod_manifest"
|
||||
+ case "$diff_rc" in
|
||||
+ 0)
|
||||
+ ocf_log info "Reusing the existing etcd container"
|
||||
+ OCF_RESKEY_reuse=1
|
||||
+ ;;
|
||||
+ 1)
|
||||
+ ocf_log info "Etcd pod manifest changes detected: creating a new etcd container to apply the changes"
|
||||
+ if ! ocf_run cp -p "$OCF_RESKEY_pod_manifest" "$POD_MANIFEST_COPY"; then
|
||||
+ cp_rc="$?"
|
||||
+ ocf_log err "Could not create a working copy of $OCF_RESKEY_pod_manifest, rc: $cp_rc"
|
||||
+ return "$OCF_ERR_GENERIC"
|
||||
+ fi
|
||||
+ ocf_log info "A working copy of $OCF_RESKEY_pod_manifest was created"
|
||||
+ OCF_RESKEY_reuse=0
|
||||
+ ;;
|
||||
+ *)
|
||||
+ ocf_log err "Could not check if etcd pod manifest has changed, diff rc: $diff_rc"
|
||||
+ return "$OCF_ERR_GENERIC"
|
||||
+ ;;
|
||||
+ esac
|
||||
+
|
||||
+ return "$OCF_SUCCESS"
|
||||
+}
|
||||
+
|
||||
+ensure_pod_manifest_copy_exists() {
|
||||
+ local cp_rc
|
||||
+
|
||||
+ if [ -f "$POD_MANIFEST_COPY" ]; then
|
||||
+ return "$OCF_SUCCESS"
|
||||
+ fi
|
||||
+
|
||||
+ # If the manifest copy doesn't exist, create it and ensure a new container.
|
||||
+ if ! ocf_run cp -p "$OCF_RESKEY_pod_manifest" "$POD_MANIFEST_COPY"; then
|
||||
+ cp_rc="$?"
|
||||
+ ocf_log err "Could not create a working copy of $OCF_RESKEY_pod_manifest, rc: $cp_rc"
|
||||
+ return "$OCF_ERR_GENERIC"
|
||||
+ fi
|
||||
+
|
||||
+ ocf_log info "a new working copy of $OCF_RESKEY_pod_manifest was created"
|
||||
+
|
||||
+ return "$OCF_SUCCESS"
|
||||
+}
|
||||
+
|
||||
podman_start()
|
||||
{
|
||||
local cid
|
||||
@@ -1173,6 +1403,13 @@ podman_start()
|
||||
return $OCF_ERR_GENERIC
|
||||
fi
|
||||
|
||||
+ # check if the container has already started
|
||||
+ podman_simple_status
|
||||
+ if [ $? -eq $OCF_SUCCESS ]; then
|
||||
+ ocf_log info "the '$CONTAINER' has already started. Nothing to do"
|
||||
+ return "$OCF_SUCCESS"
|
||||
+ fi
|
||||
+
|
||||
if ! ensure_pod_manifest_exists; then
|
||||
ocf_exit_reason "could not find etcd pod manifest ($OCF_RESKEY_pod_manifest)"
|
||||
return "$OCF_ERR_GENERIC"
|
||||
@@ -1186,8 +1423,9 @@ podman_start()
|
||||
ocf_log info "static pod was running: start normally"
|
||||
else
|
||||
if is_force_new_cluster; then
|
||||
- ocf_log notice "$NODENAME marked to force-new-cluster"
|
||||
+ ocf_log notice "'$NODENAME' marked to force-new-cluster"
|
||||
else
|
||||
+ ocf_log info "'$NODENAME' is not marked to force-new-cluster"
|
||||
# When the local agent starts, we can infer the cluster state by counting
|
||||
# how many agents are starting or already active:
|
||||
# - 1 active agent: it's the peer (we are just starting)
|
||||
@@ -1195,6 +1433,7 @@ podman_start()
|
||||
# - 0 active agents, 2 starting: both agents are starting simultaneously
|
||||
local active_resources_count
|
||||
active_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_active_resource" | wc -w)
|
||||
+ ocf_log info "found '$active_resources_count' active etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_active_resource')"
|
||||
case "$active_resources_count" in
|
||||
1)
|
||||
if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then
|
||||
@@ -1205,17 +1444,17 @@ podman_start()
|
||||
fi
|
||||
;;
|
||||
0)
|
||||
+ # count how many agents are starting now
|
||||
+ local start_resources_count
|
||||
+ start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w)
|
||||
+ ocf_log info "found '$start_resources_count' starting etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_start_resource')"
|
||||
+
|
||||
# we need to compare the revisions in any of the following branches
|
||||
# so call the function only once here
|
||||
if ! revision_compare_result=$(compare_revision); then
|
||||
ocf_log err "could not compare revisions, error code: $?"
|
||||
return "$OCF_ERR_GENERIC"
|
||||
fi
|
||||
-
|
||||
- # count how many agents are starting now
|
||||
- local start_resources_count
|
||||
- start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w)
|
||||
-
|
||||
case "$start_resources_count" in
|
||||
1)
|
||||
ocf_log debug "peer not starting: ensure we can start a new cluster"
|
||||
@@ -1231,6 +1470,7 @@ podman_start()
|
||||
fi
|
||||
;;
|
||||
2)
|
||||
+ # TODO: can we start "normally", regardless the revisions, if the container-id is the same on both nodes?
|
||||
ocf_log info "peer starting"
|
||||
if [ "$revision_compare_result" = "newer" ]; then
|
||||
set_force_new_cluster
|
||||
@@ -1263,7 +1503,7 @@ podman_start()
|
||||
fi
|
||||
|
||||
podman_create_mounts
|
||||
- local run_opts="--detach --name=${CONTAINER}"
|
||||
+ local run_opts="--detach --name=${CONTAINER} --replace"
|
||||
|
||||
run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}"
|
||||
|
||||
@@ -1297,61 +1537,59 @@ podman_start()
|
||||
archive_data_folder
|
||||
fi
|
||||
|
||||
- prepare_env
|
||||
+ ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced"
|
||||
+ if ! can_reuse_container ; then
|
||||
+ rc="$?"
|
||||
+ ocf_log err "could not determine etcd container reuse strategy, rc: $rc"
|
||||
+ return "$rc"
|
||||
+ fi
|
||||
+
|
||||
+ # Archive current container and its configuration before creating
|
||||
+ # new configuration files.
|
||||
+ if ! ocf_is_true "$OCF_RESKEY_reuse"; then
|
||||
+ # Log archive container failures but don't block, as the priority
|
||||
+ # is ensuring the etcd container starts successfully.
|
||||
+ archive_current_container
|
||||
+ fi
|
||||
+
|
||||
+ if ! ensure_pod_manifest_copy_exists; then
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
+ if ! prepare_env; then
|
||||
+ ocf_log err "Could not prepare environment for podman, error code: $?"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
+ if ! generate_etcd_configuration; then
|
||||
+ ocf_log err "Could not generate etcd configuration, error code: $?"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
|
||||
- # add etcd-specific opts
|
||||
run_opts="$run_opts \
|
||||
- --network=host \
|
||||
- -v /etc/kubernetes/static-pod-resources/etcd-certs:/etc/kubernetes/static-pod-certs \
|
||||
- -v /var/lib/etcd:/var/lib/etcd \
|
||||
- --env ALL_ETCD_ENDPOINTS=$ALL_ETCD_ENDPOINTS \
|
||||
- --env ETCD_CIPHER_SUITES=$ETCD_CIPHER_SUITES \
|
||||
- --env ETCD_DATA_DIR=$ETCD_DATA_DIR \
|
||||
- --env ETCD_ELECTION_TIMEOUT=$ETCD_ELECTION_TIMEOUT \
|
||||
- --env ETCD_ENABLE_PPROF=$ETCD_ENABLE_PPROF \
|
||||
- --env ETCD_EXPERIMENTAL_MAX_LEARNERS=$ETCD_EXPERIMENTAL_MAX_LEARNERS \
|
||||
- --env ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION \
|
||||
- --env ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL \
|
||||
- --env ETCD_HEARTBEAT_INTERVAL=$ETCD_HEARTBEAT_INTERVAL \
|
||||
- --env ETCD_INITIAL_CLUSTER=$ETCD_INITIAL_CLUSTER \
|
||||
- --env ETCD_INITIAL_CLUSTER_STATE=$ETCD_INITIAL_CLUSTER_STATE \
|
||||
- --env ETCD_NAME=$NODENAME \
|
||||
- --env ETCD_QUOTA_BACKEND_BYTES=$ETCD_QUOTA_BACKEND_BYTES \
|
||||
- --env ETCD_SOCKET_REUSE_ADDRESS=$ETCD_SOCKET_REUSE_ADDRESS \
|
||||
- --env ETCDCTL_API=$ETCDCTL_API \
|
||||
- --env ETCDCTL_CACERT=$SERVER_CACERT \
|
||||
- --env ETCDCTL_CERT=$ETCD_PEER_CERT \
|
||||
- --env ETCDCTL_KEY=$ETCD_PEER_KEY \
|
||||
- --authfile=$OCF_RESKEY_authfile \
|
||||
- --security-opt label=disable"
|
||||
+ --network=host \
|
||||
+ -v /etc/kubernetes/static-pod-resources/etcd-certs:/etc/kubernetes/static-pod-certs \
|
||||
+ -v /var/lib/etcd:/var/lib/etcd \
|
||||
+ --env ETCDCTL_API=$ETCDCTL_API \
|
||||
+ --env ETCDCTL_CACERT=$SERVER_CACERT \
|
||||
+ --env ETCDCTL_CERT=$ETCD_PEER_CERT \
|
||||
+ --env ETCDCTL_KEY=$ETCD_PEER_KEY \
|
||||
+ --authfile=$OCF_RESKEY_authfile \
|
||||
+ --security-opt label=disable"
|
||||
if [ -n "$OCF_RESKEY_run_opts" ]; then
|
||||
run_opts="$run_opts $OCF_RESKEY_run_opts"
|
||||
fi
|
||||
|
||||
- OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --logger=zap \
|
||||
- --log-level=info \
|
||||
- --experimental-initial-corrupt-check=true \
|
||||
- --snapshot-count=10000 \
|
||||
- --initial-advertise-peer-urls=$NODEIPURL:2380 \
|
||||
- --cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.crt \
|
||||
- --key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.key \
|
||||
- --trusted-ca-file=$SERVER_CACERT \
|
||||
- --client-cert-auth=true \
|
||||
- --peer-cert-file=$ETCD_PEER_CERT \
|
||||
- --peer-key-file=$ETCD_PEER_KEY \
|
||||
- --peer-trusted-ca-file=$SERVER_CACERT \
|
||||
- --peer-client-cert-auth=true \
|
||||
- --advertise-client-urls=$NODEIPURL:2379 \
|
||||
- --listen-client-urls=$(ip_url ${LISTEN_CLIENT_URLS}):2379,unixs://${NODEIP}:0 \
|
||||
- --listen-peer-urls=$(ip_url ${LISTEN_PEER_URLS}):2380 \
|
||||
- --metrics=extensive \
|
||||
- --listen-metrics-urls=$(ip_url ${LISTEN_METRICS_URLS}):9978"
|
||||
- if [ -n "$OCF_RESKEY_run_cmd_opts" ]; then
|
||||
- OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd $OCF_RESKEY_run_cmd_opts"
|
||||
+ if [ -f "$ETCD_CONFIGURATION_FILE" ]; then
|
||||
+ ocf_log info "using etcd configuration file: $ETCD_CONFIGURATION_FILE"
|
||||
+ else
|
||||
+ ocf_log err "could not find $ETCD_CONFIGURATION_FILE"
|
||||
+ return "$OCF_ERR_GENERIC"
|
||||
fi
|
||||
|
||||
- if is_force_new_cluster; then
|
||||
- OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --force-new-cluster"
|
||||
+ OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --config-file=$ETCD_CONFIGURATION_FILE"
|
||||
+ if [ -n "$OCF_RESKEY_run_cmd_opts" ]; then
|
||||
+ OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd $OCF_RESKEY_run_cmd_opts"
|
||||
fi
|
||||
|
||||
if [ "$OCF_RESKEY_image" = "$OCF_RESKEY_image_default" ]; then
|
||||
@@ -1377,9 +1615,7 @@ podman_start()
|
||||
ocf_log info "starting existing container $CONTAINER."
|
||||
ocf_run podman start "$CONTAINER"
|
||||
else
|
||||
- # make sure any previous container matching our container name is cleaned up first.
|
||||
- # we already know at this point it wouldn't be running
|
||||
- remove_container
|
||||
+ ocf_log info "starting new container $CONTAINER."
|
||||
run_new_container "$run_opts" "$OCF_RESKEY_image" "$OCF_RESKEY_run_cmd"
|
||||
if [ $? -eq 125 ]; then
|
||||
return $OCF_ERR_GENERIC
|
||||
@@ -1439,7 +1675,6 @@ podman_stop()
|
||||
local rc
|
||||
podman_simple_status
|
||||
if [ $? -eq $OCF_NOT_RUNNING ]; then
|
||||
- remove_container
|
||||
ocf_log info "could not leave members list: etcd container not running"
|
||||
return $OCF_SUCCESS
|
||||
fi
|
||||
@@ -1475,7 +1710,7 @@ podman_stop()
|
||||
ocf_run podman kill "$CONTAINER"
|
||||
rc=$?
|
||||
else
|
||||
- ocf_log debug "waiting $timeout second[s] before killing container"
|
||||
+ ocf_log info "waiting $timeout second[s] before killing container"
|
||||
ocf_run podman stop -t="$timeout" "$CONTAINER"
|
||||
rc=$?
|
||||
# on stop, systemd will automatically delete any transient
|
||||
@@ -1496,11 +1731,6 @@ podman_stop()
|
||||
fi
|
||||
fi
|
||||
|
||||
- if ! remove_container; then
|
||||
- ocf_exit_reason "Failed to remove stopped container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}."
|
||||
- return $OCF_ERR_GENERIC
|
||||
- fi
|
||||
-
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
@@ -1532,6 +1762,7 @@ podman_validate()
|
||||
check_binary oc
|
||||
check_binary podman
|
||||
check_binary jq
|
||||
+ check_binary tar
|
||||
|
||||
if [ -z "$OCF_RESKEY_node_ip_map" ]; then
|
||||
ocf_exit_reason "'node_ip_map' option is required"
|
||||
@@ -1589,6 +1820,9 @@ else
|
||||
fi
|
||||
|
||||
CONTAINER=$OCF_RESKEY_name
|
||||
+POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
|
||||
+ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
|
||||
+ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
|
||||
|
||||
# Note: we currently monitor podman containers by with the "podman exec"
|
||||
# command, so make sure that invocation is always valid by enforcing the
|
||||
@ -0,0 +1,193 @@
|
||||
From 11cdff8c886c72c83c26e48e46a8620c06e4c2f0 Mon Sep 17 00:00:00 2001
|
||||
From: E Hila <ehila@redhat.com>
|
||||
Date: Tue, 9 Sep 2025 06:06:12 -0400
|
||||
Subject: [PATCH] OCPBUGS-60977: podman-etcd: wrap ipv6 address in brackets for
|
||||
attribute_node_ip (#2068)
|
||||
|
||||
When trying to determine the node ip address we need to make sure we account for ipv6 and dualstack deployments, and accordingly wrap ipv6 in brackets so it correctly resolves. Since the node ip mapping is provided by the controller, we parse out the IP address of the node from there and use a helper function for building URLs with ports to correctly use brackets for ipv6 ip addresses.
|
||||
|
||||
Signed-off-by: ehila <ehila@redhat.com>
|
||||
---
|
||||
heartbeat/podman-etcd | 77 ++++++++++++++++++++++++++++---------------
|
||||
1 file changed, 51 insertions(+), 26 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 884b7c579..4969fbaaf 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -347,21 +347,41 @@ remove_container()
|
||||
return $rc
|
||||
}
|
||||
|
||||
+# Correctly wraps an ipv6 in [] for url otherwise use return normal ipv4 address.
|
||||
+ip_url() {
|
||||
+ local ip_addr=$1
|
||||
+ local value
|
||||
+ if echo "$ip_addr" | grep -q ":" ; then
|
||||
+ value="[$ip_addr]"
|
||||
+ else
|
||||
+ value="$ip_addr"
|
||||
+ fi
|
||||
+ echo "https://$value"
|
||||
+}
|
||||
+
|
||||
attribute_node_ip()
|
||||
{
|
||||
local action="$1"
|
||||
local attribute="node_ip"
|
||||
- local value
|
||||
+ local ip_addr name
|
||||
|
||||
- if ! value=$(ip -brief addr show "$OCF_RESKEY_nic" | awk '{gsub("/.*", "", $3); print $3}'); then
|
||||
- rc=$?
|
||||
- ocf_log err "could not get node ip, error code: $rc"
|
||||
- return "$rc"
|
||||
+ for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
|
||||
+ name=$(echo "$node" | cut -d: -f1)
|
||||
+ # ignore other nodes
|
||||
+ if [ "$name" != "$NODENAME" ]; then
|
||||
+ continue
|
||||
+ fi
|
||||
+ ip_addr=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6
|
||||
+ done
|
||||
+
|
||||
+ if [ -z "$ip_addr" ]; then
|
||||
+ ocf_log err "ip address was empty when querying (getent ahosts) for hostname: $(hostname -f)"
|
||||
+ return 1
|
||||
fi
|
||||
|
||||
case "$action" in
|
||||
get)
|
||||
- echo "$value"
|
||||
+ echo "$ip_addr"
|
||||
;;
|
||||
update)
|
||||
if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then
|
||||
@@ -409,26 +429,28 @@ get_env_from_manifest() {
|
||||
}
|
||||
|
||||
prepare_env() {
|
||||
- local name ip standalone_node
|
||||
+ local name ip ipurl standalone_node
|
||||
|
||||
NODEIP="$(attribute_node_ip get)"
|
||||
+ NODEIPURL=$(ip_url $NODEIP)
|
||||
|
||||
if is_force_new_cluster; then
|
||||
- ALL_ETCD_ENDPOINTS="https://$NODEIP:2379"
|
||||
+ ALL_ETCD_ENDPOINTS="$NODEIPURL:2379"
|
||||
ETCD_INITIAL_CLUSTER_STATE="new"
|
||||
- ETCD_INITIAL_CLUSTER="$NODENAME=https://$NODEIP:2380"
|
||||
+ ETCD_INITIAL_CLUSTER="$NODENAME=$NODEIPURL:2380"
|
||||
else
|
||||
ETCD_INITIAL_CLUSTER_STATE="existing"
|
||||
for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
|
||||
- name=$(echo "$node" | awk -F":" '{print $1}')
|
||||
- ip=$(echo "$node" | awk -F":" '{print $2}')
|
||||
+ name=$(echo "$node" | cut -d: -f1)
|
||||
+ ip=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6
|
||||
+ ipurl="$(ip_url $ip)"
|
||||
if [ -z "$name" ] || [ -z "$ip" ]; then
|
||||
ocf_exit_reason "name or ip missing for 1 or more nodes"
|
||||
exit $OCF_ERR_CONFIGURED
|
||||
fi
|
||||
|
||||
- [ -z "$ALL_ETCD_ENDPOINTS" ] && ALL_ETCD_ENDPOINTS="https://$ip:2379" || ALL_ETCD_ENDPOINTS="$ALL_ETCD_ENDPOINTS,https://$ip:2379"
|
||||
- [ -z "$ETCD_INITIAL_CLUSTER" ] && ETCD_INITIAL_CLUSTER="$name=https://$ip:2380" || ETCD_INITIAL_CLUSTER="$ETCD_INITIAL_CLUSTER,$name=https://$ip:2380"
|
||||
+ [ -z "$ALL_ETCD_ENDPOINTS" ] && ALL_ETCD_ENDPOINTS="$ipurl:2379" || ALL_ETCD_ENDPOINTS="$ALL_ETCD_ENDPOINTS,$ipurl:2379"
|
||||
+ [ -z "$ETCD_INITIAL_CLUSTER" ] && ETCD_INITIAL_CLUSTER="$name=$ipurl:2380" || ETCD_INITIAL_CLUSTER="$ETCD_INITIAL_CLUSTER,$name=$ipurl:2380"
|
||||
done
|
||||
fi
|
||||
|
||||
@@ -609,9 +631,11 @@ add_member_as_learner()
|
||||
local rc
|
||||
local member_name=$1
|
||||
local member_ip=$2
|
||||
+ local endpoint_url=$(ip_url $(attribute_node_ip get))
|
||||
+ local peer_url=$(ip_url $member_ip)
|
||||
|
||||
ocf_log info "add $member_name ($member_ip) to the member list as learner"
|
||||
- out=$(podman exec "${CONTAINER}" etcdctl --endpoints="https://$(attribute_node_ip get):2379" member add "$member_name" --peer-urls="https://$member_ip:2380" --learner)
|
||||
+ out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner)
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ]; then
|
||||
ocf_log err "could not add $member_name as learner, error code: $rc"
|
||||
@@ -806,14 +830,15 @@ get_peer_node_name() {
|
||||
|
||||
get_all_etcd_endpoints() {
|
||||
for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
|
||||
- name=$(echo "$node" | awk -F":" '{print $1}')
|
||||
- ip=$(echo "$node" | awk -F":" '{print $2}')
|
||||
+ name=$(echo "$node" | cut -d: -f1)
|
||||
+ ip=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6
|
||||
+ ipurl="$(ip_url $ip)"
|
||||
if [ -z "$name" ] || [ -z "$ip" ]; then
|
||||
ocf_exit_reason "name or ip missing for 1 or more nodes"
|
||||
exit $OCF_ERR_CONFIGURED
|
||||
fi
|
||||
|
||||
- [ -z "$ALL_ETCD_ENDPOINTS" ] && ALL_ETCD_ENDPOINTS="https://$ip:2379" || ALL_ETCD_ENDPOINTS="$ALL_ETCD_ENDPOINTS,https://$ip:2379"
|
||||
+ [ -z "$ALL_ETCD_ENDPOINTS" ] && ALL_ETCD_ENDPOINTS="$ipurl:2379" || ALL_ETCD_ENDPOINTS="$ALL_ETCD_ENDPOINTS,$ipurl:2379"
|
||||
done
|
||||
echo "$ALL_ETCD_ENDPOINTS"
|
||||
}
|
||||
@@ -831,7 +856,7 @@ get_member_list_json() {
|
||||
# Get the list of members visible to the current node
|
||||
local this_node_endpoint
|
||||
|
||||
- this_node_endpoint="https://$(attribute_node_ip get):2379"
|
||||
+ this_node_endpoint="$(ip_url $(attribute_node_ip get)):2379"
|
||||
podman exec "${CONTAINER}" etcdctl member list --endpoints="$this_node_endpoint" -w json
|
||||
}
|
||||
|
||||
@@ -886,14 +911,14 @@ check_peers()
|
||||
# ]
|
||||
# }
|
||||
for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
|
||||
- name=$(echo "$node" | awk -F":" '{print $1}')
|
||||
+ name=$(echo "$node" | cut -d: -f1)
|
||||
# do not check itself
|
||||
if [ "$name" = "$NODENAME" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
# Check by IP instead of Name since "learner" members appear only in peerURLs, not by Name.
|
||||
- ip=$(echo "$node" | awk -F":" '{print $2}')
|
||||
+ ip=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6
|
||||
id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$ip\")) | any).ID")
|
||||
if [ -z "$id" ]; then
|
||||
ocf_log info "$name is not in the members list"
|
||||
@@ -1307,7 +1332,7 @@ podman_start()
|
||||
--log-level=info \
|
||||
--experimental-initial-corrupt-check=true \
|
||||
--snapshot-count=10000 \
|
||||
- --initial-advertise-peer-urls=https://${NODEIP}:2380 \
|
||||
+ --initial-advertise-peer-urls=$NODEIPURL:2380 \
|
||||
--cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.crt \
|
||||
--key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.key \
|
||||
--trusted-ca-file=$SERVER_CACERT \
|
||||
@@ -1316,11 +1341,11 @@ podman_start()
|
||||
--peer-key-file=$ETCD_PEER_KEY \
|
||||
--peer-trusted-ca-file=$SERVER_CACERT \
|
||||
--peer-client-cert-auth=true \
|
||||
- --advertise-client-urls=https://${NODEIP}:2379 \
|
||||
- --listen-client-urls=https://${LISTEN_CLIENT_URLS}:2379,unixs://${NODEIP}:0 \
|
||||
- --listen-peer-urls=https://${LISTEN_PEER_URLS}:2380 \
|
||||
+ --advertise-client-urls=$NODEIPURL:2379 \
|
||||
+ --listen-client-urls=$(ip_url ${LISTEN_CLIENT_URLS}):2379,unixs://${NODEIP}:0 \
|
||||
+ --listen-peer-urls=$(ip_url ${LISTEN_PEER_URLS}):2380 \
|
||||
--metrics=extensive \
|
||||
- --listen-metrics-urls=https://${LISTEN_METRICS_URLS}:9978"
|
||||
+ --listen-metrics-urls=$(ip_url ${LISTEN_METRICS_URLS}):9978"
|
||||
if [ -n "$OCF_RESKEY_run_cmd_opts" ]; then
|
||||
OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd $OCF_RESKEY_run_cmd_opts"
|
||||
fi
|
||||
@@ -1430,7 +1455,7 @@ podman_stop()
|
||||
ocf_log info "last member. Not leaving the member list"
|
||||
else
|
||||
ocf_log info "leaving members list as member with ID $member_id"
|
||||
- endpoint="https://$(attribute_node_ip get):2379"
|
||||
+ endpoint="$(ip_url $(attribute_node_ip get)):2379"
|
||||
if ! ocf_run podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"; then
|
||||
rc=$?
|
||||
ocf_log err "error leaving members list, error code: $rc"
|
||||
1127
SOURCES/RHEL-114489-1-powervs-move-ip-new-ra.patch
Normal file
1127
SOURCES/RHEL-114489-1-powervs-move-ip-new-ra.patch
Normal file
File diff suppressed because it is too large
Load Diff
19
SOURCES/RHEL-114489-2-powervs-move-ip-set-bundled-path.patch
Normal file
19
SOURCES/RHEL-114489-2-powervs-move-ip-set-bundled-path.patch
Normal file
@ -0,0 +1,19 @@
|
||||
--- a/heartbeat/powervs-move-ip.in 2025-09-15 16:13:34.225046827 +0200
|
||||
+++ b/heartbeat/powervs-move-ip.in 2025-09-15 17:39:02.746258434 +0200
|
||||
@@ -33,9 +33,13 @@
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
-import requests
|
||||
-import requests.adapters
|
||||
-import urllib3.util
|
||||
+try:
|
||||
+ sys.path.insert(0, '/usr/lib/fence-agents/support/ibm')
|
||||
+ import requests
|
||||
+ import requests.adapters
|
||||
+ import urllib3.util
|
||||
+except ImportError:
|
||||
+ pass
|
||||
|
||||
# Constants
|
||||
OCF_FUNCTIONS_DIR = os.environ.get(
|
||||
@ -0,0 +1,197 @@
|
||||
From a4e496e5e6d9abde1b071fa2dfa1c6e7ba899cf1 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Edmund=20H=C3=A4fele?= <edmund.haefele@de.ibm.com>
|
||||
Date: Thu, 30 Oct 2025 13:03:22 +0100
|
||||
Subject: [PATCH] Update powervs-move-ip
|
||||
|
||||
- Add `iflabel` argument.
|
||||
- Increase maximum number of retries for HTTP requests to four.
|
||||
---
|
||||
heartbeat/powervs-move-ip.in | 66 +++++++++++++++++++++++++-----------
|
||||
1 file changed, 47 insertions(+), 19 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/powervs-move-ip.in b/heartbeat/powervs-move-ip.in
|
||||
index d55979e52..e2250c998 100755
|
||||
--- a/heartbeat/powervs-move-ip.in
|
||||
+++ b/heartbeat/powervs-move-ip.in
|
||||
@@ -50,11 +50,13 @@ RESOURCE_OPTIONS = (
|
||||
"use_token_cache",
|
||||
"monitor_api",
|
||||
"device",
|
||||
+ "iflabel",
|
||||
"proxy",
|
||||
)
|
||||
IP_CMD = "/usr/sbin/ip"
|
||||
+IFLABEL_MAX_LEN = 15 # Maximum character limit for interface labels
|
||||
REQUESTS_TIMEOUT = 5 # Timeout for requests calls
|
||||
-HTTP_MAX_RETRIES = 3 # Maximum number of retries for HTTP requests
|
||||
+HTTP_MAX_RETRIES = 4 # Maximum number of retries for HTTP requests
|
||||
HTTP_BACKOFF_FACTOR = 0.3 # Sleep (factor * (2^number of previous retries)) secs
|
||||
HTTP_STATUS_FORCE_RETRIES = (500, 502, 503, 504) # HTTP status codes to retry on
|
||||
HTTP_RETRY_ALLOWED_METHODS = frozenset({"GET", "POST", "PUT", "DELETE"})
|
||||
@@ -154,13 +156,13 @@ def ip_check_device(device):
|
||||
return False
|
||||
|
||||
|
||||
-def ip_alias_add(ip, device):
|
||||
+def ip_alias_add(ip, device, label=None):
|
||||
"""Add an IP alias to the given device."""
|
||||
ip_cidr = f"{ip}/{CIDR_NETMASK}"
|
||||
ocf.logger.debug(
|
||||
- f"[ip_alias_add]: adding IP alias '{ip_cidr}' to interface '{device}'"
|
||||
+ f"[ip_alias_add]: adding IP alias '{ip_cidr}' with label '{label}' to interface '{device}'"
|
||||
)
|
||||
- _ = ip_address_add(ip_cidr, device)
|
||||
+ _ = ip_address_add(ip_cidr, device, label)
|
||||
|
||||
|
||||
def ip_alias_remove(ip):
|
||||
@@ -522,6 +524,7 @@ class PowerCloudRoute(PowerCloudAPI):
|
||||
region="",
|
||||
route_host_map="",
|
||||
device="",
|
||||
+ iflabel="",
|
||||
proxy="",
|
||||
monitor_api="",
|
||||
use_token_cache="",
|
||||
@@ -543,6 +546,7 @@ class PowerCloudRoute(PowerCloudAPI):
|
||||
self.route_info = self._get_route_info()
|
||||
self.route_name = self.route_info["name"]
|
||||
self.device = self._get_device_name(device)
|
||||
+ self.iflabel = self._make_iflabel(iflabel)
|
||||
|
||||
def _get_ip_info(self, ip):
|
||||
"""Validate the given IP address and return its standard form."""
|
||||
@@ -588,7 +592,7 @@ class PowerCloudRoute(PowerCloudAPI):
|
||||
nodename = (
|
||||
hostname
|
||||
if not self._is_remote_route
|
||||
- else next((h for h in route_map if h != hostname), None)
|
||||
+ else next((host for host in route_map if host != hostname), None)
|
||||
)
|
||||
|
||||
if not nodename or nodename not in route_map:
|
||||
@@ -646,6 +650,21 @@ class PowerCloudRoute(PowerCloudAPI):
|
||||
ocf.OCF_ERR_CONFIGURED,
|
||||
)
|
||||
|
||||
+ def _make_iflabel(self, label=None):
|
||||
+ """Constructs an interface label in the format 'device:label' if both are provided."""
|
||||
+ if not label or self._is_remote_route:
|
||||
+ return None
|
||||
+
|
||||
+ iflabel = f"{self.device}:{label}"
|
||||
+
|
||||
+ if len(iflabel) > IFLABEL_MAX_LEN:
|
||||
+ raise PowerCloudRouteError(
|
||||
+ f"_make_iflabel: interface label '{iflabel}' exceeds limit of {IFLABEL_MAX_LEN} characters",
|
||||
+ ocf.OCF_ERR_CONFIGURED,
|
||||
+ )
|
||||
+
|
||||
+ return iflabel
|
||||
+
|
||||
def _set_route_enabled(self, enabled: bool):
|
||||
"""Enable or disable the PowerVS network route."""
|
||||
resource = f"/v1/routes/{self.route_id}"
|
||||
@@ -706,6 +725,7 @@ def start_action(
|
||||
use_token_cache="",
|
||||
monitor_api="",
|
||||
device="",
|
||||
+ iflabel="",
|
||||
proxy="",
|
||||
):
|
||||
"""Assign the service IP.
|
||||
@@ -730,7 +750,7 @@ def start_action(
|
||||
local_route = create_route_instance(resource_options)
|
||||
|
||||
# Add IP alias
|
||||
- ip_alias_add(ip, local_route.device)
|
||||
+ ip_alias_add(ip, local_route.device, local_route.iflabel)
|
||||
|
||||
# Enable local route
|
||||
ocf.logger.debug(f"[start_action]: enabling local route '{local_route.route_name}'")
|
||||
@@ -758,6 +778,7 @@ def stop_action(
|
||||
use_token_cache="",
|
||||
monitor_api="",
|
||||
device="",
|
||||
+ iflabel="",
|
||||
proxy="",
|
||||
):
|
||||
"""Remove the service IP.
|
||||
@@ -810,6 +831,7 @@ def monitor_action(
|
||||
use_token_cache="",
|
||||
monitor_api="",
|
||||
device="",
|
||||
+ iflabel="",
|
||||
proxy="",
|
||||
):
|
||||
"""Monitor the service IP.
|
||||
@@ -829,15 +851,11 @@ def monitor_action(
|
||||
interface_name = ip_find_device(ip)
|
||||
|
||||
if not use_extended_monitor:
|
||||
- if interface_name:
|
||||
- ocf.logger.debug(
|
||||
- f"[monitor_action]: IP alias '{ip}' is active'"
|
||||
- )
|
||||
+ if interface_name:
|
||||
+ ocf.logger.debug(f"[monitor_action]: IP alias '{ip}' is active'")
|
||||
return ocf.OCF_SUCCESS
|
||||
- else:
|
||||
- ocf.logger.debug(
|
||||
- f"[monitor_action]: IP alias '{ip}' is not active"
|
||||
- )
|
||||
+ else:
|
||||
+ ocf.logger.debug(f"[monitor_action]: IP alias '{ip}' is not active")
|
||||
return ocf.OCF_NOT_RUNNING
|
||||
|
||||
remote_route = create_route_instance(
|
||||
@@ -893,6 +911,7 @@ def validate_all_action(
|
||||
use_token_cache="",
|
||||
monitor_api="",
|
||||
device="",
|
||||
+ iflabel="",
|
||||
proxy="",
|
||||
):
|
||||
"""Validate resource agent parameters.
|
||||
@@ -914,12 +933,10 @@ def main():
|
||||
Resource Agent to move an IP address from one Power Virtual Server instance to another.
|
||||
|
||||
Prerequisites:
|
||||
- 1. Red Hat Enterprise Linux 9.4 or higher
|
||||
-
|
||||
- 2. Two-node cluster
|
||||
+ 1. Two-node cluster
|
||||
- Distributed across two PowerVS workspaces in separate data centers within the same region.
|
||||
|
||||
- 3. IBM Cloud API Key:
|
||||
+ 2. IBM Cloud API Key:
|
||||
- Create a service API key with privileges for both workspaces.
|
||||
- Save the key in a file and copy it to both cluster nodes using the same path and filename.
|
||||
- Reference the key file path in the resource definition.
|
||||
@@ -932,7 +949,7 @@ def main():
|
||||
"powervs-move-ip",
|
||||
shortdesc="Manages Power Virtual Server overlay IP routes.",
|
||||
longdesc=agent_description,
|
||||
- version=1.00,
|
||||
+ version=1.01,
|
||||
)
|
||||
|
||||
agent.add_parameter(
|
||||
@@ -1011,6 +1028,17 @@ def main():
|
||||
default="",
|
||||
required=False,
|
||||
)
|
||||
+ agent.add_parameter(
|
||||
+ "iflabel",
|
||||
+ shortdesc="Network interface label",
|
||||
+ longdesc=(
|
||||
+ "A custom suffix for the IP address label. "
|
||||
+ "It is appended to the interface name in the format device:label. "
|
||||
+ "The full label must not exceed 15 characters. "
|
||||
+ ),
|
||||
+ content_type="string",
|
||||
+ required=False,
|
||||
+ )
|
||||
agent.add_parameter(
|
||||
"proxy",
|
||||
shortdesc="Proxy",
|
||||
@ -0,0 +1,258 @@
|
||||
From fc240bdff60aae7133a532c7752c6253ce8f65ca Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Mon, 4 Aug 2025 16:53:09 +0200
|
||||
Subject: [PATCH 1/2] db2: add "skip_basic_sql_health_check" parameter to avoid
|
||||
failing on systems with high load
|
||||
|
||||
---
|
||||
heartbeat/db2 | 63 +++++++++++++++++++++++++++++++--------------------
|
||||
1 file changed, 38 insertions(+), 25 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/db2 b/heartbeat/db2
|
||||
index 1cd66f15a..da6c9d5f1 100755
|
||||
--- a/heartbeat/db2
|
||||
+++ b/heartbeat/db2
|
||||
@@ -40,10 +40,12 @@
|
||||
# Parameter defaults
|
||||
|
||||
OCF_RESKEY_instance_default=""
|
||||
+OCF_RESKEY_skip_basic_sql_health_check_default="false"
|
||||
OCF_RESKEY_admin_default=""
|
||||
OCF_RESKEY_dbpartitionnum_default="0"
|
||||
|
||||
: ${OCF_RESKEY_instance=${OCF_RESKEY_instance_default}}
|
||||
+: ${OCF_RESKEY_skip_basic_sql_health_check=${OCF_RESKEY_skip_basic_sql_health_check_default}}
|
||||
: ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}}
|
||||
: ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}}
|
||||
|
||||
@@ -102,6 +104,15 @@ Defaults to all databases in the instance. Specify one db for HADR mode.
|
||||
<shortdesc lang="en">List of databases to be managed</shortdesc>
|
||||
<content type="string"/>
|
||||
</parameter>
|
||||
+<parameter name="skip_basic_sql_health_check" unique="0" required="0">
|
||||
+<longdesc lang="en">
|
||||
+Skip basic health check SQL query.
|
||||
+
|
||||
+Only set to "true" to avoid issues during high load.
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">Skip basic health check SQL query</shortdesc>
|
||||
+<content type="boolean" default="${OCF_RESKEY_skip_basic_sql_health_check_default}" />
|
||||
+</parameter>
|
||||
<parameter name="admin" unique="0" required="0">
|
||||
<longdesc lang="en">
|
||||
DEPRECATED: The admin user of the instance.
|
||||
@@ -695,31 +706,33 @@ db2_monitor() {
|
||||
# set master preference accordingly
|
||||
case "$hadr" in
|
||||
PRIMARY/*|Primary/*|Standard/*)
|
||||
- # perform a basic health check
|
||||
- CMD="if db2 connect to $db;
|
||||
- then
|
||||
- db2 select \* from sysibm.sysversions ; rc=\$?;
|
||||
- db2 terminate;
|
||||
- else
|
||||
- rc=\$?;
|
||||
- fi;
|
||||
- exit \$rc"
|
||||
-
|
||||
- if ! output=$(runasdb2 $CMD)
|
||||
- then
|
||||
- case "$output" in
|
||||
- SQL1776N*)
|
||||
- # can't connect/select on standby, may be spurious turing takeover
|
||||
- ;;
|
||||
-
|
||||
- *)
|
||||
- ocf_log err "DB2 database $instance($db2node)/$db is not working"
|
||||
- ocf_log err "DB2 message: $output"
|
||||
-
|
||||
- # dead primary, remove master score
|
||||
- master_score -D -l reboot
|
||||
- return $OCF_ERR_GENERIC
|
||||
- esac
|
||||
+ if ! ocf_is_true "$OCF_RESKEY_skip_basic_sql_health_check"; then
|
||||
+ # perform a basic health check
|
||||
+ CMD="if db2 connect to $db;
|
||||
+ then
|
||||
+ db2 select \* from sysibm.sysversions ; rc=\$?;
|
||||
+ db2 terminate;
|
||||
+ else
|
||||
+ rc=\$?;
|
||||
+ fi;
|
||||
+ exit \$rc"
|
||||
+
|
||||
+ if ! output=$(runasdb2 $CMD)
|
||||
+ then
|
||||
+ case "$output" in
|
||||
+ SQL1776N*)
|
||||
+ # can't connect/select on standby, may be spurious turing takeover
|
||||
+ ;;
|
||||
+
|
||||
+ *)
|
||||
+ ocf_log err "DB2 database $instance($db2node)/$db is not working"
|
||||
+ ocf_log err "DB2 message: $output"
|
||||
+
|
||||
+ # dead primary, remove master score
|
||||
+ master_score -D -l reboot
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ esac
|
||||
+ fi
|
||||
fi
|
||||
|
||||
ocf_log debug "DB2 database $instance($db2node)/$db appears to be working"
|
||||
|
||||
From ded016f84d3fb77dc0542e3f4226774526910d97 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Thu, 7 Aug 2025 13:55:11 +0200
|
||||
Subject: [PATCH 2/2] db2: add "monitor_retries", "monitor_sleep", and
|
||||
"monitor_retry_all_errors" parameters to be able to avoid failing on first
|
||||
try
|
||||
|
||||
---
|
||||
heartbeat/db2 | 80 +++++++++++++++++++++++++++++++++++++++++++++------
|
||||
1 file changed, 72 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/db2 b/heartbeat/db2
|
||||
index da6c9d5f1..fe1d9b892 100755
|
||||
--- a/heartbeat/db2
|
||||
+++ b/heartbeat/db2
|
||||
@@ -41,11 +41,17 @@
|
||||
|
||||
OCF_RESKEY_instance_default=""
|
||||
OCF_RESKEY_skip_basic_sql_health_check_default="false"
|
||||
+OCF_RESKEY_monitor_retries_default="1"
|
||||
+OCF_RESKEY_monitor_sleep_default="1"
|
||||
+OCF_RESKEY_monitor_retry_all_errors_default="false"
|
||||
OCF_RESKEY_admin_default=""
|
||||
OCF_RESKEY_dbpartitionnum_default="0"
|
||||
|
||||
: ${OCF_RESKEY_instance=${OCF_RESKEY_instance_default}}
|
||||
: ${OCF_RESKEY_skip_basic_sql_health_check=${OCF_RESKEY_skip_basic_sql_health_check_default}}
|
||||
+: ${OCF_RESKEY_monitor_retries=${OCF_RESKEY_monitor_retries_default}}
|
||||
+: ${OCF_RESKEY_monitor_sleep=${OCF_RESKEY_monitor_sleep_default}}
|
||||
+: ${OCF_RESKEY_monitor_retry_all_errors=${OCF_RESKEY_monitor_retry_all_errors_default}}
|
||||
: ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}}
|
||||
: ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}}
|
||||
|
||||
@@ -108,11 +114,33 @@ Defaults to all databases in the instance. Specify one db for HADR mode.
|
||||
<longdesc lang="en">
|
||||
Skip basic health check SQL query.
|
||||
|
||||
-Only set to "true" to avoid issues during high load.
|
||||
+Only set to "true" when the "monitor_retries" and "monitor_retry_all_errors" parameters arent
|
||||
+enough to avoid issues under high load.
|
||||
</longdesc>
|
||||
<shortdesc lang="en">Skip basic health check SQL query</shortdesc>
|
||||
<content type="boolean" default="${OCF_RESKEY_skip_basic_sql_health_check_default}" />
|
||||
</parameter>
|
||||
+<parameter name="monitor_retries" unique="0" required="0">
|
||||
+<longdesc lang="en">
|
||||
+Monitor retries before failing.
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">Monitor retries</shortdesc>
|
||||
+<content type="string" default="${OCF_RESKEY_monitor_retries_default}" />
|
||||
+</parameter>
|
||||
+<parameter name="monitor_retries_sleep" unique="0" required="0">
|
||||
+<longdesc lang="en">
|
||||
+Monitor sleep between tries.
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">Monitor sleep</shortdesc>
|
||||
+<content type="string" default="${OCF_RESKEY_monitor_sleep_default}" />
|
||||
+</parameter>
|
||||
+<parameter name="monitor_retry_all_errors" unique="0" required="0">
|
||||
+<longdesc lang="en">
|
||||
+Set to true to retry monitor-action for all errors instead of the default "db2pd" race conditions.
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">Retry monitor for all errors</shortdesc>
|
||||
+<content type="string" default="${OCF_RESKEY_monitor_retry_all_errors_default}" />
|
||||
+</parameter>
|
||||
<parameter name="admin" unique="0" required="0">
|
||||
<longdesc lang="en">
|
||||
DEPRECATED: The admin user of the instance.
|
||||
@@ -666,6 +694,7 @@ db2_hadr_status() {
|
||||
local output
|
||||
|
||||
output=$(runasdb2 db2pd -hadr -db $db)
|
||||
+ ocf_log debug "db2_hadr_status: $output"
|
||||
if [ $? != 0 ]
|
||||
then
|
||||
echo "Down/Off"
|
||||
@@ -676,7 +705,34 @@ db2_hadr_status() {
|
||||
awk '/^\s+HADR_(ROLE|STATE) =/ {printf $3"/"}
|
||||
/^\s+HADR_CONNECT_STATUS =/ {print $3; exit; }
|
||||
/^HADR is not active/ {print "Standard/Standalone"; exit; }
|
||||
- /^Role *State */ {getline; printf "%s/%s\n", $1, $2; exit; }'
|
||||
+ /^Role *State */ {getline; printf "%s/%s\n", $1, $2; exit; }
|
||||
+ /^Option -hadr requires -db <database> or -alldbs option and active database./ { exit 255 }
|
||||
+ /^Another possibility of this failure is the Virtual Address Space Randomization is currently enabled on this system./ { exit 255 }
|
||||
+ /^Changing data structure forced command termination./ { exit 255 }'
|
||||
+}
|
||||
+
|
||||
+db2_monitor_retry() {
|
||||
+ local tries=$(($OCF_RESKEY_monitor_retries + 1))
|
||||
+
|
||||
+ for try in $(seq $tries); do
|
||||
+ ocf_log debug "monitor try $try of $tries"
|
||||
+ db2_monitor
|
||||
+ rc=$?
|
||||
+ [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_RUNNING_MASTER ] && [ $rc -ne $OCF_NOT_RUNNING ] && ocf_log warn "Monitor failed with rc $rc."
|
||||
+ if [ $rc -eq $OCF_SUCCESS ] || [ $rc -eq $OCF_RUNNING_MASTER ] || [ $rc -eq $OCF_NOT_RUNNING ] || { [ $rc -ne 255 ] && ! ocf_is_true "$OCF_RESKEY_monitor_retry_all_errors" ;} ;then
|
||||
+ break
|
||||
+ fi
|
||||
+ [ $try -lt $tries ] && sleep $OCF_RESKEY_monitor_sleep
|
||||
+ done
|
||||
+
|
||||
+ [ $rc -eq 255 ] && rc=$OCF_ERR_GENERIC
|
||||
+
|
||||
+ if [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_RUNNING_MASTER ]; then
|
||||
+ # instance is dead remove master score
|
||||
+ master_score -D -l reboot
|
||||
+ fi
|
||||
+
|
||||
+ return $rc
|
||||
}
|
||||
|
||||
#
|
||||
@@ -690,9 +746,7 @@ db2_monitor() {
|
||||
db2_instance_status
|
||||
rc=$?
|
||||
if [ $rc -ne $OCF_SUCCESS ]; then
|
||||
- # instance is dead remove master score
|
||||
- master_score -D -l reboot
|
||||
- exit $rc
|
||||
+ return $rc
|
||||
fi
|
||||
|
||||
[ $db2node = 0 ] || return 0
|
||||
@@ -700,8 +754,18 @@ db2_monitor() {
|
||||
|
||||
for db in $dblist
|
||||
do
|
||||
- hadr=$(db2_hadr_status $db) || return $OCF_ERR_GENERIC
|
||||
+ hadr=$(db2_hadr_status $db)
|
||||
+ rc=$?
|
||||
ocf_log debug "Monitor: DB2 database $instance($db2node)/$db has HADR status $hadr"
|
||||
+ if [ "$rc" -eq 255 ]; then
|
||||
+ if [ "$__OCF_ACTION" = "monitor" ]; then
|
||||
+ return $rc
|
||||
+ else
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+ elif [ "$rc" -ne 0 ]; then
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
|
||||
# set master preference accordingly
|
||||
case "$hadr" in
|
||||
@@ -915,9 +979,9 @@ case "$__OCF_ACTION" in
|
||||
exit $?
|
||||
;;
|
||||
|
||||
- monitor)
|
||||
+ monitor)
|
||||
db2_validate
|
||||
- db2_monitor
|
||||
+ db2_monitor_retry
|
||||
exit $?
|
||||
;;
|
||||
|
||||
@ -0,0 +1,49 @@
|
||||
From 54714646c6e2c4ba851e366e63316adb1092af61 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Tue, 28 Oct 2025 16:34:54 +0100
|
||||
Subject: [PATCH] db2: fix monitor_retries_sleep variable name
|
||||
|
||||
---
|
||||
heartbeat/db2 | 8 ++++----
|
||||
1 file changed, 4 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/db2 b/heartbeat/db2
|
||||
index 83020fc70..82f2f82c3 100755
|
||||
--- a/heartbeat/db2
|
||||
+++ b/heartbeat/db2
|
||||
@@ -49,7 +49,7 @@ fi
|
||||
OCF_RESKEY_instance_default=""
|
||||
OCF_RESKEY_skip_basic_sql_health_check_default="false"
|
||||
OCF_RESKEY_monitor_retries_default="1"
|
||||
-OCF_RESKEY_monitor_sleep_default="1"
|
||||
+OCF_RESKEY_monitor_retries_sleep_default="1"
|
||||
OCF_RESKEY_monitor_retry_all_errors_default="false"
|
||||
OCF_RESKEY_admin_default=""
|
||||
OCF_RESKEY_dbpartitionnum_default="0"
|
||||
@@ -57,7 +57,7 @@ OCF_RESKEY_dbpartitionnum_default="0"
|
||||
: ${OCF_RESKEY_instance=${OCF_RESKEY_instance_default}}
|
||||
: ${OCF_RESKEY_skip_basic_sql_health_check=${OCF_RESKEY_skip_basic_sql_health_check_default}}
|
||||
: ${OCF_RESKEY_monitor_retries=${OCF_RESKEY_monitor_retries_default}}
|
||||
-: ${OCF_RESKEY_monitor_sleep=${OCF_RESKEY_monitor_sleep_default}}
|
||||
+: ${OCF_RESKEY_monitor_retries_sleep=${OCF_RESKEY_monitor_retries_sleep_default}}
|
||||
: ${OCF_RESKEY_monitor_retry_all_errors=${OCF_RESKEY_monitor_retry_all_errors_default}}
|
||||
: ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}}
|
||||
: ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}}
|
||||
@@ -140,7 +140,7 @@ Monitor retries before failing.
|
||||
Monitor sleep between tries.
|
||||
</longdesc>
|
||||
<shortdesc lang="en">Monitor sleep</shortdesc>
|
||||
-<content type="string" default="${OCF_RESKEY_monitor_sleep_default}" />
|
||||
+<content type="string" default="${OCF_RESKEY_monitor_retries_sleep_default}" />
|
||||
</parameter>
|
||||
<parameter name="monitor_retry_all_errors" unique="0" required="0">
|
||||
<longdesc lang="en">
|
||||
@@ -776,7 +776,7 @@ db2_monitor_retry() {
|
||||
if [ $rc -eq $OCF_SUCCESS ] || [ $rc -eq $OCF_RUNNING_MASTER ] || [ $rc -eq $OCF_NOT_RUNNING ] || { [ $rc -ne 255 ] && ! ocf_is_true "$OCF_RESKEY_monitor_retry_all_errors" ;} ;then
|
||||
break
|
||||
fi
|
||||
- [ $try -lt $tries ] && sleep $OCF_RESKEY_monitor_sleep
|
||||
+ [ $try -lt $tries ] && sleep $OCF_RESKEY_monitor_retries_sleep
|
||||
done
|
||||
|
||||
[ $rc -eq 255 ] && rc=$OCF_ERR_GENERIC
|
||||
@ -0,0 +1,19 @@
|
||||
--- a/heartbeat/ocf-shellfuncs.in 2025-09-29 14:01:55.762931795 +0200
|
||||
+++ b/heartbeat/ocf-shellfuncs.in 2025-09-29 14:09:28.651731793 +0200
|
||||
@@ -1093,6 +1093,16 @@
|
||||
echo $1
|
||||
}
|
||||
|
||||
+ocf_promotion_score() {
|
||||
+ ocf_version_cmp "$OCF_RESKEY_crm_feature_set" "3.10.0"
|
||||
+ res=$?
|
||||
+ if [ $res -eq 2 ] || [ $res -eq 1 ] || ! have_binary "crm_master"; then
|
||||
+ ${HA_SBIN_DIR}/crm_attribute -p ${OCF_RESOURCE_INSTANCE} $@
|
||||
+ else
|
||||
+ ${HA_SBIN_DIR}/crm_master -l reboot $@
|
||||
+ fi
|
||||
+}
|
||||
+
|
||||
__ocf_set_defaults "$@"
|
||||
|
||||
: ${OCF_TRACE_RA:=$OCF_RESKEY_trace_ra}
|
||||
362
SOURCES/RHEL-116151-2-portblock-add-promotable-support.patch
Normal file
362
SOURCES/RHEL-116151-2-portblock-add-promotable-support.patch
Normal file
@ -0,0 +1,362 @@
|
||||
--- a/heartbeat/portblock 2025-09-30 09:52:13.967530030 +0200
|
||||
+++ b/heartbeat/portblock 2025-09-30 09:52:49.018382542 +0200
|
||||
@@ -4,6 +4,7 @@
|
||||
#
|
||||
# Author: Sun Jiang Dong (initial version)
|
||||
# Philipp Reisner (per-IP filtering)
|
||||
+# Sebastian Baszczyj (nftables code)
|
||||
#
|
||||
# License: GNU General Public License (GPL)
|
||||
#
|
||||
@@ -43,11 +44,15 @@
|
||||
#######################################################################
|
||||
CMD=`basename $0`
|
||||
TICKLETCP=$HA_BIN/tickle_tcp
|
||||
+TABLE="portblock"
|
||||
+# Promotion scores
|
||||
+SCORE_UNPROMOTED=5
|
||||
+SCORE_PROMOTED=10
|
||||
|
||||
usage()
|
||||
{
|
||||
cat <<END >&2
|
||||
- usage: $CMD {start|stop|status|monitor|meta-data|validate-all}
|
||||
+ usage: $CMD {start|stop|promote|demote|status|monitor|meta-data|validate-all}
|
||||
|
||||
$CMD is used to temporarily block ports using iptables.
|
||||
|
||||
@@ -86,8 +91,8 @@
|
||||
NOTE: iptables is Linux-specific.
|
||||
|
||||
An additional feature in the portblock RA is the tickle ACK function
|
||||
- enabled by specifying the tickle_dir parameter. The tickle ACK
|
||||
- triggers the clients to faster reconnect their TCP connections to the
|
||||
+ enabled by specifying the tickle_dir parameter. The tickle ACK
|
||||
+ triggers the clients to faster reconnect their TCP connections to the
|
||||
fail-overed server.
|
||||
|
||||
Please note that this feature is often used for the floating IP fail-
|
||||
@@ -95,7 +100,7 @@
|
||||
It doesn't support the cluster alias IP scenario.
|
||||
|
||||
When using the tickle ACK function, in addition to the normal usage
|
||||
- of portblock RA, the parameter tickle_dir must be specified in the
|
||||
+ of portblock RA, the parameter tickle_dir must be specified in the
|
||||
action=unblock instance of the portblock resources.
|
||||
For example, you may stack resources like below:
|
||||
portblock action=block
|
||||
@@ -103,18 +108,18 @@
|
||||
portblock action=unblock tickle_dir=/tickle/state/dir
|
||||
|
||||
If you want to tickle all the TCP connections which connected to _one_
|
||||
- floating IP but different ports, no matter how many portblock resources
|
||||
- you have defined, you should enable tickles for _one_ portblock
|
||||
+ floating IP but different ports, no matter how many portblock resources
|
||||
+ you have defined, you should enable tickles for _one_ portblock
|
||||
resource(action=unblock) only.
|
||||
-
|
||||
- The tickle_dir is a location which stores the established TCP
|
||||
- connections. It can be a shared directory(which is cluster-visible to
|
||||
+
|
||||
+ The tickle_dir is a location which stores the established TCP
|
||||
+ connections. It can be a shared directory(which is cluster-visible to
|
||||
all nodes) or a local directory.
|
||||
If you use the shared directory, you needn't do any other things.
|
||||
If you use the local directory, you must also specify the sync_script
|
||||
paramater. We recommend you to use csync2 as the sync_script.
|
||||
- For example, if you use the local directory /tmp/tickle as tickle_dir,
|
||||
- you could setup the csync2 as the csync2 documentation says and
|
||||
+ For example, if you use the local directory /tmp/tickle as tickle_dir,
|
||||
+ you could setup the csync2 as the csync2 documentation says and
|
||||
configure your /etc/csync2/csync2.cfg like:
|
||||
group ticklegroup {
|
||||
host node1;
|
||||
@@ -137,15 +142,19 @@
|
||||
<version>1.0</version>
|
||||
|
||||
<longdesc lang="en">
|
||||
-Resource script for portblock. It is used to temporarily block ports
|
||||
+Resource script for portblock. It is used to block ports
|
||||
using iptables. In addition, it may allow for faster TCP reconnects
|
||||
for clients on failover. Use that if there are long lived TCP
|
||||
connections to an HA service. This feature is enabled by setting the
|
||||
tickle_dir parameter and only in concert with action set to unblock.
|
||||
Note that the tickle ACK function is new as of version 3.0.2 and
|
||||
hasn't yet seen widespread use.
|
||||
+
|
||||
+In Promotable mode, the promote action unblocks the port(s) on the Promoted node
|
||||
+and blocks the port(s) on the Unpromoted node(s) when action=unblock, and vice versa
|
||||
+when action=block.
|
||||
</longdesc>
|
||||
-<shortdesc lang="en">Block and unblocks access to TCP and UDP ports</shortdesc>
|
||||
+<shortdesc lang="en">Blocks and unblocks access to TCP and UDP ports</shortdesc>
|
||||
|
||||
<parameters>
|
||||
<parameter name="protocol" unique="0" required="1">
|
||||
@@ -167,6 +176,10 @@
|
||||
<parameter name="action" unique="0" required="1">
|
||||
<longdesc lang="en">
|
||||
The action (block/unblock) to be done on the protocol::portno.
|
||||
+
|
||||
+In Promotable mode it is the action for the promote action,
|
||||
+and the opposite action will be used for the start and demote
|
||||
+actions.
|
||||
</longdesc>
|
||||
<shortdesc lang="en">action</shortdesc>
|
||||
<content type="string" default="${OCF_RESKEY_action_default}" />
|
||||
@@ -202,7 +215,7 @@
|
||||
|
||||
<parameter name="tickle_dir" unique="0" required="0">
|
||||
<longdesc lang="en">
|
||||
-The shared or local directory (_must_ be absolute path) which
|
||||
+The shared or local directory (_must_ be absolute path) which
|
||||
stores the established TCP connections.
|
||||
</longdesc>
|
||||
<shortdesc lang="en">Tickle directory</shortdesc>
|
||||
@@ -236,6 +249,8 @@
|
||||
<actions>
|
||||
<action name="start" timeout="20s" />
|
||||
<action name="stop" timeout="20s" />
|
||||
+<action name="promote" timeout="10s"/>
|
||||
+<action name="demote" timeout="10s"/>
|
||||
<action name="status" depth="0" timeout="10s" interval="10s" />
|
||||
<action name="monitor" depth="0" timeout="10s" interval="10s" />
|
||||
<action name="meta-data" timeout="5s" />
|
||||
@@ -269,9 +284,9 @@
|
||||
# iptables 1.8.9 briefly broke the output format, returning the
|
||||
# numeric protocol value instead of a string. Support both variants.
|
||||
if [ "$1" = "tcp" ]; then
|
||||
- local prot="(tcp|6)"
|
||||
+ local prot="\(tcp\|6\)"
|
||||
else
|
||||
- local prot="(udp|17)"
|
||||
+ local prot="\(udp\|17\)"
|
||||
fi
|
||||
echo "^DROP${w}${prot}${w}--${w}${src}${w}${dst}${w}multiport${w}${4}ports${w}${2}$"
|
||||
}
|
||||
@@ -281,7 +296,7 @@
|
||||
{
|
||||
[ "$4" = "OUTPUT" ] && ds="s" || ds="d"
|
||||
PAT=$(active_grep_pat "$1" "$2" "$3" "$ds")
|
||||
- $IPTABLES $wait -n -L "$4" | grep -qE "$PAT"
|
||||
+ $IPTABLES $wait -n -L "$4" | grep -q "$PAT"
|
||||
}
|
||||
|
||||
# netstat -tn and ss -Htn, split on whitespace and colon,
|
||||
@@ -397,6 +412,17 @@
|
||||
rc=$OCF_NOT_RUNNING
|
||||
;;
|
||||
esac
|
||||
+ elif ocf_is_ms; then
|
||||
+ case $5 in
|
||||
+ block)
|
||||
+ SayInactive $*
|
||||
+ rc=$OCF_NOT_RUNNING
|
||||
+ ;;
|
||||
+ *)
|
||||
+ SayActive $*
|
||||
+ rc=$OCF_SUCCESS
|
||||
+ ;;
|
||||
+ esac
|
||||
else
|
||||
case $5 in
|
||||
block)
|
||||
@@ -493,18 +519,21 @@
|
||||
{
|
||||
ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" start
|
||||
case $5 in
|
||||
- block) IptablesBLOCK "$@";;
|
||||
+ block) IptablesBLOCK "$@"
|
||||
+ rc=$?
|
||||
+ ;;
|
||||
unblock)
|
||||
IptablesUNBLOCK "$@"
|
||||
rc=$?
|
||||
tickle_remote
|
||||
#ignore run_tickle_tcp exit code!
|
||||
- return $rc
|
||||
;;
|
||||
- *) usage; return 1;
|
||||
+ *) usage; return $OCF_ERR_CONFIGURED ;
|
||||
esac
|
||||
|
||||
- return $?
|
||||
+ ocf_is_ms && ocf_promotion_score -v $SCORE_UNPROMOTED -N $nodename
|
||||
+
|
||||
+ return $rc
|
||||
}
|
||||
|
||||
#IptablesStop {udp|tcp} portno,portno ip {in|out|both} {block|unblock}
|
||||
@@ -512,17 +541,73 @@
|
||||
{
|
||||
ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" stop
|
||||
case $5 in
|
||||
- block) IptablesUNBLOCK "$@";;
|
||||
+ block) IptablesUNBLOCK "$@"
|
||||
+ rc=$?
|
||||
+ ;;
|
||||
unblock)
|
||||
save_tcp_connections
|
||||
IptablesBLOCK "$@"
|
||||
+ rc=$?
|
||||
;;
|
||||
- *) usage; return 1;;
|
||||
+ *) usage; return $OCF_ERR_CONFIGURED ;;
|
||||
esac
|
||||
|
||||
+ ocf_is_ms && ocf_promotion_score -D -N $nodename
|
||||
+
|
||||
+ return $rc
|
||||
+}
|
||||
+
|
||||
+IptablesPromote() {
|
||||
+ IptablesStatus "$@"
|
||||
+ rc=$?
|
||||
+ if [ $rc -eq $OCF_SUCCESS ] && [ $promotion_score -eq $SCORE_PROMOTED ]; then
|
||||
+ ocf_log info "Promote: resource already promoted."
|
||||
+ return $rc
|
||||
+ elif [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_NOT_RUNNING ]; then
|
||||
+ ocf_exit_reason "Promote: IptablesStatus failed with rc: $rc."
|
||||
+ return $rc
|
||||
+ fi
|
||||
+ case $5 in
|
||||
+ block) IptablesBLOCK "$@"
|
||||
+ rc=$?
|
||||
+ ;;
|
||||
+ unblock)
|
||||
+ IptablesUNBLOCK "$@"
|
||||
+ rc=$?
|
||||
+ tickle_remote
|
||||
+ #ignore run_tickle_tcp exit code!
|
||||
+ ;;
|
||||
+ *) usage; return $OCF_ERR_CONFIGURED ;
|
||||
+ esac
|
||||
+ ocf_promotion_score -v $SCORE_PROMOTED -N $nodename
|
||||
return $?
|
||||
}
|
||||
|
||||
+IptablesDemote() {
|
||||
+ IptablesStatus "$@"
|
||||
+ rc=$?
|
||||
+ if [ $rc -eq $OCF_SUCCESS ] && [ $promotion_score -eq $SCORE_UNPROMOTED ]; then
|
||||
+ ocf_log info "Demote: resource already demoted."
|
||||
+ return $rc
|
||||
+ elif [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_NOT_RUNNING ]; then
|
||||
+ ocf_exit_reason "Demote: IptablesStatus failed with rc: $rc."
|
||||
+ return $rc
|
||||
+ fi
|
||||
+ case $5 in
|
||||
+ block)
|
||||
+ save_tcp_connections
|
||||
+ IptablesBLOCK "$@"
|
||||
+ rc=$?
|
||||
+ ;;
|
||||
+ unblock) IptablesUNBLOCK "$@"
|
||||
+ rc=$?
|
||||
+ ;;
|
||||
+ *) usage; return $OCF_ERR_CONFIGURED ;;
|
||||
+ esac
|
||||
+ ocf_promotion_score -v $SCORE_UNPROMOTED -N $nodename
|
||||
+ return $rc
|
||||
+}
|
||||
+
|
||||
#
|
||||
# Check if the port is valid, this function code is not decent, but works
|
||||
#
|
||||
@@ -558,17 +643,17 @@
|
||||
fi
|
||||
if [ ! -d "$OCF_RESKEY_tickle_dir" ]; then
|
||||
ocf_log err "The tickle dir doesn't exist!"
|
||||
- exit $OCF_ERR_INSTALLED
|
||||
+ exit $OCF_ERR_INSTALLED
|
||||
fi
|
||||
fi
|
||||
|
||||
case $action in
|
||||
- block|unblock)
|
||||
+ block|unblock)
|
||||
;;
|
||||
- *)
|
||||
+ *)
|
||||
ocf_log err "Invalid action $action!"
|
||||
exit $OCF_ERR_CONFIGURED
|
||||
- ;;
|
||||
+ ;;
|
||||
esac
|
||||
|
||||
if ocf_is_true $reset_local_on_unblock_stop; then
|
||||
@@ -591,7 +676,7 @@
|
||||
exit $OCF_ERR_ARGS
|
||||
fi
|
||||
|
||||
-case $1 in
|
||||
+case $__OCF_ACTION in
|
||||
meta-data) meta_data
|
||||
exit $OCF_SUCCESS
|
||||
;;
|
||||
@@ -605,12 +690,12 @@
|
||||
if [ -z "$OCF_RESKEY_protocol" ]; then
|
||||
ocf_log err "Please set OCF_RESKEY_protocol"
|
||||
exit $OCF_ERR_CONFIGURED
|
||||
-fi
|
||||
+fi
|
||||
|
||||
if [ -z "$OCF_RESKEY_portno" ]; then
|
||||
ocf_log err "Please set OCF_RESKEY_portno"
|
||||
exit $OCF_ERR_CONFIGURED
|
||||
-fi
|
||||
+fi
|
||||
|
||||
if [ -z "$OCF_RESKEY_action" ]; then
|
||||
ocf_log err "Please set OCF_RESKEY_action"
|
||||
@@ -632,6 +717,7 @@
|
||||
action=$OCF_RESKEY_action
|
||||
ip=$OCF_RESKEY_ip
|
||||
reset_local_on_unblock_stop=$OCF_RESKEY_reset_local_on_unblock_stop
|
||||
+nodename=$(ocf_local_nodename)
|
||||
|
||||
|
||||
# If "tickle" is enabled, we need to record the list of currently established
|
||||
@@ -647,17 +733,35 @@
|
||||
fi
|
||||
fi
|
||||
|
||||
-case $1 in
|
||||
- start)
|
||||
- IptablesStart $protocol $portno $ip $direction $action
|
||||
+if ocf_is_ms; then
|
||||
+ promotion_score=$(ocf_promotion_score -G -N $nodename -q 2> /dev/null)
|
||||
+ if { [ "$__OCF_ACTION" = "monitor" ] && [ "$promotion_score" = "$SCORE_UNPROMOTED" ]; } || [ "$__OCF_ACTION" = "demote" ] || [ "$__OCF_ACTION" = "start" ]; then
|
||||
+ case $action in
|
||||
+ block) action="unblock" ;;
|
||||
+ unblock) action="block" ;;
|
||||
+ esac
|
||||
+ fi
|
||||
+fi
|
||||
+
|
||||
+case $__OCF_ACTION in
|
||||
+ start)
|
||||
+ IptablesStart "$protocol" "$portno" "$ip" "$direction" "$action"
|
||||
+ ;;
|
||||
+
|
||||
+ stop)
|
||||
+ IptablesStop "$protocol" "$portno" "$ip" "$direction" "$action"
|
||||
+ ;;
|
||||
+
|
||||
+ promote)
|
||||
+ IptablesPromote "$protocol" "$portno" "$ip" "$direction" "$action"
|
||||
;;
|
||||
|
||||
- stop)
|
||||
- IptablesStop $protocol $portno $ip $direction $action
|
||||
+ demote)
|
||||
+ IptablesDemote "$protocol" "$portno" "$ip" "$direction" "$action"
|
||||
;;
|
||||
|
||||
- status|monitor)
|
||||
- IptablesStatus $protocol $portno $ip $direction $action
|
||||
+ status|monitor)
|
||||
+ IptablesStatus "$protocol" "$portno" "$ip" "$direction" "$action"
|
||||
;;
|
||||
|
||||
validate-all)
|
||||
@ -0,0 +1,180 @@
|
||||
--- a/heartbeat/portblock 2025-10-21 09:27:41.753028260 +0200
|
||||
+++ b/heartbeat/portblock 2025-10-21 09:28:55.573855995 +0200
|
||||
@@ -28,6 +28,8 @@
|
||||
OCF_RESKEY_portno_default=""
|
||||
OCF_RESKEY_direction_default="in"
|
||||
OCF_RESKEY_action_default=""
|
||||
+OCF_RESKEY_method_default="drop"
|
||||
+OCF_RESKEY_status_check_default="rule"
|
||||
OCF_RESKEY_ip_default="0.0.0.0/0"
|
||||
OCF_RESKEY_reset_local_on_unblock_stop_default="false"
|
||||
OCF_RESKEY_tickle_dir_default=""
|
||||
@@ -37,6 +39,8 @@
|
||||
: ${OCF_RESKEY_portno=${OCF_RESKEY_portno_default}}
|
||||
: ${OCF_RESKEY_direction=${OCF_RESKEY_direction_default}}
|
||||
: ${OCF_RESKEY_action=${OCF_RESKEY_action_default}}
|
||||
+: ${OCF_RESKEY_method=${OCF_RESKEY_method_default}}
|
||||
+: ${OCF_RESKEY_status_check=${OCF_RESKEY_status_check_default}}
|
||||
: ${OCF_RESKEY_ip=${OCF_RESKEY_ip_default}}
|
||||
: ${OCF_RESKEY_reset_local_on_unblock_stop=${OCF_RESKEY_reset_local_on_unblock_stop_default}}
|
||||
: ${OCF_RESKEY_tickle_dir=${OCF_RESKEY_tickle_dir_default}}
|
||||
@@ -185,6 +189,26 @@
|
||||
<content type="string" default="${OCF_RESKEY_action_default}" />
|
||||
</parameter>
|
||||
|
||||
+<parameter name="method" unique="0" required="0">
|
||||
+<longdesc lang="en">
|
||||
+Block method:
|
||||
+drop: Use DROP rule.
|
||||
+reject: Use REJECT rule w/conntrack to clear connections when blocking.
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">Block method</shortdesc>
|
||||
+<content type="string" default="${OCF_RESKEY_method_default}" />
|
||||
+</parameter>
|
||||
+
|
||||
+<parameter name="status_check" unique="0" required="0">
|
||||
+<longdesc lang="en">
|
||||
+Status check:
|
||||
+rule: Check rule.
|
||||
+pseudo: Check pseudo status when rule is absent.
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">Status check</shortdesc>
|
||||
+<content type="string" default="${OCF_RESKEY_status_check_default}" />
|
||||
+</parameter>
|
||||
+
|
||||
<parameter name="reset_local_on_unblock_stop" unique="0" required="0">
|
||||
<longdesc lang="en">
|
||||
If for some reason the long lived server side TCP sessions won't be cleaned up
|
||||
@@ -253,6 +277,7 @@
|
||||
<action name="demote" timeout="10s"/>
|
||||
<action name="status" depth="0" timeout="10s" interval="10s" />
|
||||
<action name="monitor" depth="0" timeout="10s" interval="10s" />
|
||||
+<action name="monitor" depth="0" timeout="10s" interval="9s" role="Promoted" />
|
||||
<action name="meta-data" timeout="5s" />
|
||||
<action name="validate-all" timeout="5s" />
|
||||
</actions>
|
||||
@@ -288,7 +313,11 @@
|
||||
else
|
||||
local prot="\(udp\|17\)"
|
||||
fi
|
||||
- echo "^DROP${w}${prot}${w}--${w}${src}${w}${dst}${w}multiport${w}${4}ports${w}${2}$"
|
||||
+ if [ "$method" = "DROP" ]; then
|
||||
+ echo "^DROP${w}${prot}${w}--${w}${src}${w}${dst}${w}multiport${w}${4}ports${w}${2}$"
|
||||
+ else
|
||||
+ echo "^REJECT${w}${prot}${w}--${w}${src}${w}${dst}${w}multiport${w}${4}ports${w}${2}${w}ctstate${w}NEW,RELATED,ESTABLISHED${w}reject-with${w}tcp-reset$"
|
||||
+ fi
|
||||
}
|
||||
|
||||
#chain_isactive {udp|tcp} portno,portno ip chain
|
||||
@@ -374,17 +403,17 @@
|
||||
|
||||
SayActive()
|
||||
{
|
||||
- ocf_log debug "$CMD DROP rule [$*] is running (OK)"
|
||||
+ ocf_log debug "$CMD $method rule [$*] is running (OK)"
|
||||
}
|
||||
|
||||
SayConsideredActive()
|
||||
{
|
||||
- ocf_log debug "$CMD DROP rule [$*] considered to be running (OK)"
|
||||
+ ocf_log debug "$CMD $method rule [$*] considered to be running (OK)"
|
||||
}
|
||||
|
||||
SayInactive()
|
||||
{
|
||||
- ocf_log debug "$CMD DROP rule [$*] is inactive"
|
||||
+ ocf_log debug "$CMD $method rule [$*] is inactive"
|
||||
}
|
||||
|
||||
#IptablesStatus {udp|tcp} portno,portno ip {in|out|both} {block|unblock}
|
||||
@@ -405,14 +434,18 @@
|
||||
case $5 in
|
||||
block)
|
||||
SayActive $*
|
||||
- rc=$OCF_SUCCESS
|
||||
+ if [ "$__OCF_ACTION" = "monitor" ] && [ "$promotion_score" = "$SCORE_PROMOTED" ]; then
|
||||
+ rc=$OCF_RUNNING_MASTER
|
||||
+ else
|
||||
+ rc=$OCF_SUCCESS
|
||||
+ fi
|
||||
;;
|
||||
*)
|
||||
SayInactive $*
|
||||
rc=$OCF_NOT_RUNNING
|
||||
;;
|
||||
esac
|
||||
- elif ocf_is_ms; then
|
||||
+ elif [ "$OCF_RESKEY_status_check" = "rule" ]; then
|
||||
case $5 in
|
||||
block)
|
||||
SayInactive $*
|
||||
@@ -420,7 +453,11 @@
|
||||
;;
|
||||
*)
|
||||
SayActive $*
|
||||
- rc=$OCF_SUCCESS
|
||||
+ if [ "$__OCF_ACTION" = "monitor" ] && [ "$promotion_score" = "$SCORE_PROMOTED" ]; then
|
||||
+ rc=$OCF_RUNNING_MASTER
|
||||
+ else
|
||||
+ rc=$OCF_SUCCESS
|
||||
+ fi
|
||||
;;
|
||||
esac
|
||||
else
|
||||
@@ -461,7 +498,11 @@
|
||||
: Chain already in desired state
|
||||
else
|
||||
[ "$chain" = "OUTPUT" ] && ds="s" || ds="d"
|
||||
- $IPTABLES $wait "$op" "$chain" -p "$proto" -${ds} "$ip" -m multiport --${ds}ports "$ports" -j DROP
|
||||
+ if [ "$method" = "DROP" ]; then
|
||||
+ $IPTABLES $wait "$op" "$chain" -p "$proto" -${ds} "$ip" -m multiport --${ds}ports "$ports" -j DROP
|
||||
+ else
|
||||
+ $IPTABLES $wait "$op" "$chain" -p "$proto" -${ds} "$ip" -m multiport --${ds}ports "$ports" -m conntrack --ctstate NEW,ESTABLISHED,RELATED -j REJECT --reject-with tcp-reset
|
||||
+ fi
|
||||
fi
|
||||
}
|
||||
|
||||
@@ -486,7 +527,11 @@
|
||||
$IPTABLES $wait -I OUTPUT -p "$1" -s "$3" -m multiport --sports "$2" -j REJECT --reject-with tcp-reset
|
||||
tickle_local
|
||||
fi
|
||||
- $IPTABLES $wait -I INPUT -p "$1" -d "$3" -m multiport --dports "$2" -j DROP
|
||||
+ if [ "$method" = "DROP" ]; then
|
||||
+ $IPTABLES $wait -I INPUT -p "$1" -d "$3" -m multiport --dports "$2" -j DROP
|
||||
+ else
|
||||
+ $IPTABLES $wait -I INPUT -p "$1" -d "$3" -m multiport --dports "$2" -m conntrack --ctstate NEW,ESTABLISHED,RELATED -j REJECT --reject-with tcp-reset
|
||||
+ fi
|
||||
rc_in=$?
|
||||
if $try_reset ; then
|
||||
$IPTABLES $wait -D OUTPUT -p "$1" -s "$3" -m multiport --sports "$2" -j REJECT --reject-with tcp-reset
|
||||
@@ -718,6 +763,13 @@
|
||||
ip=$OCF_RESKEY_ip
|
||||
reset_local_on_unblock_stop=$OCF_RESKEY_reset_local_on_unblock_stop
|
||||
nodename=$(ocf_local_nodename)
|
||||
+case "$OCF_RESKEY_method" in
|
||||
+ drop) method="DROP" ;;
|
||||
+ reject) method="REJECT" ;;
|
||||
+ *) ocf_log err "method: $OCF_RESKEY_method not supported"
|
||||
+ exit $OCF_ERR_CONFIGURED
|
||||
+ ;;
|
||||
+esac
|
||||
|
||||
|
||||
# If "tickle" is enabled, we need to record the list of currently established
|
||||
@@ -743,6 +795,8 @@
|
||||
fi
|
||||
fi
|
||||
|
||||
+IptablesValidateAll
|
||||
+
|
||||
case $__OCF_ACTION in
|
||||
start)
|
||||
IptablesStart "$protocol" "$portno" "$ip" "$direction" "$action"
|
||||
@@ -765,7 +819,6 @@
|
||||
;;
|
||||
|
||||
validate-all)
|
||||
- IptablesValidateAll
|
||||
;;
|
||||
|
||||
*) usage
|
||||
@ -0,0 +1,186 @@
|
||||
From 1afdd91b2961061937fc802c575304ede8d79286 Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Wed, 10 Sep 2025 16:56:56 +0200
|
||||
Subject: [PATCH] podman-etcd: Add cluster-wide force_new_cluster attribute
|
||||
checking
|
||||
|
||||
Implement cluster-wide validation of force_new_cluster attribute to resolve
|
||||
race conditions during automated cluster recovery. The enhancement ensures
|
||||
agents check for the cluster-wide attribute before falling back to local
|
||||
etcd revision comparison.
|
||||
|
||||
Key changes:
|
||||
- Enhanced get_force_new_cluster() to query all cluster nodes
|
||||
- Ensure force_new_cluster is not set in both nodes to prevent
|
||||
conflicting recovery attempts
|
||||
- Updated startup logic to prioritize cluster-wide attribute checking
|
||||
|
||||
fixes OCPBUGS-61117
|
||||
---
|
||||
heartbeat/podman-etcd | 107 ++++++++++++++++++++++++++++--------------
|
||||
1 file changed, 72 insertions(+), 35 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 33804414a..f3a6da5e2 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -794,54 +794,72 @@ set_force_new_cluster()
|
||||
return $rc
|
||||
}
|
||||
|
||||
+# get_force_new_cluster returns a space-separated list of nodes that have the force_new_cluster attribute set.
|
||||
+# Return values:
|
||||
+# - Exit code 0 with non-empty output: One or more nodes have the force_new_cluster attribute set
|
||||
+# - Exit code 0 with empty output: No nodes have the force_new_cluster attribute set
|
||||
+# - Exit code 1 with empty output: Error occurred while querying the cluster nodes
|
||||
get_force_new_cluster()
|
||||
{
|
||||
- crm_attribute --lifetime reboot --query --name "force_new_cluster" | awk -F"value=" '{print $2}'
|
||||
+ local node nodes value
|
||||
+ local holders=""
|
||||
+
|
||||
+ if ! nodes=$(crm_node -l | awk '{print $2}'); then
|
||||
+ ocf_log err "could not get force_new_cluster attribute, crm_node error code: $?"
|
||||
+ return 1
|
||||
+ fi
|
||||
+ if [ -z "$nodes" ]; then
|
||||
+ ocf_log err "could not get force_new_cluster attribute, the list of nodes is empty"
|
||||
+ return 1
|
||||
+ fi
|
||||
+
|
||||
+ for node in $nodes; do
|
||||
+ if ! value=$(crm_attribute --query --lifetime reboot --name "force_new_cluster" --node "$node" 2>/dev/null | awk -F'value=' '{print $2}' | tr -d "'"); then
|
||||
+ ocf_log err "could not get force_new_cluster attribute, crm_attribut error code: $?"
|
||||
+ return 1
|
||||
+ fi
|
||||
+ if [ -n "$value" ]; then
|
||||
+ holders="$holders$node "
|
||||
+ fi
|
||||
+ done
|
||||
+ echo "$holders"
|
||||
}
|
||||
|
||||
+
|
||||
clear_force_new_cluster()
|
||||
{
|
||||
- local force_new_cluster_node
|
||||
-
|
||||
- force_new_cluster_node=$(get_force_new_cluster)
|
||||
- if [ -z "$force_new_cluster_node" ]; then
|
||||
- ocf_log info "$NODENAME: force_new_cluster attribute not set"
|
||||
+ # only the holder of "force_new_cluster" attribute can delete it
|
||||
+ if ! is_force_new_cluster; then
|
||||
+ ocf_log info "force_new_cluster unset or not owned by $NODENAME"
|
||||
return $OCF_SUCCESS
|
||||
fi
|
||||
|
||||
- # only the holder of "force_new_cluster" attribute can delete it
|
||||
- if [ "$NODENAME" = "$force_new_cluster_node" ]; then
|
||||
- crm_attribute --lifetime reboot --name "force_new_cluster" --delete
|
||||
- rc=$?
|
||||
- if [ $rc -ne 0 ]; then
|
||||
- ocf_log err "could not clear force_new_cluster attribute, error code: $rc"
|
||||
- else
|
||||
- ocf_log info "$NODENAME: force_new_cluster attribute cleared"
|
||||
- fi
|
||||
- return $rc
|
||||
- else
|
||||
- ocf_log info "$NODENAME does not hold force_new_cluster ($force_new_cluster_node has it)"
|
||||
- return $OCF_SUCCESS
|
||||
+ if ! crm_attribute --delete --lifetime reboot --node "$NODENAME" --name "force_new_cluster"; then
|
||||
+ ocf_log err "could not clear force_new_cluster attribute, error code: $?"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
fi
|
||||
+
|
||||
+ ocf_log info "$NODENAME: force_new_cluster attribute cleared"
|
||||
+ return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
+
|
||||
is_force_new_cluster()
|
||||
{
|
||||
- # Return 0 if 'force_new_cluster' is set and the value matches the current node name, 1 otherwise.
|
||||
- local value
|
||||
+ # Return 0 if 'force_new_cluster' is set on the current node, 1 otherwise.
|
||||
+ local fnc_holders
|
||||
|
||||
- value=$(get_force_new_cluster)
|
||||
- if [ -z "$value" ]; then
|
||||
- ocf_log debug "force_new_cluster attribute is not set"
|
||||
- return 1
|
||||
+ if ! fnc_holders=$(get_force_new_cluster); then
|
||||
+ ocf_exit_reason "is_force_new_cluster: Failed to get force_new_cluster node holders"
|
||||
+ exit $OCF_ERR_GENERIC
|
||||
fi
|
||||
|
||||
- if [ "$value" = "$NODENAME" ]; then
|
||||
+ if echo "$fnc_holders" | grep -q -w "$NODENAME"; then
|
||||
ocf_log debug "$NODENAME has force_new_cluster set"
|
||||
return 0
|
||||
fi
|
||||
|
||||
- ocf_log info "force_new_cluster attribute set on peer node $value"
|
||||
+ ocf_log debug "force_new_cluster attribute is not set on $NODENAME"
|
||||
return 1
|
||||
}
|
||||
|
||||
@@ -1415,17 +1433,34 @@ podman_start()
|
||||
return "$OCF_ERR_GENERIC"
|
||||
fi
|
||||
|
||||
- # force-new-cluster property is a runtime-scoped flag that instructs the agent to force a new cluster-of-1.
|
||||
- # Since this attribute is configured with a reboot-lifetime, it is automatically cleared when the machine reboots.
|
||||
- # If the agent detects during its start that this property is set, it indicates that the flag was explicitly set
|
||||
- # during the current node boot session, implying a deliberate request to recover the cluster.
|
||||
if ocf_is_true "$pod_was_running"; then
|
||||
ocf_log info "static pod was running: start normally"
|
||||
else
|
||||
- if is_force_new_cluster; then
|
||||
- ocf_log notice "'$NODENAME' marked to force-new-cluster"
|
||||
+ local fnc_holders
|
||||
+ if ! fnc_holders=$(get_force_new_cluster); then
|
||||
+ ocf_exit_reason "Failed to get force_new_cluster node holders"
|
||||
+ return "$OCF_ERR_GENERIC"
|
||||
+ fi
|
||||
+
|
||||
+ local fnc_holder_count
|
||||
+ fnc_holder_count=$(echo "$fnc_holders" | wc -w)
|
||||
+ if [ "$fnc_holder_count" -gt 1 ]; then
|
||||
+ ocf_exit_reason "force_new_cluster attribute is set on multiple nodes ($fnc_holders)"
|
||||
+ return "$OCF_ERR_GENERIC"
|
||||
+ fi
|
||||
+
|
||||
+ if [ "$fnc_holder_count" -eq 1 ]; then
|
||||
+ if echo "$fnc_holders" | grep -q -w "$NODENAME"; then
|
||||
+ # Attribute is set on the local node.
|
||||
+ ocf_log notice "$NODENAME marked to force-new-cluster"
|
||||
+ JOIN_AS_LEARNER=false
|
||||
+ else
|
||||
+ # Attribute is set on a peer node.
|
||||
+ ocf_log info "$NODENAME shall join as learner because force_new_cluster is set on peer $fnc_holders"
|
||||
+ JOIN_AS_LEARNER=true
|
||||
+ fi
|
||||
else
|
||||
- ocf_log info "'$NODENAME' is not marked to force-new-cluster"
|
||||
+ ocf_log info "no node is marked to force-new-cluster"
|
||||
# When the local agent starts, we can infer the cluster state by counting
|
||||
# how many agents are starting or already active:
|
||||
# - 1 active agent: it's the peer (we are just starting)
|
||||
@@ -1522,7 +1557,7 @@ podman_start()
|
||||
for try in $(seq $retries); do
|
||||
learner_node=$(attribute_learner_node get)
|
||||
if [ "$NODENAME" != "$learner_node" ]; then
|
||||
- ocf_log info "$learner_node is not in the member list yet. Retry in $poll_interval_sec seconds."
|
||||
+ ocf_log info "$NODENAME is not in the member list yet. Retry in $poll_interval_sec seconds."
|
||||
sleep $poll_interval_sec
|
||||
continue
|
||||
fi
|
||||
@@ -1673,6 +1708,8 @@ podman_stop()
|
||||
{
|
||||
local timeout=60
|
||||
local rc
|
||||
+
|
||||
+ ocf_log notice "podman-etcd stop"
|
||||
podman_simple_status
|
||||
if [ $? -eq $OCF_NOT_RUNNING ]; then
|
||||
ocf_log info "could not leave members list: etcd container not running"
|
||||
@ -0,0 +1,36 @@
|
||||
From 1e546b85010e5fdbf7a0f31207dce144c14c50ec Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Wed, 29 Oct 2025 15:17:30 +0100
|
||||
Subject: [PATCH] MailTo: add s-nail support for multiple recipients
|
||||
|
||||
---
|
||||
heartbeat/MailTo | 16 ++++++++++------
|
||||
1 file changed, 10 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/MailTo b/heartbeat/MailTo
|
||||
index 56940bafaa..a3ee6a04c8 100755
|
||||
--- a/heartbeat/MailTo
|
||||
+++ b/heartbeat/MailTo
|
||||
@@ -92,12 +92,16 @@ END
|
||||
}
|
||||
|
||||
MailProgram() {
|
||||
- $MAILCMD -s "$1" "$email" <<EOF
|
||||
- $Subject
|
||||
-
|
||||
- Command line was:
|
||||
- $ARGS
|
||||
-EOF
|
||||
+ local body="\
|
||||
+$Subject
|
||||
+
|
||||
+Command line was:
|
||||
+$ARGS"
|
||||
+ if $MAILCMD -V | grep -q "^s-nail"; then
|
||||
+ printf "$body" | $MAILCMD -s "$1" $(echo $email | sed "s/,\s*/ /g")
|
||||
+ else
|
||||
+ printf "$body" | $MAILCMD -s "$1" "$email"
|
||||
+ fi
|
||||
return $?
|
||||
}
|
||||
|
||||
@ -0,0 +1,481 @@
|
||||
From dbc0d2647d73bed986bf7208df33f092f56e8523 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Thu, 25 Sep 2025 14:23:20 +0200
|
||||
Subject: [PATCH] db2: use reintegration flag to avoid race condition on
|
||||
cluster reintegration, and removed FAL, as it's no longer needed
|
||||
|
||||
---
|
||||
heartbeat/db2 | 306 ++++++++++++++++++++++++++++++++------------------
|
||||
1 file changed, 197 insertions(+), 109 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/db2 b/heartbeat/db2
|
||||
index fe1d9b892..83020fc70 100755
|
||||
--- a/heartbeat/db2
|
||||
+++ b/heartbeat/db2
|
||||
@@ -37,6 +37,13 @@
|
||||
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
|
||||
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
|
||||
|
||||
+# Use runuser if available for SELinux.
|
||||
+if [ -x "/sbin/runuser" ]; then
|
||||
+ SU="runuser"
|
||||
+else
|
||||
+ SU="su"
|
||||
+fi
|
||||
+
|
||||
# Parameter defaults
|
||||
|
||||
OCF_RESKEY_instance_default=""
|
||||
@@ -55,11 +62,12 @@ OCF_RESKEY_dbpartitionnum_default="0"
|
||||
: ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}}
|
||||
: ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}}
|
||||
|
||||
+POSIX_UNICODE_LOCALE="C.UTF-8"
|
||||
#######################################################################
|
||||
|
||||
|
||||
db2_usage() {
|
||||
- echo "db2 start|stop|monitor|promote|demote|notify|validate-all|meta-data"
|
||||
+ echo "db2 start|stop|monitor|promote|demote|validate-all|meta-data"
|
||||
}
|
||||
|
||||
db2_meta_data() {
|
||||
@@ -162,7 +170,6 @@ The number of the partition (DBPARTITIONNUM) to be managed.
|
||||
<action name="stop" timeout="120s"/>
|
||||
<action name="promote" timeout="120s"/>
|
||||
<action name="demote" timeout="120s"/>
|
||||
-<action name="notify" timeout="10s"/>
|
||||
<action name="monitor" depth="0" timeout="60s" interval="20s"/>
|
||||
<action name="monitor" depth="0" timeout="60s" role="Promoted" interval="22s"/>
|
||||
<action name="validate-all" timeout="5s"/>
|
||||
@@ -273,7 +280,18 @@ master_score()
|
||||
# Run the given command as db2 instance user
|
||||
#
|
||||
runasdb2() {
|
||||
- su $instance -c ". $db2profile; $*"
|
||||
+ $SU $instance -c ". $db2profile; $*"
|
||||
+}
|
||||
+
|
||||
+#
|
||||
+# Run the given command as db2 instance user using $SU
|
||||
+# We run this function as opposed to runasdb2 whenever we have to issue commands
|
||||
+# that leave processes running on the system, such as db2start
|
||||
+# We do not want these processes to hog the resources as they were run with elevated privileges
|
||||
+#
|
||||
+runasdb2_session() {
|
||||
+ # Override db2profile with unicode locale is required to maintain compatibility with unicode CODEPAGE
|
||||
+ $SU "$instance" -c "ksh -c '. $db2profile; export LC_ALL="$POSIX_UNICODE_LOCALE"; export LANG="$POSIX_UNICODE_LOCALE"; $*'"
|
||||
}
|
||||
|
||||
#
|
||||
@@ -294,48 +312,6 @@ logasdb2() {
|
||||
}
|
||||
|
||||
|
||||
-#
|
||||
-# maintain the fal (first active log) attribute
|
||||
-# db2_fal_attrib DB {set val|get}
|
||||
-#
|
||||
-db2_fal_attrib() {
|
||||
- local db=$1
|
||||
- local attr val rc id node member me
|
||||
-
|
||||
- attr=db2hadr_${instance}_${db}_fal
|
||||
-
|
||||
- case "$2" in
|
||||
- set)
|
||||
- me=$(ocf_local_nodename)
|
||||
-
|
||||
- # loop over all member nodes and set attribute
|
||||
- crm_node -l |
|
||||
- while read id node member
|
||||
- do
|
||||
- [ "$member" = member -a "$node" != "$me" ] || continue
|
||||
- crm_attribute -l forever --node=$node -n $attr -v "$3"
|
||||
- rc=$?
|
||||
- ocf_log info "DB2 instance $instance($db2node/$db: setting attrib for FAL to $FIRST_ACTIVE_LOG @ $node"
|
||||
- [ $rc != 0 ] && break
|
||||
- done
|
||||
- ;;
|
||||
-
|
||||
- get)
|
||||
- crm_attribute -l forever -n $attr -G --quiet 2>&1
|
||||
- rc=$?
|
||||
- if ! ocf_is_true "$OCF_RESKEY_CRM_meta_notify" && [ $rc != 0 ]
|
||||
- then
|
||||
- ocf_log warn "DB2 instance $instance($db2node/$db: can't retrieve attribute $attr, are you sure notifications are enabled ?"
|
||||
- fi
|
||||
- ;;
|
||||
-
|
||||
- *)
|
||||
- exit $OCF_ERR_CONFIGURED
|
||||
- esac
|
||||
-
|
||||
- return $rc
|
||||
-}
|
||||
-
|
||||
#
|
||||
# unfortunately a first connect after a crash may need several minutes
|
||||
# for some internal cleanup stuff in DB2.
|
||||
@@ -429,6 +405,42 @@ db2_check_config_compatibility() {
|
||||
|
||||
}
|
||||
|
||||
+#
|
||||
+# Start HADR as standby.
|
||||
+#
|
||||
+# Parameters
|
||||
+# 1 - Calling function
|
||||
+# 2 - Calling functions line number
|
||||
+#
|
||||
+# Return codes:
|
||||
+# 0 - Start as standby successful
|
||||
+# 1 - Start as standby failed
|
||||
+#
|
||||
+reintegrateAsStandby() {
|
||||
+ db=$1
|
||||
+ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint"
|
||||
+ ocf_log info "$__OCF_ACTION: $LINENO: reintegrateAsStandby called by $2 at $3. Attempting to reintegrate $db as standby."
|
||||
+ if output=$(runasdb2_session "db2 start hadr on db $db as standby"); then
|
||||
+ rc=0
|
||||
+ ocf_log info "$__OCF_ACTION: $LINENO: Db2 database $instance($db2node)/$db started/activated"
|
||||
+ else
|
||||
+ case $output in
|
||||
+ SQL1777N*)
|
||||
+ # SQL1777N: HADR is already started in given state.
|
||||
+ ocf_log info "$__OCF_ACTION: $LINENO: $output"
|
||||
+ rc=0
|
||||
+ ;;
|
||||
+
|
||||
+ *)
|
||||
+ rc=1
|
||||
+ ocf_log err "$__OCF_ACTION: $LINENO: Unable to reintegrate Db2 database $instance($db2node)/$db. Please reintegrate manually: $output, return with rc=$rc"
|
||||
+ ;;
|
||||
+ esac
|
||||
+ fi
|
||||
+ crm_attribute -n "$reint_attr" -N "$local_host" -v "0" -l forever
|
||||
+ return $rc
|
||||
+}
|
||||
+
|
||||
#
|
||||
# Start instance and DB.
|
||||
# Standard mode is through "db2 activate" in order to start in previous
|
||||
@@ -478,6 +490,8 @@ db2_start() {
|
||||
|
||||
for db in $dblist
|
||||
do
|
||||
+ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint"
|
||||
+
|
||||
# sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW FIRST_ACTIVE_LOG
|
||||
db2_get_cfg $db || return $?
|
||||
|
||||
@@ -488,20 +502,13 @@ db2_start() {
|
||||
|
||||
if [ $HADR_ROLE = PRIMARY ]
|
||||
then
|
||||
- local master_fal
|
||||
-
|
||||
- # communicate our FAL to other nodes the might start concurrently
|
||||
- db2_fal_attrib $db set $FIRST_ACTIVE_LOG
|
||||
-
|
||||
- # ignore false positive:
|
||||
- # error: Can't use > in [ ]. Escape it or use [[..]]. [SC2073]
|
||||
- # see https://github.com/koalaman/shellcheck/issues/691
|
||||
- # shellcheck disable=SC2073
|
||||
- if master_fal=$(db2_fal_attrib $db get) && [ "$master_fal" '>' $FIRST_ACTIVE_LOG ]
|
||||
- then
|
||||
+ cib_value=$(crm_attribute -n "$reint_attr" -N "$local_host" -G | awk -v FS=' value=' '{print $2}')
|
||||
+ ocf_log info "$__OCF_ACTION: $LINENO: CIB attribute $reint_attr is set to '$cib_value'"
|
||||
+ if [ "$cib_value" = "1" ]; then
|
||||
ocf_log info "DB2 database $instance($db2node)/$db is Primary and outdated, starting as secondary"
|
||||
start_cmd="db2 start hadr on db $db as standby"
|
||||
HADR_ROLE=STANDBY
|
||||
+ standby_reintegration=1
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -511,27 +518,65 @@ db2_start() {
|
||||
[ $HADR_ROLE != STANDBY ] && db2_run_connect $db &
|
||||
else
|
||||
case $output in
|
||||
- SQL1490W*|SQL1494W*|SQL1497W*|SQL1777N*)
|
||||
- ocf_log info "DB2 database $instance($db2node)/$db already activated: $output"
|
||||
+ SQL1490W* | SQL1494W* | SQL1497W* | SQL1777N*)
|
||||
+ # SQL1490W Activate database is successful, however, the database has already been activated on one or more nodes.
|
||||
+ # SQL1494W Activate database is successful, however, there is already a connection to the database.
|
||||
+ # SQL1497W Activate/Deactivate database was successful, however, an error occurred on some nodes.
|
||||
+ # SQL1777N HADR is already started.
|
||||
+
|
||||
+ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database is already activated: $output"
|
||||
;;
|
||||
|
||||
- SQL1768N*"Reason code = \"7\""*)
|
||||
- ocf_log err "DB2 database $instance($db2node)/$db is a Primary and the Standby is down"
|
||||
- ocf_log err "Possible split brain ! Manual intervention required."
|
||||
+ SQL1768N*"Reason code = \"7\""*)
|
||||
+ rc="$OCF_ERR_GENERIC"
|
||||
+
|
||||
+ ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database is a Primary and the Standby is down"
|
||||
+ ocf_log err "Possible split brain! Manual intervention required."
|
||||
ocf_log err "If this DB is outdated use \"db2 start hadr on db $db as standby\""
|
||||
- ocf_log err "If this DB is the surviving primary use \"db2 start hadr on db $db as primary by force\""
|
||||
+ ocf_log err "If this DB is the surviving primary use \"db2 start hadr on db $db as primary by force\". db2_start() exit with rc=$rc."
|
||||
|
||||
- # might be the Standby is not yet there
|
||||
- # might be a timing problem because "First active log" is delayed
|
||||
- # on the next start attempt we might succeed when FAL was advanced
|
||||
- # might be manual intervention is required
|
||||
- # ... so let pacemaker give it another try and we will succeed then
|
||||
- return $OCF_ERR_GENERIC
|
||||
+ # let pacemaker give it another try and we will succeed then
|
||||
+ return "$rc"
|
||||
;;
|
||||
|
||||
- *)
|
||||
- ocf_log err "DB2 database $instance($db2node)/$db didn't start: $output"
|
||||
- return $OCF_ERR_GENERIC
|
||||
+ SQL1776N*"Reason code = \"6\""*)
|
||||
+ # SQL1776N The command cannot be issued on an HADR database.
|
||||
+ # Reason code 6:
|
||||
+ # This database is an old primary database. It cannot be started
|
||||
+ # because the standby has become the new primary through forced
|
||||
+ # takeover.
|
||||
+
|
||||
+ rc="$OCF_ERR_GENERIC"
|
||||
+ ocf_log err "$__OCF_ACTION: $LINENO: Db2 database $instance($db2node)/$db didn't start: $output, return with rc=$rc"
|
||||
+ ocf_log err "$__OCF_ACTION: $LINENO: This database is an old primary database. Trying start again as standby"
|
||||
+
|
||||
+ start_cmd="db2 start hadr on db $db as standby"
|
||||
+ if output=$(runasdb2_session "$start_cmd"); then
|
||||
+ rc="$OCF_SUCCESS"
|
||||
+ ocf_log info "$__OCF_ACTION: $LINENO: Db2 database $instance($db2node)/$db started/activated"
|
||||
+ else
|
||||
+ case $output in
|
||||
+ SQL1777N*)
|
||||
+ # SQL1777N: HADR is already started.
|
||||
+ ocf_log info "$__OCF_ACTION: $LINENO: $output"
|
||||
+ rc="$OCF_SUCCESS"
|
||||
+ ;;
|
||||
+
|
||||
+ *)
|
||||
+ rc="$OCF_ERR_GENERIC"
|
||||
+ ocf_log err "$__OCF_ACTION: $LINENO: Unable to reintegrate Db2 database $instance($db2node)/$db. Please reintegrate manually: $output, return with rc=$rc"
|
||||
+ ;;
|
||||
+ esac
|
||||
+ fi
|
||||
+
|
||||
+ return "$rc"
|
||||
+ ;;
|
||||
+
|
||||
+ *)
|
||||
+ rc="$OCF_ERR_GENERIC"
|
||||
+ ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database didn't start: $output, db2_start() exit with rc=$rc."
|
||||
+ return "$rc"
|
||||
+ ;;
|
||||
esac
|
||||
fi
|
||||
done
|
||||
@@ -539,6 +584,15 @@ db2_start() {
|
||||
# come here with success
|
||||
# Even if we are a db2 Primary pacemaker requires start to end up in slave mode
|
||||
echo SLAVE > $STATE_FILE
|
||||
+
|
||||
+ # Unset primary failover attribute as host was successfully reintegrated as standby
|
||||
+ if [ "$standby_reintegration" = "1" ]; then
|
||||
+ for db in $dblist; do
|
||||
+ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint"
|
||||
+ crm_attribute -n "$reint_attr" -N "$local_host" -v "0" -l forever
|
||||
+ done
|
||||
+ fi
|
||||
+
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
@@ -737,7 +791,7 @@ db2_monitor_retry() {
|
||||
|
||||
#
|
||||
# Monitor the db
|
||||
-# And as side effect set crm_master / FAL attribute
|
||||
+# And as side effect set crm_master
|
||||
#
|
||||
db2_monitor() {
|
||||
local CMD output hadr db
|
||||
@@ -754,6 +808,22 @@ db2_monitor() {
|
||||
|
||||
for db in $dblist
|
||||
do
|
||||
+ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint"
|
||||
+
|
||||
+ #Check for the reintegration file, then set the flag if it exists and delete the file
|
||||
+ if [ -e "/tmp/$reint_attr" ] && [ -n "$remote_host" ]; then
|
||||
+ #The file exist, try to set the reintegration attribute
|
||||
+ crm_attribute -n "$reint_attr" -N "$remote_host" -v "1" -l forever
|
||||
+ cib_value=$(crm_attribute -n "$reint_attr" -N "$remote_host" -G | awk -v FS=' value=' '{print $2}')
|
||||
+
|
||||
+ if [ "$cib_value" = "1" ]; then
|
||||
+ ocf_log info "$__OCF_ACTION: $LINENO: CIB attribute $reint_attr is set to '$cib_value', reintegration flag file will now be deleted."
|
||||
+ rm -f "/tmp/$reint_attr"
|
||||
+ else
|
||||
+ ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The reintegration flag file exists, but its attribute failed to set."
|
||||
+ fi
|
||||
+ fi
|
||||
+
|
||||
hadr=$(db2_hadr_status $db)
|
||||
rc=$?
|
||||
ocf_log debug "Monitor: DB2 database $instance($db2node)/$db has HADR status $hadr"
|
||||
@@ -804,6 +874,14 @@ db2_monitor() {
|
||||
;;
|
||||
|
||||
STANDBY/*PEER/*|Standby/*Peer)
|
||||
+ # If db is in standby peer, then it has already reintegrated.
|
||||
+ # If the reintegrate flag is still set, remove it
|
||||
+ cib_value=$(crm_attribute -n "$reint_attr" -N "$local_host" -G | awk -v FS=' value=' '{print $2}')
|
||||
+ if [ "$cib_value" = "1" ]; then
|
||||
+ ocf_log info "$__OCF_ACTION: $LINENO: Reintegrate flag detected for $db, but it has already reintegrated as standby. Removing reintegration flag."
|
||||
+ crm_attribute -n "$reint_attr" -N "$local_host" -v "0" -l forever
|
||||
+ fi
|
||||
+
|
||||
master_score -v 8000 -l reboot
|
||||
;;
|
||||
|
||||
@@ -812,6 +890,34 @@ db2_monitor() {
|
||||
master_score -D -l reboot
|
||||
;;
|
||||
|
||||
+ Down/Off)
|
||||
+ # If db is a deactivated primary and it has a reintegration flag, then reintegrate as standby.
|
||||
+ cib_value=$(crm_attribute -n "$reint_attr" -N "$local_host" -G | awk -v FS=' value=' '{print $2}')
|
||||
+ if [ "$cib_value" = "1" ]; then
|
||||
+ output=$(runasdb2 "db2 get db cfg for $db" | grep 'HADR database role' | awk '{print $5}')
|
||||
+ if [ "PRIMARY" = "$output" ]; then
|
||||
+ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: Database is deactivated with Primary role and the reintegration flag is set. Role: $output, Reintegration flag: $reint_attr = $cib_value"
|
||||
+ # Reintegrate as the standby database.
|
||||
+ if reintegrateAsStandby "$db" 'db2_monitor' $LINENO; then
|
||||
+ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database reintegration succeeded."
|
||||
+ # Setting slave state here will cause rc to be OCF_SUCCESS below.
|
||||
+ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: Echoing SLAVE into $STATE_FILE"
|
||||
+ echo SLAVE >"$STATE_FILE"
|
||||
+ # Update master score to reflect standby state.
|
||||
+ master_score -v 8000 -l reboot
|
||||
+ else
|
||||
+ ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database reintegration failed."
|
||||
+ return "$OCF_ERR_GENERIC"
|
||||
+ fi
|
||||
+ fi
|
||||
+ else
|
||||
+ rc="$OCF_NOT_RUNNING"
|
||||
+ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database has HADR status $hadr."
|
||||
+ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: db2_monitor() exit with rc=$rc."
|
||||
+ return "$rc"
|
||||
+ fi
|
||||
+ ;;
|
||||
+
|
||||
*)
|
||||
return $OCF_ERR_GENERIC
|
||||
esac
|
||||
@@ -875,8 +981,6 @@ db2_promote() {
|
||||
# update pacemaker's view
|
||||
echo MASTER > $STATE_FILE
|
||||
|
||||
- # turn the log so we rapidly get a new FAL
|
||||
- logasdb2 "db2 archive log for db $db"
|
||||
return $OCF_SUCCESS
|
||||
fi
|
||||
|
||||
@@ -914,26 +1018,6 @@ db2_demote() {
|
||||
return $?
|
||||
}
|
||||
|
||||
-#
|
||||
-# handle pre start notification
|
||||
-# We record our first active log on the other nodes.
|
||||
-# If two primaries come up after a crash they can safely determine who is
|
||||
-# the outdated one.
|
||||
-#
|
||||
-db2_notify() {
|
||||
- local node
|
||||
-
|
||||
- # only interested in pre-start
|
||||
- [ $OCF_RESKEY_CRM_meta_notify_type = pre \
|
||||
- -a $OCF_RESKEY_CRM_meta_notify_operation = start ] || return $OCF_SUCCESS
|
||||
-
|
||||
- # gets FIRST_ACTIVE_LOG
|
||||
- db2_get_cfg $dblist || return $?
|
||||
-
|
||||
- db2_fal_attrib $dblist set $FIRST_ACTIVE_LOG || return $OCF_ERR_GENERIC
|
||||
- exit $OCF_SUCCESS
|
||||
-}
|
||||
-
|
||||
########
|
||||
# Main #
|
||||
########
|
||||
@@ -947,50 +1031,54 @@ case "$__OCF_ACTION" in
|
||||
db2_usage
|
||||
exit $OCF_SUCCESS
|
||||
;;
|
||||
+esac
|
||||
|
||||
+local_host=$(ocf_local_nodename)
|
||||
+inst1=$(echo "$OCF_RESKEY_instance" | cut -d"," -f1)
|
||||
+inst2=$(echo "$OCF_RESKEY_instance" | cut -d"," -f2)
|
||||
+host1=$(crm_node -l | sort | awk '{print $2;}' | sed -n 1p)
|
||||
+
|
||||
+if [ "$host1" = "$local_host" ]; then
|
||||
+ remote_host=$(crm_node -l | sort | awk '{print $2;}' | sed -n 2p)
|
||||
+else
|
||||
+ remote_host="$host1"
|
||||
+fi
|
||||
+
|
||||
+db2_validate; validate_rc=$?
|
||||
+
|
||||
+case "$__OCF_ACTION" in
|
||||
start)
|
||||
- db2_validate
|
||||
db2_start || exit $?
|
||||
db2_monitor
|
||||
- exit $?
|
||||
;;
|
||||
|
||||
stop)
|
||||
- db2_validate
|
||||
db2_stop
|
||||
- exit $?
|
||||
;;
|
||||
|
||||
promote)
|
||||
- db2_validate
|
||||
db2_promote
|
||||
- exit $?
|
||||
;;
|
||||
|
||||
demote)
|
||||
- db2_validate
|
||||
db2_demote
|
||||
- exit $?
|
||||
;;
|
||||
|
||||
notify)
|
||||
- db2_validate
|
||||
- db2_notify
|
||||
- exit $?
|
||||
+ ocf_log debug "notify-action has been DEPRECATED, and should be removed"
|
||||
;;
|
||||
|
||||
monitor)
|
||||
- db2_validate
|
||||
db2_monitor_retry
|
||||
- exit $?
|
||||
;;
|
||||
|
||||
validate-all)
|
||||
- db2_validate
|
||||
- exit $?
|
||||
+ exit $validate_rc
|
||||
;;
|
||||
|
||||
*)
|
||||
db2_usage
|
||||
exit $OCF_ERR_UNIMPLEMENTED
|
||||
esac
|
||||
+
|
||||
+exit $?
|
||||
@ -0,0 +1,321 @@
|
||||
From a31f15104fc712cd25f8a59d49f1bbcdbbbc5434 Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Tue, 30 Sep 2025 11:54:44 +0200
|
||||
Subject: [PATCH 1/2] Refactor(podman-etcd): improve peer checking and
|
||||
leadership loss detection
|
||||
|
||||
The check_peers function is broken up into smaller, more manageable
|
||||
functions. This refactoring separates the logic for detecting a loss of
|
||||
cluster leadership from the logic for managing peer membership.
|
||||
|
||||
The main function is renamed to check_peer as there is only 1 peer to
|
||||
check (it was check_peers).
|
||||
---
|
||||
heartbeat/podman-etcd | 78 +++++++++++++++++++++++++------------------
|
||||
1 file changed, 45 insertions(+), 33 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index f3a6da5e2..3d1e4c520 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -1014,42 +1014,35 @@ get_member_list_json() {
|
||||
podman exec "${CONTAINER}" etcdctl member list --endpoints="$this_node_endpoint" -w json
|
||||
}
|
||||
|
||||
-check_peers()
|
||||
+detect_cluster_leadership_loss()
|
||||
{
|
||||
- # Check peers endpoint status and locally accessible member list
|
||||
- local member_list_json
|
||||
-
|
||||
- if ! container_exists; then
|
||||
- # we need a running container to execute etcdctl.
|
||||
- return $OCF_SUCCESS
|
||||
+ endpoint_status_json=$(get_endpoint_status_json)
|
||||
+ ocf_log info "endpoint status: $endpoint_status_json"
|
||||
+
|
||||
+ count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l)
|
||||
+ if [ "$count_endpoints" -eq 1 ]; then
|
||||
+ ocf_log info "one endpoint only: checking status errors"
|
||||
+ endpoint_status_errors=$(printf "%s" "$endpoint_status_json" | jq -r ".[0].Status.errors")
|
||||
+ if echo "$endpoint_status_errors" | grep -q "no leader"; then
|
||||
+ set_force_new_cluster
|
||||
+ set_standalone_node
|
||||
+ ocf_exit_reason "$NODENAME must force a new cluster"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+ if [ "$endpoint_status_errors" != "null" ]; then
|
||||
+ ocf_log err "unmanaged endpoint status error: $endpoint_status_errors"
|
||||
+ fi
|
||||
fi
|
||||
|
||||
- member_list_json=$(get_member_list_json)
|
||||
- rc=$?
|
||||
- ocf_log debug "member list: $member_list_json"
|
||||
- if [ $rc -ne 0 ]; then
|
||||
- ocf_log info "podman failed to get member list, error code: $rc"
|
||||
-
|
||||
- endpoint_status_json=$(get_endpoint_status_json)
|
||||
- ocf_log info "endpoint status: $endpoint_status_json"
|
||||
-
|
||||
- count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l)
|
||||
- if [ "$count_endpoints" -eq 1 ]; then
|
||||
- ocf_log info "one endpoint only: checking status errors"
|
||||
- endpoint_status_errors=$(printf "%s" "$endpoint_status_json" | jq -r ".[0].Status.errors")
|
||||
- if echo "$endpoint_status_errors" | grep -q "no leader"; then
|
||||
- set_force_new_cluster
|
||||
- set_standalone_node
|
||||
- ocf_exit_reason "$NODENAME must force a new cluster"
|
||||
- return $OCF_ERR_GENERIC
|
||||
- fi
|
||||
- if [ "$endpoint_status_errors" != "null" ]; then
|
||||
- ocf_log err "unmanaged endpoint status error: $endpoint_status_errors"
|
||||
- fi
|
||||
- fi
|
||||
+ return $OCF_SUCCESS
|
||||
+}
|
||||
|
||||
- return $OCF_SUCCESS
|
||||
- fi
|
||||
+manage_peer_membership()
|
||||
+{
|
||||
+ # Read etcd member list to detect the status of the peer member.
|
||||
+ # If the peer is missing from the member list, it will be added back as learner
|
||||
+ # If the peer is back in the member list, we ensure that the related CIB attributes (standalone and learner_node) are reset
|
||||
+ local member_list_json="$1"
|
||||
|
||||
# Example of .members[] instance fields in member list json format:
|
||||
# NOTE that "name" is present in voting members only, while "isLearner" in learner members only
|
||||
@@ -1083,6 +1076,25 @@ check_peers()
|
||||
clear_standalone_and_learner_if_not_learners "$member_list_json"
|
||||
fi
|
||||
done
|
||||
+}
|
||||
+
|
||||
+check_peer()
|
||||
+{
|
||||
+ # Check peers endpoint status and locally accessible member list
|
||||
+ local member_list_json
|
||||
+
|
||||
+ # we need a running container to execute etcdctl.
|
||||
+ if ! container_exists; then
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ if ! member_list_json=$(get_member_list_json); then
|
||||
+ ocf_log info "podman failed to get member list, error code: $?"
|
||||
+ detect_cluster_leadership_loss
|
||||
+ return $?
|
||||
+ fi
|
||||
+
|
||||
+ manage_peer_membership "$member_list_json"
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
@@ -1124,7 +1136,7 @@ podman_monitor()
|
||||
# monitor operation to fail.
|
||||
# TODO: move this inside check_peers where we already query member list json
|
||||
attribute_node_member_id update
|
||||
- if ! check_peers; then
|
||||
+ if ! check_peer; then
|
||||
return $OCF_ERR_GENERIC
|
||||
fi
|
||||
|
||||
|
||||
From de7c73a933cefb8f7b9e810bd23c3d12f6d6f29a Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Tue, 30 Sep 2025 18:38:06 +0200
|
||||
Subject: [PATCH 2/2] OCPBUGS-42808: podman-etcd: add automatic learner member
|
||||
promotion
|
||||
|
||||
Automatically promote etcd learner members to voting members when detected.
|
||||
Includes refactored member management functions and improved validation.
|
||||
---
|
||||
heartbeat/podman-etcd | 108 ++++++++++++++++++++++++++++++------------
|
||||
1 file changed, 79 insertions(+), 29 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 3d1e4c520..e1425ec02 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -712,6 +712,22 @@ attribute_node_revision_peer()
|
||||
crm_attribute --query --type nodes --node "$nodename" --name "revision" | awk -F"value=" '{print $2}'
|
||||
}
|
||||
|
||||
+# Converts a decimal number to hexadecimal format with validation
|
||||
+# Args: $1 - decimal number (test for non-negative integer too)
|
||||
+# Returns: 0 on success, OCF_ERR_GENERIC on invalid input
|
||||
+# Outputs: hexadecimal representation to stdout
|
||||
+decimal_to_hex() {
|
||||
+ local dec=$1
|
||||
+
|
||||
+ if ! echo "$dec" | grep -q "^[1-9][0-9]*$"; then
|
||||
+ ocf_log err "Invalid member ID format: '$dec' (expected decimal number)"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
+ printf "%x" "$dec"
|
||||
+ return $OCF_SUCCESS
|
||||
+}
|
||||
+
|
||||
attribute_node_member_id()
|
||||
{
|
||||
local action="$1"
|
||||
@@ -737,7 +753,7 @@ attribute_node_member_id()
|
||||
return "$rc"
|
||||
fi
|
||||
|
||||
- local value
|
||||
+ local value value_hex
|
||||
if ! value=$(echo -n "$member_list_json" | jq -r ".header.member_id"); then
|
||||
rc=$?
|
||||
ocf_log err "could not get $attribute from member list JSON, error code: $rc"
|
||||
@@ -745,8 +761,11 @@ attribute_node_member_id()
|
||||
fi
|
||||
|
||||
# JSON member_id is decimal, while etcdctl command needs the hex version
|
||||
- value=$(printf "%x" "$value")
|
||||
- if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then
|
||||
+ if ! value_hex=$(decimal_to_hex "$value"); then
|
||||
+ ocf_log err "could not convert decimal member_id '$value' to hex, error code: $?"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+ if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value_hex"; then
|
||||
rc=$?
|
||||
ocf_log err "could not update etcd $attribute, error code: $rc"
|
||||
return "$rc"
|
||||
@@ -905,42 +924,70 @@ clear_standalone_node()
|
||||
crm_attribute --name "standalone_node" --delete
|
||||
}
|
||||
|
||||
-clear_standalone_and_learner_if_not_learners()
|
||||
+
|
||||
+# Promotes an etcd learner member to a voting member
|
||||
+# Args: $1 - learner member ID in decimal format
|
||||
+# Returns: OCF_SUCCESS (even on expected promotion failures), OCF_ERR_GENERIC on conversion errors
|
||||
+# Note: Promotion failures are expected and logged as info (peer may not be up-to-date)
|
||||
+promote_learner_member()
|
||||
+{
|
||||
+ local learner_member_id=$1
|
||||
+
|
||||
+ # JSON member_id is decimal, while etcdctl command needs the hex version
|
||||
+ if ! learner_member_id_hex=$(decimal_to_hex "$learner_member_id"); then
|
||||
+ ocf_log err "could not convert decimal member_id '$learner_member_id' to hex, error code: $?"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+ if ! ocf_run podman exec "${CONTAINER}" etcdctl member promote "$learner_member_id_hex" 2>&1; then
|
||||
+ # promotion is expected to fail if the peer is not yet up-to-date
|
||||
+ ocf_log info "could not promote member $learner_member_id_hex, error code: $?"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+ ocf_log info "successfully promoted member '$learner_member_id_hex'"
|
||||
+ return $OCF_SUCCESS
|
||||
+}
|
||||
+
|
||||
+# Reconciles etcd cluster member states
|
||||
+# Promotes learner members or clears standalone/learner attributes as needed
|
||||
+# Args: $1 - member list JSON from etcdctl
|
||||
+# Returns: OCF_SUCCESS on completion, OCF_ERR_GENERIC on errors
|
||||
+# Note: Only operates when exactly 2 started members are present
|
||||
+reconcile_member_state()
|
||||
{
|
||||
local rc
|
||||
local member_list_json="$1"
|
||||
|
||||
- number_of_members=$(printf "%s" "$member_list_json" | jq -r ".members[].ID" | wc -l)
|
||||
- if [ "$number_of_members" -ne 2 ]; then
|
||||
- ocf_log info "could not clear standalone_node, nor learner_node properties: found $number_of_members members, need 2"
|
||||
+ # count only the started members, which have the ".name" JSON field
|
||||
+ number_of_started_members=$(printf "%s" "$member_list_json" | jq -r ".members[].name | select(. != null)" | wc -l)
|
||||
+ if [ "$number_of_started_members" -ne 2 ]; then
|
||||
+ ocf_log info "could not clear standalone_node, nor learner_node properties: found $number_of_started_members members, need 2"
|
||||
return $OCF_SUCCESS
|
||||
fi
|
||||
|
||||
- id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID")
|
||||
+ learner_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID")
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ]; then
|
||||
ocf_log err "could not get isLearner field from member list, error code: $rc"
|
||||
return $rc
|
||||
fi
|
||||
|
||||
- if [ -z "$id" ]; then
|
||||
- clear_standalone_node
|
||||
- rc=$?
|
||||
- if [ $rc -ne 0 ]; then
|
||||
- ocf_og error "could not clear standalone_node attribute, error code: $rc"
|
||||
- return $rc
|
||||
- fi
|
||||
+ if [ -n "$learner_member_id" ]; then
|
||||
+ promote_learner_member "$learner_member_id"
|
||||
+ return $?
|
||||
fi
|
||||
- if [ -z "$id" ]; then
|
||||
- attribute_learner_node clear
|
||||
- rc=$?
|
||||
- if [ $rc -ne 0 ]; then
|
||||
- ocf_og error "could not clear learner_node attribute, error code: $rc"
|
||||
- return $rc
|
||||
+
|
||||
+ if [ -z "$learner_member_id" ]; then
|
||||
+ if ! clear_standalone_node; then
|
||||
+ ocf_log error "could not clear standalone_node attribute, error code: $?"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+ if ! attribute_learner_node clear; then
|
||||
+ ocf_log error "could not clear learner_node attribute, error code: $?"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
fi
|
||||
fi
|
||||
|
||||
- return $rc
|
||||
+ return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
attribute_learner_node()
|
||||
@@ -1019,7 +1066,7 @@ detect_cluster_leadership_loss()
|
||||
endpoint_status_json=$(get_endpoint_status_json)
|
||||
ocf_log info "endpoint status: $endpoint_status_json"
|
||||
|
||||
- count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l)
|
||||
+ count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l)
|
||||
if [ "$count_endpoints" -eq 1 ]; then
|
||||
ocf_log info "one endpoint only: checking status errors"
|
||||
endpoint_status_errors=$(printf "%s" "$endpoint_status_json" | jq -r ".[0].Status.errors")
|
||||
@@ -1037,11 +1084,14 @@ detect_cluster_leadership_loss()
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
+
|
||||
+# Manages etcd peer membership by detecting and handling missing or rejoining peers
|
||||
+# Adds missing peers as learners and reconciles member states when peers rejoin
|
||||
+# Args: $1 - member list JSON from etcdctl
|
||||
+# Returns: OCF_SUCCESS on completion, OCF_ERR_GENERIC on errors
|
||||
+# Note: Iterates through all peer nodes to ensure proper cluster membership
|
||||
manage_peer_membership()
|
||||
{
|
||||
- # Read etcd member list to detect the status of the peer member.
|
||||
- # If the peer is missing from the member list, it will be added back as learner
|
||||
- # If the peer is back in the member list, we ensure that the related CIB attributes (standalone and learner_node) are reset
|
||||
local member_list_json="$1"
|
||||
|
||||
# Example of .members[] instance fields in member list json format:
|
||||
@@ -1066,14 +1116,14 @@ manage_peer_membership()
|
||||
|
||||
# Check by IP instead of Name since "learner" members appear only in peerURLs, not by Name.
|
||||
ip=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6
|
||||
- id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$ip\")) | any).ID")
|
||||
- if [ -z "$id" ]; then
|
||||
+ peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$ip\")) | any).ID")
|
||||
+ if [ -z "$peer_member_id" ]; then
|
||||
ocf_log info "$name is not in the members list"
|
||||
add_member_as_learner "$name" "$ip"
|
||||
set_standalone_node
|
||||
else
|
||||
ocf_log debug "$name is in the members list by IP: $ip"
|
||||
- clear_standalone_and_learner_if_not_learners "$member_list_json"
|
||||
+ reconcile_member_state "$member_list_json"
|
||||
fi
|
||||
done
|
||||
}
|
||||
135
SOURCES/RHEL-121986-Filesystem-speed-up-get-PIDs.patch
Normal file
135
SOURCES/RHEL-121986-Filesystem-speed-up-get-PIDs.patch
Normal file
@ -0,0 +1,135 @@
|
||||
From 93729d83fa5bf15f4ec694e08e9777bde858fb41 Mon Sep 17 00:00:00 2001
|
||||
From: Lars Ellenberg <lars.ellenberg@linbit.com>
|
||||
Date: Thu, 16 Oct 2025 10:58:37 +0200
|
||||
Subject: [PATCH 1/2] Filesystem: speed up get_pids
|
||||
|
||||
With force_umount=safe, we "manually" scan the /proc/ file system.
|
||||
|
||||
We look for symlinks pointing into the path we are interested in.
|
||||
Specifically, we are interested in
|
||||
/proc/<pid>/{root,exe,cwd}
|
||||
/proc/<pid>/fd/<fd>
|
||||
We also look for relevant memory mappings in /proc/<pid>/maps
|
||||
|
||||
All these are per process, not per "task" or "thread".
|
||||
see procfs(5) and pthreads(7).
|
||||
Still, we currently also scan /proc/<pid>/task/<tid>/
|
||||
for all the same things.
|
||||
|
||||
With a large system with many heavily threaded processes,
|
||||
this can significantly slow down this scanning,
|
||||
without gaining new information.
|
||||
|
||||
Adding -maxdepth to the find command line avoids this useless work,
|
||||
potentially reducing the scanning time by orders of magnitute
|
||||
on systems with many heavily threaded processes.
|
||||
|
||||
We could also write a dedicated helper in C to do the very same thing,
|
||||
with the option to "short circuit" and proceed with the next pid
|
||||
as soon as the first "match" is found for the currently inspected pid.
|
||||
|
||||
That could further reduce the scanning time
|
||||
by about an additional factor of 10.
|
||||
---
|
||||
heartbeat/Filesystem | 25 +++++++++++++++++++++----
|
||||
1 file changed, 21 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||
index 6d3960162..f76339fd6 100755
|
||||
--- a/heartbeat/Filesystem
|
||||
+++ b/heartbeat/Filesystem
|
||||
@@ -680,14 +680,31 @@ get_pids()
|
||||
# -path "/proc/[!0-9]*" -prune -o ...
|
||||
# -path "/proc/[0-9]*" -a ...
|
||||
# the latter seemd to be significantly faster for this one in my naive test.
|
||||
+
|
||||
+ # root, cwd, exe, maps, fd: all per process, not per task ("thread").
|
||||
+ # -maxdepth to avoid repeatedly scanning the same thing
|
||||
+ # for all threads of a heavily threaded process.
|
||||
+ #
|
||||
+ # Adding -maxdepth reduced scanning from > 16 seconds to < 2 seconds
|
||||
+ # on a mostly idle system that happened to run a few java processes.
|
||||
+ #
|
||||
+ # We can also add a dedicated helper in C do twhat is done below,
|
||||
+ # which would reduce the scanning time by an
|
||||
+ # additional factor of 10 again.
|
||||
+ #
|
||||
+ # Or trust that fuser (above) learned something in the last 15 years
|
||||
+ # and avoids blocking operations meanwhile?
|
||||
procs=$(exec 2>/dev/null;
|
||||
- find /proc -path "/proc/[0-9]*" -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print |
|
||||
+ find /proc -mindepth 1 -maxdepth 3 \
|
||||
+ -path "/proc/[0-9]*" \
|
||||
+ -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print |
|
||||
awk -F/ '{print $3}' | uniq)
|
||||
|
||||
- # This finds both /proc/<pid>/maps and /proc/<pid>/task/<tid>/maps;
|
||||
- # if you don't want the latter, add -maxdepth.
|
||||
+ # memory mappings are also per process, not per task.
|
||||
+ # This finds only /proc/<pid>/maps, and not /proc/<pid>/task/<tid>/maps;
|
||||
+ # if you also want the latter, drop -maxdepth.
|
||||
mmap_procs=$(exec 2>/dev/null;
|
||||
- find /proc -path "/proc/[0-9]*/maps" -print |
|
||||
+ find /proc -mindepth 2 -maxdepth 2 -path "/proc/[0-9]*/maps" -print |
|
||||
xargs -r grep -l " ${dir}/" | awk -F/ '{print $3}' | uniq)
|
||||
printf "${procs}\n${mmap_procs}" | sort -u
|
||||
fi
|
||||
|
||||
From 3d34db0c60a125126361b45ff8303358b6275298 Mon Sep 17 00:00:00 2001
|
||||
From: Lars Ellenberg <lars.ellenberg@linbit.com>
|
||||
Date: Thu, 16 Oct 2025 11:31:00 +0200
|
||||
Subject: [PATCH 2/2] Filesystem: futher speed up get_pids
|
||||
|
||||
If we have /proc/<pid>/map_files/* symlinks,
|
||||
we don't need to additionally grep /proc/<pid>/maps.
|
||||
|
||||
Also don't first collect output of commands into variables
|
||||
just to pipe them to sort -u later,
|
||||
just pipe the output of the commands through sort -u directly.
|
||||
---
|
||||
heartbeat/Filesystem | 31 +++++++++++++++++++------------
|
||||
1 file changed, 19 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||
index f76339fd6..7021f13da 100755
|
||||
--- a/heartbeat/Filesystem
|
||||
+++ b/heartbeat/Filesystem
|
||||
@@ -694,19 +694,26 @@ get_pids()
|
||||
#
|
||||
# Or trust that fuser (above) learned something in the last 15 years
|
||||
# and avoids blocking operations meanwhile?
|
||||
- procs=$(exec 2>/dev/null;
|
||||
- find /proc -mindepth 1 -maxdepth 3 \
|
||||
- -path "/proc/[0-9]*" \
|
||||
- -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print |
|
||||
- awk -F/ '{print $3}' | uniq)
|
||||
-
|
||||
- # memory mappings are also per process, not per task.
|
||||
- # This finds only /proc/<pid>/maps, and not /proc/<pid>/task/<tid>/maps;
|
||||
- # if you also want the latter, drop -maxdepth.
|
||||
- mmap_procs=$(exec 2>/dev/null;
|
||||
+ (
|
||||
+ # If you want to debug this, drop this redirection.
|
||||
+ # But it producess too much "No such file" noise for kernel
|
||||
+ # threads or due to races with exiting processes or closing fds.
|
||||
+ exec 2>/dev/null;
|
||||
+ find /proc -mindepth 1 -maxdepth 3 \
|
||||
+ -path "/proc/[0-9]*" \
|
||||
+ -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print |
|
||||
+ awk -F/ '{print $3}' | uniq
|
||||
+
|
||||
+ # If we have "map_files/", "find" above already found the
|
||||
+ # relevant symlinks, and we don't need to grep "maps" below.
|
||||
+ # Available since kernel 3.3, respectively 4.3.
|
||||
+ test -d /proc/$$/map_files ||
|
||||
+ # memory mappings are also per process, not per task.
|
||||
+ # This finds only /proc/<pid>/maps, and not /proc/<pid>/task/<tid>/maps;
|
||||
+ # if you also want the latter, drop -maxdepth.
|
||||
find /proc -mindepth 2 -maxdepth 2 -path "/proc/[0-9]*/maps" -print |
|
||||
- xargs -r grep -l " ${dir}/" | awk -F/ '{print $3}' | uniq)
|
||||
- printf "${procs}\n${mmap_procs}" | sort -u
|
||||
+ xargs -r grep -l " ${dir}/" | awk -F/ '{print $3}' | uniq
|
||||
+ ) | sort -u
|
||||
fi
|
||||
}
|
||||
|
||||
166
SOURCES/RHEL-123887-podman-etcd-certificate-rotation.patch
Normal file
166
SOURCES/RHEL-123887-podman-etcd-certificate-rotation.patch
Normal file
@ -0,0 +1,166 @@
|
||||
From 6bfbe1dc3a0dad234decd77330ca6189e932bb89 Mon Sep 17 00:00:00 2001
|
||||
From: ehila <ehila@redhat.com>
|
||||
Date: Thu, 16 Oct 2025 23:39:32 -0400
|
||||
Subject: [PATCH] feat: add support for podman-etcd cert rotation
|
||||
|
||||
added a cert check function to the monitor call to force a restart of etcd when the certs have been changed
|
||||
|
||||
Signed-off-by: ehila <ehila@redhat.com>
|
||||
---
|
||||
heartbeat/podman-etcd | 87 ++++++++++++++++++++++++++++++++++++++++++-
|
||||
1 file changed, 86 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index e1425ec02..b8dfb2f9e 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -40,6 +40,7 @@
|
||||
# Parameter defaults
|
||||
OCF_RESKEY_image_default="default"
|
||||
OCF_RESKEY_pod_manifest_default="/etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml"
|
||||
+OCF_RESKEY_etcd_certs_dir_default="/etc/kubernetes/static-pod-resources/etcd-certs"
|
||||
OCF_RESKEY_name_default="etcd"
|
||||
OCF_RESKEY_nic_default="br-ex"
|
||||
OCF_RESKEY_authfile_default="/var/lib/kubelet/config.json"
|
||||
@@ -51,6 +52,7 @@ OCF_RESKEY_backup_location_default="/var/lib/etcd"
|
||||
|
||||
: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
|
||||
: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
|
||||
+: ${OCF_RESKEY_etcd_certs_dir=${OCF_RESKEY_etcd_certs_dir_default}}
|
||||
: ${OCF_RESKEY_name=${OCF_RESKEY_name_default}}
|
||||
: ${OCF_RESKEY_nic=${OCF_RESKEY_nic_default}}
|
||||
: ${OCF_RESKEY_authfile=${OCF_RESKEY_authfile_default}}
|
||||
@@ -88,6 +90,15 @@ The Pod manifest with the configuration for Etcd.
|
||||
<content type="string" default="${OCF_RESKEY_pod_manifest_default}"/>
|
||||
</parameter>
|
||||
|
||||
+<parameter name="etcd_certs_dir" required="0" unique="0">
|
||||
+<longdesc lang="en">
|
||||
+The Etcd certificates directory mounted into the etcd container.
|
||||
+The agent will monitor this directory for changes and restart the etcd container if the certificates have changed.
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">Etcd certificates directory</shortdesc>
|
||||
+<content type="string" default="${OCF_RESKEY_etcd_certs_dir_default}"/>
|
||||
+</parameter>
|
||||
+
|
||||
<parameter name="image" required="0" unique="0">
|
||||
<longdesc lang="en">
|
||||
The podman image to base this container off of.
|
||||
@@ -289,6 +300,59 @@ Expects to have a fully populated OCF RA-compliant environment set.
|
||||
END
|
||||
}
|
||||
|
||||
+etcd_certificates_hash_manager()
|
||||
+{
|
||||
+ local action="$1"
|
||||
+ local current_hash
|
||||
+ local stored_hash
|
||||
+
|
||||
+ # If the certs directory doesn't exist, consider it unchanged
|
||||
+ if [ ! -d "$OCF_RESKEY_etcd_certs_dir" ]; then
|
||||
+ ocf_log warn "certificates directory $OCF_RESKEY_etcd_certs_dir does not exist, skipping certificate monitoring"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ # Calculate hash of all certificate files, ignore key files to avoid accidental disclosure of sensitive information
|
||||
+ # we only need to monitor the certificate files to detect changes.
|
||||
+ if ! current_hash=$(find "$OCF_RESKEY_etcd_certs_dir" -type f \( -name "*.crt" \) -exec sha256sum {} \; | sort | sha256sum | cut -d' ' -f1); then
|
||||
+ ocf_log err "failed to calculate certificate files hash"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
+ # If no stored hash exists, create one and return success
|
||||
+ if [ ! -f "$ETCD_CERTS_HASH_FILE" ]; then
|
||||
+ echo "$current_hash" > "$ETCD_CERTS_HASH_FILE"
|
||||
+ ocf_log info "created initial certificate hash: $current_hash"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ case "$action" in
|
||||
+ "update")
|
||||
+ if ! echo "$current_hash" > "$ETCD_CERTS_HASH_FILE"; then
|
||||
+ ocf_log err "failed to update certificate hash file $ETCD_CERTS_HASH_FILE"
|
||||
+ fi
|
||||
+ ocf_log info "updated certificate hash: $current_hash"
|
||||
+ ;;
|
||||
+ "check")
|
||||
+ if ! stored_hash=$(cat "$ETCD_CERTS_HASH_FILE"); then
|
||||
+ ocf_log err "failed to read stored certificate hash from $ETCD_CERTS_HASH_FILE"
|
||||
+ # This should not happen but if for some reason we can not read the stored hash,
|
||||
+ # use the current hash and log the error but allow etcd to run as long as possible.
|
||||
+ stored_hash="$current_hash"
|
||||
+ fi
|
||||
+ if [ "$current_hash" != "$stored_hash" ]; then
|
||||
+ ocf_exit_reason "$NODENAME etcd certificate files have changed (stored: $stored_hash, current: $current_hash)"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+ ;;
|
||||
+ *)
|
||||
+ ocf_log err "unsupported action: $action"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ ;;
|
||||
+ esac
|
||||
+
|
||||
+ return $OCF_SUCCESS
|
||||
+}
|
||||
|
||||
monitor_cmd_exec()
|
||||
{
|
||||
@@ -357,7 +421,7 @@ archive_current_container()
|
||||
|
||||
# archive corresponding etcd configuration files
|
||||
local files_to_archive=""
|
||||
- for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE"; do
|
||||
+ for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE" "$ETCD_CERTS_HASH_FILE"; do
|
||||
if [ -f "$file" ]; then
|
||||
files_to_archive="$files_to_archive $file"
|
||||
else
|
||||
@@ -1178,6 +1242,11 @@ podman_monitor()
|
||||
return $rc
|
||||
fi
|
||||
|
||||
+ # Check if certificate files have changed, if they have, etcd needs to be restarted
|
||||
+ if ! etcd_certificates_hash_manager "check"; then
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
if is_learner; then
|
||||
ocf_log info "$NODENAME is learner. Cannot get member id"
|
||||
return "$OCF_SUCCESS"
|
||||
@@ -1483,6 +1552,14 @@ podman_start()
|
||||
return $OCF_ERR_GENERIC
|
||||
fi
|
||||
|
||||
+ # Update the certificate hash after the container has started successfully
|
||||
+ # this is to ensure that the certificate hash is updated after a restart is initiated
|
||||
+ # by a cert rotation event from the monitor command.
|
||||
+ if ! etcd_certificates_hash_manager "update"; then
|
||||
+ ocf_exit_reason "etcd certificate hash manager failed to update the certificate hash"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
# check if the container has already started
|
||||
podman_simple_status
|
||||
if [ $? -eq $OCF_SUCCESS ]; then
|
||||
@@ -1888,6 +1965,13 @@ podman_validate()
|
||||
exit $OCF_ERR_CONFIGURED
|
||||
fi
|
||||
|
||||
+ if ! echo "validation test" > "$ETCD_CERTS_HASH_FILE" \
|
||||
+ || ! cat "$ETCD_CERTS_HASH_FILE" >/dev/null 2>&1 \
|
||||
+ || ! rm "$ETCD_CERTS_HASH_FILE"; then
|
||||
+ ocf_exit_reason "cannot read/write to certificate hash file $ETCD_CERTS_HASH_FILE"
|
||||
+ exit $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
@@ -1922,6 +2006,7 @@ CONTAINER=$OCF_RESKEY_name
|
||||
POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
|
||||
ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
|
||||
ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
|
||||
+ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
|
||||
|
||||
# Note: we currently monitor podman containers by with the "podman exec"
|
||||
# command, so make sure that invocation is always valid by enforcing the
|
||||
@ -0,0 +1,115 @@
|
||||
From 6a5608f02a657cf006b6d44d31200342c4bd19b9 Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Tue, 28 Oct 2025 12:47:10 +0100
|
||||
Subject: [PATCH] podman-etcd: compute dynamic revision bump from maxRaftIndex
|
||||
(#2087)
|
||||
|
||||
Replace hardcoded 1 billion revision bump with dynamic calculation based
|
||||
on 20% of the last known maxRaftIndex from revision.json.
|
||||
|
||||
This aligns with the logic used by cluster-etcd-operator's
|
||||
quorum-restore-pod utility and ensures the bump amount is proportional
|
||||
to the cluster's actual revision state.
|
||||
|
||||
The implementation:
|
||||
- Adds compute_bump_revision() function with safe fallback to 1bn
|
||||
default
|
||||
- Extracts magic values to named constants
|
||||
(ETCD_REVISION_BUMP_PERCENTAGE, ETCD_BUMP_REV_DEFAULT,
|
||||
ETCD_REVISION_JSON)
|
||||
- Validates computed values (non-zero, not exceeding default)
|
||||
- Logs computation results for debugging
|
||||
|
||||
Reference:
|
||||
https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da9166
|
||||
22c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34
|
||||
---
|
||||
heartbeat/podman-etcd | 38 ++++++++++++++++++++++++++++++++++----
|
||||
1 file changed, 34 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index b8dfb2f9e..551d37a20 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -619,16 +619,43 @@ prepare_env() {
|
||||
LISTEN_METRICS_URLS="0.0.0.0"
|
||||
}
|
||||
|
||||
+compute_bump_revision() {
|
||||
+ # Same logic used by cluster-etcd-operator quorum-restore-pod utility.
|
||||
+ # see https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da916622c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34
|
||||
+ # set a default value: 1bn would be an etcd running at 1000 writes/s for about eleven days.
|
||||
+ BUMP_REV=$ETCD_BUMP_REV_DEFAULT
|
||||
+ if [ ! -f "${ETCD_REVISION_JSON}" ]; then
|
||||
+ ocf_log err "could not compute bump revision: ${ETCD_REVISION_JSON} not found. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump"
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ # this will bump by the amount of 20% of the last known live revision.
|
||||
+ if ! COMPUTED_BUMP=$(jq -r "(.maxRaftIndex*${ETCD_REVISION_BUMP_PERCENTAGE}|floor)" "${ETCD_REVISION_JSON}"); then
|
||||
+ ocf_log err "could not compute maxRaftIndex for bump revision, jq error code: $?. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump"
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ if [ -z "${COMPUTED_BUMP}" ] || [ "${COMPUTED_BUMP}" -le 0 ] || [ "${COMPUTED_BUMP}" -gt "${ETCD_BUMP_REV_DEFAULT}" ]; then
|
||||
+ ocf_log err "computed bump revision (${COMPUTED_BUMP}) is invalid. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump"
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ BUMP_REV="${COMPUTED_BUMP}"
|
||||
+ ocf_log info "bumping etcd revisions by ${BUMP_REV}"
|
||||
+}
|
||||
|
||||
generate_etcd_configuration() {
|
||||
if is_force_new_cluster; then
|
||||
+ compute_bump_revision
|
||||
# The embedded newline is required for correct YAML formatting.
|
||||
FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: true
|
||||
-force-new-cluster-bump-amount: 1000000000"
|
||||
+force-new-cluster-bump-amount: $BUMP_REV"
|
||||
else
|
||||
FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: false"
|
||||
fi
|
||||
|
||||
+ # the space indentation for client-transport-security and peer-transport-security
|
||||
+ # is required for correct YAML formatting.
|
||||
cat > "$ETCD_CONFIGURATION_FILE" << EOF
|
||||
logger: zap
|
||||
log-level: info
|
||||
@@ -707,7 +734,7 @@ attribute_node_cluster_id()
|
||||
{
|
||||
local action="$1"
|
||||
local value
|
||||
- if ! value=$(jq -r ".clusterId" /var/lib/etcd/revision.json); then
|
||||
+ if ! value=$(jq -r ".clusterId" "$ETCD_REVISION_JSON"); then
|
||||
rc=$?
|
||||
ocf_log err "could not get cluster_id, error code: $rc"
|
||||
return "$rc"
|
||||
@@ -745,7 +772,7 @@ attribute_node_revision()
|
||||
local value
|
||||
local attribute="revision"
|
||||
|
||||
- if ! value=$(jq -r ".maxRaftIndex" /var/lib/etcd/revision.json); then
|
||||
+ if ! value=$(jq -r ".maxRaftIndex" "$ETCD_REVISION_JSON"); then
|
||||
rc=$?
|
||||
ocf_log err "could not get $attribute, error code: $rc"
|
||||
return "$rc"
|
||||
@@ -1456,7 +1483,7 @@ can_reuse_container() {
|
||||
|
||||
|
||||
# If the container does not exist it cannot be reused
|
||||
- if ! container_exists; then
|
||||
+ if ! container_exists; then
|
||||
OCF_RESKEY_reuse=0
|
||||
return "$OCF_SUCCESS"
|
||||
fi
|
||||
@@ -2006,6 +2033,9 @@ CONTAINER=$OCF_RESKEY_name
|
||||
POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
|
||||
ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
|
||||
ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
|
||||
+ETCD_REVISION_JSON="/var/lib/etcd/revision.json"
|
||||
+ETCD_REVISION_BUMP_PERCENTAGE=0.2
|
||||
+ETCD_BUMP_REV_DEFAULT=1000000000
|
||||
ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
|
||||
|
||||
# Note: we currently monitor podman containers by with the "podman exec"
|
||||
@ -0,0 +1,222 @@
|
||||
From e8fb2ad9cc14e91b74b5cde1e012d92afcddb1a5 Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Sat, 25 Oct 2025 17:27:42 +0200
|
||||
Subject: [PATCH] podman-etcd: add container crash detection with coordinated
|
||||
recovery
|
||||
|
||||
This change prevents the agent from starting prematurely when the etcd
|
||||
container has failed. Previously, an early start would cause the agent
|
||||
to block while waiting for peer-initiated recovery. This blocking
|
||||
prevented Pacemaker from allowing the surviving agent to stop and
|
||||
properly recover the cluster.
|
||||
|
||||
The change introduces `container_health_check` function to monitor the
|
||||
container's state and catch etcd failures. This check uses a state file
|
||||
to distinguish between a planned shutdown and an unexpected failure:
|
||||
|
||||
* Container Running: The state file is created or updated with the
|
||||
current epoch (timestamp). Returns: "healthy".
|
||||
* Container Not Running + No State File: It's the first check. Returns:
|
||||
"not-running".
|
||||
* Container Not Running + State File: An unexpected failure is detected.
|
||||
* If force_new_cluster is set, the status is: "failed-restart-now".
|
||||
* Otherwise, the status is: "failed-wait-for-peer".
|
||||
|
||||
The state file is written in a temporary directory (HA_RSCTMP) to ensure
|
||||
automatic cleanup on reboot. It is also explicitly removed in
|
||||
`podman_start` and `podman_stop` to mark planned transitions.
|
||||
|
||||
A new helper function `get_time_since_last_heartbeat()` calculates
|
||||
elapsed time since the last healthy check for diagnostic logging.
|
||||
|
||||
Monitor behavior changes:
|
||||
* failed-wait-for-peer: Returns OCF_SUCCESS to keep resource running
|
||||
while waiting for peer-initiated recovery, as the agent is not able
|
||||
to recover the cluster from a failed state.
|
||||
* failed-restart-now: Returns OCF_ERR_GENERIC to trigger restart once
|
||||
peer has set force_new_cluster
|
||||
---
|
||||
heartbeat/podman-etcd | 133 +++++++++++++++++++++++++++++++++++++++---
|
||||
1 file changed, 124 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index b8dfb2f9e..d596c6f2a 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -1226,22 +1226,122 @@ podman_simple_status()
|
||||
return $rc
|
||||
}
|
||||
|
||||
-podman_monitor()
|
||||
+# get_time_since_last_heartbeat returns the time in seconds since the heartbeat file was last updated.
|
||||
+#
|
||||
+# Returns: time in seconds since last heartbeat, or empty string if file doesn't exist
|
||||
+get_time_since_last_heartbeat()
|
||||
{
|
||||
+ local last_heartbeat
|
||||
+
|
||||
+ if [ ! -f "$CONTAINER_HEARTBEAT_FILE" ]; then
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ last_heartbeat=$(cat "$CONTAINER_HEARTBEAT_FILE")
|
||||
+ echo $(($(date +%s) - last_heartbeat))
|
||||
+}
|
||||
+
|
||||
+# container_health_check performs comprehensive health monitoring for the container.
|
||||
+# This function allows coordinated failure handling where the agent waits for
|
||||
+# peer-initiated cluster recovery in case of container failure.
|
||||
+#
|
||||
+# Uses a state file to track container state:
|
||||
+# - Container running: Update state file with current epoch, return "healthy"
|
||||
+# - Container not running + no state file: Return "not-running" (never checked before)
|
||||
+# - Container not running + state file: Failure detected, check force_new_cluster
|
||||
+# - If force_new_cluster set: Return "failed-restart-now"
|
||||
+# - Otherwise: Return "failed-wait-for-peer"
|
||||
+#
|
||||
+# Returns: healthy, not-running, failed-restart-now, failed-wait-for-peer
|
||||
+
|
||||
+container_health_check()
|
||||
+{
|
||||
+ local rc
|
||||
+
|
||||
# We rely on running podman exec to monitor the container
|
||||
# state because that command seems to be less prone to
|
||||
# performance issue under IO load.
|
||||
#
|
||||
# For probes to work, we expect cmd_exec to be able to report
|
||||
- # when a container is not running. Here, we're not interested
|
||||
- # in distinguishing whether it's stopped or non existing
|
||||
- # (there's function container_exists for that)
|
||||
+ # when a container is not running. Here, we're not interested
|
||||
+ # in distinguishing whether it's stopped or non existing
|
||||
+ # (there's function container_exists for that)
|
||||
+ # For monitor, however, we still need to know if it has stopped
|
||||
+ # recently (i.e. a failure), or not (fresh start)
|
||||
monitor_cmd_exec
|
||||
rc=$?
|
||||
- if [ $rc -ne 0 ]; then
|
||||
- return $rc
|
||||
+ if [ "$rc" -eq 0 ]; then
|
||||
+ # Container is running - update state file with current epoch
|
||||
+ local current_epoch
|
||||
+ current_epoch=$(date +%s)
|
||||
+ if ! echo "$current_epoch" > "$CONTAINER_HEARTBEAT_FILE"; then
|
||||
+ ocf_log warn "Failed to update container heartbeat file, error code: $?"
|
||||
+ # wait for peer to detect any real issue with the etcd cluster or wait for the
|
||||
+ # next monitor interval
|
||||
+ echo "failed-wait-for-peer"
|
||||
+ return
|
||||
+ fi
|
||||
+ echo "healthy"
|
||||
+ return
|
||||
fi
|
||||
|
||||
+ # Check if state file exists (was container running on last check?)
|
||||
+ if [ ! -f "$CONTAINER_HEARTBEAT_FILE" ]; then
|
||||
+ # No state file - container was never checked before
|
||||
+ ocf_log debug "Container ${CONTAINER} has no previous state"
|
||||
+ echo "not-running"
|
||||
+ # NOTE: this is where the probe is expected to exit, keeping the logic
|
||||
+ # quick and less prone to performance issue under IO load.
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ # State file exists - the container failed, check recovery status in this lifecycle
|
||||
+ local time_since_heartbeat
|
||||
+ time_since_heartbeat=$(get_time_since_last_heartbeat)
|
||||
+ ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)"
|
||||
+
|
||||
+ # Check if peer has set force_new_cluster for recovery
|
||||
+ local fnc_holders
|
||||
+ if ! fnc_holders=$(get_force_new_cluster); then
|
||||
+ ocf_log err "Could not detect peer-initiated recovery. Checking again in the next monitor cycle"
|
||||
+ echo "failed-wait-for-peer"
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ if [ -n "$fnc_holders" ]; then
|
||||
+ ocf_log debug "force_new_cluster detected (set by: $fnc_holders), triggering restart"
|
||||
+ echo "failed-restart-now"
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ echo "failed-wait-for-peer"
|
||||
+}
|
||||
+
|
||||
+podman_monitor()
|
||||
+{
|
||||
+ local container_health_state
|
||||
+
|
||||
+ container_health_state=$(container_health_check)
|
||||
+ case "$container_health_state" in
|
||||
+ healthy)
|
||||
+ # Continue with normal monitoring
|
||||
+ ;;
|
||||
+ not-running)
|
||||
+ return $OCF_NOT_RUNNING
|
||||
+ ;;
|
||||
+ failed-restart-now)
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ ;;
|
||||
+ failed-wait-for-peer)
|
||||
+ # Continue running, waiting for peer recovery
|
||||
+ return $OCF_SUCCESS
|
||||
+ ;;
|
||||
+ *)
|
||||
+ ocf_log err "Unknown health state: $container_health_state"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ ;;
|
||||
+ esac
|
||||
+
|
||||
# Check if certificate files have changed, if they have, etcd needs to be restarted
|
||||
if ! etcd_certificates_hash_manager "check"; then
|
||||
return $OCF_ERR_GENERIC
|
||||
@@ -1533,6 +1633,12 @@ podman_start()
|
||||
local pod_was_running=false
|
||||
|
||||
ocf_log notice "podman-etcd start"
|
||||
+
|
||||
+ # Clear container health check state file
|
||||
+ if ! rm -f "$CONTAINER_HEARTBEAT_FILE"; then
|
||||
+ ocf_log err "could not delete container health check state file"
|
||||
+ fi
|
||||
+
|
||||
attribute_node_ip update
|
||||
attribute_node_cluster_id update
|
||||
attribute_node_revision update
|
||||
@@ -1849,15 +1955,21 @@ podman_stop()
|
||||
local rc
|
||||
|
||||
ocf_log notice "podman-etcd stop"
|
||||
+
|
||||
+ # Clear container health check state file
|
||||
+ if ! rm -f "$CONTAINER_HEARTBEAT_FILE"; then
|
||||
+ ocf_log err "could not delete container health check state file"
|
||||
+ fi
|
||||
+
|
||||
+ attribute_node_revision update
|
||||
+ attribute_node_cluster_id update
|
||||
+
|
||||
podman_simple_status
|
||||
if [ $? -eq $OCF_NOT_RUNNING ]; then
|
||||
ocf_log info "could not leave members list: etcd container not running"
|
||||
return $OCF_SUCCESS
|
||||
fi
|
||||
|
||||
- attribute_node_revision update
|
||||
- attribute_node_cluster_id update
|
||||
-
|
||||
if ! member_id=$(attribute_node_member_id get); then
|
||||
ocf_log err "error leaving members list: could not get member-id"
|
||||
else
|
||||
@@ -2007,6 +2119,9 @@ POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
|
||||
ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
|
||||
ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
|
||||
ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
|
||||
+# State file location: Uses HA_RSCTMP to ensure automatic cleanup on reboot.
|
||||
+# This is intentional - reboots are controlled stops, not failures requiring detection.
|
||||
+CONTAINER_HEARTBEAT_FILE=${HA_RSCTMP}/podman-container-last-running
|
||||
|
||||
# Note: we currently monitor podman containers by with the "podman exec"
|
||||
# command, so make sure that invocation is always valid by enforcing the
|
||||
@ -0,0 +1,47 @@
|
||||
From a155018f6d65edf99493804dad99412b50d13e6c Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Wed, 5 Nov 2025 13:48:38 +0100
|
||||
Subject: [PATCH] podman-etcd: fix count of fnc holders in
|
||||
container_health_check
|
||||
|
||||
The variable `fnc_holders` (a list of nodes that have force_new_cluster
|
||||
CIB attribute set) can contain empty spaces. Because of this, the
|
||||
shell's simple `-n` test is not enough to establish if there are no
|
||||
`fnc_holders`.
|
||||
|
||||
Fixed counting the number of words inside the variable.
|
||||
|
||||
Moreover
|
||||
* Enhanced comment for clarity.
|
||||
* Log level changed to `info`. We want visibility when the monitor
|
||||
detects the peer node is ready for recovery, and this is rare enough
|
||||
not to flood the logs.
|
||||
---
|
||||
heartbeat/podman-etcd | 7 ++++---
|
||||
1 file changed, 4 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 5bdc6d184..7795130a6 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -1366,7 +1366,7 @@ container_health_check()
|
||||
return
|
||||
fi
|
||||
|
||||
- # State file exists - the container failed, check recovery status in this lifecycle
|
||||
+ # Could not execute monitor check command and state file exists - the container failed, check recovery status in this lifecycle
|
||||
local time_since_heartbeat
|
||||
time_since_heartbeat=$(get_time_since_last_heartbeat)
|
||||
ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)"
|
||||
@@ -1379,8 +1379,9 @@ container_health_check()
|
||||
return
|
||||
fi
|
||||
|
||||
- if [ -n "$fnc_holders" ]; then
|
||||
- ocf_log debug "force_new_cluster detected (set by: $fnc_holders), triggering restart"
|
||||
+ local fnc_holder_count=$(echo "$fnc_holders" | wc -w)
|
||||
+ if [ "$fnc_holder_count" -gt 0 ]; then
|
||||
+ ocf_log info "force_new_cluster detected (set by: $fnc_holders), triggering restart"
|
||||
echo "failed-restart-now"
|
||||
return
|
||||
fi
|
||||
@ -0,0 +1,158 @@
|
||||
From 48455cb6cef9c5b849045bc838bc2b5ccd01b0fe Mon Sep 17 00:00:00 2001
|
||||
From: Klaus Wenninger <klaus.wenninger@aon.at>
|
||||
Date: Fri, 7 Nov 2025 17:06:57 +0100
|
||||
Subject: [PATCH 1/3] storage_mon: refactor removing basically duplicate code
|
||||
|
||||
---
|
||||
tools/storage_mon.c | 45 ++++++++++++++++-----------------------------
|
||||
1 file changed, 16 insertions(+), 29 deletions(-)
|
||||
|
||||
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
|
||||
index 27d2ff1d1..fa9bd0cbc 100644
|
||||
--- a/tools/storage_mon.c
|
||||
+++ b/tools/storage_mon.c
|
||||
@@ -119,6 +119,8 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
|
||||
int device_fd;
|
||||
int res;
|
||||
off_t seek_spot;
|
||||
+ int sec_size = 512;
|
||||
+ void *buffer;
|
||||
|
||||
if (verbose) {
|
||||
printf("Testing device %s\n", device);
|
||||
@@ -164,9 +166,6 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
|
||||
}
|
||||
|
||||
if (flags & O_DIRECT) {
|
||||
- int sec_size = 0;
|
||||
- void *buffer;
|
||||
-
|
||||
#ifdef __FreeBSD__
|
||||
res = ioctl(device_fd, DIOCGSECTORSIZE, &sec_size);
|
||||
#else
|
||||
@@ -176,33 +175,21 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
|
||||
PRINT_STORAGE_MON_ERR("Failed to get block device sector size for %s: %s", device, strerror(errno));
|
||||
goto error;
|
||||
}
|
||||
+ }
|
||||
|
||||
- if (posix_memalign(&buffer, sysconf(_SC_PAGESIZE), sec_size) != 0) {
|
||||
- PRINT_STORAGE_MON_ERR("Failed to allocate aligned memory: %s", strerror(errno));
|
||||
- goto error;
|
||||
- }
|
||||
- res = read(device_fd, buffer, sec_size);
|
||||
- free(buffer);
|
||||
- if (res < 0) {
|
||||
- PRINT_STORAGE_MON_ERR("Failed to read %s: %s", device, strerror(errno));
|
||||
- goto error;
|
||||
- }
|
||||
- if (res < sec_size) {
|
||||
- PRINT_STORAGE_MON_ERR("Failed to read %d bytes from %s, got %d", sec_size, device, res);
|
||||
- goto error;
|
||||
- }
|
||||
- } else {
|
||||
- char buffer[512];
|
||||
-
|
||||
- res = read(device_fd, buffer, sizeof(buffer));
|
||||
- if (res < 0) {
|
||||
- PRINT_STORAGE_MON_ERR("Failed to read %s: %s", device, strerror(errno));
|
||||
- goto error;
|
||||
- }
|
||||
- if (res < (int)sizeof(buffer)) {
|
||||
- PRINT_STORAGE_MON_ERR("Failed to read %ld bytes from %s, got %d", sizeof(buffer), device, res);
|
||||
- goto error;
|
||||
- }
|
||||
+ if (posix_memalign(&buffer, sysconf(_SC_PAGESIZE), sec_size) != 0) {
|
||||
+ PRINT_STORAGE_MON_ERR("Failed to allocate aligned memory: %s", strerror(errno));
|
||||
+ goto error;
|
||||
+ }
|
||||
+ res = read(device_fd, buffer, sec_size);
|
||||
+ free(buffer);
|
||||
+ if (res < 0) {
|
||||
+ PRINT_STORAGE_MON_ERR("Failed to read %s: %s", device, strerror(errno));
|
||||
+ goto error;
|
||||
+ }
|
||||
+ if (res < sec_size) {
|
||||
+ PRINT_STORAGE_MON_ERR("Failed to read %d bytes from %s, got %d", sec_size, device, res);
|
||||
+ goto error;
|
||||
}
|
||||
|
||||
/* Fake an error */
|
||||
|
||||
From 310f224fc7d9a6f4fca234f10696e6049c8f2666 Mon Sep 17 00:00:00 2001
|
||||
From: Klaus Wenninger <klaus.wenninger@aon.at>
|
||||
Date: Fri, 7 Nov 2025 17:14:06 +0100
|
||||
Subject: [PATCH 2/3] storage_mon.c: refactor moving up getting blocksize
|
||||
|
||||
if that fails we can bail out without unnecessary seek
|
||||
---
|
||||
tools/storage_mon.c | 24 ++++++++++++------------
|
||||
1 file changed, 12 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
|
||||
index fa9bd0cbc..960266a74 100644
|
||||
--- a/tools/storage_mon.c
|
||||
+++ b/tools/storage_mon.c
|
||||
@@ -152,6 +152,18 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
|
||||
PRINT_STORAGE_MON_INFO("%s: opened %s O_DIRECT, size=%zu", device, (flags & O_DIRECT)?"with":"without", devsize);
|
||||
}
|
||||
|
||||
+ if (flags & O_DIRECT) {
|
||||
+#ifdef __FreeBSD__
|
||||
+ res = ioctl(device_fd, DIOCGSECTORSIZE, &sec_size);
|
||||
+#else
|
||||
+ res = ioctl(device_fd, BLKSSZGET, &sec_size);
|
||||
+#endif
|
||||
+ if (res < 0) {
|
||||
+ PRINT_STORAGE_MON_ERR("Failed to get block device sector size for %s: %s", device, strerror(errno));
|
||||
+ goto error;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
/* Don't fret about real randomness */
|
||||
srand(time(NULL) + getpid());
|
||||
/* Pick a random place on the device - sector aligned */
|
||||
@@ -165,18 +177,6 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
|
||||
PRINT_STORAGE_MON_INFO("%s: reading from pos %ld", device, seek_spot);
|
||||
}
|
||||
|
||||
- if (flags & O_DIRECT) {
|
||||
-#ifdef __FreeBSD__
|
||||
- res = ioctl(device_fd, DIOCGSECTORSIZE, &sec_size);
|
||||
-#else
|
||||
- res = ioctl(device_fd, BLKSSZGET, &sec_size);
|
||||
-#endif
|
||||
- if (res < 0) {
|
||||
- PRINT_STORAGE_MON_ERR("Failed to get block device sector size for %s: %s", device, strerror(errno));
|
||||
- goto error;
|
||||
- }
|
||||
- }
|
||||
-
|
||||
if (posix_memalign(&buffer, sysconf(_SC_PAGESIZE), sec_size) != 0) {
|
||||
PRINT_STORAGE_MON_ERR("Failed to allocate aligned memory: %s", strerror(errno));
|
||||
goto error;
|
||||
|
||||
From ac19911ce550d5eca42be6cb44632384bdf8e1c9 Mon Sep 17 00:00:00 2001
|
||||
From: Klaus Wenninger <klaus.wenninger@aon.at>
|
||||
Date: Fri, 7 Nov 2025 17:18:45 +0100
|
||||
Subject: [PATCH 3/3] storage_mon.c: fix block-seek mask deriving it from the
|
||||
block-size
|
||||
|
||||
now this is as well working for e.g. 4K block-devices
|
||||
---
|
||||
tools/storage_mon.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
|
||||
index 960266a74..6c4555f04 100644
|
||||
--- a/tools/storage_mon.c
|
||||
+++ b/tools/storage_mon.c
|
||||
@@ -167,7 +167,7 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
|
||||
/* Don't fret about real randomness */
|
||||
srand(time(NULL) + getpid());
|
||||
/* Pick a random place on the device - sector aligned */
|
||||
- seek_spot = (rand() % (devsize-1024)) & 0xFFFFFFFFFFFFFE00;
|
||||
+ seek_spot = (rand() % (devsize-sec_size)) & ~(((off_t) sec_size)-1);
|
||||
res = lseek(device_fd, seek_spot, SEEK_SET);
|
||||
if (res < 0) {
|
||||
PRINT_STORAGE_MON_ERR("Failed to seek %s: %s", device, strerror(errno));
|
||||
@ -0,0 +1,106 @@
|
||||
From d5b4428e6cd66fd47680531ff0244d9b56e4e4c2 Mon Sep 17 00:00:00 2001
|
||||
From: Pablo Fontanilla <pfontani@redhat.com>
|
||||
Date: Tue, 14 Oct 2025 11:57:09 +0200
|
||||
Subject: [PATCH 1/2] Redo counting of active_resources
|
||||
|
||||
---
|
||||
heartbeat/podman-etcd | 46 +++++++++++++++++++++++++++++++++++++++++--
|
||||
1 file changed, 44 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index e1425ec02..dbf16918d 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -1029,6 +1029,48 @@ get_peer_node_name() {
|
||||
crm_node -l | awk '{print $2}' | grep -v "$NODENAME"
|
||||
}
|
||||
|
||||
+# Calculate the count of truly active resources by excluding those being stopped.
|
||||
+# According to Pacemaker documentation, during "Post-notification (stop) /
|
||||
+# Pre-notification (start)" transitions, the true active resource count should be:
|
||||
+# Active resources = $OCF_RESKEY_CRM_meta_notify_active_resource
|
||||
+# minus $OCF_RESKEY_CRM_meta_notify_stop_resource
|
||||
+# This handles the case where a resource appears in both the active and stop lists
|
||||
+# during rapid restart scenarios (e.g., process crash recovery).
|
||||
+get_truly_active_resources_count() {
|
||||
+ local active_list="$OCF_RESKEY_CRM_meta_notify_active_resource"
|
||||
+ local stop_list="$OCF_RESKEY_CRM_meta_notify_stop_resource"
|
||||
+ local truly_active=""
|
||||
+
|
||||
+ # If no active resources, return 0
|
||||
+ if [ -z "$active_list" ]; then
|
||||
+ echo "0"
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ # If no resources being stopped, return count of active resources
|
||||
+ if [ -z "$stop_list" ]; then
|
||||
+ echo "$active_list" | wc -w
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ # Filter out resources that are being stopped from the active list
|
||||
+ for resource in $active_list; do
|
||||
+ local is_stopping=0
|
||||
+ for stop_resource in $stop_list; do
|
||||
+ if [ "$resource" = "$stop_resource" ]; then
|
||||
+ is_stopping=1
|
||||
+ break
|
||||
+ fi
|
||||
+ done
|
||||
+ if [ $is_stopping -eq 0 ]; then
|
||||
+ truly_active="$truly_active $resource"
|
||||
+ fi
|
||||
+ done
|
||||
+
|
||||
+ # Count the truly active resources (trim leading space and count words)
|
||||
+ echo "$truly_active" | wc -w
|
||||
+}
|
||||
+
|
||||
get_all_etcd_endpoints() {
|
||||
for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
|
||||
name=$(echo "$node" | cut -d: -f1)
|
||||
@@ -1529,8 +1571,8 @@ podman_start()
|
||||
# - 0 active agents, 1 starting: we are starting; the peer is not starting
|
||||
# - 0 active agents, 2 starting: both agents are starting simultaneously
|
||||
local active_resources_count
|
||||
- active_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_active_resource" | wc -w)
|
||||
- ocf_log info "found '$active_resources_count' active etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_active_resource')"
|
||||
+ active_resources_count=$(get_truly_active_resources_count)
|
||||
+ ocf_log info "found '$active_resources_count' active etcd resources (active: '$OCF_RESKEY_CRM_meta_notify_active_resource', stop: '$OCF_RESKEY_CRM_meta_notify_stop_resource')"
|
||||
case "$active_resources_count" in
|
||||
1)
|
||||
if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then
|
||||
|
||||
From 0114ddf83c95122a7f9fe9f704f864242cdb284a Mon Sep 17 00:00:00 2001
|
||||
From: Pablo Fontanilla <pfontani@redhat.com>
|
||||
Date: Wed, 29 Oct 2025 12:49:17 +0100
|
||||
Subject: [PATCH 2/2] Update truly active resources count with safer empty
|
||||
calculation
|
||||
|
||||
---
|
||||
heartbeat/podman-etcd | 6 ++++--
|
||||
1 file changed, 4 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index dbf16918d..8fc92a537 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -1042,13 +1042,15 @@ get_truly_active_resources_count() {
|
||||
local truly_active=""
|
||||
|
||||
# If no active resources, return 0
|
||||
- if [ -z "$active_list" ]; then
|
||||
+ # Use word count to handle whitespace-only values
|
||||
+ if [ "$(echo "$active_list" | wc -w)" -eq 0 ]; then
|
||||
echo "0"
|
||||
return
|
||||
fi
|
||||
|
||||
# If no resources being stopped, return count of active resources
|
||||
- if [ -z "$stop_list" ]; then
|
||||
+ # Use word count to handle whitespace-only values
|
||||
+ if [ "$(echo "$stop_list" | wc -w)" -eq 0 ]; then
|
||||
echo "$active_list" | wc -w
|
||||
return
|
||||
fi
|
||||
@ -0,0 +1,161 @@
|
||||
From 578e6d982e5ab705dac216cecf85c50fe3842af5 Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Sun, 16 Nov 2025 19:40:30 +0100
|
||||
Subject: [PATCH] OCPBUGS-60098: podman-etcd: prevent last active member from
|
||||
leaving the etcd member list
|
||||
|
||||
When stopping etcd instances, simultaneous member removal from both
|
||||
nodes can corrupt the etcd Write-Ahead Log (WAL). This change implements
|
||||
a two-part solution:
|
||||
|
||||
1. Concurrent stop protection: When multiple nodes are stopping, the
|
||||
alphabetically second node delays its member removal by 10
|
||||
seconds. This prevents simultaneous member list updates that can
|
||||
corrupt WAL.
|
||||
|
||||
2. Last member detection: Checks active resource count after any
|
||||
delay. If this is the last active member, skips member removal to
|
||||
avoid leaving an empty cluster.
|
||||
|
||||
Additionally, reorders podman_stop() to clear the member_id attribute
|
||||
after leaving the member list, ensuring the attribute reflects actual
|
||||
cluster state during shutdown.
|
||||
---
|
||||
heartbeat/podman-etcd | 86 ++++++++++++++++++++++++++++++++++---------
|
||||
1 file changed, 69 insertions(+), 17 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 7795130a6..7b6e08f11 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -1341,6 +1341,11 @@ container_health_check()
|
||||
# recently (i.e. a failure), or not (fresh start)
|
||||
monitor_cmd_exec
|
||||
rc=$?
|
||||
+ if [ "$rc" -ne 0 ]; then
|
||||
+ ocf_log info "Container ${CONTAINER} not-running"
|
||||
+ echo "not-running"
|
||||
+ return
|
||||
+ fi
|
||||
if [ "$rc" -eq 0 ]; then
|
||||
# Container is running - update state file with current epoch
|
||||
local current_epoch
|
||||
@@ -1639,7 +1644,7 @@ can_reuse_container() {
|
||||
OCF_RESKEY_reuse=0
|
||||
return "$OCF_SUCCESS"
|
||||
fi
|
||||
-
|
||||
+
|
||||
if ! filtered_original_pod_manifest=$(filter_pod_manifest "$OCF_RESKEY_pod_manifest"); then
|
||||
return $OCF_ERR_GENERIC
|
||||
fi
|
||||
@@ -1866,7 +1871,7 @@ podman_start()
|
||||
fi
|
||||
|
||||
if ocf_is_true "$JOIN_AS_LEARNER"; then
|
||||
- local wait_timeout_sec=$((10*60))
|
||||
+ local wait_timeout_sec=60
|
||||
local poll_interval_sec=5
|
||||
local retries=$(( wait_timeout_sec / poll_interval_sec ))
|
||||
|
||||
@@ -2021,6 +2026,64 @@ podman_start()
|
||||
done
|
||||
}
|
||||
|
||||
+# leave_etcd_member_list removes the current node from the etcd member list during
|
||||
+# shutdown to ensure clean cluster state.
|
||||
+#
|
||||
+# Skips removal if this is the standalone (last) node. When both nodes are stopping
|
||||
+# concurrently, delays the second node to prevent simultaneous member removal that
|
||||
+# could corrupt the etcd WAL.
|
||||
+leave_etcd_member_list()
|
||||
+{
|
||||
+ if ! member_id=$(attribute_node_member_id get); then
|
||||
+ ocf_log err "error leaving members list: could not get member-id"
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ if is_standalone; then
|
||||
+ ocf_log info "last member. Not leaving the member list"
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ local stopping_resources_count
|
||||
+ stopping_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_stop_resource" | wc -w)
|
||||
+ ocf_log info "found '$stopping_resources_count' stopping etcd resources (stop: '$OCF_RESKEY_CRM_meta_notify_stop_resource')"
|
||||
+ if [ "$stopping_resources_count" -gt 1 ]; then
|
||||
+ # Prevent WAL corruption by delaying the alphabetically second node's member
|
||||
+ # removal when both nodes are stopping concurrently.
|
||||
+ local delayed_node
|
||||
+
|
||||
+ node_names_sorted=$(echo "$OCF_RESKEY_node_ip_map" | sed 's/:[^;]*//g; s/;/ /g' | tr ' ' '\n' | sort | tr '\n' ' ')
|
||||
+ delayed_node="$(echo "$node_names_sorted" | cut -d' ' -f2)"
|
||||
+
|
||||
+ if [ -z "$delayed_node" ]; then
|
||||
+ ocf_log warn "could not determine node to be delayed: not leaving the member list"
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ if [ "$NODENAME" = "$delayed_node" ]; then
|
||||
+ ocf_log info "delaying stop for ${DELAY_SECOND_NODE_LEAVE_SEC}s to prevent simultaneous etcd member removal"
|
||||
+ sleep $DELAY_SECOND_NODE_LEAVE_SEC
|
||||
+ fi
|
||||
+ fi
|
||||
+
|
||||
+ # Ensure we're not the last active resource before leaving. The `standalone_node` property
|
||||
+ # may not be set if stop was called before monitor check, or after the delayed node waited.
|
||||
+ local active_resources_count
|
||||
+ active_resources_count=$(get_truly_active_resources_count)
|
||||
+ if [ "$active_resources_count" -lt 1 ]; then
|
||||
+ ocf_log info "last member. Not leaving the member list"
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ ocf_log info "leaving members list as member with ID $member_id"
|
||||
+ local endpoint
|
||||
+ endpoint="$(ip_url $(attribute_node_ip get)):2379"
|
||||
+ if ! ocf_run podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"; then
|
||||
+ rc=$?
|
||||
+ ocf_log err "error leaving members list, error code: $rc"
|
||||
+ fi
|
||||
+}
|
||||
+
|
||||
podman_stop()
|
||||
{
|
||||
local timeout=60
|
||||
@@ -2039,24 +2102,12 @@ podman_stop()
|
||||
podman_simple_status
|
||||
if [ $? -eq $OCF_NOT_RUNNING ]; then
|
||||
ocf_log info "could not leave members list: etcd container not running"
|
||||
+ attribute_node_member_id clear
|
||||
return $OCF_SUCCESS
|
||||
fi
|
||||
|
||||
- if ! member_id=$(attribute_node_member_id get); then
|
||||
- ocf_log err "error leaving members list: could not get member-id"
|
||||
- else
|
||||
- # TODO: is it worth/possible to check the current status instead than relying on cached attributes?
|
||||
- if is_standalone; then
|
||||
- ocf_log info "last member. Not leaving the member list"
|
||||
- else
|
||||
- ocf_log info "leaving members list as member with ID $member_id"
|
||||
- endpoint="$(ip_url $(attribute_node_ip get)):2379"
|
||||
- if ! ocf_run podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"; then
|
||||
- rc=$?
|
||||
- ocf_log err "error leaving members list, error code: $rc"
|
||||
- fi
|
||||
- fi
|
||||
- fi
|
||||
+ leave_etcd_member_list
|
||||
+ # clear node_member_id CIB attribute only after leaving the member list
|
||||
attribute_node_member_id clear
|
||||
|
||||
if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
|
||||
@@ -2197,6 +2248,7 @@ ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
|
||||
# State file location: Uses HA_RSCTMP to ensure automatic cleanup on reboot.
|
||||
# This is intentional - reboots are controlled stops, not failures requiring detection.
|
||||
CONTAINER_HEARTBEAT_FILE=${HA_RSCTMP}/podman-container-last-running
|
||||
+DELAY_SECOND_NODE_LEAVE_SEC=10
|
||||
|
||||
# Note: we currently monitor podman containers by with the "podman exec"
|
||||
# command, so make sure that invocation is always valid by enforcing the
|
||||
42
SOURCES/RHEL-130580-2-podman-etcd-remove-test-code.patch
Normal file
42
SOURCES/RHEL-130580-2-podman-etcd-remove-test-code.patch
Normal file
@ -0,0 +1,42 @@
|
||||
From 29df4255c5f65ea94fb6de997805dca65e31071c Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Mon, 24 Nov 2025 12:21:55 +0100
|
||||
Subject: [PATCH] podman-etcd: remove test code (#2103)
|
||||
|
||||
---
|
||||
heartbeat/podman-etcd | 8 +-------
|
||||
1 file changed, 1 insertion(+), 7 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 7b6e08f11..b1f52cd5c 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -1341,11 +1341,6 @@ container_health_check()
|
||||
# recently (i.e. a failure), or not (fresh start)
|
||||
monitor_cmd_exec
|
||||
rc=$?
|
||||
- if [ "$rc" -ne 0 ]; then
|
||||
- ocf_log info "Container ${CONTAINER} not-running"
|
||||
- echo "not-running"
|
||||
- return
|
||||
- fi
|
||||
if [ "$rc" -eq 0 ]; then
|
||||
# Container is running - update state file with current epoch
|
||||
local current_epoch
|
||||
@@ -1644,7 +1639,6 @@ can_reuse_container() {
|
||||
OCF_RESKEY_reuse=0
|
||||
return "$OCF_SUCCESS"
|
||||
fi
|
||||
-
|
||||
if ! filtered_original_pod_manifest=$(filter_pod_manifest "$OCF_RESKEY_pod_manifest"); then
|
||||
return $OCF_ERR_GENERIC
|
||||
fi
|
||||
@@ -1871,7 +1865,7 @@ podman_start()
|
||||
fi
|
||||
|
||||
if ocf_is_true "$JOIN_AS_LEARNER"; then
|
||||
- local wait_timeout_sec=60
|
||||
+ local wait_timeout_sec=$((10*60))
|
||||
local poll_interval_sec=5
|
||||
local retries=$(( wait_timeout_sec / poll_interval_sec ))
|
||||
|
||||
@ -0,0 +1,107 @@
|
||||
From 5cc74acd67c294da36b3f40e44842a82aa7d0957 Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Wed, 26 Nov 2025 11:43:25 +0100
|
||||
Subject: [PATCH] OCPEDGE-2213: podman-etcd: fix to prevent learner from
|
||||
starting before cluster is ready (#2098)
|
||||
|
||||
* OCPEDGE-2213: fix(podman-etcd): prevent learner from starting before cluster is ready
|
||||
|
||||
Clear stale learner_node attribute during stop and on restart when no
|
||||
active resources exist, ensuring learner always waits for peer
|
||||
availability.
|
||||
|
||||
* fix: podman-etcd should cleanup standalone/learner attributes when promotion succeeds
|
||||
|
||||
* fix: remove misleading endpoint IP from log
|
||||
---
|
||||
heartbeat/podman-etcd | 33 +++++++++++++++++++--------------
|
||||
1 file changed, 19 insertions(+), 14 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index b1f52cd5c..3e3f1d60e 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -880,7 +880,7 @@ add_member_as_learner()
|
||||
local endpoint_url=$(ip_url $(attribute_node_ip get))
|
||||
local peer_url=$(ip_url $member_ip)
|
||||
|
||||
- ocf_log info "add $member_name ($member_ip, $endpoint_url) to the member list as learner"
|
||||
+ ocf_log info "add $member_name ($member_ip) to the member list as learner"
|
||||
out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner)
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ]; then
|
||||
@@ -1032,7 +1032,7 @@ promote_learner_member()
|
||||
if ! ocf_run podman exec "${CONTAINER}" etcdctl member promote "$learner_member_id_hex" 2>&1; then
|
||||
# promotion is expected to fail if the peer is not yet up-to-date
|
||||
ocf_log info "could not promote member $learner_member_id_hex, error code: $?"
|
||||
- return $OCF_SUCCESS
|
||||
+ return $OCF_ERR_GENERIC
|
||||
fi
|
||||
ocf_log info "successfully promoted member '$learner_member_id_hex'"
|
||||
return $OCF_SUCCESS
|
||||
@@ -1063,19 +1063,19 @@ reconcile_member_state()
|
||||
fi
|
||||
|
||||
if [ -n "$learner_member_id" ]; then
|
||||
- promote_learner_member "$learner_member_id"
|
||||
- return $?
|
||||
- fi
|
||||
-
|
||||
- if [ -z "$learner_member_id" ]; then
|
||||
- if ! clear_standalone_node; then
|
||||
- ocf_log error "could not clear standalone_node attribute, error code: $?"
|
||||
- return $OCF_ERR_GENERIC
|
||||
- fi
|
||||
- if ! attribute_learner_node clear; then
|
||||
- ocf_log error "could not clear learner_node attribute, error code: $?"
|
||||
+ if ! promote_learner_member "$learner_member_id"; then
|
||||
return $OCF_ERR_GENERIC
|
||||
fi
|
||||
+ # promotion succeded: continue to clear standalone_node and learner_node
|
||||
+ fi
|
||||
+
|
||||
+ if ! clear_standalone_node; then
|
||||
+ ocf_log error "could not clear standalone_node attribute, error code: $?"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+ if ! attribute_learner_node clear; then
|
||||
+ ocf_log error "could not clear learner_node attribute, error code: $?"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
fi
|
||||
|
||||
return $OCF_SUCCESS
|
||||
@@ -1258,6 +1258,7 @@ manage_peer_membership()
|
||||
set_standalone_node
|
||||
else
|
||||
ocf_log debug "$name is in the members list by IP: $ip"
|
||||
+ # Errors from reconcile_member_state are logged internally. Ignoring them here prevents stopping a healthy voter agent; critical local failures are caught by detect_cluster_leadership_loss.
|
||||
reconcile_member_state "$member_list_json"
|
||||
fi
|
||||
done
|
||||
@@ -1369,7 +1370,7 @@ container_health_check()
|
||||
# Could not execute monitor check command and state file exists - the container failed, check recovery status in this lifecycle
|
||||
local time_since_heartbeat
|
||||
time_since_heartbeat=$(get_time_since_last_heartbeat)
|
||||
- ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)"
|
||||
+ ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago, error code: $rc)"
|
||||
|
||||
# Check if peer has set force_new_cluster for recovery
|
||||
local fnc_holders
|
||||
@@ -1795,6 +1796,9 @@ podman_start()
|
||||
fi
|
||||
;;
|
||||
0)
|
||||
+ # No active resources: clear any stale learner_node attribute from previous failed session
|
||||
+ ocf_log debug "clearing stale learner_node attribute (safe when active_resources_count=0)"
|
||||
+ attribute_learner_node clear
|
||||
# count how many agents are starting now
|
||||
local start_resources_count
|
||||
start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w)
|
||||
@@ -2090,6 +2094,7 @@ podman_stop()
|
||||
ocf_log err "could not delete container health check state file"
|
||||
fi
|
||||
|
||||
+ attribute_learner_node clear
|
||||
attribute_node_revision update
|
||||
attribute_node_cluster_id update
|
||||
|
||||
@ -0,0 +1,146 @@
|
||||
From 192b0ecbe015e8b8a4d32f8b066ead3a6dba0589 Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Tue, 2 Dec 2025 10:01:01 +0100
|
||||
Subject: [PATCH] OCPEDGE-2231: podman-etcd: improve error handling to support
|
||||
retry on start errors (#2105)
|
||||
|
||||
* podman-etcd: improve add_member_as_learner error log
|
||||
|
||||
Improving add_member_as_learner error log to better debug rare issue
|
||||
when the podman exec command returns error, but the etcd member is added
|
||||
to the list anyway. This is critical as the `learner_node` attribute
|
||||
won't be cleaned up anymore.
|
||||
|
||||
Signed-off-by: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
|
||||
* podman-etcd: remove duplicated check for container already started
|
||||
|
||||
* podman-etcd: improve error return codes to support start retries
|
||||
|
||||
Improved and/or changed some returns code to allow or forbid retry in
|
||||
case of start errors.
|
||||
|
||||
see: OCPEDGE-2231
|
||||
|
||||
---------
|
||||
|
||||
Signed-off-by: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
---
|
||||
heartbeat/podman-etcd | 40 +++++++++++++++++++++++++---------------
|
||||
1 file changed, 25 insertions(+), 15 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 3e3f1d60e..242226bb1 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -617,9 +617,13 @@ prepare_env() {
|
||||
LISTEN_CLIENT_URLS="0.0.0.0"
|
||||
LISTEN_PEER_URLS="0.0.0.0"
|
||||
LISTEN_METRICS_URLS="0.0.0.0"
|
||||
+
|
||||
+ return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
compute_bump_revision() {
|
||||
+ local rc
|
||||
+
|
||||
# Same logic used by cluster-etcd-operator quorum-restore-pod utility.
|
||||
# see https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da916622c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34
|
||||
# set a default value: 1bn would be an etcd running at 1000 writes/s for about eleven days.
|
||||
@@ -691,7 +695,13 @@ experimental-max-learners: 1
|
||||
experimental-warning-apply-duration: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
|
||||
experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
|
||||
EOF
|
||||
+ rc=$?
|
||||
+ if [ $rc -ne 0 ]; then
|
||||
+ ocf_log err "could not create etcd configuration, 'cat' error code: $rc"
|
||||
+ return $OCF_ERR_CONFIGURED
|
||||
+ fi
|
||||
|
||||
+ # Append cipher suites from the env variable where the entries are comma separated.
|
||||
{
|
||||
if [ -n "$ETCD_CIPHER_SUITES" ]; then
|
||||
echo "cipher-suites:"
|
||||
@@ -700,6 +710,13 @@ EOF
|
||||
done
|
||||
fi
|
||||
} >> "$ETCD_CONFIGURATION_FILE"
|
||||
+ rc=$?
|
||||
+ if [ $rc -ne 0 ]; then
|
||||
+ ocf_log err "could not append cipher suites to etcd configuration, error code: $rc"
|
||||
+ return $OCF_ERR_CONFIGURED
|
||||
+ fi
|
||||
+
|
||||
+ return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
archive_data_folder()
|
||||
@@ -884,7 +901,7 @@ add_member_as_learner()
|
||||
out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner)
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ]; then
|
||||
- ocf_log err "could not add $member_name as learner, error code: $rc"
|
||||
+ ocf_log err "could not add $member_name as learner, error code $rc, etcdctl output: $out"
|
||||
return $rc
|
||||
fi
|
||||
ocf_log info "$out"
|
||||
@@ -1763,7 +1780,7 @@ podman_start()
|
||||
fnc_holder_count=$(echo "$fnc_holders" | wc -w)
|
||||
if [ "$fnc_holder_count" -gt 1 ]; then
|
||||
ocf_exit_reason "force_new_cluster attribute is set on multiple nodes ($fnc_holders)"
|
||||
- return "$OCF_ERR_GENERIC"
|
||||
+ return "$OCF_ERR_CONFIGURED"
|
||||
fi
|
||||
|
||||
if [ "$fnc_holder_count" -eq 1 ]; then
|
||||
@@ -1837,7 +1854,7 @@ podman_start()
|
||||
ocf_log info "same cluster_id and revision: start normal"
|
||||
else
|
||||
ocf_exit_reason "same revision but different cluster id"
|
||||
- return "$OCF_ERR_GENERIC"
|
||||
+ return "$OCF_ERR_CONFIGURED"
|
||||
fi
|
||||
fi
|
||||
;;
|
||||
@@ -1862,12 +1879,6 @@ podman_start()
|
||||
|
||||
run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}"
|
||||
|
||||
- # check to see if the container has already started
|
||||
- podman_simple_status
|
||||
- if [ $? -eq $OCF_SUCCESS ]; then
|
||||
- return "$OCF_SUCCESS"
|
||||
- fi
|
||||
-
|
||||
if ocf_is_true "$JOIN_AS_LEARNER"; then
|
||||
local wait_timeout_sec=$((10*60))
|
||||
local poll_interval_sec=5
|
||||
@@ -1894,9 +1905,8 @@ podman_start()
|
||||
|
||||
ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced"
|
||||
if ! can_reuse_container ; then
|
||||
- rc="$?"
|
||||
- ocf_log err "could not determine etcd container reuse strategy, rc: $rc"
|
||||
- return "$rc"
|
||||
+ ocf_log err "could not determine etcd container reuse strategy"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
fi
|
||||
|
||||
# Archive current container and its configuration before creating
|
||||
@@ -1912,13 +1922,13 @@ podman_start()
|
||||
fi
|
||||
|
||||
if ! prepare_env; then
|
||||
- ocf_log err "Could not prepare environment for podman, error code: $?"
|
||||
+ ocf_log err "Could not prepare environment for podman"
|
||||
return $OCF_ERR_GENERIC
|
||||
fi
|
||||
|
||||
if ! generate_etcd_configuration; then
|
||||
- ocf_log err "Could not generate etcd configuration, error code: $?"
|
||||
- return $OCF_ERR_GENERIC
|
||||
+ ocf_log err "Could not generate etcd configuration"
|
||||
+ return $OCF_ERR_CONFIGURED
|
||||
fi
|
||||
|
||||
run_opts="$run_opts \
|
||||
@ -0,0 +1,52 @@
|
||||
From 8b70d5026fee0910a52f0fdefcaf930b2c0a3909 Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Wed, 3 Dec 2025 11:38:25 +0100
|
||||
Subject: [PATCH] podman-etcd: sync environment variables with Pod manifest
|
||||
|
||||
The EXPERIMENTAL substring was removed from
|
||||
ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION and
|
||||
ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERNAL in the Pod
|
||||
manifest. This change aligns our config with those updates.
|
||||
|
||||
NOTE: Some Etcd flags deprecated in v3.6 will be replaced in a future
|
||||
change.
|
||||
|
||||
See: https://github.com/openshift/cluster-etcd-operator/pull/1507
|
||||
---
|
||||
heartbeat/podman-etcd | 9 +++++----
|
||||
1 file changed, 5 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 242226bb1..bb2900536 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -604,8 +604,8 @@ prepare_env() {
|
||||
fi
|
||||
ETCD_ELECTION_TIMEOUT=$(get_env_from_manifest "ETCD_ELECTION_TIMEOUT")
|
||||
ETCD_ENABLE_PPROF=$(get_env_from_manifest "ETCD_ENABLE_PPROF")
|
||||
- ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
|
||||
- ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
|
||||
+ ETCD_WARNING_APPLY_DURATION=$(get_env_from_manifest "ETCD_WARNING_APPLY_DURATION")
|
||||
+ ETCD_WATCH_PROGRESS_NOTIFY_INTERVAL=$(get_env_from_manifest "ETCD_WATCH_PROGRESS_NOTIFY_INTERVAL")
|
||||
ETCD_HEARTBEAT_INTERVAL=$(get_env_from_manifest "ETCD_HEARTBEAT_INTERVAL")
|
||||
ETCD_QUOTA_BACKEND_BYTES=$(get_env_from_manifest "ETCD_QUOTA_BACKEND_BYTES")
|
||||
ETCD_SOCKET_REUSE_ADDRESS=$(get_env_from_manifest "ETCD_SOCKET_REUSE_ADDRESS")
|
||||
@@ -660,6 +660,7 @@ force-new-cluster-bump-amount: $BUMP_REV"
|
||||
|
||||
# the space indentation for client-transport-security and peer-transport-security
|
||||
# is required for correct YAML formatting.
|
||||
+ # TODO: replace flags deprecated in Etcd v3.6
|
||||
cat > "$ETCD_CONFIGURATION_FILE" << EOF
|
||||
logger: zap
|
||||
log-level: info
|
||||
@@ -692,8 +693,8 @@ listen-metrics-urls: "$(ip_url ${LISTEN_METRICS_URLS}):9978"
|
||||
metrics: extensive
|
||||
experimental-initial-corrupt-check: true
|
||||
experimental-max-learners: 1
|
||||
-experimental-warning-apply-duration: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
|
||||
-experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
|
||||
+experimental-warning-apply-duration: $(convert_duration_in_nanoseconds "$ETCD_WARNING_APPLY_DURATION")
|
||||
+experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds "$ETCD_WATCH_PROGRESS_NOTIFY_INTERVAL")
|
||||
EOF
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ]; then
|
||||
@ -0,0 +1,25 @@
|
||||
From 7449fd88d21650db1eaafdc7ef85bf3553f6ac7f Mon Sep 17 00:00:00 2001
|
||||
From: Pablo Fontanilla <pfontani@redhat.com>
|
||||
Date: Thu, 8 Jan 2026 09:42:42 +0100
|
||||
Subject: [PATCH] OCPBUGS-64765: podman-etcd: add -a option to crictl ps
|
||||
(#2112)
|
||||
|
||||
---
|
||||
heartbeat/podman-etcd | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index bb2900536..591a663bf 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -738,8 +738,8 @@ archive_data_folder()
|
||||
|
||||
etcd_pod_container_exists() {
|
||||
local count_matches
|
||||
- # Check whether the etcd pod exists on the same node (header line included)
|
||||
- count_matches=$(crictl pods --label app=etcd -q | xargs -I {} crictl ps --pod {} -o json | jq -r '.containers[].metadata | select ( .name == "etcd" ).name' | wc -l)
|
||||
+ # Check whether the etcd pod exists on the same node (including stopped/exited containers)
|
||||
+ count_matches=$(crictl pods --label app=etcd -q | xargs -I {} crictl ps -a --pod {} -o json | jq -r '.containers[].metadata | select ( .name == "etcd" ).name' | wc -l)
|
||||
if [ "$count_matches" -eq 1 ]; then
|
||||
# etcd pod found
|
||||
return 0
|
||||
@ -0,0 +1,54 @@
|
||||
From 8f5c5a2a472ab404b6fd15ff492e72904dc8ac20 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Thu, 22 Jan 2026 07:37:40 +0100
|
||||
Subject: [PATCH] powervs-move-ip/powervs-subnet: fix error logging
|
||||
|
||||
---
|
||||
heartbeat/powervs-move-ip.in | 4 ++--
|
||||
heartbeat/powervs-subnet.in | 10 ++++++----
|
||||
2 files changed, 8 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/powervs-move-ip.in b/heartbeat/powervs-move-ip.in
|
||||
index e2250c998..0eea89f1d 100755
|
||||
--- a/heartbeat/powervs-move-ip.in
|
||||
+++ b/heartbeat/powervs-move-ip.in
|
||||
@@ -310,9 +310,9 @@ class PowerCloudTokenManager:
|
||||
return json.load(f)
|
||||
finally:
|
||||
fcntl.flock(f, fcntl.LOCK_UN)
|
||||
- except (json.JSONDecodeError, FileNotFoundError, PermissionError):
|
||||
+ except (json.JSONDecodeError, FileNotFoundError, PermissionError) as e:
|
||||
ocf.logger.warning(
|
||||
- "[PowerCloudTokenManager] _read_cache: failed to read token cache read due to missing file or malformed JSON."
|
||||
+ f"[PowerCloudTokenManager] _read_cache: failed to read token cache read due to missing file or malformed JSON: '{e}'"
|
||||
)
|
||||
return {}
|
||||
|
||||
diff --git a/heartbeat/powervs-subnet.in b/heartbeat/powervs-subnet.in
|
||||
index 062b1235e..b8f3864e9 100755
|
||||
--- a/heartbeat/powervs-subnet.in
|
||||
+++ b/heartbeat/powervs-subnet.in
|
||||
@@ -837,8 +837,9 @@ def start_action(
|
||||
if rc != ocf.OCF_SUCCESS:
|
||||
return rc
|
||||
|
||||
- if monitor_action(**res_options) != ocf.OCF_SUCCESS:
|
||||
- raise PowerCloudAPIError(f"start_action: start subnet: {ws.subnet_name} failed")
|
||||
+ rc = monitor_action(**res_options)
|
||||
+ if rc != ocf.OCF_SUCCESS:
|
||||
+ raise PowerCloudAPIError(f"start_action: start subnet: {ws.subnet_name} failed", rc)
|
||||
|
||||
ocf.logger.info(
|
||||
f"start_action: finished, added connection {conn_name} for subnet {ws.subnet_name}"
|
||||
@@ -872,8 +873,9 @@ def stop_action(
|
||||
|
||||
ws.subnet_remove()
|
||||
|
||||
- if monitor_action(**res_options) != ocf.OCF_NOT_RUNNING:
|
||||
- raise PowerCloudAPIError(f"stop_action: stop subnet {ws.subnet_name} failed")
|
||||
+ rc = monitor_action(**res_options)
|
||||
+ if rc != ocf.OCF_NOT_RUNNING:
|
||||
+ raise PowerCloudAPIError(f"stop_action: stop subnet {ws.subnet_name} failed", rc)
|
||||
|
||||
ocf.logger.info(
|
||||
f"stop_action: finished, deleted connection for subnet {ws.subnet_name}"
|
||||
@ -0,0 +1,278 @@
|
||||
From 8df1e4dfdee960b971fb598c043b4ccb2b9fefca Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Mon, 3 Nov 2025 12:34:29 +0100
|
||||
Subject: [PATCH] podman-etcd: enhance etcd data backup with snapshots and
|
||||
retention
|
||||
|
||||
Replace basic data directory backup with proper etcd database snapshot
|
||||
functionality. The new implementation:
|
||||
- Creates timestamped snapshot files instead of moving the entire data directory
|
||||
- Stores backups in a non-volatile location (backup_location parameter) instead
|
||||
of the previous volatile HA_RSCTMP directory
|
||||
- Validates backup file existence and size after creation
|
||||
- Implements configurable retention policy via max_backup_snapshots parameter
|
||||
- Automatically cleans up old snapshots to control storage usage
|
||||
|
||||
Default retention is set to 3 snapshots, with backups stored in /var/lib/etcd
|
||||
by default. This provides better backup reliability, persistence across reboots,
|
||||
and storage management for etcd databases.
|
||||
---
|
||||
heartbeat/podman-etcd | 205 ++++++++++++++++++++++++++++++++++++++++--
|
||||
1 file changed, 196 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index bb2900536..1d717ec00 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -49,6 +49,7 @@ OCF_RESKEY_reuse_default="0"
|
||||
OCF_RESKEY_oom_default="-997"
|
||||
OCF_RESKEY_config_location_default="/var/lib/etcd"
|
||||
OCF_RESKEY_backup_location_default="/var/lib/etcd"
|
||||
+OCF_RESKEY_max_backup_snapshots_default="3"
|
||||
|
||||
: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
|
||||
: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
|
||||
@@ -61,6 +62,7 @@ OCF_RESKEY_backup_location_default="/var/lib/etcd"
|
||||
: ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default}}
|
||||
: ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}}
|
||||
: ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}}
|
||||
+: ${OCF_RESKEY_max_backup_snapshots=${OCF_RESKEY_max_backup_snapshots_default}}
|
||||
|
||||
|
||||
#######################################################################
|
||||
@@ -275,6 +277,17 @@ The directory where the resource agent stores its backups.
|
||||
<content type="string" default="${OCF_RESKEY_backup_location_default}"/>
|
||||
</parameter>
|
||||
|
||||
+<parameter name="max_backup_snapshots" required="0" unique="0">
|
||||
+<longdesc lang="en">
|
||||
+Maximum number of etcd database snapshots to retain. When a new snapshot is created,
|
||||
+older snapshots will be automatically removed to maintain this limit. This helps
|
||||
+control storage usage while ensuring recent backups are available for recovery.
|
||||
+Set max_backup_snapshots=0 to disable backups.
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">Maximum number of backup snapshots to retain</shortdesc>
|
||||
+<content type="integer" default="${OCF_RESKEY_max_backup_snapshots_default}"/>
|
||||
+</parameter>
|
||||
+
|
||||
</parameters>
|
||||
|
||||
<actions>
|
||||
@@ -720,20 +733,190 @@ EOF
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
+# Remove etcd member directory to allow the node to rejoin the cluster as a learner.
|
||||
+#
|
||||
+# When a node rejoins an etcd cluster, it must start fresh as a learner to prevent
|
||||
+# data inconsistencies. This function removes the member directory and syncs to disk.
|
||||
+#
|
||||
+# Returns:
|
||||
+# OCF_SUCCESS - Member directory successfully removed
|
||||
+# OCF_ERR_GENERIC - Failed to remove member directory (critical error)
|
||||
+wipe_data_folder_for_learner()
|
||||
+{
|
||||
+ ocf_log info "deleting etcd member directory ($ETCD_MEMBER_DIR) to enable learner rejoin"
|
||||
+ if ! rm -rf "$ETCD_MEMBER_DIR"; then
|
||||
+ ocf_log err "could not delete etcd member directory ($ETCD_MEMBER_DIR), error code: $?"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+ sync
|
||||
+ return $OCF_SUCCESS
|
||||
+}
|
||||
+
|
||||
+
|
||||
+# Calculate available disk space in bytes for a given directory.
|
||||
+#
|
||||
+# This function queries the filesystem and returns available space in bytes.
|
||||
+# It converts df output (KB) to bytes for consistent size comparisons.
|
||||
+#
|
||||
+# Arguments:
|
||||
+# $1 - Target directory path to check
|
||||
+#
|
||||
+# Returns:
|
||||
+# OCF_SUCCESS - Available space in bytes (via stdout)
|
||||
+# OCF_ERR_GENERIC - Failed to determine available space (error message via stdout)
|
||||
+get_available_space_in_directory()
|
||||
+{
|
||||
+ local target_dir=$1
|
||||
+ local available_space_kb
|
||||
+ local available_space_bytes
|
||||
+
|
||||
+ available_space_kb=$(df -P "$target_dir" | awk 'NR==2 {print $4}' 2>&1)
|
||||
+
|
||||
+ # Validate output is numeric
|
||||
+ if ! echo "$available_space_kb" | grep -q '^[0-9]\+$'; then
|
||||
+ echo "df command failed or returned invalid value: $available_space_kb"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
+ available_space_bytes=$((available_space_kb*1024))
|
||||
+ echo "$available_space_bytes"
|
||||
+ return $OCF_SUCCESS
|
||||
+}
|
||||
+
|
||||
+# Archive etcd database with backup and cleanup
|
||||
+#
|
||||
+# This function creates a backup copy of the etcd database, validates it, and
|
||||
+# removes old backups according to the retention policy. Backups are optional
|
||||
+# and can be disabled by setting max_backup_snapshots=0.
|
||||
+#
|
||||
+# Error handling strategy:
|
||||
+# All backup failures return OCF_SUCCESS to prevent blocking cluster recovery.
|
||||
+# Backups are beneficial but not critical for recovery operations.
|
||||
+#
|
||||
+# NOTE: This function cannot use etcdctl/etcdutl utilities because the etcd
|
||||
+# server is not running when this backup is performed.
|
||||
archive_data_folder()
|
||||
{
|
||||
- # TODO: use etcd snapshots
|
||||
- local dest_dir_name
|
||||
- local data_dir="/var/lib/etcd/member"
|
||||
+ local backup_dir="$OCF_RESKEY_backup_location"
|
||||
+ local etcd_db_path="$ETCD_MEMBER_DIR/snap/db"
|
||||
|
||||
- dest_dir_name="members-snapshot-$(date +%Y%M%d%H%M%S)"
|
||||
- if [ ! -d $data_dir ]; then
|
||||
- ocf_log info "no data dir to backup"
|
||||
+ if [ "$OCF_RESKEY_max_backup_snapshots" -eq 0 ]; then
|
||||
+ ocf_log debug "etcd backup disabled (max_backup_snapshots=0)"
|
||||
return $OCF_SUCCESS
|
||||
fi
|
||||
- ocf_log info "backing up $data_dir under $HA_RSCTMP/$dest_dir_name"
|
||||
- mv "$data_dir" "$HA_RSCTMP/$dest_dir_name"
|
||||
- sync
|
||||
+
|
||||
+ # Check if the etcd database file exists
|
||||
+ if [ ! -f "$etcd_db_path" ]; then
|
||||
+ ocf_log warn "backup skipped: etcd database file not found at '$etcd_db_path'"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ # Ensure backup directory exists
|
||||
+ if [ ! -d "$backup_dir" ]; then
|
||||
+ ocf_log debug "creating backup directory: '$backup_dir'"
|
||||
+ if ! mkdir -p "$backup_dir"; then
|
||||
+ ocf_log warn "backup skipped: failed to create backup directory '$backup_dir'"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+ fi
|
||||
+
|
||||
+ ocf_log debug "checking disk space: backup_dir=$backup_dir"
|
||||
+ local available_space_bytes
|
||||
+ if ! available_space_bytes=$(get_available_space_in_directory "$backup_dir"); then
|
||||
+ ocf_log warn "backup skipped: could not compute available disk space in '$backup_dir', error msg: $available_space_bytes"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ local required_space_bytes
|
||||
+ required_space_bytes=$(stat -c %s "$etcd_db_path" 2>&1)
|
||||
+ if ! echo "$required_space_bytes" | grep -q '^[0-9]\+$'; then
|
||||
+ ocf_log warn "backup skipped: could not compute etcd database size at '$etcd_db_path', error msg: $required_space_bytes"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ if [ "$required_space_bytes" -gt "$available_space_bytes" ]; then
|
||||
+ ocf_log warn "backup skipped: insufficient disk space (required: ${required_space_bytes}B, available: ${available_space_bytes}B)"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ # Generate timestamp and backup filename
|
||||
+ local timestamp
|
||||
+ timestamp=$(date +%Y%m%d-%H%M%S)
|
||||
+
|
||||
+ local backup_file
|
||||
+ backup_file="$backup_dir/snapshot-$timestamp.db"
|
||||
+
|
||||
+ ocf_log info "creating etcd database backup: '$backup_file'"
|
||||
+
|
||||
+ # Create the backup by copying the database file (enable Copy-on-Write copy)
|
||||
+ if ! cp --reflink=auto "$etcd_db_path" "$backup_file"; then
|
||||
+ ocf_log warn "backup creation failed: could not copy '$etcd_db_path' to '$backup_file', error code: $?"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ # Validate the backup file exists and has the expected size
|
||||
+ if [ ! -f "$backup_file" ]; then
|
||||
+ ocf_log warn "backup validation failed: snapshot file '$backup_file' does not exist"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ local backup_size_bytes
|
||||
+ backup_size_bytes=$(stat -c %s "$backup_file" 2>/dev/null || echo "0")
|
||||
+ if [ "$backup_size_bytes" -ne "$required_space_bytes" ]; then
|
||||
+ ocf_log warn "backup validation failed: size mismatch (expected: ${required_space_bytes}B, got: ${backup_size_bytes}B)"
|
||||
+ rm -f "$backup_file"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ ocf_log info "backup created successfully: $backup_file (${backup_size_bytes}B)"
|
||||
+
|
||||
+ # Cleanup old backups based on retention policy
|
||||
+ cleanup_old_backups "$backup_dir"
|
||||
+
|
||||
+ return $OCF_SUCCESS
|
||||
+}
|
||||
+
|
||||
+cleanup_old_backups()
|
||||
+{
|
||||
+ local backup_dir="$1"
|
||||
+ local max_snapshots="$OCF_RESKEY_max_backup_snapshots"
|
||||
+ local backup_count
|
||||
+ local backups_to_remove
|
||||
+ local old_backups
|
||||
+
|
||||
+ # Validate max_snapshots is a positive integer
|
||||
+ if ! echo "$max_snapshots" | grep -q '^[1-9][0-9]*$'; then
|
||||
+ ocf_log warn "invalid max_backup_snapshots value. Positive integer expected, got '$max_snapshots' instead, skipping cleanup"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ # Count existing backup files
|
||||
+ backup_count=$(find "$backup_dir" -maxdepth 1 -name "snapshot-*.db" -type f 2>/dev/null | wc -l)
|
||||
+
|
||||
+ if [ "$backup_count" -le "$max_snapshots" ]; then
|
||||
+ ocf_log info "backup count ($backup_count) is within retention limit ($max_snapshots), no cleanup needed"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ # Calculate how many backups to remove
|
||||
+ backups_to_remove=$((backup_count - max_snapshots))
|
||||
+ ocf_log info "removing $backups_to_remove old backup(s) to maintain retention limit of $max_snapshots"
|
||||
+
|
||||
+ # Find oldest backups sorted by modification time
|
||||
+ # -t sorts by modification time, -r reverses (oldest first)
|
||||
+ # -print0 and -0 handle filenames with spaces/special characters
|
||||
+ old_backups=$(find "$backup_dir" -maxdepth 1 -name "snapshot-*.db" -type f -print0 2>/dev/null | \
|
||||
+ xargs -0 -r ls -tr | \
|
||||
+ head -n "$backups_to_remove")
|
||||
+
|
||||
+ if [ -n "$old_backups" ]; then
|
||||
+ ocf_log info "removing old backups: $old_backups"
|
||||
+ if ! echo "$old_backups" | xargs -r rm -f; then
|
||||
+ ocf_log warn "failed to remove some old backups, error code: $?"
|
||||
+ fi
|
||||
+ fi
|
||||
+
|
||||
+ return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
etcd_pod_container_exists() {
|
||||
@@ -1902,6 +2085,9 @@ podman_start()
|
||||
fi
|
||||
|
||||
archive_data_folder
|
||||
+ if ! wipe_data_folder_for_learner; then
|
||||
+ return "$OCF_ERR_GENERIC"
|
||||
+ fi
|
||||
fi
|
||||
|
||||
ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced"
|
||||
@@ -2251,6 +2437,7 @@ CONTAINER=$OCF_RESKEY_name
|
||||
POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
|
||||
ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
|
||||
ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
|
||||
+ETCD_MEMBER_DIR="/var/lib/etcd/member"
|
||||
ETCD_REVISION_JSON="/var/lib/etcd/revision.json"
|
||||
ETCD_REVISION_BUMP_PERCENTAGE=0.2
|
||||
ETCD_BUMP_REV_DEFAULT=1000000000
|
||||
@ -0,0 +1,111 @@
|
||||
From e4d311b40d8ded2a1921a0e5c01cb49a07c9fb35 Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Thu, 5 Feb 2026 19:31:42 +0100
|
||||
Subject: [PATCH] podman-etcd: fix learner node attribute not set after etcdctl
|
||||
failure
|
||||
|
||||
Ensure that learner_node attribute is always set when the member list
|
||||
contains one learner member.
|
||||
|
||||
Moreover:
|
||||
* Ensure set_standalone_node is called after adding a learner member.
|
||||
* Capture stderr from etcdctl for better error logging.
|
||||
---
|
||||
heartbeat/podman-etcd | 61 +++++++++++++++++++++++++++----------------
|
||||
1 file changed, 38 insertions(+), 23 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 77525ddb7..06814ad89 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -1082,7 +1082,7 @@ add_member_as_learner()
|
||||
local peer_url=$(ip_url $member_ip)
|
||||
|
||||
ocf_log info "add $member_name ($member_ip) to the member list as learner"
|
||||
- out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner)
|
||||
+ out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner 2>&1)
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ]; then
|
||||
ocf_log err "could not add $member_name as learner, error code $rc, etcdctl output: $out"
|
||||
@@ -1429,10 +1429,22 @@ detect_cluster_leadership_loss()
|
||||
manage_peer_membership()
|
||||
{
|
||||
local member_list_json="$1"
|
||||
+ local peer_ip_map_entry
|
||||
+ local peer_member_name
|
||||
+ local peer_member_ip
|
||||
+ local peer_member_id
|
||||
+
|
||||
+ # Get peer node name and IP
|
||||
+ peer_ip_map_entry=$(echo "$OCF_RESKEY_node_ip_map" | tr ';' '\n' | grep -vF "$NODENAME")
|
||||
+ if [ -z "$peer_ip_map_entry" ]; then
|
||||
+ ocf_exit_reason "manage_peer_membership: could not parse node_ip_map: '$OCF_RESKEY_node_ip_map'"
|
||||
+ exit $OCF_ERR_CONFIGURED
|
||||
+ fi
|
||||
+ peer_member_name=$(echo "$peer_ip_map_entry" | cut -d: -f1)
|
||||
+ peer_member_ip=$(echo "$peer_ip_map_entry" | cut -d: -f2-)
|
||||
|
||||
- # Example of .members[] instance fields in member list json format:
|
||||
- # NOTE that "name" is present in voting members only, while "isLearner" in learner members only
|
||||
- # and the value is always true (not a string) in that case.
|
||||
+ # Parsing the member list's json output to find a "learner" member.
|
||||
+ # Example of .members[] instance fields in member list json format:
|
||||
# {
|
||||
# "ID": <member ID>,
|
||||
# "name": "<node hostname>",
|
||||
@@ -1443,26 +1455,28 @@ manage_peer_membership()
|
||||
# "https://<node IP>:2379"
|
||||
# ]
|
||||
# }
|
||||
- for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
|
||||
- name=$(echo "$node" | cut -d: -f1)
|
||||
- # do not check itself
|
||||
- if [ "$name" = "$NODENAME" ]; then
|
||||
- continue
|
||||
- fi
|
||||
+ # NOTE that the "name" field is present in voting members only, while "isLearner"
|
||||
+ # field in learner members only and the value is always true (not a string) in that case.
|
||||
+ peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$peer_member_ip\")) | any).ID")
|
||||
+ if [ -z "$peer_member_id" ]; then
|
||||
+ ocf_log info "$peer_member_name is not in the members list"
|
||||
+ add_member_as_learner "$peer_member_name" "$peer_member_ip"
|
||||
+ set_standalone_node
|
||||
+ return
|
||||
+ fi
|
||||
|
||||
- # Check by IP instead of Name since "learner" members appear only in peerURLs, not by Name.
|
||||
- ip=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6
|
||||
- peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$ip\")) | any).ID")
|
||||
- if [ -z "$peer_member_id" ]; then
|
||||
- ocf_log info "$name is not in the members list"
|
||||
- add_member_as_learner "$name" "$ip"
|
||||
- set_standalone_node
|
||||
- else
|
||||
- ocf_log debug "$name is in the members list by IP: $ip"
|
||||
- # Errors from reconcile_member_state are logged internally. Ignoring them here prevents stopping a healthy voter agent; critical local failures are caught by detect_cluster_leadership_loss.
|
||||
- reconcile_member_state "$member_list_json"
|
||||
- fi
|
||||
- done
|
||||
+ # Ensure learner_node attribute is always set when we have a learner member
|
||||
+ local learner_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID")
|
||||
+ local current_learner_node=$(attribute_learner_node get)
|
||||
+ if [ -n "$learner_member_id" ] && [ -z "$current_learner_node" ]; then
|
||||
+ ocf_log debug "$peer_member_name found as learner in member list, but learner_node attribute was not set. Updating"
|
||||
+ attribute_learner_node update "$peer_member_name"
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ ocf_log debug "$peer_member_name is in the members list by IP: $peer_member_ip"
|
||||
+ # Errors from reconcile_member_state are logged internally. Ignoring them here prevents stopping a healthy voter agent; critical local failures are caught by detect_cluster_leadership_loss.
|
||||
+ reconcile_member_state "$member_list_json"
|
||||
}
|
||||
|
||||
check_peer()
|
||||
@@ -2209,6 +2223,7 @@ podman_start()
|
||||
peer_node_ip="$(attribute_node_ip_peer)"
|
||||
if [ -n "$peer_node_name" ] && [ -n "$peer_node_ip" ]; then
|
||||
add_member_as_learner "$peer_node_name" "$peer_node_ip"
|
||||
+ set_standalone_node
|
||||
else
|
||||
ocf_log err "could not add peer as learner (peer node name: ${peer_node_name:-unknown}, peer ip: ${peer_node_ip:-unknown})"
|
||||
fi
|
||||
46
SOURCES/RHEL-22715-LVM-activate-fix-false-positive.patch
Normal file
46
SOURCES/RHEL-22715-LVM-activate-fix-false-positive.patch
Normal file
@ -0,0 +1,46 @@
|
||||
From 65a066cf9066390db65c4875e21c4c391793b9ae Mon Sep 17 00:00:00 2001
|
||||
From: Arslan Ahmad <arslan.ahmad97@googlemail.com>
|
||||
Date: Tue, 16 Jan 2024 09:11:17 +0530
|
||||
Subject: [PATCH] Avoid false positive for VG activation
|
||||
|
||||
When lvm.conf file has `volume_list` parameter configured and the
|
||||
cluster is managing the shared storage using `system_id_source`,
|
||||
then the activation of the LV fails to happen. However it is
|
||||
reported as a success.
|
||||
|
||||
The fixes will avoid starting of `LVM-activate` resource when
|
||||
the cluster is configured with both `system_id_source` and
|
||||
`volume_list`.
|
||||
|
||||
Signed-off-by: Arslan Ahmad <arslan.ahmad97@googlemail.com>
|
||||
---
|
||||
heartbeat/LVM-activate | 9 +++++++++
|
||||
1 file changed, 9 insertions(+)
|
||||
|
||||
diff --git a/heartbeat/LVM-activate b/heartbeat/LVM-activate
|
||||
index f6f24a3b5..3858ed8dc 100755
|
||||
--- a/heartbeat/LVM-activate
|
||||
+++ b/heartbeat/LVM-activate
|
||||
@@ -448,6 +448,10 @@ systemid_check()
|
||||
{
|
||||
# system_id_source is set in lvm.conf
|
||||
source=$(lvmconfig 'global/system_id_source' 2>/dev/null | cut -d"=" -f2)
|
||||
+
|
||||
+ # Is volume_list set in lvm.conf
|
||||
+ vol_list=$(lvmconfig 'activation/volume_list' 2>/dev/null | cut -d"=" -f2)
|
||||
+
|
||||
if [ "$source" = "" ] || [ "$source" = "none" ]; then
|
||||
ocf_exit_reason "system_id_source in lvm.conf is not set correctly!"
|
||||
exit $OCF_ERR_ARGS
|
||||
@@ -458,6 +462,11 @@ systemid_check()
|
||||
exit $OCF_ERR_ARGS
|
||||
fi
|
||||
|
||||
+ if [ -n "$source" ] && [ -n "$vol_list" ]; then
|
||||
+ ocf_exit_reason "Both system_id_source & volume_list cannot be defined!"
|
||||
+ exit $OCF_ERR_ARGS
|
||||
+ fi
|
||||
+
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
@ -0,0 +1,40 @@
|
||||
From 264e38e02cb4c04877e412bac254e42c7f6b2e1c Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Tue, 20 Feb 2024 12:34:42 +0100
|
||||
Subject: [PATCH] Filesystem: fail when leading or trailing whitespace is
|
||||
present in device or directory parameters
|
||||
|
||||
---
|
||||
heartbeat/Filesystem | 12 ++++++++++++
|
||||
1 file changed, 12 insertions(+)
|
||||
|
||||
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||
index e1378f781..f88e3b552 100755
|
||||
--- a/heartbeat/Filesystem
|
||||
+++ b/heartbeat/Filesystem
|
||||
@@ -995,6 +995,12 @@ if [ -n "${OCF_RESKEY_force_unmount}" ]; then
|
||||
fi
|
||||
|
||||
DEVICE="$OCF_RESKEY_device"
|
||||
+case "$DEVICE" in
|
||||
+ [[:space:]]*|*[[:space:]])
|
||||
+ ocf_exit_reason "device parameter does not accept leading or trailing whitespace characters"
|
||||
+ exit $OCF_ERR_CONFIGURED
|
||||
+ ;;
|
||||
+esac
|
||||
FSTYPE=$OCF_RESKEY_fstype
|
||||
if [ ! -z "$OCF_RESKEY_options" ]; then
|
||||
options="-o $OCF_RESKEY_options"
|
||||
@@ -1032,6 +1038,12 @@ if [ -z "$OCF_RESKEY_directory" ]; then
|
||||
else
|
||||
MOUNTPOINT="$(echo "$OCF_RESKEY_directory" | sed 's/\/*$//')"
|
||||
: ${MOUNTPOINT:=/}
|
||||
+ case "$MOUNTPOINT" in
|
||||
+ [[:space:]]*|*[[:space:]])
|
||||
+ ocf_exit_reason "directory parameter does not accept leading or trailing whitespace characters"
|
||||
+ exit $OCF_ERR_CONFIGURED
|
||||
+ ;;
|
||||
+ esac
|
||||
if [ -e "$MOUNTPOINT" ] ; then
|
||||
CANONICALIZED_MOUNTPOINT="$(readlink -f "$MOUNTPOINT")"
|
||||
if [ $? -ne 0 ]; then
|
||||
@ -0,0 +1,30 @@
|
||||
From 1317efc72af6b72d9fb37aea18dc16129c146148 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Tue, 25 Jun 2024 13:33:19 +0200
|
||||
Subject: [PATCH] Filesystem: return success during stop-action when leading or
|
||||
trailing whitespace is present in device or directory parameters
|
||||
|
||||
---
|
||||
heartbeat/Filesystem | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||
index 8e0127531..3eb520e0c 100755
|
||||
--- a/heartbeat/Filesystem
|
||||
+++ b/heartbeat/Filesystem
|
||||
@@ -1037,6 +1037,7 @@ fi
|
||||
DEVICE="$OCF_RESKEY_device"
|
||||
case "$DEVICE" in
|
||||
[[:space:]]*|*[[:space:]])
|
||||
+ [ "$__OCF_ACTION" = "stop" ] && exit $OCF_SUCCESS
|
||||
ocf_exit_reason "device parameter does not accept leading or trailing whitespace characters"
|
||||
exit $OCF_ERR_CONFIGURED
|
||||
;;
|
||||
@@ -1080,6 +1081,7 @@ else
|
||||
: ${MOUNTPOINT:=/}
|
||||
case "$MOUNTPOINT" in
|
||||
[[:space:]]*|*[[:space:]])
|
||||
+ [ "$__OCF_ACTION" = "stop" ] && exit $OCF_SUCCESS
|
||||
ocf_exit_reason "directory parameter does not accept leading or trailing whitespace characters"
|
||||
exit $OCF_ERR_CONFIGURED
|
||||
;;
|
||||
25
SOURCES/RHEL-32265-1-findif.sh-fix-corner-cases.patch
Normal file
25
SOURCES/RHEL-32265-1-findif.sh-fix-corner-cases.patch
Normal file
@ -0,0 +1,25 @@
|
||||
From f717b4a3aa83c9124e62716f421b99e314d00233 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Fri, 12 Apr 2024 12:23:21 +0200
|
||||
Subject: [PATCH] findif.sh: fix corner cases
|
||||
|
||||
---
|
||||
heartbeat/findif.sh | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/findif.sh b/heartbeat/findif.sh
|
||||
index 7c766e6e0..13484f827 100644
|
||||
--- a/heartbeat/findif.sh
|
||||
+++ b/heartbeat/findif.sh
|
||||
@@ -215,9 +215,9 @@ findif()
|
||||
fi
|
||||
if [ -n "$nic" ] ; then
|
||||
# NIC supports more than two.
|
||||
- set -- $(ip -o -f $family route list match $match $scope | grep "dev $nic " | awk 'BEGIN{best=0} /\// { mask=$1; sub(".*/", "", mask); if( int(mask)>=best ) { best=int(mask); best_ln=$0; } } END{print best_ln}')
|
||||
+ set -- $(ip -o -f $family route list match $match $scope | grep "dev $nic " | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||
else
|
||||
- set -- $(ip -o -f $family route list match $match $scope | awk 'BEGIN{best=0} /\// { mask=$1; sub(".*/", "", mask); if( int(mask)>=best ) { best=int(mask); best_ln=$0; } } END{print best_ln}')
|
||||
+ set -- $(ip -o -f $family route list match $match $scope | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||
fi
|
||||
if [ $# = 0 ] ; then
|
||||
case $OCF_RESKEY_ip in
|
||||
365
SOURCES/RHEL-32265-2-IPsrcaddr-add-IPv6-support.patch
Normal file
365
SOURCES/RHEL-32265-2-IPsrcaddr-add-IPv6-support.patch
Normal file
@ -0,0 +1,365 @@
|
||||
From 12d73d53026d219be67c0d5353010ba08ab49e98 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Tue, 28 May 2024 09:45:55 +0200
|
||||
Subject: [PATCH 1/3] findif.sh: add metric for IPv6 support and fail when
|
||||
matching more than 1 route
|
||||
|
||||
---
|
||||
heartbeat/findif.sh | 19 ++++++++++++++++---
|
||||
1 file changed, 16 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/findif.sh b/heartbeat/findif.sh
|
||||
index 13484f827..ca5d1a5c1 100644
|
||||
--- a/heartbeat/findif.sh
|
||||
+++ b/heartbeat/findif.sh
|
||||
@@ -196,10 +196,13 @@ findif()
|
||||
{
|
||||
local match="$OCF_RESKEY_ip"
|
||||
local family
|
||||
+ local proto
|
||||
local scope
|
||||
local nic="$OCF_RESKEY_nic"
|
||||
local netmask="$OCF_RESKEY_cidr_netmask"
|
||||
local brdcast="$OCF_RESKEY_broadcast"
|
||||
+ local metric
|
||||
+ local routematch
|
||||
|
||||
echo $match | grep -qs ":"
|
||||
if [ $? = 0 ] ; then
|
||||
@@ -215,10 +218,19 @@ findif()
|
||||
fi
|
||||
if [ -n "$nic" ] ; then
|
||||
# NIC supports more than two.
|
||||
- set -- $(ip -o -f $family route list match $match $scope | grep "dev $nic " | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||
+ routematch=$(ip -o -f $family route list match $match $proto $scope | grep "dev $nic " | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||
else
|
||||
- set -- $(ip -o -f $family route list match $match $scope | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||
+ routematch=$(ip -o -f $family route list match $match $proto $scope | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||
fi
|
||||
+ if [ "$family" = "inet6" ]; then
|
||||
+ routematch=$(echo "$routematch" | grep -v "^default")
|
||||
+ fi
|
||||
+
|
||||
+ if [ $(echo "$routematch" | wc -l) -gt 1 ]; then
|
||||
+ ocf_exit_reason "More than 1 routes match $match. Unable to decide which route to use."
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+ set -- $routematch
|
||||
if [ $# = 0 ] ; then
|
||||
case $OCF_RESKEY_ip in
|
||||
127.*)
|
||||
@@ -255,6 +267,7 @@ findif()
|
||||
return $OCF_ERR_GENERIC
|
||||
fi
|
||||
fi
|
||||
- echo "$nic netmask $netmask broadcast $brdcast"
|
||||
+ metric=$(echo "$@" | sed "s/.*metric[[:blank:]]\([^ ]\+\).*/\1/")
|
||||
+ echo "$nic netmask $netmask broadcast $brdcast metric $metric"
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
From 488c096d63fe0f7e15938e65483ba20628080198 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Tue, 28 May 2024 09:47:11 +0200
|
||||
Subject: [PATCH 2/3] IPaddr2: use metric for IPv6
|
||||
|
||||
---
|
||||
heartbeat/IPaddr2 | 11 ++++++++---
|
||||
1 file changed, 8 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/IPaddr2 b/heartbeat/IPaddr2
|
||||
index 5f30b8f98..091bea418 100755
|
||||
--- a/heartbeat/IPaddr2
|
||||
+++ b/heartbeat/IPaddr2
|
||||
@@ -561,10 +561,11 @@ ip_init() {
|
||||
if
|
||||
[ $rc -eq 0 ]
|
||||
then
|
||||
- NICINFO=`echo "$NICINFO" | sed -e 's/netmask\ //;s/broadcast\ //'`
|
||||
+ NICINFO=`echo "$NICINFO" | sed -e 's/netmask\ //;s/broadcast\ //;s/metric\ //'`
|
||||
NIC=`echo "$NICINFO" | cut -d" " -f1`
|
||||
NETMASK=`echo "$NICINFO" | cut -d" " -f2`
|
||||
BRDCAST=`echo "$NICINFO" | cut -d" " -f3`
|
||||
+ METRIC=`echo "$NICINFO" | cut -d" " -f4`
|
||||
else
|
||||
# findif couldn't find the interface
|
||||
if ocf_is_probe; then
|
||||
@@ -659,13 +660,14 @@ delete_interface () {
|
||||
# Add an interface
|
||||
#
|
||||
add_interface () {
|
||||
- local cmd msg extra_opts ipaddr netmask broadcast iface label
|
||||
+ local cmd msg extra_opts ipaddr netmask broadcast iface label metric
|
||||
|
||||
ipaddr="$1"
|
||||
netmask="$2"
|
||||
broadcast="$3"
|
||||
iface="$4"
|
||||
label="$5"
|
||||
+ metric="$6"
|
||||
|
||||
if [ "$FAMILY" = "inet" ] && ocf_is_true $OCF_RESKEY_run_arping &&
|
||||
check_binary arping; then
|
||||
@@ -688,6 +690,9 @@ add_interface () {
|
||||
fi
|
||||
|
||||
extra_opts=""
|
||||
+ if [ "$FAMILY" = "inet6" ]; then
|
||||
+ extra_opts="$extra_opts metric $metric"
|
||||
+ fi
|
||||
if [ "$FAMILY" = "inet6" ] && ocf_is_true "${OCF_RESKEY_nodad}"; then
|
||||
extra_opts="$extra_opts nodad"
|
||||
fi
|
||||
@@ -1083,7 +1088,7 @@ ip_start() {
|
||||
done
|
||||
fi
|
||||
|
||||
- add_interface $OCF_RESKEY_ip $NETMASK ${BRDCAST:-none} $NIC $IFLABEL
|
||||
+ add_interface "$OCF_RESKEY_ip" "$NETMASK" "${BRDCAST:-none}" "$NIC" "$IFLABEL" "$METRIC"
|
||||
rc=$?
|
||||
|
||||
if [ $rc -ne $OCF_SUCCESS ]; then
|
||||
|
||||
From d1c4d1969381d3e35cfaaaaae522e5687a9ed88a Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Tue, 28 May 2024 09:47:56 +0200
|
||||
Subject: [PATCH 3/3] IPsrcaddr: add IPv6 support
|
||||
|
||||
---
|
||||
heartbeat/IPsrcaddr | 116 ++++++++++++++++++++++++++++++++------------
|
||||
1 file changed, 85 insertions(+), 31 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/IPsrcaddr b/heartbeat/IPsrcaddr
|
||||
index c732ce8df..1c87d5b7f 100755
|
||||
--- a/heartbeat/IPsrcaddr
|
||||
+++ b/heartbeat/IPsrcaddr
|
||||
@@ -60,6 +60,7 @@ OCF_RESKEY_cidr_netmask_default=""
|
||||
OCF_RESKEY_destination_default="0.0.0.0/0"
|
||||
OCF_RESKEY_proto_default=""
|
||||
OCF_RESKEY_metric_default=""
|
||||
+OCF_RESKEY_pref_default=""
|
||||
OCF_RESKEY_table_default=""
|
||||
|
||||
: ${OCF_RESKEY_ipaddress=${OCF_RESKEY_ipaddress_default}}
|
||||
@@ -67,6 +68,7 @@ OCF_RESKEY_table_default=""
|
||||
: ${OCF_RESKEY_destination=${OCF_RESKEY_destination_default}}
|
||||
: ${OCF_RESKEY_proto=${OCF_RESKEY_proto_default}}
|
||||
: ${OCF_RESKEY_metric=${OCF_RESKEY_metric_default}}
|
||||
+: ${OCF_RESKEY_pref=${OCF_RESKEY_pref_default}}
|
||||
: ${OCF_RESKEY_table=${OCF_RESKEY_table_default}}
|
||||
#######################################################################
|
||||
|
||||
@@ -75,10 +77,13 @@ OCF_RESKEY_table_default=""
|
||||
|
||||
USAGE="usage: $0 {start|stop|status|monitor|validate-all|meta-data}";
|
||||
|
||||
- CMDSHOW="$IP2UTIL route show $TABLE to exact $OCF_RESKEY_destination"
|
||||
-CMDCHANGE="$IP2UTIL route change to "
|
||||
+echo "$OCF_RESKEY_ipaddress" | grep -q ":" && FAMILY="inet6" || FAMILY="inet"
|
||||
+[ "$FAMILY" = "inet6" ] && [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] && OCF_RESKEY_destination="::/0"
|
||||
|
||||
-if [ "$OCF_RESKEY_destination" != "0.0.0.0/0" ]; then
|
||||
+ CMDSHOW="$IP2UTIL -f $FAMILY route show $TABLE to exact $OCF_RESKEY_destination"
|
||||
+CMDCHANGE="$IP2UTIL -f $FAMILY route change to "
|
||||
+
|
||||
+if [ "$OCF_RESKEY_destination" != "0.0.0.0/0" ] && [ "$OCF_RESKEY_destination" != "::/0" ]; then
|
||||
CMDSHOW="$CMDSHOW src $OCF_RESKEY_ipaddress"
|
||||
fi
|
||||
|
||||
@@ -153,6 +158,14 @@ Metric. Only needed if incorrect metric value is used.
|
||||
<content type="string" default="${OCF_RESKEY_metric_default}" />
|
||||
</parameter>
|
||||
|
||||
+<parameter name="pref">
|
||||
+<longdesc lang="en">
|
||||
+IPv6 route preference (low, medium or high). Only needed if incorrect pref value is used.
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">IPv6 route preference.</shortdesc>
|
||||
+<content type="string" default="${OCF_RESKEY_pref_default}" />
|
||||
+</parameter>
|
||||
+
|
||||
<parameter name="table">
|
||||
<longdesc lang="en">
|
||||
Table to modify and use for interface lookup. E.g. "local".
|
||||
@@ -196,12 +209,21 @@ errorexit() {
|
||||
# where the src clause "src Y.Y.Y.Y" may or may not be present
|
||||
|
||||
WS="[[:blank:]]"
|
||||
-OCTET="[0-9]\{1,3\}"
|
||||
-IPADDR="\($OCTET\.\)\{3\}$OCTET"
|
||||
+case "$FAMILY" in
|
||||
+ inet)
|
||||
+ GROUP="[0-9]\{1,3\}"
|
||||
+ IPADDR="\($GROUP\.\)\{3\}$GROUP"
|
||||
+ ;;
|
||||
+ inet6)
|
||||
+ GROUP="[0-9a-f]\{0,4\}"
|
||||
+ IPADDR="\($GROUP\:\)\{0,\}$GROUP"
|
||||
+ ;;
|
||||
+esac
|
||||
SRCCLAUSE="src$WS$WS*\($IPADDR\)"
|
||||
-MATCHROUTE="\(.*${WS}\)\($SRCCLAUSE\)\($WS.*\|$\)"
|
||||
-METRICCLAUSE=".*\(metric$WS[^ ]\+\)"
|
||||
+MATCHROUTE="\(.*${WS}\)proto [^ ]\+\(.*${WS}\)\($SRCCLAUSE\)\($WS.*\|$\)"
|
||||
+METRICCLAUSE=".*\(metric$WS[^ ]\+\).*"
|
||||
PROTOCLAUSE=".*\(proto$WS[^ ]\+\).*"
|
||||
+PREFCLAUSE=".*\(pref$WS[^ ]\+\).*"
|
||||
FINDIF=findif
|
||||
|
||||
# findif needs that to be set
|
||||
@@ -216,17 +238,17 @@ srca_read() {
|
||||
errorexit "more than 1 matching route exists"
|
||||
|
||||
# But there might still be no matching route
|
||||
- [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] && [ -z "$ROUTE" ] && \
|
||||
+ ([ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] || [ "$OCF_RESKEY_destination" = "::/0" ]) && [ -z "$ROUTE" ] && \
|
||||
! ocf_is_probe && [ "$__OCF_ACTION" != stop ] && errorexit "no matching route exists"
|
||||
|
||||
# Sed out the source ip address if it exists
|
||||
- SRCIP=`echo $ROUTE | sed -n "s/$MATCHROUTE/\3/p"`
|
||||
+ SRCIP=`echo $ROUTE | sed -n "s/$MATCHROUTE/\4/p"`
|
||||
|
||||
# and what remains after stripping out the source ip address clause
|
||||
- ROUTE_WO_SRC=`echo $ROUTE | sed "s/$MATCHROUTE/\1\5/"`
|
||||
+ ROUTE_WO_SRC=`echo $ROUTE | sed "s/$MATCHROUTE/\1\2\6/"`
|
||||
|
||||
# using "src <ip>" only returns output if there's a match
|
||||
- if [ "$OCF_RESKEY_destination" != "0.0.0.0/0" ]; then
|
||||
+ if [ "$OCF_RESKEY_destination" != "0.0.0.0/0" ] && [ "$OCF_RESKEY_destination" != "::/0" ]; then
|
||||
[ -z "$ROUTE" ] && return 1 || return 0
|
||||
fi
|
||||
|
||||
@@ -249,12 +271,15 @@ srca_start() {
|
||||
rc=$OCF_SUCCESS
|
||||
ocf_log info "The ip route has been already set.($NETWORK, $INTERFACE, $ROUTE_WO_SRC)"
|
||||
else
|
||||
- $IP2UTIL route replace $TABLE $NETWORK dev $INTERFACE $PROTO src $1 $METRIC || \
|
||||
- errorexit "command 'ip route replace $TABLE $NETWORK dev $INTERFACE $PROTO src $1 $METRIC' failed"
|
||||
+ # NetworkManager manages routes with proto static/kernel
|
||||
+ [ -z "$OCF_RESKEY_proto" ] && echo "$PROTO" | grep -q "proto \(kernel\|static\)" && PROTO="proto keepalived"
|
||||
|
||||
- if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] ;then
|
||||
- $CMDCHANGE $ROUTE_WO_SRC src $1 || \
|
||||
- errorexit "command '$CMDCHANGE $ROUTE_WO_SRC src $1' failed"
|
||||
+ $IP2UTIL route replace $TABLE $NETWORK dev $INTERFACE $PROTO src $1 $METRIC $PREF || \
|
||||
+ errorexit "command 'ip route replace $TABLE $NETWORK dev $INTERFACE $PROTO src $1 $METRIC $PREF' failed"
|
||||
+
|
||||
+ if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] || [ "$OCF_RESKEY_destination" = "::/0" ]; then
|
||||
+ $CMDCHANGE $ROUTE_WO_SRC $PROTO src $1 || \
|
||||
+ errorexit "command '$CMDCHANGE $ROUTE_WO_SRC $PROTO src $1' failed"
|
||||
fi
|
||||
rc=$?
|
||||
fi
|
||||
@@ -290,14 +315,15 @@ srca_stop() {
|
||||
fi
|
||||
|
||||
PRIMARY_IP="$($IP2UTIL -4 -o addr show dev $INTERFACE primary | awk '{split($4,a,"/");print a[1]}')"
|
||||
- OPTS="proto kernel scope $SCOPE src $PRIMARY_IP"
|
||||
+ OPTS="proto kernel scope $SCOPE"
|
||||
+ [ "$FAMILY" = "inet" ] && OPTS="$OPTS src $PRIMARY_IP"
|
||||
|
||||
- $IP2UTIL route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC || \
|
||||
- errorexit "command 'ip route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC' failed"
|
||||
+ $IP2UTIL route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC $PREF || \
|
||||
+ errorexit "command 'ip route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC $PREF' failed"
|
||||
|
||||
- if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] ;then
|
||||
- $CMDCHANGE $ROUTE_WO_SRC src $PRIMARY_IP || \
|
||||
- errorexit "command '$CMDCHANGE $ROUTE_WO_SRC src $PRIMARY_IP' failed"
|
||||
+ if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] || [ "$OCF_RESKEY_destination" = "::/0" ]; then
|
||||
+ $CMDCHANGE $ROUTE_WO_SRC proto static || \
|
||||
+ errorexit "command '$CMDCHANGE $ROUTE_WO_SRC proto static' failed"
|
||||
fi
|
||||
|
||||
return $?
|
||||
@@ -330,7 +356,7 @@ CheckIP() {
|
||||
case $ip in
|
||||
*[!0-9.]*) #got invalid char
|
||||
false;;
|
||||
- .*|*.) #begin or end by ".", which is invalid
|
||||
+ .*|*.) #begin or end with ".", which is invalid
|
||||
false;;
|
||||
*..*) #consecutive ".", which is invalid
|
||||
false;;
|
||||
@@ -356,6 +382,18 @@ CheckIP() {
|
||||
return $? # This return is unnecessary, this comment too :)
|
||||
}
|
||||
|
||||
+CheckIP6() {
|
||||
+ ip="$1"
|
||||
+ case $ip in
|
||||
+ *[!0-9a-f:]*) #got invalid char
|
||||
+ false;;
|
||||
+ *:::*) # more than 2 consecutive ":", which is invalid
|
||||
+ false;;
|
||||
+ *::*::*) # more than 1 "::", which is invalid
|
||||
+ false;;
|
||||
+ esac
|
||||
+}
|
||||
+
|
||||
#
|
||||
# Find out which interface or alias serves the given IP address
|
||||
# The argument is an IP address, and its output
|
||||
@@ -396,8 +434,7 @@ find_interface_solaris() {
|
||||
# is an (aliased) interface name (e.g., "eth0" and "eth0:0").
|
||||
#
|
||||
find_interface_generic() {
|
||||
-
|
||||
- local iface=`$IP2UTIL -o -f inet addr show | grep "\ $BASEIP" \
|
||||
+ local iface=`$IP2UTIL -o -f $FAMILY addr show | grep "\ $BASEIP" \
|
||||
| cut -d ' ' -f2 | grep -v '^ipsec[0-9][0-9]*$'`
|
||||
if [ -z "$iface" ]; then
|
||||
return $OCF_ERR_GENERIC
|
||||
@@ -502,7 +539,9 @@ srca_validate_all() {
|
||||
|
||||
# The IP address should be in good shape
|
||||
if CheckIP "$ipaddress"; then
|
||||
- :
|
||||
+ :
|
||||
+ elif CheckIP6 "$ipaddress"; then
|
||||
+ :
|
||||
else
|
||||
ocf_exit_reason "Invalid IP address [$ipaddress]"
|
||||
return $OCF_ERR_CONFIGURED
|
||||
@@ -570,21 +609,36 @@ rc=$?
|
||||
}
|
||||
|
||||
INTERFACE=`echo $findif_out | awk '{print $1}'`
|
||||
-LISTROUTE=`$IP2UTIL route list dev $INTERFACE scope link $PROTO match $ipaddress`
|
||||
+case "$FAMILY" in
|
||||
+ inet)
|
||||
+ LISTCMD="$IP2UTIL -f $FAMILY route list dev $INTERFACE scope link $PROTO match $ipaddress"
|
||||
+ ;;
|
||||
+ inet6)
|
||||
+ LISTCMD="$IP2UTIL -f $FAMILY route list dev $INTERFACE $PROTO match $ipaddress"
|
||||
+ ;;
|
||||
+esac
|
||||
+LISTROUTE=`$LISTCMD`
|
||||
+
|
||||
[ -z "$PROTO" ] && PROTO=`echo $LISTROUTE | sed -n "s/$PROTOCLAUSE/\1/p"`
|
||||
if [ -n "$OCF_RESKEY_metric" ]; then
|
||||
METRIC="metric $OCF_RESKEY_metric"
|
||||
-elif [ -z "$TABLE" ] || [ "${TABLE#table }" = "main" ]; then
|
||||
+elif [ -z "$TABLE" ] || [ "${TABLE#table }" = "main" ] || [ "$FAMILY" = "inet6" ]; then
|
||||
METRIC=`echo $LISTROUTE | sed -n "s/$METRICCLAUSE/\1/p"`
|
||||
else
|
||||
METRIC=""
|
||||
fi
|
||||
-if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] ;then
|
||||
+if [ "$FAMILY" = "inet6" ]; then
|
||||
+ if [ -z "$OCF_RESKEY_pref" ]; then
|
||||
+ PREF=`echo $LISTROUTE | sed -n "s/$PREFCLAUSE/\1/p"`
|
||||
+ else
|
||||
+ PREF="pref $OCF_RESKEY_pref"
|
||||
+ fi
|
||||
+fi
|
||||
+if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] || [ "$OCF_RESKEY_destination" = "::/0" ] ;then
|
||||
NETWORK=`echo $LISTROUTE | grep -m 1 -o '^[^ ]*'`
|
||||
|
||||
if [ -z "$NETWORK" ]; then
|
||||
- err_str="command '$IP2UTIL route list dev $INTERFACE scope link $PROTO"
|
||||
- err_str="$err_str match $ipaddress' failed to find a matching route"
|
||||
+ err_str="command '$LISTCMD' failed to find a matching route"
|
||||
|
||||
if [ "$__OCF_ACTION" = "start" ]; then
|
||||
ocf_exit_reason "$err_str"
|
||||
@ -0,0 +1,22 @@
|
||||
From 4075aff88776e2811ebc83b735b2a70bcf46247f Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Mon, 24 Jun 2024 09:45:29 +0200
|
||||
Subject: [PATCH] IPaddr2: only set metric value for IPv6 when detected
|
||||
|
||||
---
|
||||
heartbeat/IPaddr2 | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/heartbeat/IPaddr2 b/heartbeat/IPaddr2
|
||||
index 091bea418..3bc5abec1 100755
|
||||
--- a/heartbeat/IPaddr2
|
||||
+++ b/heartbeat/IPaddr2
|
||||
@@ -690,7 +690,7 @@ add_interface () {
|
||||
fi
|
||||
|
||||
extra_opts=""
|
||||
- if [ "$FAMILY" = "inet6" ]; then
|
||||
+ if [ "$FAMILY" = "inet6" ] && [ -n "$metric" ]; then
|
||||
extra_opts="$extra_opts metric $metric"
|
||||
fi
|
||||
if [ "$FAMILY" = "inet6" ] && ocf_is_true "${OCF_RESKEY_nodad}"; then
|
||||
@ -0,0 +1,25 @@
|
||||
From f561e272e9b7fe94ba598b70c6d2f44d034446ed Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Wed, 14 Aug 2024 12:05:54 +0200
|
||||
Subject: [PATCH] findif.sh: ignore unreachable, blackhole, and prohibit routes
|
||||
|
||||
---
|
||||
heartbeat/findif.sh | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/findif.sh b/heartbeat/findif.sh
|
||||
index ca5d1a5c1..7b817f75c 100644
|
||||
--- a/heartbeat/findif.sh
|
||||
+++ b/heartbeat/findif.sh
|
||||
@@ -218,9 +218,9 @@ findif()
|
||||
fi
|
||||
if [ -n "$nic" ] ; then
|
||||
# NIC supports more than two.
|
||||
- routematch=$(ip -o -f $family route list match $match $proto $scope | grep "dev $nic " | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||
+ routematch=$(ip -o -f $family route list match $match $proto $scope | grep -v "^\(unreachable\|prohibit\|blackhole\)" | grep "dev $nic " | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||
else
|
||||
- routematch=$(ip -o -f $family route list match $match $proto $scope | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||
+ routematch=$(ip -o -f $family route list match $match $proto $scope | grep -v "^\(unreachable\|prohibit\|blackhole\)" | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||
fi
|
||||
if [ "$family" = "inet6" ]; then
|
||||
routematch=$(echo "$routematch" | grep -v "^default")
|
||||
@ -0,0 +1,36 @@
|
||||
From f23ae9c1e9ff9a44a053c7c2378975ac5b807478 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Thu, 29 Aug 2024 16:24:02 +0200
|
||||
Subject: [PATCH] IPsrcaddr: specify dev for default route, as e.g. fe80::
|
||||
routes can be present on multiple interfaces
|
||||
|
||||
---
|
||||
heartbeat/IPsrcaddr | 8 ++++----
|
||||
1 file changed, 4 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/IPsrcaddr b/heartbeat/IPsrcaddr
|
||||
index 1c87d5b7f..58d89a280 100755
|
||||
--- a/heartbeat/IPsrcaddr
|
||||
+++ b/heartbeat/IPsrcaddr
|
||||
@@ -278,8 +278,8 @@ srca_start() {
|
||||
errorexit "command 'ip route replace $TABLE $NETWORK dev $INTERFACE $PROTO src $1 $METRIC $PREF' failed"
|
||||
|
||||
if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] || [ "$OCF_RESKEY_destination" = "::/0" ]; then
|
||||
- $CMDCHANGE $ROUTE_WO_SRC $PROTO src $1 || \
|
||||
- errorexit "command '$CMDCHANGE $ROUTE_WO_SRC $PROTO src $1' failed"
|
||||
+ $CMDCHANGE $ROUTE_WO_SRC dev $INTERFACE $PROTO src $1 || \
|
||||
+ errorexit "command '$CMDCHANGE $ROUTE_WO_SRC dev $INTERFACE $PROTO src $1' failed"
|
||||
fi
|
||||
rc=$?
|
||||
fi
|
||||
@@ -322,8 +322,8 @@ srca_stop() {
|
||||
errorexit "command 'ip route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC $PREF' failed"
|
||||
|
||||
if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] || [ "$OCF_RESKEY_destination" = "::/0" ]; then
|
||||
- $CMDCHANGE $ROUTE_WO_SRC proto static || \
|
||||
- errorexit "command '$CMDCHANGE $ROUTE_WO_SRC proto static' failed"
|
||||
+ $CMDCHANGE $ROUTE_WO_SRC dev $INTERFACE proto static || \
|
||||
+ errorexit "command '$CMDCHANGE $ROUTE_WO_SRC dev $INTERFACE proto static' failed"
|
||||
fi
|
||||
|
||||
return $?
|
||||
23
SOURCES/RHEL-32829-db2-fix-OCF_SUCESS-typo.patch
Normal file
23
SOURCES/RHEL-32829-db2-fix-OCF_SUCESS-typo.patch
Normal file
@ -0,0 +1,23 @@
|
||||
From a9c4aeb971e9f4963345d0e215b729def62dd27c Mon Sep 17 00:00:00 2001
|
||||
From: pepadelic <162310096+pepadelic@users.noreply.github.com>
|
||||
Date: Mon, 15 Apr 2024 13:52:54 +0200
|
||||
Subject: [PATCH] Update db2: fix OCF_SUCESS name in db2_notify
|
||||
|
||||
fix OCF_SUCESS to OCF_SUCCESS in db2_notify
|
||||
---
|
||||
heartbeat/db2 | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/heartbeat/db2 b/heartbeat/db2
|
||||
index 95447ab6cb..1cd66f15af 100755
|
||||
--- a/heartbeat/db2
|
||||
+++ b/heartbeat/db2
|
||||
@@ -848,7 +848,7 @@ db2_notify() {
|
||||
|
||||
# only interested in pre-start
|
||||
[ $OCF_RESKEY_CRM_meta_notify_type = pre \
|
||||
- -a $OCF_RESKEY_CRM_meta_notify_operation = start ] || return $OCF_SUCESS
|
||||
+ -a $OCF_RESKEY_CRM_meta_notify_operation = start ] || return $OCF_SUCCESS
|
||||
|
||||
# gets FIRST_ACTIVE_LOG
|
||||
db2_get_cfg $dblist || return $?
|
||||
@ -0,0 +1,110 @@
|
||||
From 66a5308d2e8f61093716a076f4386416dc18045c Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Mon, 22 Apr 2024 11:26:09 +0200
|
||||
Subject: [PATCH] Filesystem: fail when incorrect device mounted on mountpoint,
|
||||
and dont unmount the mountpoint in this case, or if mountpoint set to "/"
|
||||
|
||||
---
|
||||
heartbeat/Filesystem | 71 ++++++++++++++++++++++++++++++++++++--------
|
||||
1 file changed, 58 insertions(+), 13 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||
index e1378f781..cec71f1a6 100755
|
||||
--- a/heartbeat/Filesystem
|
||||
+++ b/heartbeat/Filesystem
|
||||
@@ -582,10 +582,16 @@ Filesystem_start()
|
||||
fi
|
||||
|
||||
# See if the device is already mounted.
|
||||
- if Filesystem_status >/dev/null 2>&1 ; then
|
||||
- ocf_log info "Filesystem $MOUNTPOINT is already mounted."
|
||||
- return $OCF_SUCCESS
|
||||
- fi
|
||||
+ Filesystem_status
|
||||
+ case "$?" in
|
||||
+ $OCF_SUCCESS)
|
||||
+ ocf_log info "Filesystem $MOUNTPOINT is already mounted."
|
||||
+ return $OCF_SUCCESS
|
||||
+ ;;
|
||||
+ $OCF_ERR_CONFIGURED)
|
||||
+ return $OCF_ERR_CONFIGURED
|
||||
+ ;;
|
||||
+ esac
|
||||
|
||||
fstype_supported || exit $OCF_ERR_INSTALLED
|
||||
|
||||
@@ -801,10 +807,42 @@ Filesystem_stop()
|
||||
#
|
||||
Filesystem_status()
|
||||
{
|
||||
- match_string="${TAB}${CANONICALIZED_MOUNTPOINT}${TAB}"
|
||||
- if list_mounts | grep "$match_string" >/dev/null 2>&1; then
|
||||
- rc=$OCF_SUCCESS
|
||||
- msg="$MOUNTPOINT is mounted (running)"
|
||||
+ local match_string="${TAB}${CANONICALIZED_MOUNTPOINT}${TAB}"
|
||||
+ local mounted_device=$(list_mounts | grep "$match_string" | awk '{print $1}')
|
||||
+
|
||||
+ if [ -n "$mounted_device" ]; then
|
||||
+ if [ "X$blockdevice" = "Xyes" ]; then
|
||||
+ if [ -e "$DEVICE" ] ; then
|
||||
+ local canonicalized_device="$(readlink -f "$DEVICE")"
|
||||
+ if [ $? -ne 0 ]; then
|
||||
+ ocf_exit_reason "Could not canonicalize $DEVICE because readlink failed"
|
||||
+ exit $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+ else
|
||||
+ local canonicalized_device="$DEVICE"
|
||||
+ fi
|
||||
+ if [ -e "$mounted_device" ] ; then
|
||||
+ local canonicalized_mounted_device="$(readlink -f "$mounted_device")"
|
||||
+ if [ $? -ne 0 ]; then
|
||||
+ ocf_exit_reason "Could not canonicalize $mounted_device because readlink failed"
|
||||
+ exit $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+ else
|
||||
+ local canonicalized_mounted_device="$mounted_device"
|
||||
+ fi
|
||||
+ if [ "$canonicalized_device" != "$canonicalized_mounted_device" ]; then
|
||||
+ if ocf_is_probe || [ "$__OCF_ACTION" = "stop" ]; then
|
||||
+ ocf_log debug "Another device ($mounted_device) is already mounted on $MOUNTPOINT"
|
||||
+ rc=$OCF_NOT_RUNNING
|
||||
+ else
|
||||
+ ocf_exit_reason "Another device ($mounted_device) is already mounted on $MOUNTPOINT"
|
||||
+ rc=$OCF_ERR_CONFIGURED
|
||||
+ fi
|
||||
+ fi
|
||||
+ else
|
||||
+ rc=$OCF_SUCCESS
|
||||
+ msg="$MOUNTPOINT is mounted (running)"
|
||||
+ fi
|
||||
else
|
||||
rc=$OCF_NOT_RUNNING
|
||||
msg="$MOUNTPOINT is unmounted (stopped)"
|
||||
@@ -1041,9 +1079,18 @@ else
|
||||
else
|
||||
CANONICALIZED_MOUNTPOINT="$MOUNTPOINT"
|
||||
fi
|
||||
- # At this stage, $MOUNTPOINT does not contain trailing "/" unless it is "/"
|
||||
- # TODO: / mounted via Filesystem sounds dangerous. On stop, we'll
|
||||
- # kill the whole system. Is that a good idea?
|
||||
+
|
||||
+ if echo "$CANONICALIZED_MOUNTPOINT" | grep -q "^\s*/\s*$"; then
|
||||
+ if ocf_is_probe; then
|
||||
+ ocf_log debug "/ cannot be managed in a cluster"
|
||||
+ exit $OCF_NOT_RUNNING
|
||||
+ elif [ "$__OCF_ACTION" = "start" ] || [ "$__OCF_ACTION" = "monitor" ] || [ "$__OCF_ACTION" = "status" ]; then
|
||||
+ ocf_exit_reason "/ cannot be managed in a cluster"
|
||||
+ exit $OCF_ERR_CONFIGURED
|
||||
+ elif [ "$__OCF_ACTION" = "stop" ]; then
|
||||
+ exit $OCF_SUCCESS
|
||||
+ fi
|
||||
+ fi
|
||||
fi
|
||||
|
||||
# Check to make sure the utilites are found
|
||||
@@ -1124,5 +1171,3 @@ case $OP in
|
||||
;;
|
||||
esac
|
||||
exit $?
|
||||
-
|
||||
-
|
||||
@ -0,0 +1,22 @@
|
||||
From 4b09b3e467a7f8076bbf20f5b027efecf16303e7 Mon Sep 17 00:00:00 2001
|
||||
From: Gianluca Piccolo <gianluca.piccolo@wuerth-phoenix.com>
|
||||
Date: Thu, 6 Jun 2024 17:34:41 +0200
|
||||
Subject: [PATCH] Fix #1944
|
||||
|
||||
---
|
||||
heartbeat/Filesystem | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||
index a445349b9..59b6c1b51 100755
|
||||
--- a/heartbeat/Filesystem
|
||||
+++ b/heartbeat/Filesystem
|
||||
@@ -664,7 +664,7 @@ get_pids()
|
||||
if [ "X${HOSTOS}" = "XOpenBSD" ];then
|
||||
fstat | grep $dir | awk '{print $3}'
|
||||
else
|
||||
- $FUSER -m $dir 2>/dev/null
|
||||
+ $FUSER -Mm $dir 2>/dev/null
|
||||
fi
|
||||
elif [ "$FORCE_UNMOUNT" = "safe" ]; then
|
||||
procs=$(find /proc/[0-9]*/ -type l -lname "${dir}/*" -or -lname "${dir}" 2>/dev/null | awk -F/ '{print $3}')
|
||||
26
SOURCES/RHEL-40393-Filesystem-2-update-bsd-logic.patch
Normal file
26
SOURCES/RHEL-40393-Filesystem-2-update-bsd-logic.patch
Normal file
@ -0,0 +1,26 @@
|
||||
From c9ba6ac66ee27a70c69e1156f17aa6beac277bc5 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Fri, 7 Jun 2024 14:23:28 +0200
|
||||
Subject: [PATCH] Filesystem: use fuser -c on FreeBSD, as -m and -M are used
|
||||
for other functionality
|
||||
|
||||
---
|
||||
heartbeat/Filesystem | 4 +++-
|
||||
1 file changed, 3 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||
index 59b6c1b51..88fe2e2eb 100755
|
||||
--- a/heartbeat/Filesystem
|
||||
+++ b/heartbeat/Filesystem
|
||||
@@ -661,8 +661,10 @@ get_pids()
|
||||
fi
|
||||
|
||||
if ocf_is_true "$FORCE_UNMOUNT"; then
|
||||
- if [ "X${HOSTOS}" = "XOpenBSD" ];then
|
||||
+ if [ "X${HOSTOS}" = "XOpenBSD" ]; then
|
||||
fstat | grep $dir | awk '{print $3}'
|
||||
+ elif [ "X${HOSTOS}" = "XFreeBSD" ]; then
|
||||
+ $FUSER -c $dir 2>/dev/null
|
||||
else
|
||||
$FUSER -Mm $dir 2>/dev/null
|
||||
fi
|
||||
@ -0,0 +1,333 @@
|
||||
From 7739c2a802c1dddb6757ff75cf7f6582a89bd518 Mon Sep 17 00:00:00 2001
|
||||
From: id <happytobi@tscoding.de>
|
||||
Date: Fri, 31 May 2024 09:00:18 +0200
|
||||
Subject: [PATCH] azure-events-az: update to API versions, add retry
|
||||
functionality for metadata requests, update tests
|
||||
|
||||
---
|
||||
heartbeat/azure-events-az.in | 117 ++++++++++++++++++++++++-----------
|
||||
heartbeat/ocf.py | 50 +++++++++++++--
|
||||
2 files changed, 126 insertions(+), 41 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/azure-events-az.in b/heartbeat/azure-events-az.in
|
||||
index 46d4d1f3d9..6d31e5abae 100644
|
||||
--- a/heartbeat/azure-events-az.in
|
||||
+++ b/heartbeat/azure-events-az.in
|
||||
@@ -27,7 +27,7 @@ import ocf
|
||||
##############################################################################
|
||||
|
||||
|
||||
-VERSION = "0.10"
|
||||
+VERSION = "0.20"
|
||||
USER_AGENT = "Pacemaker-ResourceAgent/%s %s" % (VERSION, ocf.distro())
|
||||
|
||||
attr_globalPullState = "azure-events-az_globalPullState"
|
||||
@@ -39,9 +39,6 @@ attr_healthstate = "#health-azure"
|
||||
default_loglevel = ocf.logging.INFO
|
||||
default_relevantEventTypes = set(["Reboot", "Redeploy"])
|
||||
|
||||
-global_pullMaxAttempts = 3
|
||||
-global_pullDelaySecs = 1
|
||||
-
|
||||
##############################################################################
|
||||
|
||||
class attrDict(defaultdict):
|
||||
@@ -71,16 +68,22 @@ class azHelper:
|
||||
metadata_host = "http://169.254.169.254/metadata"
|
||||
instance_api = "instance"
|
||||
events_api = "scheduledevents"
|
||||
- api_version = "2019-08-01"
|
||||
+ events_api_version = "2020-07-01"
|
||||
+ instance_api_version = "2021-12-13"
|
||||
|
||||
@staticmethod
|
||||
- def _sendMetadataRequest(endpoint, postData=None):
|
||||
+ def _sendMetadataRequest(endpoint, postData=None, api_version="2019-08-01"):
|
||||
"""
|
||||
Send a request to Azure's Azure Metadata Service API
|
||||
"""
|
||||
- url = "%s/%s?api-version=%s" % (azHelper.metadata_host, endpoint, azHelper.api_version)
|
||||
+
|
||||
+ retryCount = int(ocf.get_parameter("retry_count",3))
|
||||
+ retryWaitTime = int(ocf.get_parameter("retry_wait",20))
|
||||
+ requestTimeout = int(ocf.get_parameter("request_timeout",15))
|
||||
+
|
||||
+ url = "%s/%s?api-version=%s" % (azHelper.metadata_host, endpoint, api_version)
|
||||
data = ""
|
||||
- ocf.logger.debug("_sendMetadataRequest: begin; endpoint = %s, postData = %s" % (endpoint, postData))
|
||||
+ ocf.logger.debug("_sendMetadataRequest: begin; endpoint = %s, postData = %s, retry_count = %s, retry_wait time = %s, request_timeout = %s" % (endpoint, postData, retryCount, retryWaitTime, requestTimeout))
|
||||
ocf.logger.debug("_sendMetadataRequest: url = %s" % url)
|
||||
|
||||
if postData and type(postData) != bytes:
|
||||
@@ -89,18 +92,37 @@ class azHelper:
|
||||
req = urllib2.Request(url, postData)
|
||||
req.add_header("Metadata", "true")
|
||||
req.add_header("User-Agent", USER_AGENT)
|
||||
- try:
|
||||
- resp = urllib2.urlopen(req)
|
||||
- except URLError as e:
|
||||
- if hasattr(e, 'reason'):
|
||||
- ocf.logger.warning("Failed to reach the server: %s" % e.reason)
|
||||
- clusterHelper.setAttr(attr_globalPullState, "IDLE")
|
||||
- elif hasattr(e, 'code'):
|
||||
- ocf.logger.warning("The server couldn\'t fulfill the request. Error code: %s" % e.code)
|
||||
- clusterHelper.setAttr(attr_globalPullState, "IDLE")
|
||||
- else:
|
||||
- data = resp.read()
|
||||
- ocf.logger.debug("_sendMetadataRequest: response = %s" % data)
|
||||
+
|
||||
+ if retryCount > 0:
|
||||
+ ocf.logger.debug("_sendMetadataRequest: retry enabled")
|
||||
+
|
||||
+ successful = None
|
||||
+ for retry in range(retryCount+1):
|
||||
+ try:
|
||||
+ resp = urllib2.urlopen(req, timeout=requestTimeout)
|
||||
+ except Exception as e:
|
||||
+ excType = e.__class__.__name__
|
||||
+ if excType == TimeoutError.__name__:
|
||||
+ ocf.logger.warning("Request timed out after %s seconds Error: %s" % (requestTimeout, e))
|
||||
+ if excType == URLError.__name__:
|
||||
+ if hasattr(e, 'reason'):
|
||||
+ ocf.logger.warning("Failed to reach the server: %s" % e.reason)
|
||||
+ elif hasattr(e, 'code'):
|
||||
+ ocf.logger.warning("The server couldn\'t fulfill the request. Error code: %s" % e.code)
|
||||
+
|
||||
+ if retryCount > 1 and retry != retryCount:
|
||||
+ ocf.logger.warning("Request failed, retry (%s/%s) wait %s seconds before retry (wait time)" % (retry + 1,retryCount,retryWaitTime))
|
||||
+ time.sleep(retryWaitTime)
|
||||
+
|
||||
+ else:
|
||||
+ data = resp.read()
|
||||
+ ocf.logger.debug("_sendMetadataRequest: response = %s" % data)
|
||||
+ successful = 1
|
||||
+ break
|
||||
+
|
||||
+ # When no request was successful also with retry enabled, set the cluster to idle
|
||||
+ if successful is None:
|
||||
+ clusterHelper.setAttr(attr_globalPullState, "IDLE")
|
||||
|
||||
if data:
|
||||
data = json.loads(data)
|
||||
@@ -115,14 +137,15 @@ class azHelper:
|
||||
"""
|
||||
ocf.logger.debug("getInstanceInfo: begin")
|
||||
|
||||
- jsondata = azHelper._sendMetadataRequest(azHelper.instance_api)
|
||||
+ jsondata = azHelper._sendMetadataRequest(azHelper.instance_api, None, azHelper.instance_api_version)
|
||||
ocf.logger.debug("getInstanceInfo: json = %s" % jsondata)
|
||||
|
||||
if jsondata:
|
||||
ocf.logger.debug("getInstanceInfo: finished, returning {}".format(jsondata["compute"]))
|
||||
return attrDict(jsondata["compute"])
|
||||
else:
|
||||
- ocf.ocf_exit_reason("getInstanceInfo: Unable to get instance info")
|
||||
+ apiCall = "%s/%s?api-version=%s" % (azHelper.metadata_host, azHelper.instance_api, azHelper.instance_api_version)
|
||||
+ ocf.ocf_exit_reason("getInstanceInfo: Unable to get instance info - call: %s" % apiCall)
|
||||
sys.exit(ocf.OCF_ERR_GENERIC)
|
||||
|
||||
@staticmethod
|
||||
@@ -132,11 +155,17 @@ class azHelper:
|
||||
"""
|
||||
ocf.logger.debug("pullScheduledEvents: begin")
|
||||
|
||||
- jsondata = azHelper._sendMetadataRequest(azHelper.events_api)
|
||||
+ jsondata = azHelper._sendMetadataRequest(azHelper.events_api, None, azHelper.events_api_version)
|
||||
ocf.logger.debug("pullScheduledEvents: json = %s" % jsondata)
|
||||
|
||||
- ocf.logger.debug("pullScheduledEvents: finished")
|
||||
- return attrDict(jsondata)
|
||||
+ if jsondata:
|
||||
+ ocf.logger.debug("pullScheduledEvents: finished")
|
||||
+ return attrDict(jsondata)
|
||||
+ else:
|
||||
+ apiCall = "%s/%s?api-version=%s" % (azHelper.metadata_host, azHelper.events_api, azHelper.events_api_version)
|
||||
+ ocf.ocf_exit_reason("pullScheduledEvents: Unable to get scheduledevents info - call: %s" % apiCall)
|
||||
+ sys.exit(ocf.OCF_ERR_GENERIC)
|
||||
+
|
||||
|
||||
@staticmethod
|
||||
def forceEvents(eventIDs):
|
||||
@@ -534,7 +563,7 @@ class Node:
|
||||
except ValueError:
|
||||
# Handle the exception
|
||||
ocf.logger.warn("Health attribute %s on node %s cannot be converted to an integer value" % (healthAttributeStr, node))
|
||||
-
|
||||
+
|
||||
ocf.logger.debug("isNodeInStandby: finished - result %s" % isInStandy)
|
||||
return isInStandy
|
||||
|
||||
@@ -584,7 +613,7 @@ class raAzEvents:
|
||||
|
||||
def monitor(self):
|
||||
ocf.logger.debug("monitor: begin")
|
||||
-
|
||||
+
|
||||
events = azHelper.pullScheduledEvents()
|
||||
|
||||
# get current document version
|
||||
@@ -600,21 +629,21 @@ class raAzEvents:
|
||||
ocf.logger.info("monitor: already handled curDocVersion, skip")
|
||||
return ocf.OCF_SUCCESS
|
||||
|
||||
- localAzEventIDs = set()
|
||||
+ localAzEventIds = dict()
|
||||
for e in localEvents:
|
||||
- localAzEventIDs.add(e.EventId)
|
||||
+ localAzEventIds[e.EventId] = json.dumps(e)
|
||||
|
||||
curState = self.node.getState()
|
||||
clusterEventIDs = self.node.getEventIDs()
|
||||
|
||||
ocf.logger.debug("monitor: curDocVersion has not been handled yet")
|
||||
-
|
||||
+
|
||||
if clusterEventIDs:
|
||||
# there are pending events set, so our state must be STOPPING or IN_EVENT
|
||||
i = 0; touchedEventIDs = False
|
||||
while i < len(clusterEventIDs):
|
||||
# clean up pending events that are already finished according to AZ
|
||||
- if clusterEventIDs[i] not in localAzEventIDs:
|
||||
+ if clusterEventIDs[i] not in localAzEventIds.keys():
|
||||
ocf.logger.info("monitor: remove finished local clusterEvent %s" % (clusterEventIDs[i]))
|
||||
clusterEventIDs.pop(i)
|
||||
touchedEventIDs = True
|
||||
@@ -644,12 +673,12 @@ class raAzEvents:
|
||||
ocf.logger.info("monitor: all local events finished, but some resources have not completed startup yet -> wait")
|
||||
else:
|
||||
if curState == AVAILABLE:
|
||||
- if len(localAzEventIDs) > 0:
|
||||
+ if len(localAzEventIds) > 0:
|
||||
if clusterHelper.otherNodesAvailable(self.node):
|
||||
- ocf.logger.info("monitor: can handle local events %s -> set state STOPPING" % (str(localAzEventIDs)))
|
||||
- curState = self.node.updateNodeStateAndEvents(STOPPING, localAzEventIDs)
|
||||
+ ocf.logger.info("monitor: can handle local events %s -> set state STOPPING - %s" % (str(list(localAzEventIds.keys())), str(list(localAzEventIds.values()))))
|
||||
+ curState = self.node.updateNodeStateAndEvents(STOPPING, localAzEventIds.keys())
|
||||
else:
|
||||
- ocf.logger.info("monitor: cannot handle azEvents %s (only node available) -> set state ON_HOLD" % str(localAzEventIDs))
|
||||
+ ocf.logger.info("monitor: cannot handle azEvents %s (only node available) -> set state ON_HOLD - %s" % (str(list(localAzEventIds.keys())), str(list(localAzEventIds.values()))))
|
||||
self.node.setState(ON_HOLD)
|
||||
else:
|
||||
ocf.logger.debug("monitor: no local azEvents to handle")
|
||||
@@ -761,6 +790,24 @@ def main():
|
||||
longdesc="Set to true to enable verbose logging",
|
||||
content_type="boolean",
|
||||
default="false")
|
||||
+ agent.add_parameter(
|
||||
+ "retry_count",
|
||||
+ shortdesc="Azure IMDS webservice retry count",
|
||||
+ longdesc="Set to any number bigger than zero to enable retry count",
|
||||
+ content_type="integer",
|
||||
+ default="3")
|
||||
+ agent.add_parameter(
|
||||
+ "retry_wait",
|
||||
+ shortdesc="Configure a retry wait time",
|
||||
+ longdesc="Set retry wait time in seconds",
|
||||
+ content_type="integer",
|
||||
+ default="20")
|
||||
+ agent.add_parameter(
|
||||
+ "request_timeout",
|
||||
+ shortdesc="Configure a request timeout",
|
||||
+ longdesc="Set request timeout in seconds",
|
||||
+ content_type="integer",
|
||||
+ default="15")
|
||||
agent.add_action("start", timeout=10, handler=lambda: ocf.OCF_SUCCESS)
|
||||
agent.add_action("stop", timeout=10, handler=lambda: ocf.OCF_SUCCESS)
|
||||
agent.add_action("validate-all", timeout=20, handler=validate_action)
|
||||
diff --git a/heartbeat/ocf.py b/heartbeat/ocf.py
|
||||
index dda2fed4bb..571cd19664 100644
|
||||
--- a/heartbeat/ocf.py
|
||||
+++ b/heartbeat/ocf.py
|
||||
@@ -16,7 +16,7 @@
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
-#
|
||||
+#
|
||||
|
||||
import sys, os, logging, syslog
|
||||
|
||||
@@ -42,19 +42,19 @@
|
||||
# OCF does not include the concept of master/slave resources so we
|
||||
# need to extend it so we can discover a resource's complete state.
|
||||
#
|
||||
-# OCF_RUNNING_MASTER:
|
||||
+# OCF_RUNNING_MASTER:
|
||||
# The resource is in "master" mode and fully operational
|
||||
# OCF_FAILED_MASTER:
|
||||
# The resource is in "master" mode but in a failed state
|
||||
-#
|
||||
+#
|
||||
# The extra two values should only be used during a probe.
|
||||
#
|
||||
# Probes are used to discover resources that were started outside of
|
||||
# the CRM and/or left behind if the LRM fails.
|
||||
-#
|
||||
+#
|
||||
# They can be identified in RA scripts by checking for:
|
||||
# [ "${__OCF_ACTION}" = "monitor" -a "${OCF_RESKEY_CRM_meta_interval}" = "0" ]
|
||||
-#
|
||||
+#
|
||||
# Failed "slaves" should continue to use: OCF_ERR_GENERIC
|
||||
# Fully operational "slaves" should continue to use: OCF_SUCCESS
|
||||
#
|
||||
@@ -451,15 +451,17 @@ def value_for_parameter(param):
|
||||
sys.exit(OCF_ERR_UNIMPLEMENTED)
|
||||
|
||||
|
||||
+
|
||||
if __name__ == "__main__":
|
||||
import unittest
|
||||
+ import logging
|
||||
|
||||
class TestMetadata(unittest.TestCase):
|
||||
def test_noparams_noactions(self):
|
||||
m = Agent("foo", shortdesc="shortdesc", longdesc="longdesc")
|
||||
self.assertEqual("""<?xml version="1.0"?>
|
||||
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
|
||||
-<resource-agent name="foo">
|
||||
+<resource-agent name="foo" version="1.0">
|
||||
<version>1.0</version>
|
||||
<longdesc lang="en">
|
||||
longdesc
|
||||
@@ -483,4 +485,40 @@ def test_params_actions(self):
|
||||
m.add_action("start")
|
||||
self.assertEqual(str(m.actions[0]), '<action name="start" />\n')
|
||||
|
||||
+ def test_retry_params_actions(self):
|
||||
+ log= logging.getLogger( "test_retry_params_actions" )
|
||||
+
|
||||
+ m = Agent("foo", shortdesc="shortdesc", longdesc="longdesc")
|
||||
+ m.add_parameter(
|
||||
+ "retry_count",
|
||||
+ shortdesc="Azure ims webservice retry count",
|
||||
+ longdesc="Set to any number bigger than zero to enable retry count",
|
||||
+ content_type="integer",
|
||||
+ default="0")
|
||||
+ m.add_parameter(
|
||||
+ "retry_wait",
|
||||
+ shortdesc="Configure a retry wait time",
|
||||
+ longdesc="Set retry wait time in seconds",
|
||||
+ content_type="integer",
|
||||
+ default="20")
|
||||
+ m.add_parameter(
|
||||
+ "request_timeout",
|
||||
+ shortdesc="Configure a request timeout",
|
||||
+ longdesc="Set request timeout in seconds",
|
||||
+ content_type="integer",
|
||||
+ default="15")
|
||||
+
|
||||
+ m.add_action("start")
|
||||
+
|
||||
+ log.debug( "actions= %s", str(m.actions[0] ))
|
||||
+ self.assertEqual(str(m.actions[0]), '<action name="start" />\n')
|
||||
+
|
||||
+ log.debug( "parameters= %s", str(m.parameters[0] ))
|
||||
+ log.debug( "parameters= %s", str(m.parameters[1] ))
|
||||
+ log.debug( "parameters= %s", str(m.parameters[2] ))
|
||||
+ self.assertEqual(str(m.parameters[0]), '<parameter name="retry_count">\n<longdesc lang="en">Set to any number bigger than zero to enable retry count</longdesc>\n<shortdesc lang="en">Azure ims webservice retry count</shortdesc>\n<content type="integer" default="0" />\n</parameter>\n')
|
||||
+ self.assertEqual(str(m.parameters[1]), '<parameter name="retry_wait">\n<longdesc lang="en">Set retry wait time in seconds</longdesc>\n<shortdesc lang="en">Configure a retry wait time</shortdesc>\n<content type="integer" default="20" />\n</parameter>\n')
|
||||
+ self.assertEqual(str(m.parameters[2]), '<parameter name="request_timeout">\n<longdesc lang="en">Set request timeout in seconds</longdesc>\n<shortdesc lang="en">Configure a request timeout</shortdesc>\n<content type="integer" default="15" />\n</parameter>\n')
|
||||
+
|
||||
+ logging.basicConfig( stream=sys.stderr )
|
||||
unittest.main()
|
||||
1165
SOURCES/RHEL-42513-1-powervs-subnet-new-ra.patch
Normal file
1165
SOURCES/RHEL-42513-1-powervs-subnet-new-ra.patch
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,84 @@
|
||||
From 277370f569b34e1cfb49637f9a00afc20bcd4c54 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Wed, 17 Jul 2024 10:43:29 +0200
|
||||
Subject: [PATCH] build: dont build powervs-subnet if dependencies are missing
|
||||
|
||||
---
|
||||
configure.ac | 9 +++++++++
|
||||
doc/man/Makefile.am | 5 ++++-
|
||||
heartbeat/Makefile.am | 5 ++++-
|
||||
3 files changed, 17 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/configure.ac b/configure.ac
|
||||
index b785e2c2c..21ce27423 100644
|
||||
--- a/configure.ac
|
||||
+++ b/configure.ac
|
||||
@@ -519,6 +519,8 @@ fi
|
||||
|
||||
AC_PYTHON_MODULE(json)
|
||||
AC_PYTHON_MODULE(pyroute2)
|
||||
+AC_PYTHON_MODULE(requests)
|
||||
+AC_PYTHON_MODULE(urllib3)
|
||||
|
||||
AS_VERSION_COMPARE([$PYTHON_VERSION], [3.6], [BUILD_OCF_PY=0], [BUILD_OCF_PY=1], [BUILD_OCF_PY=1])
|
||||
|
||||
@@ -557,6 +559,13 @@ if test -z "$PYTHON" || test $BUILD_OCF_PY -eq 0; then
|
||||
fi
|
||||
AM_CONDITIONAL(BUILD_GCP_VPC_MOVE_VIP, test $BUILD_GCP_VPC_MOVE_VIP -eq 1)
|
||||
|
||||
+BUILD_POWERVS_SUBNET=1
|
||||
+if test -z "$PYTHON" || test $BUILD_OCF_PY -eq 0 || test "x${HAVE_PYMOD_REQUESTS}" != xyes || test "x${HAVE_PYMOD_URLLIB3}" != xyes; then
|
||||
+ BUILD_POWERVS_SUBNET=0
|
||||
+ AC_MSG_WARN("Not building powervs-subnet")
|
||||
+fi
|
||||
+AM_CONDITIONAL(BUILD_POWERVS_SUBNET, test $BUILD_POWERVS_SUBNET -eq 1)
|
||||
+
|
||||
AC_PATH_PROGS(ROUTE, route)
|
||||
AC_DEFINE_UNQUOTED(ROUTE, "$ROUTE", path to route command)
|
||||
|
||||
diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am
|
||||
index e577e6357..ef7639bff 100644
|
||||
--- a/doc/man/Makefile.am
|
||||
+++ b/doc/man/Makefile.am
|
||||
@@ -190,7 +190,6 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \
|
||||
ocf_heartbeat_portblock.7 \
|
||||
ocf_heartbeat_postfix.7 \
|
||||
ocf_heartbeat_pound.7 \
|
||||
- ocf_heartbeat_powervs-subnet.7 \
|
||||
ocf_heartbeat_proftpd.7 \
|
||||
ocf_heartbeat_rabbitmq-cluster.7 \
|
||||
ocf_heartbeat_rabbitmq-server-ha.7 \
|
||||
@@ -238,6 +237,10 @@ if BUILD_GCP_VPC_MOVE_VIP
|
||||
man_MANS += ocf_heartbeat_gcp-vpc-move-vip.7
|
||||
endif
|
||||
|
||||
+if BUILD_POWERVS_SUBNET
|
||||
+man_MANS += ocf_heartbeat_powervs-subnet.7
|
||||
+endif
|
||||
+
|
||||
xmlfiles = $(man_MANS:.7=.xml)
|
||||
|
||||
%.1 %.5 %.7 %.8: %.xml
|
||||
diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am
|
||||
index ff73a15aa..409847970 100644
|
||||
--- a/heartbeat/Makefile.am
|
||||
+++ b/heartbeat/Makefile.am
|
||||
@@ -162,7 +162,6 @@ ocf_SCRIPTS = AoEtarget \
|
||||
portblock \
|
||||
postfix \
|
||||
pound \
|
||||
- powervs-subnet \
|
||||
proftpd \
|
||||
rabbitmq-cluster \
|
||||
rabbitmq-server-ha \
|
||||
@@ -207,6 +206,10 @@ if BUILD_GCP_VPC_MOVE_VIP
|
||||
ocf_SCRIPTS += gcp-vpc-move-vip
|
||||
endif
|
||||
|
||||
+if BUILD_POWERVS_SUBNET
|
||||
+ocf_SCRIPTS += powervs-subnet
|
||||
+endif
|
||||
+
|
||||
ocfcommondir = $(OCF_LIB_DIR_PREFIX)/heartbeat
|
||||
ocfcommon_DATA = ocf-shellfuncs \
|
||||
ocf-binaries \
|
||||
43
SOURCES/RHEL-42513-powervs-subnet-wait-for-IP.patch
Normal file
43
SOURCES/RHEL-42513-powervs-subnet-wait-for-IP.patch
Normal file
@ -0,0 +1,43 @@
|
||||
From 0b4bf9c23eb60455da6c6a16c1df19282ab2a8b5 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Fri, 9 Jan 2026 12:56:14 +0100
|
||||
Subject: [PATCH] powervs-subnet: wait until IP is activated before running
|
||||
monitor-check
|
||||
|
||||
---
|
||||
heartbeat/powervs-subnet.in | 15 +++++++++++++--
|
||||
1 file changed, 13 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/powervs-subnet.in b/heartbeat/powervs-subnet.in
|
||||
index 84e86c0c4..062b1235e 100755
|
||||
--- a/heartbeat/powervs-subnet.in
|
||||
+++ b/heartbeat/powervs-subnet.in
|
||||
@@ -243,7 +243,16 @@ class nmcli:
|
||||
|
||||
@staticmethod
|
||||
def up(name, **kwargs):
|
||||
- return nmcli._nmcli_cmd("connection", "up", name, **kwargs)
|
||||
+ nmcli._nmcli_cmd("connection", "up", name, **kwargs)
|
||||
+
|
||||
+ for i in range(1, 10):
|
||||
+ time.sleep(1)
|
||||
+ status = nmcli._nmcli_cmd("connection", "show", name, **kwargs)
|
||||
+ if len(status.get("IP4.ADDRESS[1]", "")) > 0:
|
||||
+ return ocf.OCF_SUCCESS
|
||||
+ ocf.logger.warning(f"nmcli.connection.up: check {i} of 10: IP not yet available.")
|
||||
+
|
||||
+ return ocf.OCF_ERR_GENERIC
|
||||
|
||||
@staticmethod
|
||||
def find(match_key, match_value):
|
||||
@@ -824,7 +833,9 @@ def start_action(
|
||||
conn_options.update({"802-3-ethernet.mtu": "9000", "ethtool.feature-tso": "on"})
|
||||
|
||||
nmcli.connection.add(conn_name, options=conn_options)
|
||||
- nmcli.connection.up(conn_name)
|
||||
+ rc = nmcli.connection.up(conn_name)
|
||||
+ if rc != ocf.OCF_SUCCESS:
|
||||
+ return rc
|
||||
|
||||
if monitor_action(**res_options) != ocf.OCF_SUCCESS:
|
||||
raise PowerCloudAPIError(f"start_action: start subnet: {ws.subnet_name} failed")
|
||||
@ -0,0 +1,61 @@
|
||||
From 481672f73d05666ab20a883cf8fc746cb1f3050f Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Thu, 20 Jun 2024 09:29:21 +0200
|
||||
Subject: [PATCH] galera/mariadb/mysql/redis: remove Unpromoted monitor-action,
|
||||
as it's covered by the regular monitor-action
|
||||
|
||||
---
|
||||
heartbeat/galera.in | 1 -
|
||||
heartbeat/mariadb.in | 1 -
|
||||
heartbeat/mysql | 1 -
|
||||
heartbeat/redis.in | 1 -
|
||||
4 files changed, 4 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/galera.in b/heartbeat/galera.in
|
||||
index b518595cb0..b29d68bf73 100755
|
||||
--- a/heartbeat/galera.in
|
||||
+++ b/heartbeat/galera.in
|
||||
@@ -299,7 +299,6 @@ Use it with caution! (and fencing)
|
||||
<action name="status" timeout="60s" />
|
||||
<action name="monitor" depth="0" timeout="30s" interval="20s" />
|
||||
<action name="monitor" role="Promoted" depth="0" timeout="30s" interval="10s" />
|
||||
-<action name="monitor" role="Unpromoted" depth="0" timeout="30s" interval="30s" />
|
||||
<action name="promote" timeout="300s" />
|
||||
<action name="demote" timeout="120s" />
|
||||
<action name="validate-all" timeout="5s" />
|
||||
diff --git a/heartbeat/mariadb.in b/heartbeat/mariadb.in
|
||||
index e0f1f3c9f1..1dca98ba68 100644
|
||||
--- a/heartbeat/mariadb.in
|
||||
+++ b/heartbeat/mariadb.in
|
||||
@@ -255,7 +255,6 @@ The port on which the Promoted MariaDB instance is listening.
|
||||
<action name="status" timeout="60s" />
|
||||
<action name="monitor" depth="0" timeout="30s" interval="20s" />
|
||||
<action name="monitor" role="Promoted" depth="0" timeout="30s" interval="10s" />
|
||||
-<action name="monitor" role="Unpromoted" depth="0" timeout="30s" interval="30s" />
|
||||
<action name="promote" timeout="120s" />
|
||||
<action name="demote" timeout="120s" />
|
||||
<action name="notify" timeout="90s" />
|
||||
diff --git a/heartbeat/mysql b/heartbeat/mysql
|
||||
index 1df2fc0f28..6b00889ff4 100755
|
||||
--- a/heartbeat/mysql
|
||||
+++ b/heartbeat/mysql
|
||||
@@ -322,7 +322,6 @@ whether a node is usable for clients to read from.</shortdesc>
|
||||
<action name="status" timeout="60s" />
|
||||
<action name="monitor" depth="0" timeout="30s" interval="20s" />
|
||||
<action name="monitor" role="Promoted" depth="0" timeout="30s" interval="10s" />
|
||||
-<action name="monitor" role="Unpromoted" depth="0" timeout="30s" interval="30s" />
|
||||
<action name="promote" timeout="120s" />
|
||||
<action name="demote" timeout="120s" />
|
||||
<action name="notify" timeout="90s" />
|
||||
diff --git a/heartbeat/redis.in b/heartbeat/redis.in
|
||||
index 6429477e11..1e541f13d5 100755
|
||||
--- a/heartbeat/redis.in
|
||||
+++ b/heartbeat/redis.in
|
||||
@@ -221,7 +221,6 @@ is in use.
|
||||
<action name="status" timeout="60s" />
|
||||
<action name="monitor" depth="0" timeout="60s" interval="45s" />
|
||||
<action name="monitor" role="Promoted" depth="0" timeout="60s" interval="20s" />
|
||||
-<action name="monitor" role="Unpromoted" depth="0" timeout="60s" interval="60s" />
|
||||
<action name="promote" timeout="120s" />
|
||||
<action name="demote" timeout="120s" />
|
||||
<action name="notify" timeout="90s" />
|
||||
@ -0,0 +1,43 @@
|
||||
From 2ab2c832180dacb2e66d38541beae0957416eb96 Mon Sep 17 00:00:00 2001
|
||||
From: Antonio Romito <aromito@redhat.com>
|
||||
Date: Mon, 9 Sep 2024 17:30:38 +0200
|
||||
Subject: [PATCH] Improve handling of "stopping" container removal in
|
||||
remove_container()
|
||||
|
||||
- Added handling for containers in a stopping state by checking the state and force-removing if necessary.
|
||||
- Improved log messages to provide clearer information when force removal is needed.
|
||||
|
||||
Related: https://issues.redhat.com/browse/RHEL-58008
|
||||
---
|
||||
heartbeat/podman | 11 +++++++++--
|
||||
1 file changed, 9 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman b/heartbeat/podman
|
||||
index 53867bff20..643ec4d894 100755
|
||||
--- a/heartbeat/podman
|
||||
+++ b/heartbeat/podman
|
||||
@@ -254,6 +254,13 @@ remove_container()
|
||||
ocf_run podman rm -v $CONTAINER
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ]; then
|
||||
+ if [ $rc -eq 2 ]; then
|
||||
+ if podman inspect --format '{{.State.Status}}' $CONTAINER | grep -wq "stopping"; then
|
||||
+ ocf_log err "Inactive container ${CONTAINER} is stuck in 'stopping' state. Force-remove it."
|
||||
+ ocf_run podman rm -f $CONTAINER
|
||||
+ rc=$?
|
||||
+ fi
|
||||
+ fi
|
||||
# due to a podman bug (rhbz#1841485), sometimes a stopped
|
||||
# container can still be associated with Exec sessions, in
|
||||
# which case the "podman rm" has to be forced
|
||||
@@ -517,8 +524,8 @@ podman_stop()
|
||||
# but the associated container exit code is -1. If that's the case,
|
||||
# assume there's no failure and continue with the rm as usual.
|
||||
if [ $rc -eq 125 ] && \
|
||||
- podman inspect --format '{{.State.Status}}:{{.State.ExitCode}}' $CONTAINER | grep -wq "stopped:-1"; then
|
||||
- ocf_log warn "Container ${CONTAINER} had an unexpected stop outcome. Trying to remove it anyway."
|
||||
+ podman inspect --format '{{.State.Status}}:{{.State.ExitCode}}' $CONTAINER | grep -Eq '^(exited|stopped):-1$'; then
|
||||
+ ocf_log err "Container ${CONTAINER} had an unexpected stop outcome. Trying to remove it anyway."
|
||||
else
|
||||
ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}."
|
||||
return $OCF_ERR_GENERIC
|
||||
@ -0,0 +1,106 @@
|
||||
From d66a52cfb25f5436255ecc65a407c0166a720146 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Tue, 3 Sep 2024 12:55:28 +0200
|
||||
Subject: [PATCH 1/2] Filesystem: dont sleep during stop-action when there are
|
||||
no processes to kill
|
||||
|
||||
Thanks @SatomiOSAWA for the initial code.
|
||||
---
|
||||
heartbeat/Filesystem | 10 ++++++----
|
||||
1 file changed, 6 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||
index 3eb520e0c..f54969f20 100755
|
||||
--- a/heartbeat/Filesystem
|
||||
+++ b/heartbeat/Filesystem
|
||||
@@ -685,12 +685,13 @@ signal_processes() {
|
||||
pids=$(get_pids "$dir")
|
||||
if [ -z "$pids" ]; then
|
||||
ocf_log info "No processes on $dir were signalled. force_unmount is set to '$FORCE_UNMOUNT'"
|
||||
- return
|
||||
+ return 1
|
||||
fi
|
||||
for pid in $pids; do
|
||||
ocf_log info "sending signal $sig to: $(ps -f $pid | tail -1)"
|
||||
kill -s $sig $pid
|
||||
done
|
||||
+ return 0
|
||||
}
|
||||
try_umount() {
|
||||
local SUB="$1"
|
||||
@@ -717,12 +718,13 @@ timeout_child() {
|
||||
return $ret
|
||||
}
|
||||
fs_stop_loop() {
|
||||
- local SUB="$1" signals="$2" sig
|
||||
+ local SUB="$1" signals="$2" sig send_signal
|
||||
while true; do
|
||||
+ send_signal=false
|
||||
for sig in $signals; do
|
||||
- signal_processes "$SUB" $sig
|
||||
+ signal_processes "$SUB" $sig && send_signal=true
|
||||
done
|
||||
- sleep $OCF_RESKEY_signal_delay
|
||||
+ $send_signal && sleep $OCF_RESKEY_signal_delay
|
||||
try_umount "$SUB" && return $OCF_SUCCESS
|
||||
done
|
||||
}
|
||||
|
||||
From cb6aaffc260eea0f0fee6fab44393c6cf12b8a83 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Mon, 9 Sep 2024 10:58:12 +0200
|
||||
Subject: [PATCH 2/2] Filesystem: only use $umount_force after sending
|
||||
kill_signals
|
||||
|
||||
---
|
||||
heartbeat/Filesystem | 12 ++++++------
|
||||
1 file changed, 6 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||
index f54969f20..4dd962fd9 100755
|
||||
--- a/heartbeat/Filesystem
|
||||
+++ b/heartbeat/Filesystem
|
||||
@@ -694,8 +694,8 @@ signal_processes() {
|
||||
return 0
|
||||
}
|
||||
try_umount() {
|
||||
- local SUB="$1"
|
||||
- $UMOUNT $umount_force "$SUB"
|
||||
+ local force_arg="$1" SUB="$2"
|
||||
+ $UMOUNT $force_arg "$SUB"
|
||||
list_mounts | grep "${TAB}${SUB}${TAB}" >/dev/null 2>&1 || {
|
||||
ocf_log info "unmounted $SUB successfully"
|
||||
return $OCF_SUCCESS
|
||||
@@ -718,14 +718,14 @@ timeout_child() {
|
||||
return $ret
|
||||
}
|
||||
fs_stop_loop() {
|
||||
- local SUB="$1" signals="$2" sig send_signal
|
||||
+ local force_arg="$1" SUB="$2" signals="$3" sig send_signal
|
||||
while true; do
|
||||
send_signal=false
|
||||
for sig in $signals; do
|
||||
signal_processes "$SUB" $sig && send_signal=true
|
||||
done
|
||||
$send_signal && sleep $OCF_RESKEY_signal_delay
|
||||
- try_umount "$SUB" && return $OCF_SUCCESS
|
||||
+ try_umount "$force_arg" "$SUB" && return $OCF_SUCCESS
|
||||
done
|
||||
}
|
||||
fs_stop() {
|
||||
@@ -733,13 +733,13 @@ fs_stop() {
|
||||
grace_time=$((timeout/2))
|
||||
|
||||
# try gracefully terminating processes for up to half of the configured timeout
|
||||
- fs_stop_loop "$SUB" "$OCF_RESKEY_term_signals" &
|
||||
+ fs_stop_loop "" "$SUB" "$OCF_RESKEY_term_signals" &
|
||||
timeout_child $! $grace_time
|
||||
ret=$?
|
||||
[ $ret -eq $OCF_SUCCESS ] && return $ret
|
||||
|
||||
# try killing them for the rest of the timeout
|
||||
- fs_stop_loop "$SUB" "$OCF_RESKEY_kill_signals" &
|
||||
+ fs_stop_loop "$umount_force" "$SUB" "$OCF_RESKEY_kill_signals" &
|
||||
timeout_child $! $grace_time
|
||||
ret=$?
|
||||
[ $ret -eq $OCF_SUCCESS ] && return $ret
|
||||
@ -0,0 +1,37 @@
|
||||
From c72dc2f2e502486d93aeec26abc12e720b14a0a7 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Thu, 10 Oct 2024 16:41:03 +0200
|
||||
Subject: [PATCH] azure-events*: use node name from cluster instead of hostname
|
||||
to avoid failing if they're not the same
|
||||
|
||||
---
|
||||
heartbeat/azure-events-az.in | 2 +-
|
||||
heartbeat/azure-events.in | 2 +-
|
||||
2 files changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/azure-events-az.in b/heartbeat/azure-events-az.in
|
||||
index 6d31e5aba..0ed001037 100644
|
||||
--- a/heartbeat/azure-events-az.in
|
||||
+++ b/heartbeat/azure-events-az.in
|
||||
@@ -441,7 +441,7 @@ class Node:
|
||||
self.raOwner = ra
|
||||
self.azInfo = azHelper.getInstanceInfo()
|
||||
self.azName = self.azInfo.name
|
||||
- self.hostName = socket.gethostname()
|
||||
+ self.hostName = clusterHelper._exec("crm_node", "-n")
|
||||
self.setAttr("azName", self.azName)
|
||||
clusterHelper.setAttr("hostName_%s" % self.azName, self.hostName)
|
||||
|
||||
diff --git a/heartbeat/azure-events.in b/heartbeat/azure-events.in
|
||||
index 90acaba62..32f71ee26 100644
|
||||
--- a/heartbeat/azure-events.in
|
||||
+++ b/heartbeat/azure-events.in
|
||||
@@ -411,7 +411,7 @@ class Node:
|
||||
self.raOwner = ra
|
||||
self.azInfo = azHelper.getInstanceInfo()
|
||||
self.azName = self.azInfo.name
|
||||
- self.hostName = socket.gethostname()
|
||||
+ self.hostName = clusterHelper._exec("crm_node", "-n")
|
||||
self.setAttr("azName", self.azName)
|
||||
clusterHelper.setAttr("hostName_%s" % self.azName, self.hostName)
|
||||
|
||||
@ -0,0 +1,38 @@
|
||||
From 38eaf00bc81af7530c56eba282918762a47a9326 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Thu, 19 Sep 2024 13:01:53 +0200
|
||||
Subject: [PATCH] nfsserver: also stop rpc-statd for nfsv4_only to avoid stop
|
||||
failing in some cases
|
||||
|
||||
E.g. nfs_no_notify=true nfsv4_only=true nfs_shared_infodir=/nfsmq/nfsinfo would cause a "Failed to unmount a bind mount" error
|
||||
---
|
||||
heartbeat/nfsserver | 16 +++++++---------
|
||||
1 file changed, 7 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/nfsserver b/heartbeat/nfsserver
|
||||
index 5793d7a70..fd9268afc 100755
|
||||
--- a/heartbeat/nfsserver
|
||||
+++ b/heartbeat/nfsserver
|
||||
@@ -947,15 +947,13 @@ nfsserver_stop ()
|
||||
sleep 1
|
||||
done
|
||||
|
||||
- if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then
|
||||
- nfs_exec stop rpc-statd > /dev/null 2>&1
|
||||
- ocf_log info "Stop: rpc-statd"
|
||||
- rpcinfo -t localhost 100024 > /dev/null 2>&1
|
||||
- rc=$?
|
||||
- if [ "$rc" -eq "0" ]; then
|
||||
- ocf_exit_reason "Failed to stop rpc-statd"
|
||||
- return $OCF_ERR_GENERIC
|
||||
- fi
|
||||
+ nfs_exec stop rpc-statd > /dev/null 2>&1
|
||||
+ ocf_log info "Stop: rpc-statd"
|
||||
+ rpcinfo -t localhost 100024 > /dev/null 2>&1
|
||||
+ rc=$?
|
||||
+ if [ "$rc" -eq "0" ]; then
|
||||
+ ocf_exit_reason "Failed to stop rpc-statd"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
fi
|
||||
|
||||
nfs_exec stop nfs-idmapd > /dev/null 2>&1
|
||||
@ -0,0 +1,100 @@
|
||||
From f02afd0fadb581ca0fc9798beaf28044cf211200 Mon Sep 17 00:00:00 2001
|
||||
From: Lars Ellenberg <lars.ellenberg@linbit.com>
|
||||
Date: Wed, 18 Sep 2024 11:53:52 +0200
|
||||
Subject: [PATCH 1/2] Filesystem: on stop, try umount directly, before scanning
|
||||
for users
|
||||
|
||||
48ed6e6d (Filesystem: improve stop-action and allow setting term/kill signals and signal_delay for large filesystems, 2023-07-04)
|
||||
changed the logic from
|
||||
"try umount; if that fails, find and kill users; repeat" to
|
||||
"try to find and kill users; then try umount; repeat"
|
||||
|
||||
But even just walking /proc may take "a long time" on busy systems,
|
||||
and may still turn up with "no users found".
|
||||
|
||||
It will take even longer for "force_umount=safe"
|
||||
(observed 8 to 10 seconds just for "get_pids() with "safe" to return nothing)
|
||||
than for "force_umount=yes" (still ~ 2 to 3 seconds),
|
||||
but it will take "a long time" in any case.
|
||||
(BTW, that may be longer than the hardcoded default of 6 seconds for "fast_stop",
|
||||
which is also the default on many systems now)
|
||||
|
||||
If the dependencies are properly configured,
|
||||
there should be no users left,
|
||||
and the umount should just work.
|
||||
|
||||
Revert back to "try umount first", and only then try to find "rogue" users.
|
||||
---
|
||||
heartbeat/Filesystem | 5 +++++
|
||||
1 file changed, 5 insertions(+)
|
||||
|
||||
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||
index 4dd962fd9..99bddaf62 100755
|
||||
--- a/heartbeat/Filesystem
|
||||
+++ b/heartbeat/Filesystem
|
||||
@@ -732,6 +732,11 @@ fs_stop() {
|
||||
local SUB="$1" timeout=$2 grace_time ret
|
||||
grace_time=$((timeout/2))
|
||||
|
||||
+ # Just walking /proc may take "a long time", even if we don't find any users of this FS.
|
||||
+ # If dependencies are properly configured, umount should just work.
|
||||
+ # Only if that fails, try to find and kill processes that still use it.
|
||||
+ try_umount "" "$SUB" && return $OCF_SUCCESS
|
||||
+
|
||||
# try gracefully terminating processes for up to half of the configured timeout
|
||||
fs_stop_loop "" "$SUB" "$OCF_RESKEY_term_signals" &
|
||||
timeout_child $! $grace_time
|
||||
|
||||
From b42d698f12aaeb871f4cc6a3c0327a27862b4376 Mon Sep 17 00:00:00 2001
|
||||
From: Lars Ellenberg <lars.ellenberg@linbit.com>
|
||||
Date: Wed, 18 Sep 2024 13:42:38 +0200
|
||||
Subject: [PATCH 2/2] Filesystem: stop/get_pids to be signaled
|
||||
|
||||
The "safe" way to get process ids that may be using a particular filesystem
|
||||
currently uses shell globs ("find /proc/[0-9]*").
|
||||
With a million processes (and/or a less capable shell),
|
||||
that may result in "Argument list too long".
|
||||
|
||||
Replace with find /proc -path "/proc/[0-9]*" instead.
|
||||
While at it, also fix the non-posix -or to be -o,
|
||||
and add explicit grouping parentheses \( \) and explicit -print.
|
||||
|
||||
Add a comment to not include "interesting" characters in mount point names.
|
||||
---
|
||||
heartbeat/Filesystem | 23 ++++++++++++++++++++---
|
||||
1 file changed, 20 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||
index 99bddaf62..3405e2c26 100755
|
||||
--- a/heartbeat/Filesystem
|
||||
+++ b/heartbeat/Filesystem
|
||||
@@ -669,9 +669,26 @@ get_pids()
|
||||
$FUSER -Mm $dir 2>/dev/null
|
||||
fi
|
||||
elif [ "$FORCE_UNMOUNT" = "safe" ]; then
|
||||
- procs=$(find /proc/[0-9]*/ -type l -lname "${dir}/*" -or -lname "${dir}" 2>/dev/null | awk -F/ '{print $3}')
|
||||
- mmap_procs=$(grep " ${dir}/" /proc/[0-9]*/maps | awk -F/ '{print $3}')
|
||||
- printf "${procs}\n${mmap_procs}" | sort | uniq
|
||||
+ # Yes, in theory, ${dir} could contain "intersting" characters
|
||||
+ # and would need to be quoted for glob (find) and regex (grep).
|
||||
+ # Don't do that, then.
|
||||
+
|
||||
+ # Avoid /proc/[0-9]*, it may cause "Argument list too long".
|
||||
+ # There are several ways to filter for /proc/<pid>
|
||||
+ # -mindepth 1 -not -path "/proc/[0-9]*" -prune -o ...
|
||||
+ # -path "/proc/[!0-9]*" -prune -o ...
|
||||
+ # -path "/proc/[0-9]*" -a ...
|
||||
+ # the latter seemd to be significantly faster for this one in my naive test.
|
||||
+ procs=$(exec 2>/dev/null;
|
||||
+ find /proc -path "/proc/[0-9]*" -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print |
|
||||
+ awk -F/ '{print $3}' | uniq)
|
||||
+
|
||||
+ # This finds both /proc/<pid>/maps and /proc/<pid>/task/<tid>/maps;
|
||||
+ # if you don't want the latter, add -maxdepth.
|
||||
+ mmap_procs=$(exec 2>/dev/null;
|
||||
+ find /proc -path "/proc/[0-9]*/maps" -print |
|
||||
+ xargs -r grep -l " ${dir}/" | awk -F/ '{print $3}' | uniq)
|
||||
+ printf "${procs}\n${mmap_procs}" | sort -u
|
||||
fi
|
||||
}
|
||||
|
||||
@ -0,0 +1,48 @@
|
||||
From 82958dc115c47232ae0468b1ddf64e728ec325e4 Mon Sep 17 00:00:00 2001
|
||||
From: Georg Pfuetzenreuter <mail@georg-pfuetzenreuter.net>
|
||||
Date: Wed, 9 Oct 2024 00:16:44 +0200
|
||||
Subject: [PATCH] ocf-shellfuncs: systemd_drop_in only if needed
|
||||
|
||||
Avoid dbus overload upon many simultaneous "daemon-reload" invocations
|
||||
(when a resource agent using systemd_drop_in() is called multiple times
|
||||
as part of parallel resource operations in Pacemaker) by skipping the
|
||||
file creation and reload if the expected data already exists.
|
||||
|
||||
Whilst at it, align the indentation of the heredoc with the other parts
|
||||
of the function.
|
||||
|
||||
Signed-off-by: Georg Pfuetzenreuter <mail@georg-pfuetzenreuter.net>
|
||||
---
|
||||
heartbeat/ocf-shellfuncs.in | 19 +++++++++++--------
|
||||
1 file changed, 11 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/ocf-shellfuncs.in b/heartbeat/ocf-shellfuncs.in
|
||||
index 9335cbf00..5c4bb3264 100644
|
||||
--- a/heartbeat/ocf-shellfuncs.in
|
||||
+++ b/heartbeat/ocf-shellfuncs.in
|
||||
@@ -662,14 +662,17 @@ systemd_drop_in()
|
||||
systemdrundir="/run/systemd/system/resource-agents-deps.target.d"
|
||||
mkdir -p "$systemdrundir"
|
||||
conf_file="$systemdrundir/$1.conf"
|
||||
- cat >"$conf_file" <<EOF
|
||||
-[Unit]
|
||||
-$2=$3
|
||||
-EOF
|
||||
- # The information is accessible through systemd API and systemd would
|
||||
- # complain about improper permissions.
|
||||
- chmod o+r "$conf_file"
|
||||
- systemctl daemon-reload
|
||||
+ conf_line="$2=$3"
|
||||
+ if ! { [ -f "$conf_file" ] && grep -q "^$conf_line$" "$conf_file" ; } ; then
|
||||
+ cat > "$conf_file" <<-EOF
|
||||
+ [Unit]
|
||||
+ $conf_line
|
||||
+ EOF
|
||||
+ # The information is accessible through systemd API and systemd would
|
||||
+ # complain about improper permissions.
|
||||
+ chmod o+r "$conf_file"
|
||||
+ systemctl daemon-reload
|
||||
+ fi
|
||||
}
|
||||
|
||||
# usage: curl_retry RETRIES SLEEP ARGS URL
|
||||
@ -0,0 +1,132 @@
|
||||
From 6fab544e702a7601714cd017aecc00193f23ae72 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Fri, 11 Oct 2024 13:13:10 +0200
|
||||
Subject: [PATCH] IPaddr2: improve fail logic and check ip_status after adding
|
||||
IP
|
||||
|
||||
* check that the label got applied
|
||||
* return OCF_ERR_GENERIC to avoid false-positive when IP was manually added before starting the resource
|
||||
* check ip_status after adding IP to fail without having to wait for the first monitor-action
|
||||
|
||||
Co-authored-by: Evan J. Felix <evan.felix@pnnl.gov>
|
||||
---
|
||||
heartbeat/IPaddr2 | 35 ++++++++++++++++++++++++++---------
|
||||
1 file changed, 26 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/IPaddr2 b/heartbeat/IPaddr2
|
||||
index e325aa574..27cae2d11 100755
|
||||
--- a/heartbeat/IPaddr2
|
||||
+++ b/heartbeat/IPaddr2
|
||||
@@ -586,7 +586,7 @@ ip_init() {
|
||||
exit $rc
|
||||
fi
|
||||
fi
|
||||
-
|
||||
+
|
||||
SENDARPPIDFILE="$SENDARPPIDDIR/send_arp-$OCF_RESKEY_ip"
|
||||
|
||||
if [ -n "$IFLABEL" ]; then
|
||||
@@ -985,6 +985,7 @@ run_send_ua() {
|
||||
# ok = served (for CIP: + hash bucket)
|
||||
# partial = served and no hash bucket (CIP only)
|
||||
# partial2 = served and no CIP iptables rule
|
||||
+# partial3 = served with no label
|
||||
# no = nothing
|
||||
#
|
||||
ip_served() {
|
||||
@@ -1002,6 +1003,11 @@ ip_served() {
|
||||
|
||||
if [ -z "$IP_CIP" ]; then
|
||||
for i in $cur_nic; do
|
||||
+ # check address label
|
||||
+ if [ -n "$IFLABEL" ] && [ -z "`$IP2UTIL -o -f $FAMILY addr show $nic label $IFLABEL`" ]; then
|
||||
+ echo partial3
|
||||
+ return 0
|
||||
+ fi
|
||||
# only mark as served when on the same interfaces as $NIC
|
||||
[ "$i" = "$NIC" ] || continue
|
||||
echo "ok"
|
||||
@@ -1065,7 +1071,12 @@ ip_start() {
|
||||
if [ "$ip_status" = "ok" ]; then
|
||||
exit $OCF_SUCCESS
|
||||
fi
|
||||
-
|
||||
+
|
||||
+ if [ "$ip_status" = "partial3" ]; then
|
||||
+ ocf_exit_reason "IP $OCF_RESKEY_ip available, but label missing"
|
||||
+ exit $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
if [ -n "$IP_CIP" ] && ([ $ip_status = "no" ] || [ $ip_status = "partial2" ]); then
|
||||
$MODPROBE ip_conntrack
|
||||
$IPADDR2_CIP_IPTABLES -I INPUT -d $OCF_RESKEY_ip -i $NIC -j CLUSTERIP \
|
||||
@@ -1083,7 +1094,7 @@ ip_start() {
|
||||
if [ -n "$IP_CIP" ] && [ $ip_status = "partial" ]; then
|
||||
echo "+$IP_INC_NO" >$IP_CIP_FILE
|
||||
fi
|
||||
-
|
||||
+
|
||||
if [ "$ip_status" = "no" ]; then
|
||||
if ocf_is_true ${OCF_RESKEY_lvs_support}; then
|
||||
for i in `find_interface $OCF_RESKEY_ip 32`; do
|
||||
@@ -1094,7 +1105,7 @@ ip_start() {
|
||||
esac
|
||||
done
|
||||
fi
|
||||
-
|
||||
+
|
||||
add_interface "$OCF_RESKEY_ip" "$NETMASK" "${BRDCAST:-none}" "$NIC" "$IFLABEL" "$METRIC"
|
||||
rc=$?
|
||||
|
||||
@@ -1102,6 +1113,12 @@ ip_start() {
|
||||
ocf_exit_reason "Failed to add $OCF_RESKEY_ip"
|
||||
exit $rc
|
||||
fi
|
||||
+
|
||||
+ ip_status=`ip_served`
|
||||
+ if [ "$ip_status" != "ok" ]; then
|
||||
+ ocf_exit_reason "Failed to add $OCF_RESKEY_ip with error $ip_status"
|
||||
+ exit $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
fi
|
||||
|
||||
case $NIC in
|
||||
@@ -1134,7 +1151,7 @@ ip_stop() {
|
||||
ocf_take_lock $CIP_lockfile
|
||||
ocf_release_lock_on_exit $CIP_lockfile
|
||||
fi
|
||||
-
|
||||
+
|
||||
if [ -f "$SENDARPPIDFILE" ] ; then
|
||||
kill `cat "$SENDARPPIDFILE"`
|
||||
if [ $? -ne 0 ]; then
|
||||
@@ -1171,17 +1188,17 @@ ip_stop() {
|
||||
i=`expr $i + 1`
|
||||
done
|
||||
else
|
||||
- ip_del_if="no"
|
||||
+ ip_del_if="no"
|
||||
fi
|
||||
fi
|
||||
-
|
||||
+
|
||||
if [ "$ip_del_if" = "yes" ]; then
|
||||
delete_interface $OCF_RESKEY_ip $NIC $NETMASK
|
||||
if [ $? -ne 0 ]; then
|
||||
ocf_exit_reason "Unable to remove IP [${OCF_RESKEY_ip} from interface [ $NIC ]"
|
||||
exit $OCF_ERR_GENERIC
|
||||
fi
|
||||
-
|
||||
+
|
||||
if ocf_is_true ${OCF_RESKEY_lvs_support}; then
|
||||
restore_loopback "$OCF_RESKEY_ip"
|
||||
fi
|
||||
@@ -1200,7 +1217,7 @@ ip_monitor() {
|
||||
run_arp_sender refresh
|
||||
return $OCF_SUCCESS
|
||||
;;
|
||||
- partial|no|partial2)
|
||||
+ no)
|
||||
exit $OCF_NOT_RUNNING
|
||||
;;
|
||||
*)
|
||||
@ -0,0 +1,23 @@
|
||||
From eac983c14f4695f491fe430a78d8d18a1481c60c Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Wed, 29 Oct 2025 15:15:54 +0100
|
||||
Subject: [PATCH] oracle: improve monpassword description
|
||||
|
||||
---
|
||||
heartbeat/oracle | 3 +--
|
||||
1 file changed, 1 insertion(+), 2 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/oracle b/heartbeat/oracle
|
||||
index 8cf4e3649c..c85e499833 100755
|
||||
--- a/heartbeat/oracle
|
||||
+++ b/heartbeat/oracle
|
||||
@@ -132,8 +132,7 @@ that the password for this user does not expire.
|
||||
<longdesc lang="en">
|
||||
Password for the monitoring user. Make sure
|
||||
that the password for this user does not expire.
|
||||
-Need to explicitly set a password to a new monitor
|
||||
-user for the security reason.
|
||||
+Set to avoid using the agents default password for "monuser".
|
||||
</longdesc>
|
||||
<shortdesc lang="en">monpassword</shortdesc>
|
||||
<content type="string" default="$OCF_RESKEY_monpassword_default" />
|
||||
@ -0,0 +1,455 @@
|
||||
From 61cec34a754017537c61e79cd1212f2688c32429 Mon Sep 17 00:00:00 2001
|
||||
From: harshkiprofile <83770157+harshkiprofile@users.noreply.github.com>
|
||||
Date: Mon, 4 Nov 2024 12:19:10 +0530
|
||||
Subject: [PATCH 1/7] Introduce a new shell function to reuse IMDS token
|
||||
|
||||
---
|
||||
heartbeat/ocf-shellfuncs.in | 31 +++++++++++++++++++++++++++++++
|
||||
1 file changed, 31 insertions(+)
|
||||
|
||||
diff --git a/heartbeat/ocf-shellfuncs.in b/heartbeat/ocf-shellfuncs.in
|
||||
index 5c4bb3264..0c4632cf9 100644
|
||||
--- a/heartbeat/ocf-shellfuncs.in
|
||||
+++ b/heartbeat/ocf-shellfuncs.in
|
||||
@@ -1111,3 +1111,34 @@ ocf_is_true "$OCF_TRACE_RA" && ocf_start_trace
|
||||
if ocf_is_true "$HA_use_logd"; then
|
||||
: ${HA_LOGD:=yes}
|
||||
fi
|
||||
+
|
||||
+# File to store the token and timestamp
|
||||
+TOKEN_FILE="/tmp/.imds_token"
|
||||
+TOKEN_LIFETIME=21600 # Token lifetime in seconds (6 hours)
|
||||
+TOKEN_EXPIRY_THRESHOLD=3600 # Renew token if less than 60 minutes (1 hour) remaining
|
||||
+
|
||||
+# Function to fetch a new token
|
||||
+fetch_new_token() {
|
||||
+ TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: $TOKEN_LIFETIME")
|
||||
+ echo "$TOKEN $(date +%s)" > "$TOKEN_FILE"
|
||||
+ echo "$TOKEN"
|
||||
+}
|
||||
+
|
||||
+# Function to retrieve or renew the token
|
||||
+get_token() {
|
||||
+ if [[ -f "$TOKEN_FILE" ]]; then
|
||||
+ read -r STORED_TOKEN STORED_TIMESTAMP < "$TOKEN_FILE"
|
||||
+ CURRENT_TIME=$(date +%s)
|
||||
+ ELAPSED_TIME=$((CURRENT_TIME - STORED_TIMESTAMP))
|
||||
+
|
||||
+ if (( ELAPSED_TIME < (TOKEN_LIFETIME - TOKEN_EXPIRY_THRESHOLD) )); then
|
||||
+ # Token is still valid
|
||||
+ echo "$STORED_TOKEN"
|
||||
+ return
|
||||
+ fi
|
||||
+ fi
|
||||
+ # Fetch a new token if not valid
|
||||
+ fetch_new_token
|
||||
+}
|
||||
+
|
||||
+
|
||||
|
||||
From 00629fa44cb7a8dd1045fc8cad755e1d0c808476 Mon Sep 17 00:00:00 2001
|
||||
From: harshkiprofile <83770157+harshkiprofile@users.noreply.github.com>
|
||||
Date: Mon, 4 Nov 2024 12:21:18 +0530
|
||||
Subject: [PATCH 2/7] Utilize the get_token function to reuse the token
|
||||
|
||||
---
|
||||
heartbeat/aws-vpc-move-ip | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/heartbeat/aws-vpc-move-ip b/heartbeat/aws-vpc-move-ip
|
||||
index 6115e5ba8..fbeb2ee64 100755
|
||||
--- a/heartbeat/aws-vpc-move-ip
|
||||
+++ b/heartbeat/aws-vpc-move-ip
|
||||
@@ -270,7 +270,7 @@ ec2ip_validate() {
|
||||
fi
|
||||
fi
|
||||
|
||||
- TOKEN=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -sX PUT -H 'X-aws-ec2-metadata-token-ttl-seconds: 21600'" "http://169.254.169.254/latest/api/token")
|
||||
+ TOKEN=$(get_token)
|
||||
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
EC2_INSTANCE_ID=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'" "http://169.254.169.254/latest/meta-data/instance-id")
|
||||
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
|
||||
From 36126cdcb90ad617ecfce03d986550907732aa4f Mon Sep 17 00:00:00 2001
|
||||
From: harshkiprofile <83770157+harshkiprofile@users.noreply.github.com>
|
||||
Date: Mon, 4 Nov 2024 12:22:16 +0530
|
||||
Subject: [PATCH 3/7] Utilize to get_token function to reuse the token
|
||||
|
||||
---
|
||||
heartbeat/awsvip | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/heartbeat/awsvip b/heartbeat/awsvip
|
||||
index f2b238a0f..ca19ac086 100755
|
||||
--- a/heartbeat/awsvip
|
||||
+++ b/heartbeat/awsvip
|
||||
@@ -266,7 +266,7 @@ if [ -n "${OCF_RESKEY_region}" ]; then
|
||||
AWSCLI_CMD="$AWSCLI_CMD --region ${OCF_RESKEY_region}"
|
||||
fi
|
||||
SECONDARY_PRIVATE_IP="${OCF_RESKEY_secondary_private_ip}"
|
||||
-TOKEN=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -sX PUT -H 'X-aws-ec2-metadata-token-ttl-seconds: 21600'" "http://169.254.169.254/latest/api/token")
|
||||
+TOKEN=$(get_token)
|
||||
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
INSTANCE_ID=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'" "http://169.254.169.254/latest/meta-data/instance-id")
|
||||
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
|
||||
From dcd0050df5ba94905bc71d38b05cbb93f5687b61 Mon Sep 17 00:00:00 2001
|
||||
From: harshkiprofile <beer18317@gmail.com>
|
||||
Date: Mon, 4 Nov 2024 20:05:33 +0530
|
||||
Subject: [PATCH 4/7] Move token renewal function to aws.sh for reuse in AWS
|
||||
agent scripts
|
||||
|
||||
---
|
||||
heartbeat/Makefile.am | 1 +
|
||||
heartbeat/aws-vpc-move-ip | 1 +
|
||||
heartbeat/aws-vpc-route53.in | 3 ++-
|
||||
heartbeat/aws.sh | 46 ++++++++++++++++++++++++++++++++++++
|
||||
heartbeat/awseip | 3 ++-
|
||||
heartbeat/awsvip | 1 +
|
||||
heartbeat/ocf-shellfuncs.in | 33 +-------------------------
|
||||
7 files changed, 54 insertions(+), 34 deletions(-)
|
||||
create mode 100644 heartbeat/aws.sh
|
||||
|
||||
diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am
|
||||
index 409847970..655740f14 100644
|
||||
--- a/heartbeat/Makefile.am
|
||||
+++ b/heartbeat/Makefile.am
|
||||
@@ -218,6 +218,7 @@ ocfcommon_DATA = ocf-shellfuncs \
|
||||
ocf-rarun \
|
||||
ocf-distro \
|
||||
apache-conf.sh \
|
||||
+ aws.sh \
|
||||
http-mon.sh \
|
||||
sapdb-nosha.sh \
|
||||
sapdb.sh \
|
||||
diff --git a/heartbeat/aws-vpc-move-ip b/heartbeat/aws-vpc-move-ip
|
||||
index fbeb2ee64..f4b0492f2 100755
|
||||
--- a/heartbeat/aws-vpc-move-ip
|
||||
+++ b/heartbeat/aws-vpc-move-ip
|
||||
@@ -33,6 +33,7 @@
|
||||
|
||||
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
|
||||
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
|
||||
+. ${OCF_FUNCTIONS_DIR}/aws.sh
|
||||
|
||||
# Defaults
|
||||
OCF_RESKEY_awscli_default="/usr/bin/aws"
|
||||
diff --git a/heartbeat/aws-vpc-route53.in b/heartbeat/aws-vpc-route53.in
|
||||
index eba2ed95c..f7e756782 100644
|
||||
--- a/heartbeat/aws-vpc-route53.in
|
||||
+++ b/heartbeat/aws-vpc-route53.in
|
||||
@@ -43,6 +43,7 @@
|
||||
|
||||
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
|
||||
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
|
||||
+. ${OCF_FUNCTIONS_DIR}/aws.sh
|
||||
|
||||
# Defaults
|
||||
OCF_RESKEY_awscli_default="/usr/bin/aws"
|
||||
@@ -377,7 +378,7 @@ r53_monitor() {
|
||||
_get_ip() {
|
||||
case $OCF_RESKEY_ip in
|
||||
local|public)
|
||||
- TOKEN=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -sX PUT -H 'X-aws-ec2-metadata-token-ttl-seconds: 21600'" "http://169.254.169.254/latest/api/token")
|
||||
+ TOKEN=$(get_token)
|
||||
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
IPADDRESS=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'" "http://169.254.169.254/latest/meta-data/${OCF_RESKEY_ip}-ipv4")
|
||||
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
diff --git a/heartbeat/aws.sh b/heartbeat/aws.sh
|
||||
new file mode 100644
|
||||
index 000000000..fc557109c
|
||||
--- /dev/null
|
||||
+++ b/heartbeat/aws.sh
|
||||
@@ -0,0 +1,46 @@
|
||||
+#!/bin/sh
|
||||
+#
|
||||
+#
|
||||
+# AWS Helper Scripts
|
||||
+#
|
||||
+#
|
||||
+
|
||||
+: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
|
||||
+. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
|
||||
+
|
||||
+# Defaults
|
||||
+OCF_RESKEY_curl_retries_default="3"
|
||||
+OCF_RESKEY_curl_sleep_default="1"
|
||||
+
|
||||
+: ${OCF_RESKEY_curl_retries=${OCF_RESKEY_curl_retries_default}}
|
||||
+: ${OCF_RESKEY_curl_sleep=${OCF_RESKEY_curl_sleep_default}}
|
||||
+
|
||||
+# Function to enable reusable IMDS token retrieval for efficient repeated access
|
||||
+# File to store the token and timestamp
|
||||
+TOKEN_FILE="/tmp/.imds_token"
|
||||
+TOKEN_LIFETIME=21600 # Token lifetime in seconds (6 hours)
|
||||
+TOKEN_EXPIRY_THRESHOLD=3600 # Renew token if less than 60 minutes (1 hour) remaining
|
||||
+
|
||||
+# Function to fetch a new token
|
||||
+fetch_new_token() {
|
||||
+ TOKEN=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -sX PUT -H 'X-aws-ec2-metadata-token-ttl-seconds: $TOKEN_LIFETIME'" "http://169.254.169.254/latest/api/token")
|
||||
+ echo "$TOKEN $(date +%s)" > "$TOKEN_FILE"
|
||||
+ echo "$TOKEN"
|
||||
+}
|
||||
+
|
||||
+# Function to retrieve or renew the token
|
||||
+get_token() {
|
||||
+ if [ -f "$TOKEN_FILE" ]; then
|
||||
+ read -r STORED_TOKEN STORED_TIMESTAMP < "$TOKEN_FILE"
|
||||
+ CURRENT_TIME=$(date +%s)
|
||||
+ ELAPSED_TIME=$((CURRENT_TIME - STORED_TIMESTAMP))
|
||||
+
|
||||
+ if (( ELAPSED_TIME < (TOKEN_LIFETIME - TOKEN_EXPIRY_THRESHOLD) )); then
|
||||
+ # Token is still valid
|
||||
+ echo "$STORED_TOKEN"
|
||||
+ return
|
||||
+ fi
|
||||
+ fi
|
||||
+ # Fetch a new token if not valid
|
||||
+ fetch_new_token
|
||||
+}
|
||||
\ No newline at end of file
|
||||
diff --git a/heartbeat/awseip b/heartbeat/awseip
|
||||
index ffb6223a1..049c2e566 100755
|
||||
--- a/heartbeat/awseip
|
||||
+++ b/heartbeat/awseip
|
||||
@@ -38,6 +38,7 @@
|
||||
|
||||
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
|
||||
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
|
||||
+. ${OCF_FUNCTIONS_DIR}/aws.sh
|
||||
|
||||
#######################################################################
|
||||
|
||||
@@ -306,7 +307,7 @@ fi
|
||||
ELASTIC_IP="${OCF_RESKEY_elastic_ip}"
|
||||
ALLOCATION_ID="${OCF_RESKEY_allocation_id}"
|
||||
PRIVATE_IP_ADDRESS="${OCF_RESKEY_private_ip_address}"
|
||||
-TOKEN=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -sX PUT -H 'X-aws-ec2-metadata-token-ttl-seconds: 21600'" "http://169.254.169.254/latest/api/token")
|
||||
+TOKEN=$(get_token)
|
||||
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
INSTANCE_ID=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'" "http://169.254.169.254/latest/meta-data/instance-id")
|
||||
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
diff --git a/heartbeat/awsvip b/heartbeat/awsvip
|
||||
index ca19ac086..de67981d8 100755
|
||||
--- a/heartbeat/awsvip
|
||||
+++ b/heartbeat/awsvip
|
||||
@@ -37,6 +37,7 @@
|
||||
|
||||
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
|
||||
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
|
||||
+. ${OCF_FUNCTIONS_DIR}/aws.sh
|
||||
|
||||
#######################################################################
|
||||
|
||||
diff --git a/heartbeat/ocf-shellfuncs.in b/heartbeat/ocf-shellfuncs.in
|
||||
index 0c4632cf9..922c6ea45 100644
|
||||
--- a/heartbeat/ocf-shellfuncs.in
|
||||
+++ b/heartbeat/ocf-shellfuncs.in
|
||||
@@ -1110,35 +1110,4 @@ ocf_is_true "$OCF_TRACE_RA" && ocf_start_trace
|
||||
# pacemaker sets HA_use_logd, some others use HA_LOGD :/
|
||||
if ocf_is_true "$HA_use_logd"; then
|
||||
: ${HA_LOGD:=yes}
|
||||
-fi
|
||||
-
|
||||
-# File to store the token and timestamp
|
||||
-TOKEN_FILE="/tmp/.imds_token"
|
||||
-TOKEN_LIFETIME=21600 # Token lifetime in seconds (6 hours)
|
||||
-TOKEN_EXPIRY_THRESHOLD=3600 # Renew token if less than 60 minutes (1 hour) remaining
|
||||
-
|
||||
-# Function to fetch a new token
|
||||
-fetch_new_token() {
|
||||
- TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: $TOKEN_LIFETIME")
|
||||
- echo "$TOKEN $(date +%s)" > "$TOKEN_FILE"
|
||||
- echo "$TOKEN"
|
||||
-}
|
||||
-
|
||||
-# Function to retrieve or renew the token
|
||||
-get_token() {
|
||||
- if [[ -f "$TOKEN_FILE" ]]; then
|
||||
- read -r STORED_TOKEN STORED_TIMESTAMP < "$TOKEN_FILE"
|
||||
- CURRENT_TIME=$(date +%s)
|
||||
- ELAPSED_TIME=$((CURRENT_TIME - STORED_TIMESTAMP))
|
||||
-
|
||||
- if (( ELAPSED_TIME < (TOKEN_LIFETIME - TOKEN_EXPIRY_THRESHOLD) )); then
|
||||
- # Token is still valid
|
||||
- echo "$STORED_TOKEN"
|
||||
- return
|
||||
- fi
|
||||
- fi
|
||||
- # Fetch a new token if not valid
|
||||
- fetch_new_token
|
||||
-}
|
||||
-
|
||||
-
|
||||
+fi
|
||||
\ No newline at end of file
|
||||
|
||||
From 9f7be201923c8eab1b121f2067ed74a69841cf8a Mon Sep 17 00:00:00 2001
|
||||
From: harshkiprofile <beer18317@gmail.com>
|
||||
Date: Tue, 5 Nov 2024 19:12:34 +0530
|
||||
Subject: [PATCH 5/7] Refactor to use common temp path and update shell syntax
|
||||
|
||||
---
|
||||
heartbeat/Makefile.am | 2 +-
|
||||
heartbeat/aws.sh | 4 ++--
|
||||
2 files changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am
|
||||
index 655740f14..8352f3a3d 100644
|
||||
--- a/heartbeat/Makefile.am
|
||||
+++ b/heartbeat/Makefile.am
|
||||
@@ -218,7 +218,7 @@ ocfcommon_DATA = ocf-shellfuncs \
|
||||
ocf-rarun \
|
||||
ocf-distro \
|
||||
apache-conf.sh \
|
||||
- aws.sh \
|
||||
+ aws.sh \
|
||||
http-mon.sh \
|
||||
sapdb-nosha.sh \
|
||||
sapdb.sh \
|
||||
diff --git a/heartbeat/aws.sh b/heartbeat/aws.sh
|
||||
index fc557109c..c77f93b91 100644
|
||||
--- a/heartbeat/aws.sh
|
||||
+++ b/heartbeat/aws.sh
|
||||
@@ -17,7 +17,7 @@ OCF_RESKEY_curl_sleep_default="1"
|
||||
|
||||
# Function to enable reusable IMDS token retrieval for efficient repeated access
|
||||
# File to store the token and timestamp
|
||||
-TOKEN_FILE="/tmp/.imds_token"
|
||||
+TOKEN_FILE="${HA_RSCTMP}/.aws_imds_token"
|
||||
TOKEN_LIFETIME=21600 # Token lifetime in seconds (6 hours)
|
||||
TOKEN_EXPIRY_THRESHOLD=3600 # Renew token if less than 60 minutes (1 hour) remaining
|
||||
|
||||
@@ -35,7 +35,7 @@ get_token() {
|
||||
CURRENT_TIME=$(date +%s)
|
||||
ELAPSED_TIME=$((CURRENT_TIME - STORED_TIMESTAMP))
|
||||
|
||||
- if (( ELAPSED_TIME < (TOKEN_LIFETIME - TOKEN_EXPIRY_THRESHOLD) )); then
|
||||
+ if [ "$ELAPSED_TIME" -lt "$((TOKEN_LIFETIME - TOKEN_EXPIRY_THRESHOLD))" ]; then
|
||||
# Token is still valid
|
||||
echo "$STORED_TOKEN"
|
||||
return
|
||||
|
||||
From 4f61048064d1df3bebdb5c1441cf0020f213c01b Mon Sep 17 00:00:00 2001
|
||||
From: harshkiprofile <beer18317@gmail.com>
|
||||
Date: Tue, 5 Nov 2024 19:30:15 +0530
|
||||
Subject: [PATCH 6/7] Consolidate curl_retry and curl_sleep variable to a
|
||||
single location in aws.sh
|
||||
|
||||
---
|
||||
heartbeat/aws-vpc-move-ip | 4 ----
|
||||
heartbeat/aws-vpc-route53.in | 4 ----
|
||||
heartbeat/awseip | 4 ----
|
||||
heartbeat/awsvip | 4 ----
|
||||
4 files changed, 16 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/aws-vpc-move-ip b/heartbeat/aws-vpc-move-ip
|
||||
index f4b0492f2..3aa9ceb02 100755
|
||||
--- a/heartbeat/aws-vpc-move-ip
|
||||
+++ b/heartbeat/aws-vpc-move-ip
|
||||
@@ -48,8 +48,6 @@ OCF_RESKEY_interface_default="eth0"
|
||||
OCF_RESKEY_iflabel_default=""
|
||||
OCF_RESKEY_monapi_default="false"
|
||||
OCF_RESKEY_lookup_type_default="InstanceId"
|
||||
-OCF_RESKEY_curl_retries_default="3"
|
||||
-OCF_RESKEY_curl_sleep_default="1"
|
||||
|
||||
: ${OCF_RESKEY_awscli=${OCF_RESKEY_awscli_default}}
|
||||
: ${OCF_RESKEY_auth_type=${OCF_RESKEY_auth_type_default}}
|
||||
@@ -63,8 +61,6 @@ OCF_RESKEY_curl_sleep_default="1"
|
||||
: ${OCF_RESKEY_iflabel=${OCF_RESKEY_iflabel_default}}
|
||||
: ${OCF_RESKEY_monapi=${OCF_RESKEY_monapi_default}}
|
||||
: ${OCF_RESKEY_lookup_type=${OCF_RESKEY_lookup_type_default}}
|
||||
-: ${OCF_RESKEY_curl_retries=${OCF_RESKEY_curl_retries_default}}
|
||||
-: ${OCF_RESKEY_curl_sleep=${OCF_RESKEY_curl_sleep_default}}
|
||||
#######################################################################
|
||||
|
||||
|
||||
diff --git a/heartbeat/aws-vpc-route53.in b/heartbeat/aws-vpc-route53.in
|
||||
index f7e756782..85c8de3c1 100644
|
||||
--- a/heartbeat/aws-vpc-route53.in
|
||||
+++ b/heartbeat/aws-vpc-route53.in
|
||||
@@ -54,8 +54,6 @@ OCF_RESKEY_hostedzoneid_default=""
|
||||
OCF_RESKEY_fullname_default=""
|
||||
OCF_RESKEY_ip_default="local"
|
||||
OCF_RESKEY_ttl_default=10
|
||||
-OCF_RESKEY_curl_retries_default="3"
|
||||
-OCF_RESKEY_curl_sleep_default="1"
|
||||
|
||||
: ${OCF_RESKEY_awscli=${OCF_RESKEY_awscli_default}}
|
||||
: ${OCF_RESKEY_auth_type=${OCF_RESKEY_auth_type_default}}
|
||||
@@ -65,8 +63,6 @@ OCF_RESKEY_curl_sleep_default="1"
|
||||
: ${OCF_RESKEY_fullname:=${OCF_RESKEY_fullname_default}}
|
||||
: ${OCF_RESKEY_ip:=${OCF_RESKEY_ip_default}}
|
||||
: ${OCF_RESKEY_ttl:=${OCF_RESKEY_ttl_default}}
|
||||
-: ${OCF_RESKEY_curl_retries=${OCF_RESKEY_curl_retries_default}}
|
||||
-: ${OCF_RESKEY_curl_sleep=${OCF_RESKEY_curl_sleep_default}}
|
||||
|
||||
usage() {
|
||||
cat <<-EOT
|
||||
diff --git a/heartbeat/awseip b/heartbeat/awseip
|
||||
index 049c2e566..4b1c3bc6a 100755
|
||||
--- a/heartbeat/awseip
|
||||
+++ b/heartbeat/awseip
|
||||
@@ -50,16 +50,12 @@ OCF_RESKEY_auth_type_default="key"
|
||||
OCF_RESKEY_profile_default="default"
|
||||
OCF_RESKEY_region_default=""
|
||||
OCF_RESKEY_api_delay_default="3"
|
||||
-OCF_RESKEY_curl_retries_default="3"
|
||||
-OCF_RESKEY_curl_sleep_default="1"
|
||||
|
||||
: ${OCF_RESKEY_awscli=${OCF_RESKEY_awscli_default}}
|
||||
: ${OCF_RESKEY_auth_type=${OCF_RESKEY_auth_type_default}}
|
||||
: ${OCF_RESKEY_profile=${OCF_RESKEY_profile_default}}
|
||||
: ${OCF_RESKEY_region=${OCF_RESKEY_region_default}}
|
||||
: ${OCF_RESKEY_api_delay=${OCF_RESKEY_api_delay_default}}
|
||||
-: ${OCF_RESKEY_curl_retries=${OCF_RESKEY_curl_retries_default}}
|
||||
-: ${OCF_RESKEY_curl_sleep=${OCF_RESKEY_curl_sleep_default}}
|
||||
|
||||
meta_data() {
|
||||
cat <<END
|
||||
diff --git a/heartbeat/awsvip b/heartbeat/awsvip
|
||||
index de67981d8..8c71e7fac 100755
|
||||
--- a/heartbeat/awsvip
|
||||
+++ b/heartbeat/awsvip
|
||||
@@ -49,16 +49,12 @@ OCF_RESKEY_auth_type_default="key"
|
||||
OCF_RESKEY_profile_default="default"
|
||||
OCF_RESKEY_region_default=""
|
||||
OCF_RESKEY_api_delay_default="3"
|
||||
-OCF_RESKEY_curl_retries_default="3"
|
||||
-OCF_RESKEY_curl_sleep_default="1"
|
||||
|
||||
: ${OCF_RESKEY_awscli=${OCF_RESKEY_awscli_default}}
|
||||
: ${OCF_RESKEY_auth_type=${OCF_RESKEY_auth_type_default}}
|
||||
: ${OCF_RESKEY_profile=${OCF_RESKEY_profile_default}}
|
||||
: ${OCF_RESKEY_region=${OCF_RESKEY_region_default}}
|
||||
: ${OCF_RESKEY_api_delay=${OCF_RESKEY_api_delay_default}}
|
||||
-: ${OCF_RESKEY_curl_retries=${OCF_RESKEY_curl_retries_default}}
|
||||
-: ${OCF_RESKEY_curl_sleep=${OCF_RESKEY_curl_sleep_default}}
|
||||
|
||||
meta_data() {
|
||||
cat <<END
|
||||
|
||||
From d451c5c595b08685f84ec85da96ae9cb4fc076fe Mon Sep 17 00:00:00 2001
|
||||
From: harshkiprofile <beer18317@gmail.com>
|
||||
Date: Tue, 5 Nov 2024 20:50:24 +0530
|
||||
Subject: [PATCH 7/7] aws.sh needs to added to be symlinkstargets in
|
||||
doc/man/Makefile.am
|
||||
|
||||
---
|
||||
doc/man/Makefile.am | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am
|
||||
index ef7639bff..447f5cba3 100644
|
||||
--- a/doc/man/Makefile.am
|
||||
+++ b/doc/man/Makefile.am
|
||||
@@ -42,7 +42,7 @@ radir = $(abs_top_builddir)/heartbeat
|
||||
# required for out-of-tree build
|
||||
symlinkstargets = \
|
||||
ocf-distro ocf.py ocf-rarun ocf-returncodes \
|
||||
- findif.sh apache-conf.sh http-mon.sh mysql-common.sh \
|
||||
+ findif.sh apache-conf.sh aws.sh http-mon.sh mysql-common.sh \
|
||||
nfsserver-redhat.sh ora-common.sh
|
||||
|
||||
preptree:
|
||||
@ -0,0 +1,161 @@
|
||||
From cc5ffa5e599c974c426e93faa821b342e96b916d Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Mon, 11 Nov 2024 12:46:27 +0100
|
||||
Subject: [PATCH 1/2] aws.sh: chmod 600 $TOKEN_FILE, add get_instance_id() with
|
||||
DMI support, and use get_instance_id() in AWS agents
|
||||
|
||||
---
|
||||
heartbeat/aws-vpc-move-ip | 2 +-
|
||||
heartbeat/aws.sh | 30 +++++++++++++++++++++++++++---
|
||||
heartbeat/awseip | 2 +-
|
||||
heartbeat/awsvip | 2 +-
|
||||
4 files changed, 30 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/aws-vpc-move-ip b/heartbeat/aws-vpc-move-ip
|
||||
index 3aa9ceb02..09ae68b57 100755
|
||||
--- a/heartbeat/aws-vpc-move-ip
|
||||
+++ b/heartbeat/aws-vpc-move-ip
|
||||
@@ -269,7 +269,7 @@ ec2ip_validate() {
|
||||
|
||||
TOKEN=$(get_token)
|
||||
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
- EC2_INSTANCE_ID=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'" "http://169.254.169.254/latest/meta-data/instance-id")
|
||||
+ EC2_INSTANCE_ID=$(get_instance_id)
|
||||
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
|
||||
if [ -z "${EC2_INSTANCE_ID}" ]; then
|
||||
diff --git a/heartbeat/aws.sh b/heartbeat/aws.sh
|
||||
index c77f93b91..9cd343c16 100644
|
||||
--- a/heartbeat/aws.sh
|
||||
+++ b/heartbeat/aws.sh
|
||||
@@ -9,8 +9,8 @@
|
||||
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
|
||||
|
||||
# Defaults
|
||||
-OCF_RESKEY_curl_retries_default="3"
|
||||
-OCF_RESKEY_curl_sleep_default="1"
|
||||
+OCF_RESKEY_curl_retries_default="4"
|
||||
+OCF_RESKEY_curl_sleep_default="3"
|
||||
|
||||
: ${OCF_RESKEY_curl_retries=${OCF_RESKEY_curl_retries_default}}
|
||||
: ${OCF_RESKEY_curl_sleep=${OCF_RESKEY_curl_sleep_default}}
|
||||
@@ -20,11 +20,13 @@ OCF_RESKEY_curl_sleep_default="1"
|
||||
TOKEN_FILE="${HA_RSCTMP}/.aws_imds_token"
|
||||
TOKEN_LIFETIME=21600 # Token lifetime in seconds (6 hours)
|
||||
TOKEN_EXPIRY_THRESHOLD=3600 # Renew token if less than 60 minutes (1 hour) remaining
|
||||
+DMI_FILE="/sys/devices/virtual/dmi/id/board_asset_tag" # Only supported on nitro-based instances.
|
||||
|
||||
# Function to fetch a new token
|
||||
fetch_new_token() {
|
||||
TOKEN=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -sX PUT -H 'X-aws-ec2-metadata-token-ttl-seconds: $TOKEN_LIFETIME'" "http://169.254.169.254/latest/api/token")
|
||||
echo "$TOKEN $(date +%s)" > "$TOKEN_FILE"
|
||||
+ chmod 600 "$TOKEN_FILE"
|
||||
echo "$TOKEN"
|
||||
}
|
||||
|
||||
@@ -43,4 +45,26 @@ get_token() {
|
||||
fi
|
||||
# Fetch a new token if not valid
|
||||
fetch_new_token
|
||||
-}
|
||||
\ No newline at end of file
|
||||
+}
|
||||
+
|
||||
+get_instance_id() {
|
||||
+ local INSTANCE_ID
|
||||
+
|
||||
+ # Try to get the EC2 instance ID from DMI first before falling back to IMDS.
|
||||
+ ocf_log debug "EC2: Attempt to get EC2 Instance ID from local file."
|
||||
+ if [ -r "$DMI_FILE" ] && [ -s "$DMI_FILE" ]; then
|
||||
+ INSTANCE_ID="$(cat "$DMI_FILE")"
|
||||
+ case "$INSTANCE_ID" in
|
||||
+ i-0*) echo "$INSTANCE_ID"; return "$OCF_SUCCESS" ;;
|
||||
+ esac
|
||||
+ fi
|
||||
+
|
||||
+ INSTANCE_ID=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'" "http://169.254.169.254/latest/meta-data/instance-id")
|
||||
+ if [ $? -ne 0 ]; then
|
||||
+ ocf_exit_reason "Failed to get EC2 Instance ID"
|
||||
+ exit $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
+ echo "$INSTANCE_ID"
|
||||
+ return "$OCF_SUCCESS"
|
||||
+}
|
||||
diff --git a/heartbeat/awseip b/heartbeat/awseip
|
||||
index 4b1c3bc6a..7f38376dc 100755
|
||||
--- a/heartbeat/awseip
|
||||
+++ b/heartbeat/awseip
|
||||
@@ -305,7 +305,7 @@ ALLOCATION_ID="${OCF_RESKEY_allocation_id}"
|
||||
PRIVATE_IP_ADDRESS="${OCF_RESKEY_private_ip_address}"
|
||||
TOKEN=$(get_token)
|
||||
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
-INSTANCE_ID=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'" "http://169.254.169.254/latest/meta-data/instance-id")
|
||||
+INSTANCE_ID=$(get_instance_id)
|
||||
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
|
||||
case $__OCF_ACTION in
|
||||
diff --git a/heartbeat/awsvip b/heartbeat/awsvip
|
||||
index 8c71e7fac..0856ac5e4 100755
|
||||
--- a/heartbeat/awsvip
|
||||
+++ b/heartbeat/awsvip
|
||||
@@ -265,7 +265,7 @@ fi
|
||||
SECONDARY_PRIVATE_IP="${OCF_RESKEY_secondary_private_ip}"
|
||||
TOKEN=$(get_token)
|
||||
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
-INSTANCE_ID=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'" "http://169.254.169.254/latest/meta-data/instance-id")
|
||||
+INSTANCE_ID=$(get_instance_id)
|
||||
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
MAC_ADDRESS=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'" "http://169.254.169.254/latest/meta-data/mac")
|
||||
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
|
||||
From b8d3ecc6a8ce4baf4b28d02978dd573728ccf5fa Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Mon, 18 Nov 2024 11:10:42 +0100
|
||||
Subject: [PATCH 2/2] aws.sh/ocf-shellfuncs: add ability to fresh token if it's
|
||||
invalid
|
||||
|
||||
---
|
||||
heartbeat/aws.sh | 1 +
|
||||
heartbeat/ocf-shellfuncs.in | 11 ++++++++++-
|
||||
2 files changed, 11 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/heartbeat/aws.sh b/heartbeat/aws.sh
|
||||
index 9cd343c16..64f2e13a7 100644
|
||||
--- a/heartbeat/aws.sh
|
||||
+++ b/heartbeat/aws.sh
|
||||
@@ -18,6 +18,7 @@ OCF_RESKEY_curl_sleep_default="3"
|
||||
# Function to enable reusable IMDS token retrieval for efficient repeated access
|
||||
# File to store the token and timestamp
|
||||
TOKEN_FILE="${HA_RSCTMP}/.aws_imds_token"
|
||||
+TOKEN_FUNC="fetch_new_token" # Used by curl_retry() if saved token is invalid
|
||||
TOKEN_LIFETIME=21600 # Token lifetime in seconds (6 hours)
|
||||
TOKEN_EXPIRY_THRESHOLD=3600 # Renew token if less than 60 minutes (1 hour) remaining
|
||||
DMI_FILE="/sys/devices/virtual/dmi/id/board_asset_tag" # Only supported on nitro-based instances.
|
||||
diff --git a/heartbeat/ocf-shellfuncs.in b/heartbeat/ocf-shellfuncs.in
|
||||
index 922c6ea45..8e51fa3c8 100644
|
||||
--- a/heartbeat/ocf-shellfuncs.in
|
||||
+++ b/heartbeat/ocf-shellfuncs.in
|
||||
@@ -697,6 +697,15 @@ curl_retry()
|
||||
|
||||
ocf_log debug "result: $result"
|
||||
[ $rc -eq 0 ] && break
|
||||
+ if [ -n "$TOKEN" ] && [ -n "$TOKEN_FILE" ] && \
|
||||
+ [ -f "$TOKEN_FILE" ] && [ -n "$TOKEN_FUNC" ] && \
|
||||
+ echo "$result" | grep -q "The requested URL returned error: 401$"; then
|
||||
+ local OLD_TOKEN="$TOKEN"
|
||||
+ ocf_log err "Token invalid. Getting new token."
|
||||
+ TOKEN=$($TOKEN_FUNC)
|
||||
+ [ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
+ args=$(echo "$args" | sed "s/$OLD_TOKEN/$TOKEN/")
|
||||
+ fi
|
||||
sleep $sleep
|
||||
done
|
||||
|
||||
@@ -1110,4 +1119,4 @@ ocf_is_true "$OCF_TRACE_RA" && ocf_start_trace
|
||||
# pacemaker sets HA_use_logd, some others use HA_LOGD :/
|
||||
if ocf_is_true "$HA_use_logd"; then
|
||||
: ${HA_LOGD:=yes}
|
||||
-fi
|
||||
\ No newline at end of file
|
||||
+fi
|
||||
184
SOURCES/RHEL-68739-awsvip-add-interface-parameter.patch
Normal file
184
SOURCES/RHEL-68739-awsvip-add-interface-parameter.patch
Normal file
@ -0,0 +1,184 @@
|
||||
From 392d40048a25d7cb73ec5b5e9f7a5862f7a3fd48 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Mon, 11 Nov 2024 12:22:27 +0100
|
||||
Subject: [PATCH 1/2] aws.sh: add get_interface_mac()
|
||||
|
||||
---
|
||||
heartbeat/aws.sh | 21 +++++++++++++++++++++
|
||||
1 file changed, 21 insertions(+)
|
||||
|
||||
diff --git a/heartbeat/aws.sh b/heartbeat/aws.sh
|
||||
index 64f2e13a7..ebb4eb1f4 100644
|
||||
--- a/heartbeat/aws.sh
|
||||
+++ b/heartbeat/aws.sh
|
||||
@@ -69,3 +69,24 @@ get_instance_id() {
|
||||
echo "$INSTANCE_ID"
|
||||
return "$OCF_SUCCESS"
|
||||
}
|
||||
+
|
||||
+get_interface_mac() {
|
||||
+ local MAC_FILE MAC_ADDR rc
|
||||
+ MAC_FILE="/sys/class/net/${OCF_RESKEY_interface}/address"
|
||||
+ if [ -f "$MAC_FILE" ]; then
|
||||
+ cmd="cat ${MAC_FILE}"
|
||||
+ else
|
||||
+ cmd="ip -br link show dev ${OCF_RESKEY_interface} | tr -s ' ' | cut -d' ' -f3"
|
||||
+ fi
|
||||
+ ocf_log debug "executing command: $cmd"
|
||||
+ MAC_ADDR="$(eval $cmd)"
|
||||
+ rc=$?
|
||||
+ if [ $rc != 0 ]; then
|
||||
+ ocf_log warn "command failed, rc: $rc"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+ ocf_log debug "MAC address associated with interface ${OCF_RESKEY_interface}: ${MAC_ADDR}"
|
||||
+
|
||||
+ echo $MAC_ADDR
|
||||
+ return $OCF_SUCCESS
|
||||
+}
|
||||
|
||||
From 87337ac4da931d5a53c83d53d4bab17ee123ba9f Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Mon, 11 Nov 2024 12:26:38 +0100
|
||||
Subject: [PATCH 2/2] awsvip: let user specify which interface to use, and make
|
||||
the parameter optional in aws-vpc-move-ip
|
||||
|
||||
---
|
||||
heartbeat/aws-vpc-move-ip | 20 ++++----------------
|
||||
heartbeat/aws.sh | 4 +++-
|
||||
heartbeat/awsvip | 24 +++++++++++++++++-------
|
||||
3 files changed, 24 insertions(+), 24 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/aws-vpc-move-ip b/heartbeat/aws-vpc-move-ip
|
||||
index 09ae68b57..2afc0ba53 100755
|
||||
--- a/heartbeat/aws-vpc-move-ip
|
||||
+++ b/heartbeat/aws-vpc-move-ip
|
||||
@@ -157,7 +157,7 @@ Role to use to query/update the route table
|
||||
<content type="string" default="${OCF_RESKEY_routing_table_role_default}" />
|
||||
</parameter>
|
||||
|
||||
-<parameter name="interface" required="1">
|
||||
+<parameter name="interface" required="0">
|
||||
<longdesc lang="en">
|
||||
Name of the network interface, i.e. eth0
|
||||
</longdesc>
|
||||
@@ -321,7 +321,7 @@ ec2ip_monitor() {
|
||||
ocf_log debug "monitor: Enhanced Monitoring disabled - omitting API call"
|
||||
fi
|
||||
|
||||
- cmd="ip addr show to $OCF_RESKEY_ip up"
|
||||
+ cmd="ip addr show dev $OCF_RESKEY_interface to $OCF_RESKEY_ip up"
|
||||
ocf_log debug "executing command: $cmd"
|
||||
RESULT=$($cmd | grep "$OCF_RESKEY_ip")
|
||||
if [ -z "$RESULT" ]; then
|
||||
@@ -331,7 +331,7 @@ ec2ip_monitor() {
|
||||
level="info"
|
||||
fi
|
||||
|
||||
- ocf_log "$level" "IP $OCF_RESKEY_ip not assigned to running interface"
|
||||
+ ocf_log "$level" "IP $OCF_RESKEY_ip not assigned to interface $OCF_RESKEY_interface"
|
||||
return $OCF_NOT_RUNNING
|
||||
fi
|
||||
|
||||
@@ -369,19 +369,7 @@ ec2ip_drop() {
|
||||
}
|
||||
|
||||
ec2ip_get_instance_eni() {
|
||||
- MAC_FILE="/sys/class/net/${OCF_RESKEY_interface}/address"
|
||||
- if [ -f $MAC_FILE ]; then
|
||||
- cmd="cat ${MAC_FILE}"
|
||||
- else
|
||||
- cmd="ip -br link show dev ${OCF_RESKEY_interface} | tr -s ' ' | cut -d' ' -f3"
|
||||
- fi
|
||||
- ocf_log debug "executing command: $cmd"
|
||||
- MAC_ADDR="$(eval $cmd)"
|
||||
- rc=$?
|
||||
- if [ $rc != 0 ]; then
|
||||
- ocf_log warn "command failed, rc: $rc"
|
||||
- return $OCF_ERR_GENERIC
|
||||
- fi
|
||||
+ MAC_ADDR=$(get_interface_mac)
|
||||
ocf_log debug "MAC address associated with interface ${OCF_RESKEY_interface}: ${MAC_ADDR}"
|
||||
|
||||
cmd="curl_retry \"$OCF_RESKEY_curl_retries\" \"$OCF_RESKEY_curl_sleep\" \"--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'\" \"http://169.254.169.254/latest/meta-data/network/interfaces/macs/${MAC_ADDR}/interface-id\""
|
||||
diff --git a/heartbeat/aws.sh b/heartbeat/aws.sh
|
||||
index ebb4eb1f4..216033afe 100644
|
||||
--- a/heartbeat/aws.sh
|
||||
+++ b/heartbeat/aws.sh
|
||||
@@ -73,7 +73,9 @@ get_instance_id() {
|
||||
get_interface_mac() {
|
||||
local MAC_FILE MAC_ADDR rc
|
||||
MAC_FILE="/sys/class/net/${OCF_RESKEY_interface}/address"
|
||||
- if [ -f "$MAC_FILE" ]; then
|
||||
+ if [ -z "$OCF_RESKEY_interface" ]; then
|
||||
+ cmd="curl_retry \"$OCF_RESKEY_curl_retries\" \"$OCF_RESKEY_curl_sleep\" \"--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'\" \"http://169.254.169.254/latest/meta-data/mac\""
|
||||
+ elif [ -f "$MAC_FILE" ]; then
|
||||
cmd="cat ${MAC_FILE}"
|
||||
else
|
||||
cmd="ip -br link show dev ${OCF_RESKEY_interface} | tr -s ' ' | cut -d' ' -f3"
|
||||
diff --git a/heartbeat/awsvip b/heartbeat/awsvip
|
||||
index 0856ac5e4..015180d5a 100755
|
||||
--- a/heartbeat/awsvip
|
||||
+++ b/heartbeat/awsvip
|
||||
@@ -49,12 +49,14 @@ OCF_RESKEY_auth_type_default="key"
|
||||
OCF_RESKEY_profile_default="default"
|
||||
OCF_RESKEY_region_default=""
|
||||
OCF_RESKEY_api_delay_default="3"
|
||||
+OCF_RESKEY_interface_default=""
|
||||
|
||||
: ${OCF_RESKEY_awscli=${OCF_RESKEY_awscli_default}}
|
||||
: ${OCF_RESKEY_auth_type=${OCF_RESKEY_auth_type_default}}
|
||||
: ${OCF_RESKEY_profile=${OCF_RESKEY_profile_default}}
|
||||
: ${OCF_RESKEY_region=${OCF_RESKEY_region_default}}
|
||||
: ${OCF_RESKEY_api_delay=${OCF_RESKEY_api_delay_default}}
|
||||
+: ${OCF_RESKEY_interface=${OCF_RESKEY_interface_default}}
|
||||
|
||||
meta_data() {
|
||||
cat <<END
|
||||
@@ -125,6 +127,14 @@ a short delay between API calls, to avoid sending API too quick
|
||||
<content type="integer" default="${OCF_RESKEY_api_delay_default}" />
|
||||
</parameter>
|
||||
|
||||
+<parameter name="interface" required="0">
|
||||
+<longdesc lang="en">
|
||||
+Name of the network interface, i.e. eth0
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">network interface name</shortdesc>
|
||||
+<content type="string" default="${OCF_RESKEY_interface_default}" />
|
||||
+</parameter>
|
||||
+
|
||||
<parameter name="curl_retries" unique="0">
|
||||
<longdesc lang="en">
|
||||
curl retries before failing
|
||||
@@ -207,16 +217,16 @@ awsvip_stop() {
|
||||
}
|
||||
|
||||
awsvip_monitor() {
|
||||
- $AWSCLI_CMD ec2 describe-instances \
|
||||
- --instance-id "${INSTANCE_ID}" \
|
||||
- --query 'Reservations[].Instances[].NetworkInterfaces[].PrivateIpAddresses[].PrivateIpAddress[]' \
|
||||
+ $AWSCLI_CMD ec2 describe-network-interfaces \
|
||||
+ --network-interface-ids "${NETWORK_ID}" \
|
||||
+ --query 'NetworkInterfaces[].PrivateIpAddresses[].PrivateIpAddress[]' \
|
||||
--output text | \
|
||||
grep -qE "(^|\s)${SECONDARY_PRIVATE_IP}(\s|$)"
|
||||
- RET=$?
|
||||
-
|
||||
- if [ $RET -ne 0 ]; then
|
||||
+ if [ $? -ne 0 ]; then
|
||||
+ [ "$__OCF_ACTION" = "monitor" ] && ! ocf_is_probe && ocf_log error "IP $SECONDARY_PRIVATE_IP not assigned to interface ${NETWORK_ID}"
|
||||
return $OCF_NOT_RUNNING
|
||||
fi
|
||||
+
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
@@ -267,7 +277,7 @@ TOKEN=$(get_token)
|
||||
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
INSTANCE_ID=$(get_instance_id)
|
||||
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
-MAC_ADDRESS=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'" "http://169.254.169.254/latest/meta-data/mac")
|
||||
+MAC_ADDRESS=$(get_interface_mac)
|
||||
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
NETWORK_ID=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'" "http://169.254.169.254/latest/meta-data/network/interfaces/macs/${MAC_ADDRESS}/interface-id")
|
||||
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||
@ -0,0 +1,63 @@
|
||||
From 71bc76dc4fa57726e80d0ddcc0bdcfe708af8763 Mon Sep 17 00:00:00 2001
|
||||
From: "Fabio M. Di Nitto" <fdinitto@redhat.com>
|
||||
Date: Thu, 5 Dec 2024 11:02:40 +0100
|
||||
Subject: [PATCH] openstack-cinder-volume: wait for volume to be available
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
monitor the vol till it´s attached to the host and avoid a race between
|
||||
openstack APIs receiving the request and completing the operation.
|
||||
|
||||
Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>
|
||||
---
|
||||
heartbeat/openstack-cinder-volume | 29 ++++++++++++++++++-----------
|
||||
1 file changed, 18 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/openstack-cinder-volume b/heartbeat/openstack-cinder-volume
|
||||
index 116442c41b..2b64d4d887 100755
|
||||
--- a/heartbeat/openstack-cinder-volume
|
||||
+++ b/heartbeat/openstack-cinder-volume
|
||||
@@ -141,17 +141,19 @@ osvol_monitor() {
|
||||
|
||||
node_id=$(_get_node_id)
|
||||
|
||||
- if [ "$__OCF_ACTION" = "monitor" ] && ocf_is_true $OCF_RESKEY_volume_local_check ; then
|
||||
- #
|
||||
- # Is the volue attached?
|
||||
- # We check the local devices
|
||||
- #
|
||||
- short_volume_id=$(echo $OCF_RESKEY_volume_id | awk '{print substr($0, 0, 20)}')
|
||||
- if lsblk /dev/disk/by-id/virtio-$short_volume_id 1>/dev/null 2>&1; then
|
||||
- return $OCF_SUCCESS
|
||||
- else
|
||||
- ocf_log warn "$OCF_RESKEY_volume_id is not attached to instance $node_id"
|
||||
- return $OCF_NOT_RUNNING
|
||||
+ if ocf_is_true $OCF_RESKEY_volume_local_check ; then
|
||||
+ if [ "$__OCF_ACTION" = "monitor" ] || [ "$__OCF_ACTION" = "start" ] ; then
|
||||
+ #
|
||||
+ # Is the volue attached?
|
||||
+ # We check the local devices
|
||||
+ #
|
||||
+ short_volume_id=$(echo $OCF_RESKEY_volume_id | awk '{print substr($0, 0, 20)}')
|
||||
+ if lsblk /dev/disk/by-id/virtio-$short_volume_id 1>/dev/null 2>&1; then
|
||||
+ return $OCF_SUCCESS
|
||||
+ else
|
||||
+ ocf_log warn "$OCF_RESKEY_volume_id is not attached to instance $node_id"
|
||||
+ return $OCF_NOT_RUNNING
|
||||
+ fi
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -247,6 +249,11 @@ osvol_start() {
|
||||
return $OCF_ERR_GENERIC
|
||||
fi
|
||||
|
||||
+ while ! osvol_monitor; do
|
||||
+ ocf_log info "Waiting for cinder volume $OCF_RESKEY_volume_id to appear on $node_id"
|
||||
+ sleep 1
|
||||
+ done
|
||||
+
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
@ -0,0 +1,44 @@
|
||||
From d89b3fb29033c3a60eb0896033af5981c7b9f64a Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Fri, 10 Jan 2025 11:39:48 +0100
|
||||
Subject: [PATCH] openstack-cinder-volume: fix detach not working during
|
||||
start-action after #2000
|
||||
|
||||
---
|
||||
heartbeat/openstack-cinder-volume | 8 ++++----
|
||||
1 file changed, 4 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/openstack-cinder-volume b/heartbeat/openstack-cinder-volume
|
||||
index 2b64d4d88..5bb1acddd 100755
|
||||
--- a/heartbeat/openstack-cinder-volume
|
||||
+++ b/heartbeat/openstack-cinder-volume
|
||||
@@ -142,9 +142,9 @@ osvol_monitor() {
|
||||
node_id=$(_get_node_id)
|
||||
|
||||
if ocf_is_true $OCF_RESKEY_volume_local_check ; then
|
||||
- if [ "$__OCF_ACTION" = "monitor" ] || [ "$__OCF_ACTION" = "start" ] ; then
|
||||
+ if [ "$__OCF_ACTION" = "monitor" ] || [ "$1" = "quick" ]; then
|
||||
#
|
||||
- # Is the volue attached?
|
||||
+ # Is the volume attached?
|
||||
# We check the local devices
|
||||
#
|
||||
short_volume_id=$(echo $OCF_RESKEY_volume_id | awk '{print substr($0, 0, 20)}')
|
||||
@@ -158,7 +158,7 @@ osvol_monitor() {
|
||||
fi
|
||||
|
||||
#
|
||||
- # Is the volue attached?
|
||||
+ # Is the volume attached?
|
||||
# We use the API
|
||||
#
|
||||
result=$(run_openstackcli "volume show \
|
||||
@@ -249,7 +249,7 @@ osvol_start() {
|
||||
return $OCF_ERR_GENERIC
|
||||
fi
|
||||
|
||||
- while ! osvol_monitor; do
|
||||
+ while ! osvol_monitor quick; do
|
||||
ocf_log info "Waiting for cinder volume $OCF_RESKEY_volume_id to appear on $node_id"
|
||||
sleep 1
|
||||
done
|
||||
@ -0,0 +1,37 @@
|
||||
From d0d2a0ff92dd23ee36cb57324c1eeaa3daed65bc Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Tue, 4 Feb 2025 16:13:27 +0100
|
||||
Subject: [PATCH] findif.sh: fix to avoid duplicate route issues
|
||||
|
||||
---
|
||||
heartbeat/findif.sh | 14 +++++---------
|
||||
1 file changed, 5 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/findif.sh b/heartbeat/findif.sh
|
||||
index 2ae91e958..6fb47110c 100644
|
||||
--- a/heartbeat/findif.sh
|
||||
+++ b/heartbeat/findif.sh
|
||||
@@ -217,18 +217,14 @@ findif()
|
||||
fi
|
||||
if [ -n "$nic" ] ; then
|
||||
# NIC supports more than two.
|
||||
- routematch=$(ip -o -f $family route list match $match $proto $scope | grep -v "^\(unreachable\|prohibit\|blackhole\)" | grep "dev $nic " | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||
+ routematch=$(ip -o -f $family route list match $match $proto $scope | grep "dev $nic " | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||
else
|
||||
- routematch=$(ip -o -f $family route list match $match $proto $scope | grep -v "^\(unreachable\|prohibit\|blackhole\)" | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||
- fi
|
||||
- if [ "$family" = "inet6" ]; then
|
||||
- routematch=$(echo "$routematch" | grep -v "^default")
|
||||
+ routematch=$(ip -o -f $family route list match $match $proto $scope | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||
fi
|
||||
|
||||
- if [ $(echo "$routematch" | wc -l) -gt 1 ]; then
|
||||
- ocf_exit_reason "More than 1 routes match $match. Unable to decide which route to use."
|
||||
- return $OCF_ERR_GENERIC
|
||||
- fi
|
||||
+ # ignore matches from unrelated tables, and sort by metric to get the route with the lowest metric
|
||||
+ routematch=$(echo "$routematch" | awk '!/^(default|unreachable|prohibit|blackhole)/{match($0, /metric ([^ ]+)/, arr); print arr[1], $0}' | sort -k 1n -u | cut -d" " -f 2- | head -1)
|
||||
+
|
||||
set -- $routematch
|
||||
if [ $# = 0 ] ; then
|
||||
case $OCF_RESKEY_ip in
|
||||
@ -0,0 +1,23 @@
|
||||
From a1e22c5c612f369bac0830588642560dcea92e7c Mon Sep 17 00:00:00 2001
|
||||
From: Fujii Masao <fujii@postgresql.org>
|
||||
Date: Sat, 9 Nov 2024 02:33:37 +0900
|
||||
Subject: [PATCH] Remove unused macro variables from storage_mon.c.
|
||||
|
||||
---
|
||||
tools/storage_mon.c | 3 ---
|
||||
1 file changed, 3 deletions(-)
|
||||
|
||||
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
|
||||
index f94268f6f..2519a9e72 100644
|
||||
--- a/tools/storage_mon.c
|
||||
+++ b/tools/storage_mon.c
|
||||
@@ -33,9 +33,6 @@
|
||||
#define DEFAULT_PIDFILE HA_VARRUNDIR "storage_mon.pid"
|
||||
#define DEFAULT_ATTRNAME "#health-storage_mon"
|
||||
#define SMON_GET_RESULT_COMMAND "get_check_value"
|
||||
-#define SMON_RESULT_OK "green"
|
||||
-#define SMON_RESULT_NG "red"
|
||||
-#define SMON_RESULT_COMMAND_ERROR "unknown command"
|
||||
#define SMON_BUFF_1MEG 1048576
|
||||
#define SMON_MAX_IPCSNAME 256
|
||||
#define SMON_MAX_MSGSIZE 128
|
||||
@ -0,0 +1,79 @@
|
||||
From 46715c638829598d949dffab0898fe4c07074895 Mon Sep 17 00:00:00 2001
|
||||
From: Hideo Yamauchi <renayama19661014@ybb.ne.jp>
|
||||
Date: Thu, 21 Nov 2024 15:21:19 +0900
|
||||
Subject: [PATCH 1/2] High: storage-mon: Correct the timing of setting
|
||||
notification values to storage-mon(RA) clients.
|
||||
|
||||
---
|
||||
tools/storage_mon.c | 17 ++++++++---------
|
||||
1 file changed, 8 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
|
||||
index 2519a9e72..27d2ff1d1 100644
|
||||
--- a/tools/storage_mon.c
|
||||
+++ b/tools/storage_mon.c
|
||||
@@ -320,7 +320,14 @@ static int32_t sigchld_handler(int32_t sig, void *data)
|
||||
|
||||
finished_count++;
|
||||
test_forks[index] = 0;
|
||||
-
|
||||
+
|
||||
+ /* Update the result value for the client response once all checks have completed. */
|
||||
+ if (device_count == finished_count) {
|
||||
+ response_final_score = final_score;
|
||||
+ if (!daemon_check_first_all_devices) {
|
||||
+ daemon_check_first_all_devices = TRUE;
|
||||
+ }
|
||||
+ }
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@@ -441,15 +448,7 @@ static int test_device_main(gpointer data)
|
||||
if (is_child_runnning()) {
|
||||
device_check = FALSE;
|
||||
}
|
||||
-
|
||||
- if (device_count == finished_count && device_check) {
|
||||
- /* Update the result value for the client response once all checks have completed. */
|
||||
- response_final_score = final_score;
|
||||
|
||||
- if (!daemon_check_first_all_devices) {
|
||||
- daemon_check_first_all_devices = TRUE;
|
||||
- }
|
||||
- }
|
||||
}
|
||||
|
||||
if (device_check) {
|
||||
|
||||
From 1201390fb219d1b566c5d31463daacef60c31ab4 Mon Sep 17 00:00:00 2001
|
||||
From: Hideo Yamauchi <renayama19661014@ybb.ne.jp>
|
||||
Date: Thu, 21 Nov 2024 15:43:33 +0900
|
||||
Subject: [PATCH 2/2] Mid: storage-mon RA: Wait until monitor confirms the
|
||||
startup pid according to the OCF resource specification.
|
||||
|
||||
---
|
||||
heartbeat/storage-mon.in | 11 +++++++++++
|
||||
1 file changed, 11 insertions(+)
|
||||
|
||||
diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in
|
||||
index 284dec30f..7c9943d4f 100644
|
||||
--- a/heartbeat/storage-mon.in
|
||||
+++ b/heartbeat/storage-mon.in
|
||||
@@ -325,6 +325,17 @@ storage-mon_start() {
|
||||
if [ "$?" -ne 0 ]; then
|
||||
return $OCF_ERR_GENERIC
|
||||
fi
|
||||
+
|
||||
+ #Wait until monitor confirms the startup pid according to the ocf resource specification.
|
||||
+ while true; do
|
||||
+ storage-mon_monitor pid_check_only
|
||||
+ rc="$?"
|
||||
+ if [ $rc -eq $OCF_SUCCESS ]; then
|
||||
+ break
|
||||
+ fi
|
||||
+ sleep 1
|
||||
+ ocf_log debug "storage-mon daemon still hasn't started yet. Waiting..."
|
||||
+ done
|
||||
fi
|
||||
}
|
||||
|
||||
@ -0,0 +1,148 @@
|
||||
From b72b329a45c058fda720c6739f881b9597fc8b30 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Thu, 23 Jan 2025 16:18:20 +0100
|
||||
Subject: [PATCH] storage-mon: replace dashes with underscores in functions
|
||||
|
||||
Dashes in function names produce "`storage-mon_usage': not a valid identifier"
|
||||
error when run with sh -x.
|
||||
---
|
||||
heartbeat/storage-mon.in | 44 ++++++++++++++++++++--------------------
|
||||
1 file changed, 22 insertions(+), 22 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in
|
||||
index 7c9943d4f..5edb96979 100644
|
||||
--- a/heartbeat/storage-mon.in
|
||||
+++ b/heartbeat/storage-mon.in
|
||||
@@ -152,7 +152,7 @@ END
|
||||
|
||||
#######################################################################
|
||||
|
||||
-storage-mon_usage() {
|
||||
+storage_mon_usage() {
|
||||
cat <<END
|
||||
usage: $0 {start|stop|monitor|validate-all|meta-data}
|
||||
|
||||
@@ -161,7 +161,7 @@ END
|
||||
return $1
|
||||
}
|
||||
|
||||
-storage-mon_init() {
|
||||
+storage_mon_init() {
|
||||
#Test for presence of storage_mon helper
|
||||
if [ ! -x "$STORAGEMON" ] ; then
|
||||
ocf_log err "${STORAGEMON} not installed."
|
||||
@@ -205,7 +205,7 @@ storage-mon_init() {
|
||||
fi
|
||||
}
|
||||
|
||||
-storage-mon_update_attribute() {
|
||||
+storage_mon_update_attribute() {
|
||||
|
||||
while :
|
||||
do
|
||||
@@ -224,9 +224,9 @@ storage-mon_update_attribute() {
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
-storage-mon_monitor() {
|
||||
+storage_mon_monitor() {
|
||||
if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
|
||||
- storage-mon_init
|
||||
+ storage_mon_init
|
||||
|
||||
# Monitor _MUST!_ differentiate correctly between running
|
||||
# (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
|
||||
@@ -252,7 +252,7 @@ storage-mon_monitor() {
|
||||
status="green"
|
||||
fi
|
||||
|
||||
- storage-mon_update_attribute $status
|
||||
+ storage_mon_update_attribute $status
|
||||
return "$?"
|
||||
else
|
||||
ocf_pidfile_status "${PIDFILE}" > /dev/null 2>&1
|
||||
@@ -298,20 +298,20 @@ storage-mon_monitor() {
|
||||
esac
|
||||
done
|
||||
|
||||
- storage-mon_update_attribute $status
|
||||
+ storage_mon_update_attribute $status
|
||||
return "$?"
|
||||
fi
|
||||
}
|
||||
|
||||
-storage-mon_start() {
|
||||
+storage_mon_start() {
|
||||
if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
|
||||
- storage-mon_monitor
|
||||
+ storage_mon_monitor
|
||||
if [ $? -eq $OCF_SUCCESS ]; then
|
||||
return $OCF_SUCCESS
|
||||
fi
|
||||
touch "${OCF_RESKEY_state_file}"
|
||||
else
|
||||
- storage-mon_init
|
||||
+ storage_mon_init
|
||||
# generate command line
|
||||
cmdline=""
|
||||
for DRIVE in ${OCF_RESKEY_drives}; do
|
||||
@@ -328,7 +328,7 @@ storage-mon_start() {
|
||||
|
||||
#Wait until monitor confirms the startup pid according to the ocf resource specification.
|
||||
while true; do
|
||||
- storage-mon_monitor pid_check_only
|
||||
+ storage_mon_monitor pid_check_only
|
||||
rc="$?"
|
||||
if [ $rc -eq $OCF_SUCCESS ]; then
|
||||
break
|
||||
@@ -339,8 +339,8 @@ storage-mon_start() {
|
||||
fi
|
||||
}
|
||||
|
||||
-storage-mon_stop() {
|
||||
- storage-mon_monitor
|
||||
+storage_mon_stop() {
|
||||
+ storage_mon_monitor
|
||||
rc=$?
|
||||
|
||||
if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
|
||||
@@ -363,7 +363,7 @@ storage-mon_stop() {
|
||||
fi
|
||||
|
||||
while true; do
|
||||
- storage-mon_monitor pid_check_only
|
||||
+ storage_mon_monitor pid_check_only
|
||||
rc="$?"
|
||||
case "$rc" in
|
||||
$OCF_SUCCESS)
|
||||
@@ -379,8 +379,8 @@ storage-mon_stop() {
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
-storage-mon_validate() {
|
||||
- storage-mon_init
|
||||
+storage_mon_validate() {
|
||||
+ storage_mon_init
|
||||
|
||||
if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
|
||||
# Is the state directory writable?
|
||||
@@ -396,13 +396,13 @@ storage-mon_validate() {
|
||||
}
|
||||
|
||||
case "$__OCF_ACTION" in
|
||||
- start) storage-mon_start;;
|
||||
- stop) storage-mon_stop;;
|
||||
- monitor) storage-mon_monitor;;
|
||||
- validate-all) storage-mon_validate;;
|
||||
+ start) storage_mon_start;;
|
||||
+ stop) storage_mon_stop;;
|
||||
+ monitor) storage_mon_monitor;;
|
||||
+ validate-all) storage_mon_validate;;
|
||||
meta-data) meta_data;;
|
||||
- usage|help) storage-mon_usage $OCF_SUCCESS;;
|
||||
- *) storage-mon_usage $OCF_ERR_UNIMPLEMENTED;;
|
||||
+ usage|help) storage_mon_usage $OCF_SUCCESS;;
|
||||
+ *) storage_mon_usage $OCF_ERR_UNIMPLEMENTED;;
|
||||
esac
|
||||
rc=$?
|
||||
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
|
||||
@ -0,0 +1,25 @@
|
||||
From c6f520344e830a7c946b2222f9f251be038b1b28 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Fri, 24 Jan 2025 10:01:30 +0100
|
||||
Subject: [PATCH] storage-mon: check if daemon is already running during
|
||||
start-action
|
||||
|
||||
---
|
||||
heartbeat/storage-mon.in | 4 ++++
|
||||
1 file changed, 4 insertions(+)
|
||||
|
||||
diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in
|
||||
index 5edb96979..00e42f68d 100644
|
||||
--- a/heartbeat/storage-mon.in
|
||||
+++ b/heartbeat/storage-mon.in
|
||||
@@ -311,6 +311,10 @@ storage_mon_start() {
|
||||
fi
|
||||
touch "${OCF_RESKEY_state_file}"
|
||||
else
|
||||
+ storage_mon_monitor pid_check_only
|
||||
+ if [ $? -eq $OCF_SUCCESS ]; then
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
storage_mon_init
|
||||
# generate command line
|
||||
cmdline=""
|
||||
@ -0,0 +1,22 @@
|
||||
From de51a1705ce761f1fb5f1b2294cfc1153af70c1c Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Mon, 27 Jan 2025 09:54:06 +0100
|
||||
Subject: [PATCH] storage-mon: log "storage_mon is already running" in
|
||||
start-action
|
||||
|
||||
---
|
||||
heartbeat/storage-mon.in | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in
|
||||
index 00e42f68d..d60db4ad4 100644
|
||||
--- a/heartbeat/storage-mon.in
|
||||
+++ b/heartbeat/storage-mon.in
|
||||
@@ -313,6 +313,7 @@ storage_mon_start() {
|
||||
else
|
||||
storage_mon_monitor pid_check_only
|
||||
if [ $? -eq $OCF_SUCCESS ]; then
|
||||
+ ocf_log info "storage_mon is already running. PID=`cat $PIDFILE`"
|
||||
return $OCF_SUCCESS
|
||||
fi
|
||||
storage_mon_init
|
||||
@ -0,0 +1,118 @@
|
||||
From 4a228f3d8212368124134c01f958ac43e32cec08 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Mon, 7 Apr 2025 09:19:37 +0200
|
||||
Subject: [PATCH] IPaddr2: add link status DOWN/LOWERLAYERDOWN check
|
||||
|
||||
---
|
||||
heartbeat/IPaddr2 | 42 +++++++++++++++++++++++++++++++++++++++++-
|
||||
1 file changed, 41 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/heartbeat/IPaddr2 b/heartbeat/IPaddr2
|
||||
index cf03e4426..230ac853c 100755
|
||||
--- a/heartbeat/IPaddr2
|
||||
+++ b/heartbeat/IPaddr2
|
||||
@@ -92,6 +92,19 @@ OCF_RESKEY_nodad_default=false
|
||||
OCF_RESKEY_noprefixroute_default="false"
|
||||
OCF_RESKEY_preferred_lft_default="forever"
|
||||
OCF_RESKEY_network_namespace_default=""
|
||||
+OCF_RESKEY_check_link_status_default="true"
|
||||
+
|
||||
+# RHEL specific defaults
|
||||
+if is_redhat_based; then
|
||||
+ get_os_ver
|
||||
+ ocf_version_cmp "$VER" "10.1" 2>/dev/null
|
||||
+
|
||||
+ case "$?" in
|
||||
+ # RHEL < 10.1
|
||||
+ 0)
|
||||
+ OCF_RESKEY_check_link_status_default="false";;
|
||||
+ esac
|
||||
+fi
|
||||
|
||||
: ${OCF_RESKEY_ip=${OCF_RESKEY_ip_default}}
|
||||
: ${OCF_RESKEY_cidr_netmask=${OCF_RESKEY_cidr_netmask_default}}
|
||||
@@ -116,6 +129,7 @@ OCF_RESKEY_network_namespace_default=""
|
||||
: ${OCF_RESKEY_noprefixroute=${OCF_RESKEY_noprefixroute_default}}
|
||||
: ${OCF_RESKEY_preferred_lft=${OCF_RESKEY_preferred_lft_default}}
|
||||
: ${OCF_RESKEY_network_namespace=${OCF_RESKEY_network_namespace_default}}
|
||||
+: ${OCF_RESKEY_check_link_status=${OCF_RESKEY_check_link_status_default}}
|
||||
|
||||
#######################################################################
|
||||
|
||||
@@ -449,6 +463,14 @@ the namespace.
|
||||
<shortdesc lang="en">Network namespace to use</shortdesc>
|
||||
<content type="string" default="${OCF_RESKEY_network_namespace_default}"/>
|
||||
</parameter>
|
||||
+
|
||||
+<parameter name="check_link_status">
|
||||
+<longdesc lang="en">
|
||||
+Consider the resource failed if the interface has status DOWN or LOWERLAYERDOWN.
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">Consider the resource failed if the interface has status DOWN or LOWERLAYERDOWN</shortdesc>
|
||||
+<content type="string" default="${OCF_RESKEY_check_link_status_default}"/>
|
||||
+</parameter>
|
||||
</parameters>
|
||||
|
||||
<actions>
|
||||
@@ -581,6 +603,9 @@ ip_init() {
|
||||
elif [ "$__OCF_ACTION" = stop ]; then
|
||||
ocf_log warn "[$FINDIF] failed"
|
||||
exit $OCF_SUCCESS
|
||||
+ elif [ "$__OCF_ACTION" = start ]; then
|
||||
+ ocf_exit_reason "[$FINDIF] failed"
|
||||
+ exit $OCF_ERR_INSTALLED
|
||||
else
|
||||
ocf_exit_reason "[$FINDIF] failed"
|
||||
exit $rc
|
||||
@@ -1002,6 +1027,12 @@ ip_served() {
|
||||
return 0
|
||||
fi
|
||||
|
||||
+ if ocf_is_true "$OCF_RESKEY_check_link_status" && $IP2UTIL -f $FAMILY addr show $cur_nic | \
|
||||
+ grep -q "[[:space:]]\(DOWN\|LOWERLAYERDOWN\)[[:space:]]"; then
|
||||
+ echo "down"
|
||||
+ return 0
|
||||
+ fi
|
||||
+
|
||||
if [ -z "$IP_CIP" ]; then
|
||||
for i in $cur_nic; do
|
||||
# check address label
|
||||
@@ -1073,6 +1104,11 @@ ip_start() {
|
||||
exit $OCF_SUCCESS
|
||||
fi
|
||||
|
||||
+ if [ "$ip_status" = "down" ]; then
|
||||
+ ocf_exit_reason "IP $OCF_RESKEY_ip available, but device has status $ip_status"
|
||||
+ exit $OCF_ERR_INSTALLED
|
||||
+ fi
|
||||
+
|
||||
if [ "$ip_status" = "partial3" ]; then
|
||||
ocf_exit_reason "IP $OCF_RESKEY_ip available, but label missing"
|
||||
exit $OCF_ERR_GENERIC
|
||||
@@ -1096,7 +1132,7 @@ ip_start() {
|
||||
echo "+$IP_INC_NO" >$IP_CIP_FILE
|
||||
fi
|
||||
|
||||
- if [ "$ip_status" = "no" ]; then
|
||||
+ if [ "$ip_status" != "ok" ]; then
|
||||
if ocf_is_true ${OCF_RESKEY_lvs_support}; then
|
||||
for i in `find_interface $OCF_RESKEY_ip 32`; do
|
||||
case $i in
|
||||
@@ -1213,6 +1249,7 @@ ip_monitor() {
|
||||
# interface health maybe via a daemon like FailSafe etc...
|
||||
|
||||
local ip_status=`ip_served`
|
||||
+ ocf_log debug "monitor: $ip_status"
|
||||
case $ip_status in
|
||||
ok)
|
||||
run_arp_sender refresh
|
||||
@@ -1221,6 +1258,9 @@ ip_monitor() {
|
||||
no)
|
||||
exit $OCF_NOT_RUNNING
|
||||
;;
|
||||
+ down)
|
||||
+ exit $OCF_ERR_INSTALLED
|
||||
+ ;;
|
||||
*)
|
||||
# Errors on this interface?
|
||||
return $OCF_ERR_GENERIC
|
||||
360
SOURCES/RHEL-79819-portblock-fix-version-detection.patch
Normal file
360
SOURCES/RHEL-79819-portblock-fix-version-detection.patch
Normal file
@ -0,0 +1,360 @@
|
||||
--- a/heartbeat/portblock 2021-11-03 10:12:01.000000000 +0100
|
||||
+++ b/heartbeat/portblock 2025-02-20 14:09:44.546869740 +0100
|
||||
@@ -25,6 +25,7 @@
|
||||
# Defaults
|
||||
OCF_RESKEY_protocol_default=""
|
||||
OCF_RESKEY_portno_default=""
|
||||
+OCF_RESKEY_direction_default="in"
|
||||
OCF_RESKEY_action_default=""
|
||||
OCF_RESKEY_ip_default="0.0.0.0/0"
|
||||
OCF_RESKEY_reset_local_on_unblock_stop_default="false"
|
||||
@@ -33,6 +34,7 @@
|
||||
|
||||
: ${OCF_RESKEY_protocol=${OCF_RESKEY_protocol_default}}
|
||||
: ${OCF_RESKEY_portno=${OCF_RESKEY_portno_default}}
|
||||
+: ${OCF_RESKEY_direction=${OCF_RESKEY_direction_default}}
|
||||
: ${OCF_RESKEY_action=${OCF_RESKEY_action_default}}
|
||||
: ${OCF_RESKEY_ip=${OCF_RESKEY_ip_default}}
|
||||
: ${OCF_RESKEY_reset_local_on_unblock_stop=${OCF_RESKEY_reset_local_on_unblock_stop_default}}
|
||||
@@ -217,6 +219,18 @@
|
||||
<shortdesc lang="en">Connection state file synchronization script</shortdesc>
|
||||
<content type="string" default="${OCF_RESKEY_sync_script_default}" />
|
||||
</parameter>
|
||||
+
|
||||
+<parameter name="direction" unique="0" required="0">
|
||||
+<longdesc lang="en">
|
||||
+Whether to block incoming or outgoing traffic. Can be either "in",
|
||||
+"out", or "both".
|
||||
+If "in" is used, the incoming ports are blocked on the INPUT chain.
|
||||
+If "out" is used, the outgoing ports are blocked on the OUTPUT chain.
|
||||
+If "both" is used, both the incoming and outgoing ports are blocked.
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">Whether to block incoming or outgoing traffic, or both</shortdesc>
|
||||
+<content type="string" default="${OCF_RESKEY_direction_default}" />
|
||||
+</parameter>
|
||||
</parameters>
|
||||
|
||||
<actions>
|
||||
@@ -240,19 +254,34 @@
|
||||
# and disable us -- but we're still in some sense active...
|
||||
#
|
||||
|
||||
-#active_grep_pat {udp|tcp} portno,portno
|
||||
+#active_grep_pat {udp|tcp} portno,portno ip {d|s}
|
||||
+# d = look for destination ports
|
||||
+# s = look for source ports
|
||||
active_grep_pat()
|
||||
{
|
||||
w="[ ][ ]*"
|
||||
any="0\\.0\\.0\\.0/0"
|
||||
- echo "^DROP${w}${1}${w}--${w}${any}${w}${3}${w}multiport${w}dports${w}${2}\>"
|
||||
+ src=$any dst=$3
|
||||
+ if [ "$4" = "s" ]; then
|
||||
+ local src=$3
|
||||
+ local dst=$any
|
||||
+ fi
|
||||
+ # iptables 1.8.9 briefly broke the output format, returning the
|
||||
+ # numeric protocol value instead of a string. Support both variants.
|
||||
+ if [ "$1" = "tcp" ]; then
|
||||
+ local prot="(tcp|6)"
|
||||
+ else
|
||||
+ local prot="(udp|17)"
|
||||
+ fi
|
||||
+ echo "^DROP${w}${prot}${w}--${w}${src}${w}${dst}${w}multiport${w}${4}ports${w}${2}$"
|
||||
}
|
||||
|
||||
-#chain_isactive {udp|tcp} portno,portno ip
|
||||
+#chain_isactive {udp|tcp} portno,portno ip chain
|
||||
chain_isactive()
|
||||
{
|
||||
- PAT=`active_grep_pat "$1" "$2" "$3"`
|
||||
- $IPTABLES $wait -n -L INPUT | grep "$PAT" >/dev/null
|
||||
+ [ "$4" = "OUTPUT" ] && ds="s" || ds="d"
|
||||
+ PAT=$(active_grep_pat "$1" "$2" "$3" "$ds")
|
||||
+ $IPTABLES $wait -n -L "$4" | grep -qE "$PAT"
|
||||
}
|
||||
|
||||
# netstat -tn and ss -Htn, split on whitespace and colon,
|
||||
@@ -299,7 +328,6 @@
|
||||
tickle_remote()
|
||||
{
|
||||
[ -z "$OCF_RESKEY_tickle_dir" ] && return
|
||||
- echo 1 > /proc/sys/net/ipv4/tcp_tw_recycle
|
||||
f=$OCF_RESKEY_tickle_dir/$OCF_RESKEY_ip
|
||||
[ -r $f ] || return
|
||||
$TICKLETCP -n 3 < $f
|
||||
@@ -331,112 +359,140 @@
|
||||
|
||||
SayActive()
|
||||
{
|
||||
- echo "$CMD DROP rule for INPUT chain [$*] is running (OK)"
|
||||
+ ocf_log debug "$CMD DROP rule [$*] is running (OK)"
|
||||
}
|
||||
|
||||
SayConsideredActive()
|
||||
{
|
||||
- echo "$CMD DROP rule for INPUT chain [$*] considered to be running (OK)"
|
||||
+ ocf_log debug "$CMD DROP rule [$*] considered to be running (OK)"
|
||||
}
|
||||
|
||||
SayInactive()
|
||||
{
|
||||
- echo "$CMD DROP rule for INPUT chain [$*] is inactive"
|
||||
+ ocf_log debug "$CMD DROP rule [$*] is inactive"
|
||||
}
|
||||
|
||||
-#IptablesStatus {udp|tcp} portno,portno ip {block|unblock}
|
||||
+#IptablesStatus {udp|tcp} portno,portno ip {in|out|both} {block|unblock}
|
||||
IptablesStatus() {
|
||||
- local rc
|
||||
- rc=$OCF_ERR_GENERIC
|
||||
- activewords="$CMD $1 $2 is running (OK)"
|
||||
- if chain_isactive "$1" "$2" "$3"; then
|
||||
- case $4 in
|
||||
- block)
|
||||
- SayActive $*
|
||||
- rc=$OCF_SUCCESS
|
||||
- ;;
|
||||
- *)
|
||||
- SayInactive $*
|
||||
- rc=$OCF_NOT_RUNNING
|
||||
- ;;
|
||||
- esac
|
||||
- else
|
||||
- case $4 in
|
||||
- block)
|
||||
- if ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" status; then
|
||||
- SayConsideredActive $*
|
||||
- rc=$OCF_SUCCESS
|
||||
- else
|
||||
- SayInactive $*
|
||||
- rc=$OCF_NOT_RUNNING
|
||||
- fi
|
||||
- ;;
|
||||
-
|
||||
- *)
|
||||
- if ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" status; then
|
||||
- SayActive $*
|
||||
- #This is only run on real monitor events.
|
||||
- save_tcp_connections
|
||||
- rc=$OCF_SUCCESS
|
||||
- else
|
||||
- SayInactive $*
|
||||
- rc=$OCF_NOT_RUNNING
|
||||
- fi
|
||||
- ;;
|
||||
- esac
|
||||
- fi
|
||||
-
|
||||
- return $rc
|
||||
+ local rc
|
||||
+ rc=$OCF_ERR_GENERIC
|
||||
+ is_active=0
|
||||
+ if [ "$4" = "in" ] || [ "$4" = "both" ]; then
|
||||
+ chain_isactive "$1" "$2" "$3" INPUT
|
||||
+ is_active=$?
|
||||
+ fi
|
||||
+ if [ "$4" = "out" ] || [ "$4" = "both" ]; then
|
||||
+ chain_isactive "$1" "$2" "$3" OUTPUT
|
||||
+ r=$?
|
||||
+ [ $r -gt $is_active ] && is_active=$r
|
||||
+ fi
|
||||
+ if [ $is_active -eq 0 ]; then
|
||||
+ case $5 in
|
||||
+ block)
|
||||
+ SayActive $*
|
||||
+ rc=$OCF_SUCCESS
|
||||
+ ;;
|
||||
+ *)
|
||||
+ SayInactive $*
|
||||
+ rc=$OCF_NOT_RUNNING
|
||||
+ ;;
|
||||
+ esac
|
||||
+ else
|
||||
+ case $5 in
|
||||
+ block)
|
||||
+ if ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" status; then
|
||||
+ SayConsideredActive $*
|
||||
+ rc=$OCF_SUCCESS
|
||||
+ else
|
||||
+ SayInactive $*
|
||||
+ rc=$OCF_NOT_RUNNING
|
||||
+ fi
|
||||
+ ;;
|
||||
+ *)
|
||||
+ if ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" status; then
|
||||
+ SayActive $*
|
||||
+ #This is only run on real monitor events.
|
||||
+ save_tcp_connections
|
||||
+ rc=$OCF_SUCCESS
|
||||
+ else
|
||||
+ SayInactive $*
|
||||
+ rc=$OCF_NOT_RUNNING
|
||||
+ fi
|
||||
+ ;;
|
||||
+ esac
|
||||
+ fi
|
||||
+ return $rc
|
||||
}
|
||||
|
||||
-#IptablesBLOCK {udp|tcp} portno,portno ip
|
||||
-IptablesBLOCK()
|
||||
+#DoIptables {-I|-D} {udp|tcp} portno,portno ip chain
|
||||
+DoIptables()
|
||||
{
|
||||
- local rc=0
|
||||
- local try_reset=false
|
||||
- if [ "$1/$4/$__OCF_ACTION" = tcp/unblock/stop ] &&
|
||||
- ocf_is_true $reset_local_on_unblock_stop
|
||||
- then
|
||||
- try_reset=true
|
||||
- fi
|
||||
- if
|
||||
- chain_isactive "$1" "$2" "$3"
|
||||
- then
|
||||
- : OK -- chain already active
|
||||
+ op=$1 proto=$2 ports=$3 ip=$4 chain=$5
|
||||
+ active=0; chain_isactive "$proto" "$ports" "$ip" "$chain" && active=1
|
||||
+ want_active=0; [ "$op" = "-I" ] && want_active=1
|
||||
+ ocf_log debug "active: $active want_active: $want_active"
|
||||
+ if [ $active -eq $want_active ] ; then
|
||||
+ : Chain already in desired state
|
||||
else
|
||||
- if $try_reset ; then
|
||||
- $IPTABLES $wait -I OUTPUT -p "$1" -s "$3" -m multiport --sports "$2" -j REJECT --reject-with tcp-reset
|
||||
- tickle_local
|
||||
+ [ "$chain" = "OUTPUT" ] && ds="s" || ds="d"
|
||||
+ $IPTABLES $wait "$op" "$chain" -p "$proto" -${ds} "$ip" -m multiport --${ds}ports "$ports" -j DROP
|
||||
+ fi
|
||||
+}
|
||||
+
|
||||
+#IptablesBLOCK {udp|tcp} portno,portno ip {in|out|both} {block|unblock}
|
||||
+IptablesBLOCK()
|
||||
+{
|
||||
+ local rc_in=0
|
||||
+ local rc_out=0
|
||||
+ if [ "$4" = "in" ] || [ "$4" = "both" ]; then
|
||||
+ local try_reset=false
|
||||
+ if [ "$1/$5/$__OCF_ACTION" = tcp/unblock/stop ] &&
|
||||
+ ocf_is_true $reset_local_on_unblock_stop
|
||||
+ then
|
||||
+ try_reset=true
|
||||
fi
|
||||
- $IPTABLES $wait -I INPUT -p "$1" -d "$3" -m multiport --dports "$2" -j DROP
|
||||
- rc=$?
|
||||
- if $try_reset ; then
|
||||
- $IPTABLES $wait -D OUTPUT -p "$1" -s "$3" -m multiport --sports "$2" -j REJECT --reject-with tcp-reset
|
||||
+ if
|
||||
+ chain_isactive "$1" "$2" "$3" INPUT
|
||||
+ then
|
||||
+ : OK -- chain already active
|
||||
+ else
|
||||
+ if $try_reset ; then
|
||||
+ $IPTABLES $wait -I OUTPUT -p "$1" -s "$3" -m multiport --sports "$2" -j REJECT --reject-with tcp-reset
|
||||
+ tickle_local
|
||||
+ fi
|
||||
+ $IPTABLES $wait -I INPUT -p "$1" -d "$3" -m multiport --dports "$2" -j DROP
|
||||
+ rc_in=$?
|
||||
+ if $try_reset ; then
|
||||
+ $IPTABLES $wait -D OUTPUT -p "$1" -s "$3" -m multiport --sports "$2" -j REJECT --reject-with tcp-reset
|
||||
+ fi
|
||||
fi
|
||||
fi
|
||||
+ if [ "$4" = "out" ] || [ "$4" = "both" ]; then
|
||||
+ DoIptables -I "$1" "$2" "$3" OUTPUT
|
||||
+ rc_out=$?
|
||||
+ fi
|
||||
|
||||
- return $rc
|
||||
+ [ $rc_in -gt $rc_out ] && return $rc_in || return $rc_out
|
||||
}
|
||||
|
||||
-#IptablesUNBLOCK {udp|tcp} portno,portno ip
|
||||
+#IptablesUNBLOCK {udp|tcp} portno,portno ip {in|out|both}
|
||||
IptablesUNBLOCK()
|
||||
{
|
||||
- if
|
||||
- chain_isactive "$1" "$2" "$3"
|
||||
- then
|
||||
- $IPTABLES $wait -D INPUT -p "$1" -d "$3" -m multiport --dports "$2" -j DROP
|
||||
- else
|
||||
- : Chain Not active
|
||||
+ if [ "$4" = "in" ] || [ "$4" = "both" ]; then
|
||||
+ DoIptables -D "$1" "$2" "$3" INPUT
|
||||
+ fi
|
||||
+ if [ "$4" = "out" ] || [ "$4" = "both" ]; then
|
||||
+ DoIptables -D "$1" "$2" "$3" OUTPUT
|
||||
fi
|
||||
|
||||
return $?
|
||||
}
|
||||
|
||||
-#IptablesStart {udp|tcp} portno,portno ip {block|unblock}
|
||||
+#IptablesStart {udp|tcp} portno,portno ip {in|out|both} {block|unblock}
|
||||
IptablesStart()
|
||||
{
|
||||
ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" start
|
||||
- case $4 in
|
||||
+ case $5 in
|
||||
block) IptablesBLOCK "$@";;
|
||||
unblock)
|
||||
IptablesUNBLOCK "$@"
|
||||
@@ -451,11 +507,11 @@
|
||||
return $?
|
||||
}
|
||||
|
||||
-#IptablesStop {udp|tcp} portno,portno ip {block|unblock}
|
||||
+#IptablesStop {udp|tcp} portno,portno ip {in|out|both} {block|unblock}
|
||||
IptablesStop()
|
||||
{
|
||||
ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" stop
|
||||
- case $4 in
|
||||
+ case $5 in
|
||||
block) IptablesUNBLOCK "$@";;
|
||||
unblock)
|
||||
save_tcp_connections
|
||||
@@ -473,7 +529,7 @@
|
||||
CheckPort() {
|
||||
# Examples of valid port: "1080", "1", "0080"
|
||||
# Examples of invalid port: "1080bad", "0", "0000", ""
|
||||
- echo $1 |egrep -qx '[0-9]+(:[0-9]+)?(,[0-9]+(:[0-9]+)?)*'
|
||||
+ echo $1 | $EGREP -qx '[0-9]+(:[0-9]+)?(,[0-9]+(:[0-9]+)?)*'
|
||||
}
|
||||
|
||||
IptablesValidateAll()
|
||||
@@ -562,7 +618,7 @@
|
||||
fi
|
||||
|
||||
# iptables v1.4.20+ is required to use -w (wait)
|
||||
-version=$(iptables -V | awk -F ' v' '{print $NF}')
|
||||
+version=$(iptables -V | grep -oE '[0-9]+[\.0-9]+')
|
||||
ocf_version_cmp "$version" "1.4.19.1"
|
||||
if [ "$?" -eq "2" ]; then
|
||||
wait="-w"
|
||||
@@ -572,6 +628,7 @@
|
||||
|
||||
protocol=$OCF_RESKEY_protocol
|
||||
portno=$OCF_RESKEY_portno
|
||||
+direction=$OCF_RESKEY_direction
|
||||
action=$OCF_RESKEY_action
|
||||
ip=$OCF_RESKEY_ip
|
||||
reset_local_on_unblock_stop=$OCF_RESKEY_reset_local_on_unblock_stop
|
||||
@@ -592,15 +649,15 @@
|
||||
|
||||
case $1 in
|
||||
start)
|
||||
- IptablesStart $protocol $portno $ip $action
|
||||
+ IptablesStart $protocol $portno $ip $direction $action
|
||||
;;
|
||||
|
||||
stop)
|
||||
- IptablesStop $protocol $portno $ip $action
|
||||
+ IptablesStop $protocol $portno $ip $direction $action
|
||||
;;
|
||||
|
||||
status|monitor)
|
||||
- IptablesStatus $protocol $portno $ip $action
|
||||
+ IptablesStatus $protocol $portno $ip $direction $action
|
||||
;;
|
||||
|
||||
validate-all)
|
||||
@ -0,0 +1,72 @@
|
||||
From f6a5f38405a93ab88e887aa657ee79593d1a4485 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Wed, 26 Mar 2025 09:48:06 +0100
|
||||
Subject: [PATCH 1/2] tomcat: fix CATALINA_PID not set issue
|
||||
|
||||
---
|
||||
heartbeat/tomcat | 10 ++++++----
|
||||
1 file changed, 6 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/tomcat b/heartbeat/tomcat
|
||||
index fa2715140b..6d47980296 100755
|
||||
--- a/heartbeat/tomcat
|
||||
+++ b/heartbeat/tomcat
|
||||
@@ -695,10 +695,12 @@ CATALINA_BASE="${OCF_RESKEY_catalina_base-${OCF_RESKEY_catalina_home}}"
|
||||
CATALINA_OUT="${OCF_RESKEY_catalina_out}"
|
||||
|
||||
CATALINA_PID=$OCF_RESKEY_catalina_pid
|
||||
-if [ -z "$CATALINA_PID" ] && [ "$__OCF_ACTION" = "start" ]; then
|
||||
- mkdir -p "${HA_RSCTMP}/${TOMCAT_NAME}_tomcatstate/"
|
||||
- if [ "${RESOURCE_TOMCAT_USER}" != "root" ]; then
|
||||
- chown ${RESOURCE_TOMCAT_USER} "${HA_RSCTMP}/${TOMCAT_NAME}_tomcatstate/"
|
||||
+if [ -z "$CATALINA_PID" ]; then
|
||||
+ if [ "$__OCF_ACTION" = "start" ]; then
|
||||
+ mkdir -p "${HA_RSCTMP}/${TOMCAT_NAME}_tomcatstate/"
|
||||
+ if [ "${RESOURCE_TOMCAT_USER}" != "root" ]; then
|
||||
+ chown ${RESOURCE_TOMCAT_USER} "${HA_RSCTMP}/${TOMCAT_NAME}_tomcatstate/"
|
||||
+ fi
|
||||
fi
|
||||
CATALINA_PID="${HA_RSCTMP}/${TOMCAT_NAME}_tomcatstate/catalina.pid"
|
||||
fi
|
||||
|
||||
From b0da375699ebfa544e6e4a13eae554af3e7d65c9 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Fri, 28 Mar 2025 10:50:17 +0100
|
||||
Subject: [PATCH 2/2] tomcat: fix catalina_base and catalina_out parameter
|
||||
defaults
|
||||
|
||||
---
|
||||
heartbeat/tomcat | 6 +++---
|
||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/tomcat b/heartbeat/tomcat
|
||||
index 6d47980296..1e8f216384 100755
|
||||
--- a/heartbeat/tomcat
|
||||
+++ b/heartbeat/tomcat
|
||||
@@ -650,7 +650,6 @@ OCF_RESKEY_statusurl_default="http://127.0.0.1:8080"
|
||||
OCF_RESKEY_max_stop_time_default=""
|
||||
OCF_RESKEY_java_home_default=""
|
||||
OCF_RESKEY_java_opts_default=""
|
||||
-OCF_RESKEY_catalina_out_default="${OCF_RESKEY_catalina_base-${OCF_RESKEY_catalina_home}}/logs/catalina.out"
|
||||
OCF_RESKEY_catalina_pid_default=""
|
||||
OCF_RESKEY_tomcat_start_script_default="${TOMCAT_START_SCRIPT}"
|
||||
OCF_RESKEY_tomcat_start_opts_default=""
|
||||
@@ -670,7 +669,6 @@ OCF_RESKEY_logging_manager_default=""
|
||||
: ${OCF_RESKEY_max_stop_time=${OCF_RESKEY_max_stop_time_default}}
|
||||
: ${OCF_RESKEY_java_home=${OCF_RESKEY_java_home_default}}
|
||||
: ${OCF_RESKEY_java_opts=${OCF_RESKEY_java_opts_default}}
|
||||
-: ${OCF_RESKEY_catalina_out=${OCF_RESKEY_catalina_out_default}}
|
||||
: ${OCF_RESKEY_catalina_pid=${OCF_RESKEY_catalina_pid_default}}
|
||||
: ${OCF_RESKEY_tomcat_start_script=${OCF_RESKEY_tomcat_start_script_default}}
|
||||
: ${OCF_RESKEY_tomcat_start_opts=${OCF_RESKEY_tomcat_start_opts_default}}
|
||||
@@ -691,7 +689,9 @@ RESOURCE_STATUSURL="${OCF_RESKEY_statusurl}"
|
||||
JAVA_HOME="${OCF_RESKEY_java_home}"
|
||||
JAVA_OPTS="${OCF_RESKEY_java_opts}"
|
||||
CATALINA_HOME="${OCF_RESKEY_catalina_home}"
|
||||
-CATALINA_BASE="${OCF_RESKEY_catalina_base-${OCF_RESKEY_catalina_home}}"
|
||||
+CATALINA_BASE="${OCF_RESKEY_catalina_base:-${OCF_RESKEY_catalina_home}}"
|
||||
+OCF_RESKEY_catalina_out_default="${OCF_RESKEY_catalina_base:-${OCF_RESKEY_catalina_home}}/logs/catalina.out"
|
||||
+: ${OCF_RESKEY_catalina_out=${OCF_RESKEY_catalina_out_default}}
|
||||
CATALINA_OUT="${OCF_RESKEY_catalina_out}"
|
||||
|
||||
CATALINA_PID=$OCF_RESKEY_catalina_pid
|
||||
171
SOURCES/RHEL-88035-Filesystem-add-support-for-aznfs.patch
Normal file
171
SOURCES/RHEL-88035-Filesystem-add-support-for-aznfs.patch
Normal file
@ -0,0 +1,171 @@
|
||||
From 3bffa541f7bf66e143f14e51551fc91dfebec86c Mon Sep 17 00:00:00 2001
|
||||
From: Tobias Schug <happytobi@tscoding.de>
|
||||
Date: Mon, 28 Oct 2024 09:14:41 +0100
|
||||
Subject: [PATCH] Add azure aznfs filesystem support
|
||||
|
||||
---
|
||||
heartbeat/Filesystem | 37 ++++++++++++++++++++-----------------
|
||||
1 file changed, 20 insertions(+), 17 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||
index 3405e2c26..b48bee142 100755
|
||||
--- a/heartbeat/Filesystem
|
||||
+++ b/heartbeat/Filesystem
|
||||
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# Support: users@clusterlabs.org
|
||||
# License: GNU General Public License (GPL)
|
||||
-#
|
||||
+#
|
||||
# Filesystem
|
||||
# Description: Manages a Filesystem on a shared storage medium.
|
||||
# Original Author: Eric Z. Ayers (eric.ayers@compgen.com)
|
||||
@@ -142,7 +142,7 @@ meta_data() {
|
||||
|
||||
<longdesc lang="en">
|
||||
Resource script for Filesystem. It manages a Filesystem on a
|
||||
-shared storage medium.
|
||||
+shared storage medium.
|
||||
|
||||
The standard monitor operation of depth 0 (also known as probe)
|
||||
checks if the filesystem is mounted. If you want deeper tests,
|
||||
@@ -260,7 +260,7 @@ currently accessing the mount directory.
|
||||
"true" : Kill processes accessing mount point
|
||||
"safe" : Kill processes accessing mount point using methods that
|
||||
avoid functions that could potentially block during process
|
||||
- detection
|
||||
+ detection
|
||||
"false" : Do not kill any processes.
|
||||
|
||||
The 'safe' option uses shell logic to walk the /procs/ directory
|
||||
@@ -373,7 +373,7 @@ determine_blockdevice() {
|
||||
# Get the current real device name, if possible.
|
||||
# (specified devname could be -L or -U...)
|
||||
case "$FSTYPE" in
|
||||
- nfs4|nfs|efs|smbfs|cifs|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs|none|lustre)
|
||||
+ nfs4|nfs|aznfs|efs|smbfs|cifs|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs|none|lustre)
|
||||
: ;;
|
||||
*)
|
||||
match_string="${TAB}${CANONICALIZED_MOUNTPOINT}${TAB}"
|
||||
@@ -455,7 +455,7 @@ is_fsck_needed() {
|
||||
no) false;;
|
||||
""|auto)
|
||||
case "$FSTYPE" in
|
||||
- ext4|ext4dev|ext3|reiserfs|reiser4|nss|xfs|jfs|vfat|fat|nfs4|nfs|efs|cifs|smbfs|ocfs2|gfs2|none|lustre|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs)
|
||||
+ ext4|ext4dev|ext3|reiserfs|reiser4|nss|xfs|jfs|vfat|fat|nfs4|nfs|aznfs|efs|cifs|smbfs|ocfs2|gfs2|none|lustre|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs)
|
||||
false;;
|
||||
*)
|
||||
true;;
|
||||
@@ -478,7 +478,7 @@ fstype_supported()
|
||||
fi
|
||||
|
||||
if [ -z "$FSTYPE" -o "$FSTYPE" = none ]; then
|
||||
- : No FSTYPE specified, rely on the system has the right file-system support already
|
||||
+ : No FSTYPE specified, rely on the system has the right file-system support already
|
||||
return $OCF_SUCCESS
|
||||
fi
|
||||
|
||||
@@ -487,6 +487,7 @@ fstype_supported()
|
||||
case "$FSTYPE" in
|
||||
fuse.*|glusterfs|rozofs) support="fuse";;
|
||||
efs) check_binary "mount.efs"; support="nfs4";;
|
||||
+ aznfs) check_binary "mount.aznfs"; support="nfs4";;
|
||||
esac
|
||||
|
||||
if [ "$support" != "$FSTYPE" ]; then
|
||||
@@ -530,7 +531,7 @@ fstype_supported()
|
||||
# node on the shared storage, and is not visible yet. Then try
|
||||
# partprobe to refresh /dev/disk/by-{label,uuid}/* up to date.
|
||||
#
|
||||
-# DEVICE can be /dev/xxx, -U, -L
|
||||
+# DEVICE can be /dev/xxx, -U, -L
|
||||
#
|
||||
trigger_udev_rules_if_needed()
|
||||
{
|
||||
@@ -545,12 +546,12 @@ trigger_udev_rules_if_needed()
|
||||
fi
|
||||
else
|
||||
tmp="$(echo $DEVICE|awk '{$1=""; print substr($0,2)}')"
|
||||
- case "$DEVICE" in
|
||||
- -U*|--uuid*)
|
||||
- tmp="/dev/disk/by-uuid/$tmp"
|
||||
+ case "$DEVICE" in
|
||||
+ -U*|--uuid*)
|
||||
+ tmp="/dev/disk/by-uuid/$tmp"
|
||||
;;
|
||||
-L*|--label*)
|
||||
- tmp="/dev/disk/by-label/$tmp"
|
||||
+ tmp="/dev/disk/by-label/$tmp"
|
||||
;;
|
||||
*)
|
||||
# bind mount?
|
||||
@@ -595,7 +596,7 @@ Filesystem_start()
|
||||
|
||||
fstype_supported || exit $OCF_ERR_INSTALLED
|
||||
|
||||
- # Check the filesystem & auto repair.
|
||||
+ # Check the filesystem & auto repair.
|
||||
# NOTE: Some filesystem types don't need this step... Please modify
|
||||
# accordingly
|
||||
|
||||
@@ -697,7 +698,7 @@ signal_processes() {
|
||||
local sig=$2
|
||||
local pids pid
|
||||
# fuser returns a non-zero return code if none of the
|
||||
- # specified files is accessed or in case of a fatal
|
||||
+ # specified files is accessed or in case of a fatal
|
||||
# error.
|
||||
pids=$(get_pids "$dir")
|
||||
if [ -z "$pids" ]; then
|
||||
@@ -745,6 +746,7 @@ fs_stop_loop() {
|
||||
try_umount "$force_arg" "$SUB" && return $OCF_SUCCESS
|
||||
done
|
||||
}
|
||||
+
|
||||
fs_stop() {
|
||||
local SUB="$1" timeout=$2 grace_time ret
|
||||
grace_time=$((timeout/2))
|
||||
@@ -797,7 +799,7 @@ Filesystem_stop()
|
||||
|
||||
# For networked filesystems, there's merit in trying -f:
|
||||
case "$FSTYPE" in
|
||||
- nfs4|nfs|efs|cifs|smbfs) umount_force="-f" ;;
|
||||
+ nfs4|nfs|aznfs|efs|cifs|smbfs) umount_force="-f" ;;
|
||||
esac
|
||||
|
||||
# Umount all sub-filesystems mounted under $MOUNTPOINT/ too.
|
||||
@@ -942,6 +944,7 @@ Filesystem_monitor_20()
|
||||
fi
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
+
|
||||
Filesystem_monitor()
|
||||
{
|
||||
Filesystem_status
|
||||
@@ -1016,7 +1019,7 @@ set_blockdevice_var() {
|
||||
|
||||
# these are definitely not block devices
|
||||
case "$FSTYPE" in
|
||||
- nfs4|nfs|efs|smbfs|cifs|none|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs|lustre) return;;
|
||||
+ nfs4|nfs|aznfs|efs|smbfs|cifs|none|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs|lustre) return;;
|
||||
esac
|
||||
|
||||
if $(is_option "loop"); then
|
||||
@@ -1098,7 +1101,7 @@ set_blockdevice_var
|
||||
if [ -z "$OCF_RESKEY_directory" ]; then
|
||||
if [ X$OP = "Xstart" -o $blockdevice = "no" ]; then
|
||||
ocf_exit_reason "Please specify the directory"
|
||||
- exit $OCF_ERR_CONFIGURED
|
||||
+ exit $OCF_ERR_CONFIGURED
|
||||
fi
|
||||
else
|
||||
MOUNTPOINT="$(echo "$OCF_RESKEY_directory" | sed 's/\/*$//')"
|
||||
@@ -1166,7 +1169,7 @@ is_option "ro" &&
|
||||
CLUSTERSAFE=2
|
||||
|
||||
case "$FSTYPE" in
|
||||
-nfs4|nfs|efs|smbfs|cifs|none|gfs2|glusterfs|ceph|ocfs2|overlay|overlayfs|tmpfs|cvfs|lustre)
|
||||
+nfs4|nfs|aznfs|efs|smbfs|cifs|none|gfs2|glusterfs|ceph|ocfs2|overlay|overlayfs|tmpfs|cvfs|lustre)
|
||||
CLUSTERSAFE=1 # this is kind of safe too
|
||||
systemd_drop_in "99-Filesystem-remote" "After" "remote-fs.target"
|
||||
;;
|
||||
1643
SOURCES/RHEL-88429-1-podman-etcd-new-ra.patch
Normal file
1643
SOURCES/RHEL-88429-1-podman-etcd-new-ra.patch
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,31 @@
|
||||
From 6a3249aae260c081ccbcfd09444d5d85ebc4e3b3 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Mon, 28 Apr 2025 15:48:29 +0200
|
||||
Subject: [PATCH] podman-etcd: remove unused actions from metadata
|
||||
|
||||
---
|
||||
heartbeat/podman-etcd | 4 +---
|
||||
1 file changed, 1 insertion(+), 3 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 514dd2e5b..3a2323260 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -236,8 +236,6 @@ to stop the container before pacemaker.
|
||||
<action name="start" timeout="600s" />
|
||||
<action name="stop" timeout="90s" />
|
||||
<action name="monitor" timeout="25s" interval="30s" depth="0" />
|
||||
-<action name="promote" timeout="300s" />
|
||||
-<action name="demote" timeout="120s" />
|
||||
<action name="meta-data" timeout="5s" />
|
||||
<action name="validate-all" timeout="30s" />
|
||||
</actions>
|
||||
@@ -251,7 +249,7 @@ REQUIRE_IMAGE_PULL=0
|
||||
podman_usage()
|
||||
{
|
||||
cat <<END
|
||||
-usage: $0 {start|stop|monitor|promote|demote|validate-all|meta-data}
|
||||
+usage: $0 {start|stop|monitor|validate-all|meta-data}
|
||||
|
||||
Expects to have a fully populated OCF RA-compliant environment set.
|
||||
END
|
||||
@ -0,0 +1,36 @@
|
||||
From 5f7b9b045d4713e8ff27a4fc8b2799669c1b823a Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Tue, 20 May 2025 09:34:03 +0200
|
||||
Subject: [PATCH] podman-etcd: fix listen-peer-urls binding (#2049)
|
||||
|
||||
This change ensures learner etcd listens on all interfaces for peer
|
||||
connections, resolving accessibility issues.
|
||||
|
||||
Fix: OCPBUGS-56447
|
||||
---
|
||||
heartbeat/podman-etcd | 12 +++---------
|
||||
1 file changed, 3 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 3a2323260..6762112ec 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -436,15 +436,9 @@ prepare_env() {
|
||||
ETCD_PEER_CERT=$(get_env_from_manifest "ETCDCTL_CERT")
|
||||
ETCD_PEER_KEY=$(get_env_from_manifest "ETCDCTL_KEY")
|
||||
|
||||
- if is_learner; then
|
||||
- LISTEN_CLIENT_URLS="$NODEIP"
|
||||
- LISTEN_PEER_URLS="$NODEIP"
|
||||
- LISTEN_METRICS_URLS="$NODEIP"
|
||||
- else
|
||||
- LISTEN_CLIENT_URLS="0.0.0.0"
|
||||
- LISTEN_PEER_URLS="0.0.0.0"
|
||||
- LISTEN_METRICS_URLS="0.0.0.0"
|
||||
- fi
|
||||
+ LISTEN_CLIENT_URLS="0.0.0.0"
|
||||
+ LISTEN_PEER_URLS="0.0.0.0"
|
||||
+ LISTEN_METRICS_URLS="0.0.0.0"
|
||||
}
|
||||
|
||||
archive_data_folder()
|
||||
@ -0,0 +1,24 @@
|
||||
From 9127148e15fc200356df2571c7c9e5a716854c24 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Thu, 26 Jun 2025 09:39:31 +0200
|
||||
Subject: [PATCH] Filesystem: fix issue with Vormetric mounts
|
||||
|
||||
---
|
||||
heartbeat/Filesystem | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||
index d10e5a714..515a3919d 100755
|
||||
--- a/heartbeat/Filesystem
|
||||
+++ b/heartbeat/Filesystem
|
||||
@@ -361,8 +361,8 @@ list_mounts() {
|
||||
fi
|
||||
done
|
||||
|
||||
- # Convert octal \040 to space characters
|
||||
- printf "$mount_list"
|
||||
+ # Convert octal \040 to space characters and ignore Vormetric mounts
|
||||
+ printf "$mount_list" | grep -v "secfs2$"
|
||||
}
|
||||
|
||||
determine_blockdevice() {
|
||||
@ -6,7 +6,7 @@ diff --color -uNr a/heartbeat/aliyun-vpc-move-ip b/heartbeat/aliyun-vpc-move-ip
|
||||
OCF_RESKEY_profile_default="default"
|
||||
OCF_RESKEY_endpoint_default="vpc.aliyuncs.com"
|
||||
-OCF_RESKEY_aliyuncli_default="detect"
|
||||
+OCF_RESKEY_aliyuncli_default="/usr/lib/fence-agents/support/aliyun/bin/aliyuncli"
|
||||
+OCF_RESKEY_aliyuncli_default="/usr/lib/fence-agents/support/aliyun/aliyun-cli/aliyun"
|
||||
|
||||
|
||||
: ${OCF_RESKEY_address=${OCF_RESKEY_address_default}}
|
||||
|
||||
@ -1,49 +0,0 @@
|
||||
diff --color -uNr a/heartbeat/awseip b/heartbeat/awseip
|
||||
--- a/heartbeat/awseip 2020-12-03 14:31:17.000000000 +0100
|
||||
+++ b/heartbeat/awseip 2021-02-15 16:47:36.624610378 +0100
|
||||
@@ -43,7 +43,7 @@
|
||||
#
|
||||
# Defaults
|
||||
#
|
||||
-OCF_RESKEY_awscli_default="/usr/bin/aws"
|
||||
+OCF_RESKEY_awscli_default="/usr/lib/fence-agents/support/awscli/bin/aws"
|
||||
OCF_RESKEY_auth_type_default="key"
|
||||
OCF_RESKEY_profile_default="default"
|
||||
OCF_RESKEY_region_default=""
|
||||
OCF_RESKEY_api_delay_default="3"
|
||||
diff --color -uNr a/heartbeat/awsvip b/heartbeat/awsvip
|
||||
--- a/heartbeat/awsvip 2020-12-03 14:31:17.000000000 +0100
|
||||
+++ b/heartbeat/awsvip 2021-02-15 16:47:48.960632484 +0100
|
||||
@@ -42,7 +42,7 @@
|
||||
#
|
||||
# Defaults
|
||||
#
|
||||
-OCF_RESKEY_awscli_default="/usr/bin/aws"
|
||||
+OCF_RESKEY_awscli_default="/usr/lib/fence-agents/support/awscli/bin/aws"
|
||||
OCF_RESKEY_auth_type_default="key"
|
||||
OCF_RESKEY_profile_default="default"
|
||||
OCF_RESKEY_region_default=""
|
||||
diff --color -uNr a/heartbeat/aws-vpc-move-ip b/heartbeat/aws-vpc-move-ip
|
||||
--- a/heartbeat/aws-vpc-move-ip 2020-12-03 14:31:17.000000000 +0100
|
||||
+++ b/heartbeat/aws-vpc-move-ip 2021-02-15 16:47:55.484644118 +0100
|
||||
@@ -35,7 +35,7 @@
|
||||
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
|
||||
|
||||
# Defaults
|
||||
-OCF_RESKEY_awscli_default="/usr/bin/aws"
|
||||
+OCF_RESKEY_awscli_default="/usr/lib/fence-agents/support/awscli/bin/aws"
|
||||
OCF_RESKEY_auth_type_default="key"
|
||||
OCF_RESKEY_profile_default="default"
|
||||
OCF_RESKEY_region_default=""
|
||||
diff --color -uNr a/heartbeat/aws-vpc-route53.in b/heartbeat/aws-vpc-route53.in
|
||||
--- a/heartbeat/aws-vpc-route53.in 2020-12-03 14:31:17.000000000 +0100
|
||||
+++ b/heartbeat/aws-vpc-route53.in 2021-02-15 16:47:59.808651828 +0100
|
||||
@@ -45,7 +45,7 @@
|
||||
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
|
||||
|
||||
# Defaults
|
||||
-OCF_RESKEY_awscli_default="/usr/bin/aws"
|
||||
+OCF_RESKEY_awscli_default="/usr/lib/fence-agents/support/awscli/bin/aws"
|
||||
OCF_RESKEY_auth_type_default="key"
|
||||
OCF_RESKEY_profile_default="default"
|
||||
OCF_RESKEY_region_default=""
|
||||
19
SOURCES/ha-cloud-support-ibm.patch
Normal file
19
SOURCES/ha-cloud-support-ibm.patch
Normal file
@ -0,0 +1,19 @@
|
||||
--- a/heartbeat/powervs-subnet.in 2024-10-18 10:59:30.418142172 +0200
|
||||
+++ b/heartbeat/powervs-subnet.in 2024-10-18 12:30:15.954883160 +0200
|
||||
@@ -33,9 +33,13 @@
|
||||
import textwrap
|
||||
import time
|
||||
|
||||
-import requests
|
||||
-import requests.adapters
|
||||
-import urllib3.util
|
||||
+try:
|
||||
+ sys.path.insert(0, '/usr/lib/fence-agents/support/ibm')
|
||||
+ import requests
|
||||
+ import requests.adapters
|
||||
+ import urllib3.util
|
||||
+except ImportError:
|
||||
+ pass
|
||||
|
||||
OCF_FUNCTIONS_DIR = os.environ.get(
|
||||
"OCF_FUNCTIONS_DIR", "%s/lib/heartbeat" % os.environ.get("OCF_ROOT")
|
||||
@ -45,7 +45,7 @@
|
||||
Name: resource-agents
|
||||
Summary: Open Source HA Reusable Cluster Resource Scripts
|
||||
Version: 4.10.0
|
||||
Release: 52%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}.4.alma.1
|
||||
Release: 108%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}.alma.1
|
||||
License: GPLv2+ and LGPLv2+
|
||||
URL: https://github.com/ClusterLabs/resource-agents
|
||||
Source0: %{upstream_prefix}-%{upstream_version}.tar.gz
|
||||
@ -119,17 +119,94 @@ Patch66: RHEL-16247-aws-vpc-move-ip-aws-vpc-route53-awseip-awsvip-auth_type-role
|
||||
Patch67: RHEL-17072-1-storage_mon-findif-leak-unitialized-values-EOS-fixes.patch
|
||||
Patch68: RHEL-17072-2-storage_mon-use-memset-to-fix-covscan-error.patch
|
||||
Patch69: RHEL-15304-2-findif.sh-dont-use-table-parameter.patch
|
||||
# Patches were taken from:
|
||||
# https://gitlab.com/redhat/centos-stream/rpms/resource-agents/-/commit/3f75ae39582fc9b00e6d3b6c4e7a50163763b5bc
|
||||
Patch70: RHEL-16246-aws-agents-use-curl_retry.patch
|
||||
# https://gitlab.com/redhat/centos-stream/rpms/resource-agents/-/commit/580971cd281c07f94afb5cb83a0da7b90a8b642b
|
||||
Patch71: RHEL-31763-galera-fix-joiner-promotion-fails-issue.patch
|
||||
|
||||
Patch70: RHEL-31763-galera-fix-joiner-promotion-fails-issue.patch
|
||||
Patch71: RHEL-16246-aws-agents-use-curl_retry.patch
|
||||
Patch72: RHEL-34777-Filesystem-fail-when-incorrect-device-mounted.patch
|
||||
Patch73: RHEL-24683-1-Filesystem-fail-leading-trailing-whitespace.patch
|
||||
Patch74: RHEL-24683-2-Filesystem-return-success-stop-action.patch
|
||||
Patch75: RHEL-32265-1-findif.sh-fix-corner-cases.patch
|
||||
Patch76: RHEL-32265-2-IPsrcaddr-add-IPv6-support.patch
|
||||
Patch77: RHEL-32265-3-IPaddr2-only-set-metric-value-for-IPv6-when-detected.patch
|
||||
Patch78: RHEL-32265-4-findif.sh-ignore-unreachable-blackhole-prohibit-routes.patch
|
||||
Patch79: RHEL-32265-5-IPsrcaddr-specify-dev-for-default-route.patch
|
||||
Patch80: RHEL-40393-Filesystem-1-dont-kill-unrelated-processes.patch
|
||||
Patch81: RHEL-40393-Filesystem-2-update-bsd-logic.patch
|
||||
Patch82: RHEL-32829-db2-fix-OCF_SUCESS-typo.patch
|
||||
Patch83: RHEL-43579-galera-mysql-redis-remove-Unpromoted-monitor-action.patch
|
||||
Patch84: RHEL-22715-LVM-activate-fix-false-positive.patch
|
||||
Patch85: RHEL-58038-Filesystem-dont-sleep-no-processes-only-send-force-net-fs-after-kill.patch
|
||||
Patch86: RHEL-59576-Filesystem-try-umount-first-avoid-arguments-list-too-long.patch
|
||||
Patch87: RHEL-59172-nfsserver-also-stop-rpc-statd-for-nfsv4_only.patch
|
||||
Patch88: RHEL-58008-podman-force-remove-container-if-necessary.patch
|
||||
Patch89: RHEL-61888-ocf-shellfuncs-only-create-update-reload-systemd-drop-in-if-needed.patch
|
||||
Patch90: RHEL-62200-IPaddr2-improve-fail-logic-check-ip_status-after-adding-IP.patch
|
||||
Patch91: RHEL-40589-azure-events-az-update-API-versions-add-retry-for-metadata.patch
|
||||
Patch92: RHEL-58632-azure-events-use-node-name-from-cluster.patch
|
||||
Patch93: RHEL-42513-1-powervs-subnet-new-ra.patch
|
||||
Patch94: RHEL-66292-1-aws-agents-reuse-imds-token-until-it-expires.patch
|
||||
Patch95: RHEL-66292-2-aws-agents-reuse-imds-token-improvements.patch
|
||||
Patch96: RHEL-68739-awsvip-add-interface-parameter.patch
|
||||
Patch97: RHEL-69734-1-openstack-cinder-volume-wait-for-volume-to-be-available.patch
|
||||
Patch98: RHEL-69734-2-openstack-cinder-volume-fix-detach-not-working-during-start-action.patch
|
||||
Patch99: RHEL-85056-tomcat-fix-CATALINA_PID-not-set-and-parameter-defaults.patch
|
||||
Patch100: RHEL-76038-1-storage-mon-remove-unused-variables.patch
|
||||
Patch101: RHEL-76038-2-storage-mon-fix-daemon-mode-bug-that-caused-delayed-initial-score.patch
|
||||
Patch102: RHEL-76038-3-storage-mon-only-use-underscores-in-functions.patch
|
||||
Patch103: RHEL-76038-4-storage-mon-check-if-daemon-is-already-running.patch
|
||||
Patch104: RHEL-76038-5-storage-mon-log-storage_mon-is-already-running-in-start-action.patch
|
||||
Patch105: RHEL-79819-portblock-fix-version-detection.patch
|
||||
Patch106: RHEL-88035-Filesystem-add-support-for-aznfs.patch
|
||||
Patch107: RHEL-88429-1-podman-etcd-new-ra.patch
|
||||
Patch108: RHEL-88429-2-podman-etcd-remove-unused-actions-from-metadata.patch
|
||||
Patch109: RHEL-88429-3-podman-etcd-fix-listen-peer-urls-binding.patch
|
||||
Patch110: RHEL-70044-IPaddr2-IPsrcaddr-avoid-duplicate-route-issues.patch
|
||||
Patch111: RHEL-7688-IPaddr2-add-link-status-DOWN-LOWERLAYERDOWN-check.patch
|
||||
Patch112: RHEL-97123-Filesystem-fix-issue-with-Vormetric-mounts.patch
|
||||
Patch113: RHEL-102727-ocf-shellfuncs-remove-extra-sleep-from-curl_retry.patch
|
||||
Patch114: RHEL-102610-podman-etcd-add-oom-parameter.patch
|
||||
Patch115: RHEL-42513-2-build-dont-build-powervs-subnet-if-dependencies-are-missing.patch
|
||||
Patch116: RHEL-114489-1-powervs-move-ip-new-ra.patch
|
||||
Patch117: RHEL-114489-2-powervs-move-ip-set-bundled-path.patch
|
||||
Patch118: RHEL-115785-RHEL-115782-1-db2-add-skip_basic_sql_health_check-and-monitor-parameters.patch
|
||||
Patch119: RHEL-113767-podman-etcd-wrap-ipv6-address-in-brackets.patch
|
||||
Patch120: RHEL-113766-podman-etcd-preserve-containers-for-debugging.patch
|
||||
Patch121: RHEL-116206-podman-etcd-add-cluster-wide-force_new_cluster-attribute-check.patch
|
||||
Patch122: RHEL-116151-1-ocf-shellfuncs-add-ocf_promotion_score.patch
|
||||
Patch123: RHEL-116151-2-portblock-add-promotable-support.patch
|
||||
Patch124: RHEL-116151-3-portblock-fixes-add-method-and-status_check-parameters.patch
|
||||
Patch125: RHEL-119495-podman-etcd-add-automatic-learner-member-promotion.patch
|
||||
Patch126: RHEL-118624-db2-use-reintegration-flag-to-avoid-race-condition-on-cluster-reintegration.patch
|
||||
Patch127: RHEL-123887-podman-etcd-certificate-rotation.patch
|
||||
Patch128: RHEL-123906-podman-etcd-compute-dynamic-revision-bump-from-maxRaftIndex.patch
|
||||
Patch129: RHEL-115785-RHEL-115782-2-db2-fix-variable-name.patch
|
||||
Patch130: RHEL-118621-MailTo-add-s-nail-support-for-multiple-recipients.patch
|
||||
Patch131: RHEL-64949-oracle-improve-monpassword-description.patch
|
||||
Patch132: RHEL-109485-1-nfsserver-support-non-clustered-kerberized-mounts.patch
|
||||
Patch133: RHEL-109485-2-nfsserver-fix-error-message.patch
|
||||
Patch134: RHEL-114489-3-powervs-move-ip-add-iflabel-parameter.patch
|
||||
Patch135: RHEL-127006-storage_mon-fix-handling-of-4k-block-devices.patch
|
||||
Patch136: RHEL-127891-podman-etcd-exclude-stopping-resources-from-active-count.patch
|
||||
Patch137: RHEL-126087-1-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch
|
||||
Patch138: RHEL-121986-Filesystem-speed-up-get-PIDs.patch
|
||||
Patch139: RHEL-130580-1-podman-etcd-prevent-last-active-member-from-leaving.patch
|
||||
Patch140: RHEL-130580-2-podman-etcd-remove-test-code.patch
|
||||
Patch141: RHEL-126087-2-podman-etcd-fix-count-of-fnc-holders-in-container_health_check.patch
|
||||
Patch142: RHEL-131185-podman-etcd-prevent-learner-from-starting-before-cluster-is-ready.patch
|
||||
Patch143: RHEL-132052-podman-etcd-prevent-retries-on-fatal-errors.patch
|
||||
Patch144: RHEL-133937-podman-etcd-align-variable-names-with-etcd-3.6-pod-manifest.patch
|
||||
Patch145: RHEL-139519-podman-etcd-verify-no-containers-running-or-being-deleted.patch
|
||||
Patch146: RHEL-42513-powervs-subnet-wait-for-IP.patch
|
||||
Patch147: RHEL-143527-powervs-move-ip-powervs-subnet-fix-error-logging.patch
|
||||
Patch148: RHEL-145628-podman-etcd-enhance-etcd-data-backup-with-snapshots-and-retention.patch
|
||||
Patch149: RHEL-150700-podman-etcd-set-attributes-if-they-fail-during-force-new-cluster.patch
|
||||
|
||||
# bundled ha-cloud-support libs
|
||||
Patch500: ha-cloud-support-aws.patch
|
||||
Patch501: ha-cloud-support-aliyun.patch
|
||||
Patch502: ha-cloud-support-gcloud.patch
|
||||
Patch500: ha-cloud-support-aliyun.patch
|
||||
Patch501: ha-cloud-support-gcloud.patch
|
||||
Patch502: ha-cloud-support-ibm.patch
|
||||
|
||||
# AlmaLinux Patch
|
||||
Patch1000: 1000-ocf-distro-add-AlmaLinux-to-RHEL-based-distro-detection.patch
|
||||
|
||||
Obsoletes: heartbeat-resources <= %{version}
|
||||
Provides: heartbeat-resources = %{version}
|
||||
@ -150,7 +227,7 @@ BuildRequires: python-devel
|
||||
# for pgsqlms
|
||||
BuildRequires: perl-devel perl-English perl-FindBin
|
||||
|
||||
%ifarch x86_64
|
||||
%ifarch x86_64 ppc64le
|
||||
BuildRequires: ha-cloud-support
|
||||
%endif
|
||||
|
||||
@ -178,7 +255,15 @@ Requires: which
|
||||
Requires: /sbin/fsck
|
||||
Requires: /usr/sbin/fsck.ext2 /usr/sbin/fsck.ext3 /usr/sbin/fsck.ext4
|
||||
Requires: /usr/sbin/fsck.xfs
|
||||
%if 0%{?fedora} > 40 || 0%{?rhel} > 9 || 0%{?suse_version}
|
||||
Recommends: /usr/sbin/mount.nfs /usr/sbin/mount.nfs4
|
||||
%else
|
||||
%if 0%{?rhel} > 8
|
||||
Recommends: /sbin/mount.nfs /sbin/mount.nfs4
|
||||
%else
|
||||
Requires: /sbin/mount.nfs /sbin/mount.nfs4
|
||||
%endif
|
||||
%endif
|
||||
%if (0%{?fedora} && 0%{?fedora} < 33) || (0%{?rhel} && 0%{?rhel} < 9) || (0%{?centos} && 0%{?centos} < 9) || 0%{?suse_version}
|
||||
%if (0%{?rhel} && 0%{?rhel} < 8) || (0%{?centos} && 0%{?centos} < 8)
|
||||
Requires: /usr/sbin/mount.cifs
|
||||
@ -194,7 +279,20 @@ Requires: /sbin/ip
|
||||
Requires: /usr/sbin/lvm
|
||||
|
||||
# nfsserver / netfs.sh
|
||||
Requires: /usr/sbin/rpc.nfsd /sbin/rpc.statd /usr/sbin/rpc.mountd
|
||||
%if 0%{?fedora} > 40 || 0%{?rhel} > 9 || 0%{?suse_version}
|
||||
Recommends: /usr/sbin/rpc.statd
|
||||
%else
|
||||
%if 0%{?rhel} > 8
|
||||
Recommends: /sbin/rpc.statd
|
||||
%else
|
||||
Requires: /sbin/rpc.statd
|
||||
%endif
|
||||
%endif
|
||||
%if 0%{?fedora} > 40 || 0%{?rhel} > 8 || 0%{?suse_version}
|
||||
Recommends: /usr/sbin/rpc.nfsd /usr/sbin/rpc.mountd
|
||||
%else
|
||||
Requires: /usr/sbin/rpc.nfsd /usr/sbin/rpc.mountd
|
||||
%endif
|
||||
|
||||
# ocf.py
|
||||
Requires: python3
|
||||
@ -215,12 +313,12 @@ A set of scripts to interface with several services to operate in a
|
||||
High Availability environment for both Pacemaker and rgmanager
|
||||
service managers.
|
||||
|
||||
%ifarch x86_64
|
||||
%ifarch x86_64 ppc64le
|
||||
%package cloud
|
||||
License: GPLv2+ and LGPLv2+
|
||||
Summary: Cloud resource agents
|
||||
Requires: %{name} = %{version}-%{release}
|
||||
Requires: ha-cloud-support
|
||||
Requires: ha-cloud-support >= 4.10.0-63
|
||||
Requires: socat
|
||||
Provides: resource-agents-aliyun
|
||||
Obsoletes: resource-agents-aliyun <= %{version}
|
||||
@ -248,6 +346,9 @@ databases to be managed in a cluster environment.
|
||||
exit 1
|
||||
%endif
|
||||
%setup -q -n %{upstream_prefix}-%{upstream_version}
|
||||
|
||||
# Applying AlmaLinux Patch
|
||||
%patch -P 1000 -p1 -b .1000-ocf-distro-add-AlmaLinux-to-RHEL-based-distro-detection
|
||||
%patch -p1 -P 0 -F1
|
||||
%patch -p1 -P 1
|
||||
%patch -p1 -P 2
|
||||
@ -320,6 +421,84 @@ exit 1
|
||||
%patch -p1 -P 69
|
||||
%patch -p1 -P 70
|
||||
%patch -p1 -P 71
|
||||
%patch -p1 -P 72
|
||||
%patch -p1 -P 73
|
||||
%patch -p1 -P 74
|
||||
%patch -p1 -P 75
|
||||
%patch -p1 -P 76
|
||||
%patch -p1 -P 77
|
||||
%patch -p1 -P 78
|
||||
%patch -p1 -P 79
|
||||
%patch -p1 -P 80
|
||||
%patch -p1 -P 81
|
||||
%patch -p1 -P 82
|
||||
%patch -p1 -P 83
|
||||
%patch -p1 -P 84
|
||||
%patch -p1 -P 85
|
||||
%patch -p1 -P 86
|
||||
%patch -p1 -P 87
|
||||
%patch -p1 -P 88
|
||||
%patch -p1 -P 89
|
||||
%patch -p1 -P 90
|
||||
%patch -p1 -P 91
|
||||
%patch -p1 -P 92
|
||||
%patch -p1 -P 93
|
||||
%patch -p1 -P 94
|
||||
%patch -p1 -P 95
|
||||
%patch -p1 -P 96
|
||||
%patch -p1 -P 97
|
||||
%patch -p1 -P 98
|
||||
%patch -p1 -P 99
|
||||
%patch -p1 -P 100
|
||||
%patch -p1 -P 101
|
||||
%patch -p1 -P 102
|
||||
%patch -p1 -P 103
|
||||
%patch -p1 -P 104
|
||||
%patch -p1 -P 105
|
||||
%patch -p1 -P 106
|
||||
%patch -p1 -P 107 -F1
|
||||
%patch -p1 -P 108
|
||||
%patch -p1 -P 109
|
||||
%patch -p1 -P 110
|
||||
%patch -p1 -P 111
|
||||
%patch -p1 -P 112
|
||||
%patch -p1 -P 113
|
||||
%patch -p1 -P 114
|
||||
%patch -p1 -P 115 -F2
|
||||
%patch -p1 -P 116
|
||||
%patch -p1 -P 117
|
||||
%patch -p1 -P 118
|
||||
%patch -p1 -P 119
|
||||
%patch -p1 -P 120
|
||||
%patch -p1 -P 121
|
||||
%patch -p1 -P 122
|
||||
%patch -p1 -P 123
|
||||
%patch -p1 -P 124
|
||||
%patch -p1 -P 125
|
||||
%patch -p1 -P 126
|
||||
%patch -p1 -P 127
|
||||
%patch -p1 -P 128
|
||||
%patch -p1 -P 129
|
||||
%patch -p1 -P 130
|
||||
%patch -p1 -P 131
|
||||
%patch -p1 -P 132
|
||||
%patch -p1 -P 133
|
||||
%patch -p1 -P 134
|
||||
%patch -p1 -P 135
|
||||
%patch -p1 -P 136
|
||||
%patch -p1 -P 137 -F2
|
||||
%patch -p1 -P 138
|
||||
%patch -p1 -P 139
|
||||
%patch -p1 -P 140
|
||||
%patch -p1 -P 141
|
||||
%patch -p1 -P 142
|
||||
%patch -p1 -P 143
|
||||
%patch -p1 -P 144
|
||||
%patch -p1 -P 145
|
||||
%patch -p1 -P 146
|
||||
%patch -p1 -P 147
|
||||
%patch -p1 -P 148
|
||||
%patch -p1 -P 149
|
||||
|
||||
# bundled ha-cloud-support libs
|
||||
%patch -p1 -P 500
|
||||
@ -361,6 +540,9 @@ export CFLAGS
|
||||
%endif
|
||||
%ifarch x86_64
|
||||
PYTHONPATH="%{_usr}/lib/fence-agents/support/google" \
|
||||
%endif
|
||||
%ifarch ppc64le
|
||||
PYTHONPATH="%{_usr}/lib/fence-agents/support/ibm" \
|
||||
%endif
|
||||
%{conf_opt_fatal} \
|
||||
%if %{defined _unitdir}
|
||||
@ -455,6 +637,8 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
|
||||
%exclude %{_mandir}/man7/*aliyun-vpc-move-ip*
|
||||
%exclude /usr/lib/ocf/resource.d/heartbeat/gcp*
|
||||
%exclude %{_mandir}/man7/*gcp*
|
||||
%exclude /usr/lib/ocf/resource.d/heartbeat/powervs-*
|
||||
%exclude %{_mandir}/man7/*powervs-*
|
||||
%exclude /usr/lib/ocf/resource.d/heartbeat/pgsqlms
|
||||
%exclude %{_mandir}/man7/*pgsqlms*
|
||||
%exclude %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
|
||||
@ -618,8 +802,9 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
|
||||
%{_libexecdir}/heartbeat
|
||||
%endif
|
||||
|
||||
%ifarch x86_64
|
||||
%ifarch x86_64 ppc64le
|
||||
%files cloud
|
||||
%ifarch x86_64
|
||||
/usr/lib/ocf/resource.d/heartbeat/aliyun-*
|
||||
%{_mandir}/man7/*aliyun-*
|
||||
/usr/lib/ocf/resource.d/heartbeat/aws*
|
||||
@ -631,6 +816,11 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
|
||||
%exclude /usr/lib/ocf/resource.d/heartbeat/gcp-vpc-move-ip
|
||||
%exclude %{_mandir}/man7/*gcp-vpc-move-ip*
|
||||
%endif
|
||||
%ifarch ppc64le
|
||||
/usr/lib/ocf/resource.d/heartbeat/powervs-*
|
||||
%{_mandir}/man7/*powervs-*
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%files paf
|
||||
%doc paf_README.md
|
||||
@ -641,9 +831,280 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
|
||||
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
|
||||
|
||||
%changelog
|
||||
* Thu Jun 13 2024 Eduard Abdullin <eabdullin@redhat.com> - 4.10.0-52.4.alma.1
|
||||
- galera: allow joiner to report non-Primary during initial IST
|
||||
- ocf-shellfuncs: add curl_retry()
|
||||
* Tue May 26 2026 Andrew Lukoshko <alukoshko@almalinux.org> - 4.10.0-108.alma.1
|
||||
- Add AlmaLinux to RHEL-based distro detection in ocf-distro
|
||||
|
||||
* Wed Feb 18 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-108
|
||||
- podman-etcd: set attributes if they fail during force-new-cluster
|
||||
|
||||
Resolves: RHEL-150700
|
||||
|
||||
* Wed Feb 4 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-107
|
||||
- podman-etcd: enhance etcd data backup with snapshots and retention
|
||||
|
||||
Resolves: RHEL-145628
|
||||
|
||||
* Thu Jan 22 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-106
|
||||
- powervs-move-ip/powervs-subnet: fix error logging
|
||||
|
||||
Resolves: RHEL-143527
|
||||
|
||||
* Wed Jan 14 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-105
|
||||
- powervs-subnet: new resource agent
|
||||
|
||||
Resolves: RHEL-42513
|
||||
|
||||
* Thu Jan 8 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-104
|
||||
- podman-etcd: verify that no static pod containers are running or
|
||||
being deleted before starting
|
||||
|
||||
Resolves: RHEL-139519
|
||||
|
||||
* Mon Dec 8 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-103
|
||||
- podman-etcd: align environment variable names with Etcd v3.6 Pod
|
||||
manifest
|
||||
|
||||
Resolves: RHEL-133937
|
||||
|
||||
* Tue Dec 2 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-102
|
||||
- podman-etcd: prevent retries on fatal errors
|
||||
|
||||
Resolves: RHEL-132052
|
||||
|
||||
* Thu Nov 27 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-101
|
||||
- podman-etcd: prevent learner from starting before cluster is ready
|
||||
|
||||
Resolves: RHEL-131185
|
||||
|
||||
* Tue Nov 25 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-100
|
||||
- podman-etcd: add container crash detection with coordinated recovery
|
||||
|
||||
Resolves: RHEL-126087
|
||||
|
||||
* Mon Nov 24 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-99
|
||||
- podman-etcd: prevent last active member from leaving the etcd member
|
||||
list
|
||||
|
||||
Resolves: RHEL-130580
|
||||
|
||||
* Thu Nov 20 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-98
|
||||
- Filesystem: speed up get PIDs
|
||||
|
||||
Resolves: RHEL-121986
|
||||
|
||||
* Thu Nov 13 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-97
|
||||
- podman-etcd: exclude stopping resources from active count
|
||||
|
||||
Resolves: RHEL-127891
|
||||
|
||||
* Mon Nov 10 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-96
|
||||
- storage_mon: fix handling of 4k block devices
|
||||
|
||||
Resolves: RHEL-127006
|
||||
|
||||
* Mon Nov 3 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-95
|
||||
- powervs-move-ip: new resource agent
|
||||
|
||||
Resolves: RHEL-114489
|
||||
|
||||
* Fri Oct 31 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-94
|
||||
- nfsserver: add ability to set e.g. "pipefs-directory=/run/nfs/rpc_pipefs"
|
||||
in /etc/nfs.conf to avoid issues with non-clustered Kerberized mounts
|
||||
|
||||
Resolves: RHEL-109485
|
||||
|
||||
* Wed Oct 29 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-92
|
||||
- MailTo: add s-nail support for multiple recipients
|
||||
- oracle: improve monpassword description
|
||||
|
||||
Resolves: RHEL-118621, RHEL-64949
|
||||
|
||||
* Wed Oct 29 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-91
|
||||
- db2: add "skip_basic_sql_health_check" parameter to avoid failing on
|
||||
systems with high load
|
||||
- db2: add "monitor_retries", "monitor_sleep", and "monitor_retry_all_errors"
|
||||
parameters to be able to avoid failing on first try
|
||||
|
||||
Resolves: RHEL-115785, RHEL-115782
|
||||
|
||||
* Tue Oct 28 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-90
|
||||
- podman-etcd: add support for cert rotation
|
||||
- podman-etcd: compute dynamic revision bump from maxRaftIndex
|
||||
|
||||
Resolves: RHEL-123887, RHEL-123906
|
||||
|
||||
* Wed Oct 22 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-89
|
||||
- portblock: add promotable support, and method and status_check
|
||||
parameters
|
||||
- db2: use reintegration flag to avoid race condition on cluster
|
||||
reintegration
|
||||
|
||||
Resolves: RHEL-116151, RHEL-118624
|
||||
|
||||
* Thu Oct 9 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-88
|
||||
- podman-etcd: add automatic learner member promotion
|
||||
|
||||
Resolves: RHEL-119495
|
||||
|
||||
* Wed Oct 8 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-87
|
||||
- build: make nfs-utils a weak dependency
|
||||
|
||||
Resolves: RHEL-116100
|
||||
|
||||
* Mon Sep 22 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-85
|
||||
- podman-etcd: wrap ipv6 address in brackets
|
||||
- podman-etcd: preserve containers for debugging
|
||||
- podman-etcd: add cluster-wide force_new_cluster attribute check
|
||||
|
||||
Resolves: RHEL-113767, RHEL-113766, RHEL-116206
|
||||
|
||||
* Tue Sep 9 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-81
|
||||
- podman-etcd: add oom parameter to be able to tune the Out-Of-Memory (OOM)
|
||||
score for etcd containers
|
||||
|
||||
Resolves: RHEL-102610
|
||||
|
||||
* Tue Jul 15 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-80
|
||||
- ocf-shellfuncs/AWS agents: dont sleep after the final try in
|
||||
curl_retry()
|
||||
|
||||
Resolves: RHEL-102727
|
||||
|
||||
* Thu Jul 3 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-78
|
||||
- Filesystem: fix issue with Vormetric mounts
|
||||
|
||||
Resolves: RHEL-97123
|
||||
|
||||
* Tue Jun 17 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-77
|
||||
- IPaddr2/IPsrcaddr: fix to avoid duplicate route issues
|
||||
- IPaddr2: add link status DOWN/LOWERLAYERDOWN check
|
||||
|
||||
Resolves: RHEL-70044, RHEL-7688
|
||||
|
||||
* Tue May 20 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-76
|
||||
- podman-etcd: new resource agent
|
||||
|
||||
Resolves: RHEL-88429
|
||||
|
||||
* Tue Apr 22 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-74
|
||||
- Filesystem: add support for aznfs
|
||||
|
||||
Resolves: RHEL-88035
|
||||
|
||||
* Wed Apr 9 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-73
|
||||
- storage-mon: fix daemon mode bug that caused delayed initial score
|
||||
- portblock: fix iptables version detection
|
||||
|
||||
Resolves: RHEL-76038, RHEL-79819
|
||||
|
||||
* Tue Apr 1 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-72
|
||||
- tomcat: fix CATALINA_PID not set, and catalina_base and catalina_out
|
||||
parameter defaults
|
||||
|
||||
Resolves: RHEL-85056
|
||||
|
||||
* Fri Jan 10 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-71
|
||||
- openstack-cinder-volume: wait for volume to be available
|
||||
|
||||
Resolves: RHEL-69734
|
||||
|
||||
* Wed Nov 27 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-69
|
||||
- AWS agents: reuse IMDS token until it expires
|
||||
- awsvip: add interface parameter
|
||||
|
||||
Resolves: RHEL-66292
|
||||
Resolves: RHEL-68739
|
||||
|
||||
* Wed Oct 23 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-68
|
||||
- powervs-subnet: new resource agent
|
||||
|
||||
Resolves: RHEL-42513
|
||||
|
||||
* Mon Oct 14 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-67
|
||||
- ocf-shellfuncs: only create/update and reload systemd drop-in if
|
||||
needed
|
||||
- IPaddr2: improve fail logic and check ip_status after adding IP
|
||||
- azure-events-az: update API versions, and add retry functionality
|
||||
for metadata requests
|
||||
- azure-events*: use node name from cluster instead of hostname to
|
||||
avoid failing if they're not the same
|
||||
|
||||
Resolves: RHEL-61888
|
||||
Resolves: RHEL-62200
|
||||
Resolves: RHEL-40589
|
||||
Resolves: RHEL-58632
|
||||
|
||||
* Wed Oct 2 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-66
|
||||
- nfsserver: also stop rpc-statd for nfsv4_only to avoid stop failing
|
||||
in some cases
|
||||
- podman: force-remove containers in stopping state if necessary
|
||||
|
||||
Resolves: RHEL-59172
|
||||
Resolves: RHEL-58008
|
||||
|
||||
* Wed Sep 25 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-65
|
||||
- Filesystem: dont sleep during stop-action when there are no
|
||||
processes to kill, and only use force argument for network
|
||||
filesystems after sending kill_signals
|
||||
- Filesystem: try umount first during stop-action, and avoid potential
|
||||
"Argument list too long" for force_unmount=safe
|
||||
- AWS agents: use awscli2
|
||||
|
||||
Resolves: RHEL-58038
|
||||
Resolves: RHEL-59576
|
||||
Resolves: RHEL-46233
|
||||
|
||||
* Thu Aug 29 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-64
|
||||
- IPsrcaddr: add IPv6 support
|
||||
|
||||
Resolves: RHEL-32265
|
||||
|
||||
* Tue Aug 13 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-61
|
||||
- LVM-activate: fail when both "system_id_source" and "volume_list"
|
||||
are set in lvm.conf to avoid false positive activation of the VG
|
||||
|
||||
Resolves: RHEL-22715
|
||||
|
||||
* Fri Jun 28 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-60
|
||||
- galera/mysql/redis: remove Unpromoted monitor-action
|
||||
|
||||
Resolves: RHEL-43579
|
||||
|
||||
* Tue Jun 25 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-59
|
||||
- Filesystem: fail when leading or trailing whitespace is present in
|
||||
device or directory parameters
|
||||
|
||||
Resolves: RHEL-24683
|
||||
|
||||
* Tue Jun 11 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-58
|
||||
- Filesystem: dont kill unrelated processes during stop-action
|
||||
- db2: fix OCF_SUCESS typo
|
||||
|
||||
Resolves: RHEL-40393
|
||||
Resolves: RHEL-32829
|
||||
|
||||
* Wed May 15 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-57
|
||||
- Filesystem: fail when incorrect device mounted on mountpoint, and
|
||||
dont unmount the mountpoint in this case, or if mountpoint set to "/"
|
||||
|
||||
Resolves: RHEL-34777
|
||||
|
||||
* Tue Apr 30 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-56
|
||||
- AWS agents: retry failed metadata requests to avoid instantly
|
||||
failing when there is a hiccup in the network or metadata service
|
||||
|
||||
Resolves: RHEL-16246
|
||||
|
||||
* Wed Apr 10 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-54
|
||||
- galera: fix issue where joiner promotion fails is the node reports
|
||||
being in non-primary state
|
||||
|
||||
Resolves: RHEL-31763
|
||||
|
||||
* Wed Mar 6 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-53
|
||||
- aliyun-vpc-move-ip: use new aliyun-cli
|
||||
|
||||
Resolves: RHEL-26666
|
||||
|
||||
* Thu Feb 8 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-52
|
||||
- findif.sh: fix loopback IP handling
|
||||
|
||||
Loading…
Reference in New Issue
Block a user