diff --git a/SOURCES/RHEL-102610-podman-etcd-add-oom-parameter.patch b/SOURCES/RHEL-102610-podman-etcd-add-oom-parameter.patch new file mode 100644 index 0000000..f72576a --- /dev/null +++ b/SOURCES/RHEL-102610-podman-etcd-add-oom-parameter.patch @@ -0,0 +1,85 @@ +From d08a7f74427ea2cf7d355a0f7f6d8f583e2d0cba Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Thu, 3 Jul 2025 12:22:12 +0200 +Subject: [PATCH] OCPBUGS-58324: podman-etcd Add OOM score adjustment for etcd + containers + +This change introduces a new `oom` parameter to the `podman-etcd` OCF +agent. This allows tuning the Out-Of-Memory (OOM) score adjustment for +the etcd container. + +The `oom` parameter accepts integer values from -1000 to 1000, +defaulting to -997 (system-node-critical equivalent). + +see https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/#node-out-of-memory-behavior + +Key changes: +- Added `OCF_RESKEY_oom` parameter to agent definition (`content type="integer"`). +- Integrated `--oom-score-adj` option into `podman_start()`. +- Implemented input validation for `oom` in `podman_validate()`, + ensuring values are within the [-1000:1000] range. +--- + heartbeat/podman-etcd | 22 +++++++++++++++++++++- + 1 file changed, 21 insertions(+), 1 deletion(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 6762112ec..884b7c579 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -45,6 +45,7 @@ OCF_RESKEY_nic_default="br-ex" + OCF_RESKEY_authfile_default="/var/lib/kubelet/config.json" + OCF_RESKEY_allow_pull_default="1" + OCF_RESKEY_reuse_default="0" ++OCF_RESKEY_oom_default="-997" + + : ${OCF_RESKEY_image=${OCF_RESKEY_image_default}} + : ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}} +@@ -53,6 +54,7 @@ OCF_RESKEY_reuse_default="0" + : ${OCF_RESKEY_authfile=${OCF_RESKEY_authfile_default}} + : ${OCF_RESKEY_allow_pull=${OCF_RESKEY_allow_pull_default}} + : ${OCF_RESKEY_reuse=${OCF_RESKEY_reuse_default}} ++: ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default}} + + ####################################################################### + +@@ -230,6 +232,16 @@ to stop the container before pacemaker. + drop-in dependency + + ++ ++ ++ ++Tune the host's Out-Of-Memory (OOM) preferences for containers (accepts values from -1000 to 1000). ++Default to same OOM score as system-node-critical ++https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/#node-out-of-memory-behavior ++ ++OOM for container ++ ++ + + + +@@ -1226,7 +1238,10 @@ podman_start() + fi + + podman_create_mounts +- local run_opts="-d --name=${CONTAINER}" ++ local run_opts="--detach --name=${CONTAINER}" ++ ++ run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}" ++ + # check to see if the container has already started + podman_simple_status + if [ $? -eq $OCF_SUCCESS ]; then +@@ -1513,6 +1528,11 @@ podman_validate() + exit $OCF_ERR_CONFIGURED + fi + ++ if [ "$OCF_RESKEY_oom" -lt -1000 ] || [ "$OCF_RESKEY_oom" -gt 1000 ]; then ++ ocf_exit_reason "'oom' value ${OCF_RESKEY_oom} is out of range [-1000:1000]" ++ exit $OCF_ERR_CONFIGURED ++ fi ++ + return $OCF_SUCCESS + } + diff --git a/SOURCES/RHEL-109485-1-nfsserver-support-non-clustered-kerberized-mounts.patch b/SOURCES/RHEL-109485-1-nfsserver-support-non-clustered-kerberized-mounts.patch new file mode 100644 index 0000000..a88052d --- /dev/null +++ b/SOURCES/RHEL-109485-1-nfsserver-support-non-clustered-kerberized-mounts.patch @@ -0,0 +1,92 @@ +From a4fd26a37b20e86e7c188b45d40e31d240f3decf Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Thu, 14 Aug 2025 09:33:17 +0200 +Subject: [PATCH] nfsserver: add ability to set e.g. + "pipefs-directory=/run/nfs/rpc_pipefs" in /etc/nfs.conf to avoid issues with + non-clustered Kerberized mounts + +--- + heartbeat/nfsserver | 28 +++++++++++++++++----------- + 1 file changed, 17 insertions(+), 11 deletions(-) + +diff --git a/heartbeat/nfsserver b/heartbeat/nfsserver +index 5b02924a9..83f4bac51 100755 +--- a/heartbeat/nfsserver ++++ b/heartbeat/nfsserver +@@ -264,7 +264,7 @@ set_exec_mode() + ## + # If the user defined an init script, It must exist for us to continue + ## +- if [ -n "$OCF_RESKEY_nfs_init_script" ]; then ++ if [ $systemd_running -ne 0 ] && [ -n "$OCF_RESKEY_nfs_init_script" ]; then + # check_binary will exit the process if init script does not exist + check_binary ${OCF_RESKEY_nfs_init_script} + EXEC_MODE=1 +@@ -274,7 +274,7 @@ set_exec_mode() + ## + # Check to see if the default init script exists, if so we'll use that. + ## +- if which $DEFAULT_INIT_SCRIPT > /dev/null 2>&1; then ++ if [ $systemd_running -ne 0 ] && which $DEFAULT_INIT_SCRIPT > /dev/null 2>&1; then + OCF_RESKEY_nfs_init_script=$DEFAULT_INIT_SCRIPT + EXEC_MODE=1 + return 0 +@@ -780,7 +780,7 @@ nfsserver_start () + # the uts namespace is useless in that case. + # If systemd is running, mangle the nfs-server.service unit, + # independent of the "EXEC_MODE" we detected. +- if $systemd_is_running ; then ++ if [ $systemd_running -eq 0 ]; then + if [ -z "$OCF_RESKEY_nfs_server_scope" ] ; then + remove_unshare_uts_dropins + else +@@ -789,7 +789,9 @@ nfsserver_start () + fi + + if ! `mount | grep -q " on $OCF_RESKEY_rpcpipefs_dir "`; then +- mount -t rpc_pipefs sunrpc $OCF_RESKEY_rpcpipefs_dir ++ if [ $systemd_running -ne 0 ] || { [ $systemd_running -eq 0 ] && systemctl -q is-enabled var-lib-nfs-rpc_pipefs.mount ;}; then ++ mount -t rpc_pipefs sunrpc $OCF_RESKEY_rpcpipefs_dir ++ fi + fi + + # remove the sm-notify pid so sm-notify will be allowed to run again without requiring a reboot. +@@ -1003,11 +1005,15 @@ nfsserver_stop () + fi + fi + +- # systemd +- case $EXEC_MODE in +- [23]) nfs_exec stop rpc-gssd > /dev/null 2>&1 +- ocf_log info "Stop: rpc-gssd" +- esac ++ ++ if mount | grep -q " on $OCF_RESKEY_rpcpipefs_dir "; then ++ # systemd ++ case $EXEC_MODE in ++ [23]) ++ nfs_exec stop rpc-gssd > /dev/null 2>&1 ++ ocf_log info "Stop: rpc-gssd" ++ esac ++ fi + + unbind_tree + rc=$? +@@ -1017,7 +1023,7 @@ nfsserver_stop () + ocf_log info "NFS server stopped" + fi + +- if $systemd_is_running; then ++ if [ $systemd_running -eq 0 ]; then + remove_unshare_uts_dropins + fi + +@@ -1057,7 +1063,7 @@ nfsserver_validate () + } + + nfsserver_validate +-systemd_is_running && systemd_is_running=true || systemd_is_running=false ++systemd_is_running; systemd_running=$? + + case $__OCF_ACTION in + start) nfsserver_start diff --git a/SOURCES/RHEL-109485-2-nfsserver-fix-error-message.patch b/SOURCES/RHEL-109485-2-nfsserver-fix-error-message.patch new file mode 100644 index 0000000..277c1a6 --- /dev/null +++ b/SOURCES/RHEL-109485-2-nfsserver-fix-error-message.patch @@ -0,0 +1,24 @@ +From 72620db5b52c943358faaf77ce5a15fb41169fab Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Fri, 31 Oct 2025 11:22:46 +0100 +Subject: [PATCH] nfsserver: set systemd_running before nfsserver_validate() to + avoid error message + +--- + heartbeat/nfsserver | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/heartbeat/nfsserver b/heartbeat/nfsserver +index 83f4bac51..71a711305 100755 +--- a/heartbeat/nfsserver ++++ b/heartbeat/nfsserver +@@ -1062,8 +1062,8 @@ nfsserver_validate () + return $OCF_SUCCESS + } + +-nfsserver_validate + systemd_is_running; systemd_running=$? ++nfsserver_validate + + case $__OCF_ACTION in + start) nfsserver_start diff --git a/SOURCES/RHEL-113766-podman-etcd-preserve-containers-for-debugging.patch b/SOURCES/RHEL-113766-podman-etcd-preserve-containers-for-debugging.patch new file mode 100644 index 0000000..ae4fd06 --- /dev/null +++ b/SOURCES/RHEL-113766-podman-etcd-preserve-containers-for-debugging.patch @@ -0,0 +1,686 @@ +From 6e9200dc2ffc89382188794742361985309936b2 Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Wed, 23 Jul 2025 09:34:13 +0200 +Subject: [PATCH] podman-etcd: preserve containers for debugging + +This change modifies the agent to keep stopped containers for log +inspection and debugging, with supporting changes to enable this +behavior. + +* Conditionally reuse existing containers when configuration unchanged +* Move etcd inline configuration flags to external file to allow + restarts without container recreation (mainly for the + force-new-cluster flag) +* Archive previous container renaming it into *-previous, and its + configuration files into /var/lib/etcd/config-previous.tar.gz archive. + The tar.gz archive consists in: + * the pod manifest created by CEO, used to generated the Etc + configuration file + * the Etcd configuration file + * the auth json file + Only one copy is maintained to limit disk usage. +* Both configuration and backup files location is configurable with 2 + new input arguments. + +Signed-off-by: Carlo Lobrano +--- + heartbeat/podman-etcd | 438 ++++++++++++++++++++++++++++++++---------- + 1 file changed, 336 insertions(+), 102 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 4969fbaaf..33804414a 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -46,6 +46,8 @@ OCF_RESKEY_authfile_default="/var/lib/kubelet/config.json" + OCF_RESKEY_allow_pull_default="1" + OCF_RESKEY_reuse_default="0" + OCF_RESKEY_oom_default="-997" ++OCF_RESKEY_config_location_default="/var/lib/etcd" ++OCF_RESKEY_backup_location_default="/var/lib/etcd" + + : ${OCF_RESKEY_image=${OCF_RESKEY_image_default}} + : ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}} +@@ -55,6 +57,9 @@ OCF_RESKEY_oom_default="-997" + : ${OCF_RESKEY_allow_pull=${OCF_RESKEY_allow_pull_default}} + : ${OCF_RESKEY_reuse=${OCF_RESKEY_reuse_default}} + : ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default}} ++: ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}} ++: ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}} ++ + + ####################################################################### + +@@ -242,6 +247,23 @@ https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/# + OOM for container + + ++ ++ ++ ++The directory where the resource agent stores its state files, such as the generated etcd configuration and a copy of the pod manifest. ++ ++Resource agent state directory ++ ++ ++ ++ ++ ++The directory where the resource agent stores its backups. ++ ++Resource agent backup directory ++ ++ ++ + + + +@@ -309,42 +331,52 @@ container_exists() + return 1 + } + +-remove_container() ++# archive_current_container archives the current ++# podman etcd container and its configuration files. ++archive_current_container() + { +- local rc +- local execids ++ # don't attempt to archive a container that doesn't exist ++ if ! container_exists; then ++ return ++ fi + +- if ocf_is_true "$OCF_RESKEY_reuse"; then +- # never remove the container if we have reuse enabled. +- return 0 ++ # delete any container named "*-previous", or we won't be able to archive the current container. ++ if podman inspect "${CONTAINER}-previous" >/dev/null 2>&1; then ++ ocf_log info "removing old archived container '$CONTAINER-previous'" ++ if ! ocf_run podman rm --volumes --force "$CONTAINER-previous"; then ++ ocf_log warn "could not remove old archived container (podman rm failed, error code: $?). Won't be able to archive current container" ++ return ++ fi + fi + +- if ! container_exists; then +- # don't attempt to remove a container that doesn't exist +- return 0 ++ ocf_log info "archiving '$CONTAINER' container as '$CONTAINER-previous' for debugging purposes" ++ if ! ocf_run podman rename "$CONTAINER" "$CONTAINER-previous"; then ++ ocf_log err "could not archive container '$CONTAINER', error code: $?" ++ return + fi +- ocf_log notice "Cleaning up inactive container, ${CONTAINER}." +- ocf_run podman rm -v "$CONTAINER" +- rc=$? +- if [ $rc -ne 0 ]; then +- if [ $rc -eq 2 ]; then +- if podman inspect --format '{{.State.Status}}' "$CONTAINER" | grep -wq "stopping"; then +- ocf_log err "Inactive container ${CONTAINER} is stuck in 'stopping' state. Force-remove it." +- ocf_run podman rm -f "$CONTAINER" +- rc=$? +- fi +- fi +- # due to a podman bug (rhbz#1841485), sometimes a stopped +- # container can still be associated with Exec sessions, in +- # which case the "podman rm" has to be forced +- execids=$(podman inspect "$CONTAINER" --format '{{len .ExecIDs}}') +- if [ "$execids" -ne "0" ]; then +- ocf_log warn "Inactive container ${CONTAINER} has lingering exec sessions. Force-remove it." +- ocf_run podman rm -f "$CONTAINER" +- rc=$? ++ ++ # archive corresponding etcd configuration files ++ local files_to_archive="" ++ for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE"; do ++ if [ -f "$file" ]; then ++ files_to_archive="$files_to_archive $file" ++ else ++ ocf_log warn "file '$file' is missing and won't be archived" + fi ++ done ++ ++ if [ -z "$files_to_archive" ]; then ++ ocf_log warn "could not find any file to archive." ++ return ++ fi ++ ++ # NOTE: tar will override any existing archive as wanted ++ # shellcheck disable=SC2086 ++ if ! ocf_run tar --create --verbose --gzip --file "$ETCD_BACKUP_FILE" $files_to_archive; then ++ ocf_log warn "container archived successfully, but configuration backup failed (error: $?). Container debugging available, but without matching configuration files" ++ else ++ ocf_log info "container configuration also archived in '$ETCD_BACKUP_FILE'" + fi +- return $rc + } + + # Correctly wraps an ipv6 in [] for url otherwise use return normal ipv4 address. +@@ -365,6 +397,7 @@ attribute_node_ip() + local attribute="node_ip" + local ip_addr name + ++ # TODO: We can retrieve both the local and peer IP addresses from this map, which eliminates the need to use CIB to share them between nodes + for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do + name=$(echo "$node" | cut -d: -f1) + # ignore other nodes +@@ -375,7 +408,7 @@ attribute_node_ip() + done + + if [ -z "$ip_addr" ]; then +- ocf_log err "ip address was empty when querying (getent ahosts) for hostname: $(hostname -f)" ++ ocf_log err "could not get local ip address from node_ip_map: '$OCF_RESKEY_node_ip_map'" + return 1 + fi + +@@ -384,9 +417,9 @@ attribute_node_ip() + echo "$ip_addr" + ;; + update) +- if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then ++ if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$ip_addr"; then + rc="$?" +- ocf_log err "could not set $attribute to $value, error code: $rc" ++ ocf_log err "could not set $attribute to $ip_addr, error code: $rc" + return "$rc" + fi + ;; +@@ -428,6 +461,48 @@ get_env_from_manifest() { + echo "$env_var_value" + } + ++# etcd configuration file expects duration to be expressed in nanoseconds ++convert_duration_in_nanoseconds() { ++ local duration=$1 ++ local value unit nanoseconds ++ ++ if [ -z "$duration" ]; then ++ ocf_log err "convert_duration_in_nanoseconds: no duration provided" ++ return 1 ++ fi ++ ++ if ! echo "$duration" | grep -qE '^[0-9]+[numµ]?s$'; then ++ ocf_log err "convert_duration_in_nanoseconds: invalid duration format \"$duration\". Expected format: where unit is one of s, ms, us, µs, ns" ++ return 1 ++ fi ++ ++ # Extract numeric value and unit from duration string ++ value=$(echo "$duration" | sed 's/[^0-9]*$//') ++ unit=$(echo "$duration" | sed 's/^[0-9]*//') ++ ++ case "$unit" in ++ ns) ++ nanoseconds=$value ++ ;; ++ us|µs) ++ nanoseconds=$((value * 1000)) ++ ;; ++ ms) ++ nanoseconds=$((value * 1000000)) ++ ;; ++ s) ++ nanoseconds=$((value * 1000000000)) ++ ;; ++ *) ++ # this should not happen as the input is already validated ++ ocf_log err "convert_duration_in_nanoseconds: unknown duration unit \"$unit\"" ++ return 1 ++ ;; ++ esac ++ ++ echo "$nanoseconds" ++} ++ + prepare_env() { + local name ip ipurl standalone_node + +@@ -457,9 +532,14 @@ prepare_env() { + ETCDCTL_API=$(get_env_from_manifest "ETCDCTL_API") + ETCD_CIPHER_SUITES=$(get_env_from_manifest "ETCD_CIPHER_SUITES") + ETCD_DATA_DIR=$(get_env_from_manifest "ETCD_DATA_DIR") ++ if [ ! -d "$ETCD_DATA_DIR" ]; then ++ ocf_log err "could not find data-dir at path \"$ETCD_DATA_DIR\"" ++ return "$OCF_ERR_ARGS" ++ else ++ ocf_log info "using data-dir: $ETCD_DATA_DIR" ++ fi + ETCD_ELECTION_TIMEOUT=$(get_env_from_manifest "ETCD_ELECTION_TIMEOUT") + ETCD_ENABLE_PPROF=$(get_env_from_manifest "ETCD_ENABLE_PPROF") +- ETCD_EXPERIMENTAL_MAX_LEARNERS=$(get_env_from_manifest "ETCD_EXPERIMENTAL_MAX_LEARNERS") + ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION") + ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL") + ETCD_HEARTBEAT_INTERVAL=$(get_env_from_manifest "ETCD_HEARTBEAT_INTERVAL") +@@ -475,6 +555,62 @@ prepare_env() { + LISTEN_METRICS_URLS="0.0.0.0" + } + ++ ++generate_etcd_configuration() { ++ if is_force_new_cluster; then ++ # The embedded newline is required for correct YAML formatting. ++ FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: true ++force-new-cluster-bump-amount: 1000000000" ++ else ++ FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: false" ++ fi ++ ++ cat > "$ETCD_CONFIGURATION_FILE" << EOF ++logger: zap ++log-level: info ++snapshot-count: 10000 ++name: $NODENAME ++data-dir: $ETCD_DATA_DIR ++$FORCE_NEW_CLUSTER_CONFIG ++socket-reuse-address: $ETCD_SOCKET_REUSE_ADDRESS ++election-timeout: $ETCD_ELECTION_TIMEOUT ++enable-pprof: $ETCD_ENABLE_PPROF ++heartbeat-interval: $ETCD_HEARTBEAT_INTERVAL ++quota-backend-bytes: $ETCD_QUOTA_BACKEND_BYTES ++initial-advertise-peer-urls: "$NODEIPURL:2380" ++listen-peer-urls: "$(ip_url ${LISTEN_PEER_URLS}):2380" ++listen-client-urls: "$(ip_url ${LISTEN_CLIENT_URLS}):2379,unixs://${NODEIP}:0" ++initial-cluster: $ETCD_INITIAL_CLUSTER ++initial-cluster-state: $ETCD_INITIAL_CLUSTER_STATE ++client-transport-security: ++ cert-file: /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.crt ++ key-file: /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.key ++ client-cert-auth: true ++ trusted-ca-file: $SERVER_CACERT ++peer-transport-security: ++ cert-file: $ETCD_PEER_CERT ++ key-file: $ETCD_PEER_KEY ++ client-cert-auth: true ++ trusted-ca-file: $SERVER_CACERT ++advertise-client-urls: "$NODEIPURL:2379" ++listen-metrics-urls: "$(ip_url ${LISTEN_METRICS_URLS}):9978" ++metrics: extensive ++experimental-initial-corrupt-check: true ++experimental-max-learners: 1 ++experimental-warning-apply-duration: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION") ++experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL") ++EOF ++ ++ { ++ if [ -n "$ETCD_CIPHER_SUITES" ]; then ++ echo "cipher-suites:" ++ echo "$ETCD_CIPHER_SUITES" | tr ',' '\n' | while read -r cipher; do ++ echo " - \"$cipher\"" ++ done ++ fi ++ } >> "$ETCD_CONFIGURATION_FILE" ++} ++ + archive_data_folder() + { + # TODO: use etcd snapshots +@@ -634,7 +770,7 @@ add_member_as_learner() + local endpoint_url=$(ip_url $(attribute_node_ip get)) + local peer_url=$(ip_url $member_ip) + +- ocf_log info "add $member_name ($member_ip) to the member list as learner" ++ ocf_log info "add $member_name ($member_ip, $endpoint_url) to the member list as learner" + out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner) + rc=$? + if [ $rc -ne 0 ]; then +@@ -1104,18 +1240,18 @@ compare_revision() + peer_revision=$(attribute_node_revision_peer) + + if [ "$revision" = "" ] || [ "$revision" = "null" ] || [ "$peer_revision" = "" ] || [ "$peer_revision" = "null" ]; then +- ocf_log err "could not compare revisions: $NODENAME local revision: $revision, peer revision: $peer_revision" ++ ocf_log err "could not compare revisions: '$NODENAME' local revision='$revision', peer revision='$peer_revision'" + return "$OCF_ERR_GENERIC" + fi + + if [ "$revision" -gt "$peer_revision" ]; then +- ocf_log info "$NODENAME revision: $revision is newer than peer revision: $peer_revision" ++ ocf_log info "$NODENAME revision: '$revision' is newer than peer revision: '$peer_revision'" + echo "newer" + elif [ "$revision" -eq "$peer_revision" ]; then +- ocf_log info "$NODENAME revision: $revision is equal to peer revision: $peer_revision" ++ ocf_log info "$NODENAME revision: '$revision' is equal to peer revision: '$peer_revision'" + echo "equal" + else +- ocf_log info "$NODENAME revision: $revision is older than peer revision: $peer_revision" ++ ocf_log info "$NODENAME revision: '$revision' is older than peer revision: '$peer_revision'" + echo "older" + fi + return "$OCF_SUCCESS" +@@ -1144,6 +1280,100 @@ ensure_pod_manifest_exists() + return "$OCF_SUCCESS" + } + ++filter_pod_manifest() { ++ # Remove pod-version related fields from POD manifest ++ local pod_manifest="$1" ++ local temporary_file ++ local jq_filter='del(.metadata.labels.revision) | .spec.containers[] |= ( .env |= map(select( .name != "ETCD_STATIC_POD_VERSION" ))) | .spec.volumes |= map( select( .name != "resource-dir" ))' ++ ++ if ! temporary_file=$(mktemp); then ++ ocf_log err "could not create temporary file for '$pod_manifest', error code: $?" ++ return $OCF_ERR_GENERIC ++ fi ++ if ! jq "$jq_filter" "$pod_manifest" > "$temporary_file"; then ++ ocf_log err "could not remove pod version related data from '$pod_manifest', error code: $?" ++ return $OCF_ERR_GENERIC ++ fi ++ echo "$temporary_file" ++} ++ ++can_reuse_container() { ++ # Decide whether to reuse the existing container or create a new one based on etcd pod manifest changes. ++ # NOTE: explicitly ignore POD version and POD version related data, as the content might be the same even if the revision number has changed. ++ local cp_rc ++ local diff_rc ++ local filtered_original_pod_manifest ++ local filtered_copy_pod_manifest ++ ++ ++ # If the container does not exist it cannot be reused ++ if ! container_exists; then ++ OCF_RESKEY_reuse=0 ++ return "$OCF_SUCCESS" ++ fi ++ ++ # If the manifest copy doesn't exist, we need a new container. ++ if [ ! -f "$POD_MANIFEST_COPY" ]; then ++ ocf_log info "a working copy of $OCF_RESKEY_pod_manifest was not found. A new etcd container will be created." ++ OCF_RESKEY_reuse=0 ++ return "$OCF_SUCCESS" ++ fi ++ ++ if ! filtered_original_pod_manifest=$(filter_pod_manifest "$OCF_RESKEY_pod_manifest"); then ++ return $OCF_ERR_GENERIC ++ fi ++ if ! filtered_copy_pod_manifest=$(filter_pod_manifest "$POD_MANIFEST_COPY"); then ++ return $OCF_ERR_GENERIC ++ fi ++ ++ ocf_log info "comparing $OCF_RESKEY_pod_manifest with local copy $POD_MANIFEST_COPY" ++ ocf_run diff -s "$filtered_original_pod_manifest" "$filtered_copy_pod_manifest" ++ diff_rc="$?" ++ # clean up temporary files ++ rm -f "$filtered_original_pod_manifest" "$filtered_copy_pod_manifest" ++ case "$diff_rc" in ++ 0) ++ ocf_log info "Reusing the existing etcd container" ++ OCF_RESKEY_reuse=1 ++ ;; ++ 1) ++ ocf_log info "Etcd pod manifest changes detected: creating a new etcd container to apply the changes" ++ if ! ocf_run cp -p "$OCF_RESKEY_pod_manifest" "$POD_MANIFEST_COPY"; then ++ cp_rc="$?" ++ ocf_log err "Could not create a working copy of $OCF_RESKEY_pod_manifest, rc: $cp_rc" ++ return "$OCF_ERR_GENERIC" ++ fi ++ ocf_log info "A working copy of $OCF_RESKEY_pod_manifest was created" ++ OCF_RESKEY_reuse=0 ++ ;; ++ *) ++ ocf_log err "Could not check if etcd pod manifest has changed, diff rc: $diff_rc" ++ return "$OCF_ERR_GENERIC" ++ ;; ++ esac ++ ++ return "$OCF_SUCCESS" ++} ++ ++ensure_pod_manifest_copy_exists() { ++ local cp_rc ++ ++ if [ -f "$POD_MANIFEST_COPY" ]; then ++ return "$OCF_SUCCESS" ++ fi ++ ++ # If the manifest copy doesn't exist, create it and ensure a new container. ++ if ! ocf_run cp -p "$OCF_RESKEY_pod_manifest" "$POD_MANIFEST_COPY"; then ++ cp_rc="$?" ++ ocf_log err "Could not create a working copy of $OCF_RESKEY_pod_manifest, rc: $cp_rc" ++ return "$OCF_ERR_GENERIC" ++ fi ++ ++ ocf_log info "a new working copy of $OCF_RESKEY_pod_manifest was created" ++ ++ return "$OCF_SUCCESS" ++} ++ + podman_start() + { + local cid +@@ -1173,6 +1403,13 @@ podman_start() + return $OCF_ERR_GENERIC + fi + ++ # check if the container has already started ++ podman_simple_status ++ if [ $? -eq $OCF_SUCCESS ]; then ++ ocf_log info "the '$CONTAINER' has already started. Nothing to do" ++ return "$OCF_SUCCESS" ++ fi ++ + if ! ensure_pod_manifest_exists; then + ocf_exit_reason "could not find etcd pod manifest ($OCF_RESKEY_pod_manifest)" + return "$OCF_ERR_GENERIC" +@@ -1186,8 +1423,9 @@ podman_start() + ocf_log info "static pod was running: start normally" + else + if is_force_new_cluster; then +- ocf_log notice "$NODENAME marked to force-new-cluster" ++ ocf_log notice "'$NODENAME' marked to force-new-cluster" + else ++ ocf_log info "'$NODENAME' is not marked to force-new-cluster" + # When the local agent starts, we can infer the cluster state by counting + # how many agents are starting or already active: + # - 1 active agent: it's the peer (we are just starting) +@@ -1195,6 +1433,7 @@ podman_start() + # - 0 active agents, 2 starting: both agents are starting simultaneously + local active_resources_count + active_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_active_resource" | wc -w) ++ ocf_log info "found '$active_resources_count' active etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_active_resource')" + case "$active_resources_count" in + 1) + if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then +@@ -1205,17 +1444,17 @@ podman_start() + fi + ;; + 0) ++ # count how many agents are starting now ++ local start_resources_count ++ start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w) ++ ocf_log info "found '$start_resources_count' starting etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_start_resource')" ++ + # we need to compare the revisions in any of the following branches + # so call the function only once here + if ! revision_compare_result=$(compare_revision); then + ocf_log err "could not compare revisions, error code: $?" + return "$OCF_ERR_GENERIC" + fi +- +- # count how many agents are starting now +- local start_resources_count +- start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w) +- + case "$start_resources_count" in + 1) + ocf_log debug "peer not starting: ensure we can start a new cluster" +@@ -1231,6 +1470,7 @@ podman_start() + fi + ;; + 2) ++ # TODO: can we start "normally", regardless the revisions, if the container-id is the same on both nodes? + ocf_log info "peer starting" + if [ "$revision_compare_result" = "newer" ]; then + set_force_new_cluster +@@ -1263,7 +1503,7 @@ podman_start() + fi + + podman_create_mounts +- local run_opts="--detach --name=${CONTAINER}" ++ local run_opts="--detach --name=${CONTAINER} --replace" + + run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}" + +@@ -1297,61 +1537,59 @@ podman_start() + archive_data_folder + fi + +- prepare_env ++ ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced" ++ if ! can_reuse_container ; then ++ rc="$?" ++ ocf_log err "could not determine etcd container reuse strategy, rc: $rc" ++ return "$rc" ++ fi ++ ++ # Archive current container and its configuration before creating ++ # new configuration files. ++ if ! ocf_is_true "$OCF_RESKEY_reuse"; then ++ # Log archive container failures but don't block, as the priority ++ # is ensuring the etcd container starts successfully. ++ archive_current_container ++ fi ++ ++ if ! ensure_pod_manifest_copy_exists; then ++ return $OCF_ERR_GENERIC ++ fi ++ ++ if ! prepare_env; then ++ ocf_log err "Could not prepare environment for podman, error code: $?" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ if ! generate_etcd_configuration; then ++ ocf_log err "Could not generate etcd configuration, error code: $?" ++ return $OCF_ERR_GENERIC ++ fi + +- # add etcd-specific opts + run_opts="$run_opts \ +- --network=host \ +- -v /etc/kubernetes/static-pod-resources/etcd-certs:/etc/kubernetes/static-pod-certs \ +- -v /var/lib/etcd:/var/lib/etcd \ +- --env ALL_ETCD_ENDPOINTS=$ALL_ETCD_ENDPOINTS \ +- --env ETCD_CIPHER_SUITES=$ETCD_CIPHER_SUITES \ +- --env ETCD_DATA_DIR=$ETCD_DATA_DIR \ +- --env ETCD_ELECTION_TIMEOUT=$ETCD_ELECTION_TIMEOUT \ +- --env ETCD_ENABLE_PPROF=$ETCD_ENABLE_PPROF \ +- --env ETCD_EXPERIMENTAL_MAX_LEARNERS=$ETCD_EXPERIMENTAL_MAX_LEARNERS \ +- --env ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION \ +- --env ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL \ +- --env ETCD_HEARTBEAT_INTERVAL=$ETCD_HEARTBEAT_INTERVAL \ +- --env ETCD_INITIAL_CLUSTER=$ETCD_INITIAL_CLUSTER \ +- --env ETCD_INITIAL_CLUSTER_STATE=$ETCD_INITIAL_CLUSTER_STATE \ +- --env ETCD_NAME=$NODENAME \ +- --env ETCD_QUOTA_BACKEND_BYTES=$ETCD_QUOTA_BACKEND_BYTES \ +- --env ETCD_SOCKET_REUSE_ADDRESS=$ETCD_SOCKET_REUSE_ADDRESS \ +- --env ETCDCTL_API=$ETCDCTL_API \ +- --env ETCDCTL_CACERT=$SERVER_CACERT \ +- --env ETCDCTL_CERT=$ETCD_PEER_CERT \ +- --env ETCDCTL_KEY=$ETCD_PEER_KEY \ +- --authfile=$OCF_RESKEY_authfile \ +- --security-opt label=disable" ++ --network=host \ ++ -v /etc/kubernetes/static-pod-resources/etcd-certs:/etc/kubernetes/static-pod-certs \ ++ -v /var/lib/etcd:/var/lib/etcd \ ++ --env ETCDCTL_API=$ETCDCTL_API \ ++ --env ETCDCTL_CACERT=$SERVER_CACERT \ ++ --env ETCDCTL_CERT=$ETCD_PEER_CERT \ ++ --env ETCDCTL_KEY=$ETCD_PEER_KEY \ ++ --authfile=$OCF_RESKEY_authfile \ ++ --security-opt label=disable" + if [ -n "$OCF_RESKEY_run_opts" ]; then + run_opts="$run_opts $OCF_RESKEY_run_opts" + fi + +- OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --logger=zap \ +- --log-level=info \ +- --experimental-initial-corrupt-check=true \ +- --snapshot-count=10000 \ +- --initial-advertise-peer-urls=$NODEIPURL:2380 \ +- --cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.crt \ +- --key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.key \ +- --trusted-ca-file=$SERVER_CACERT \ +- --client-cert-auth=true \ +- --peer-cert-file=$ETCD_PEER_CERT \ +- --peer-key-file=$ETCD_PEER_KEY \ +- --peer-trusted-ca-file=$SERVER_CACERT \ +- --peer-client-cert-auth=true \ +- --advertise-client-urls=$NODEIPURL:2379 \ +- --listen-client-urls=$(ip_url ${LISTEN_CLIENT_URLS}):2379,unixs://${NODEIP}:0 \ +- --listen-peer-urls=$(ip_url ${LISTEN_PEER_URLS}):2380 \ +- --metrics=extensive \ +- --listen-metrics-urls=$(ip_url ${LISTEN_METRICS_URLS}):9978" +- if [ -n "$OCF_RESKEY_run_cmd_opts" ]; then +- OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd $OCF_RESKEY_run_cmd_opts" ++ if [ -f "$ETCD_CONFIGURATION_FILE" ]; then ++ ocf_log info "using etcd configuration file: $ETCD_CONFIGURATION_FILE" ++ else ++ ocf_log err "could not find $ETCD_CONFIGURATION_FILE" ++ return "$OCF_ERR_GENERIC" + fi + +- if is_force_new_cluster; then +- OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --force-new-cluster" ++ OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --config-file=$ETCD_CONFIGURATION_FILE" ++ if [ -n "$OCF_RESKEY_run_cmd_opts" ]; then ++ OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd $OCF_RESKEY_run_cmd_opts" + fi + + if [ "$OCF_RESKEY_image" = "$OCF_RESKEY_image_default" ]; then +@@ -1377,9 +1615,7 @@ podman_start() + ocf_log info "starting existing container $CONTAINER." + ocf_run podman start "$CONTAINER" + else +- # make sure any previous container matching our container name is cleaned up first. +- # we already know at this point it wouldn't be running +- remove_container ++ ocf_log info "starting new container $CONTAINER." + run_new_container "$run_opts" "$OCF_RESKEY_image" "$OCF_RESKEY_run_cmd" + if [ $? -eq 125 ]; then + return $OCF_ERR_GENERIC +@@ -1439,7 +1675,6 @@ podman_stop() + local rc + podman_simple_status + if [ $? -eq $OCF_NOT_RUNNING ]; then +- remove_container + ocf_log info "could not leave members list: etcd container not running" + return $OCF_SUCCESS + fi +@@ -1475,7 +1710,7 @@ podman_stop() + ocf_run podman kill "$CONTAINER" + rc=$? + else +- ocf_log debug "waiting $timeout second[s] before killing container" ++ ocf_log info "waiting $timeout second[s] before killing container" + ocf_run podman stop -t="$timeout" "$CONTAINER" + rc=$? + # on stop, systemd will automatically delete any transient +@@ -1496,11 +1731,6 @@ podman_stop() + fi + fi + +- if ! remove_container; then +- ocf_exit_reason "Failed to remove stopped container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." +- return $OCF_ERR_GENERIC +- fi +- + return $OCF_SUCCESS + } + +@@ -1532,6 +1762,7 @@ podman_validate() + check_binary oc + check_binary podman + check_binary jq ++ check_binary tar + + if [ -z "$OCF_RESKEY_node_ip_map" ]; then + ocf_exit_reason "'node_ip_map' option is required" +@@ -1589,6 +1820,9 @@ else + fi + + CONTAINER=$OCF_RESKEY_name ++POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml" ++ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml" ++ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz" + + # Note: we currently monitor podman containers by with the "podman exec" + # command, so make sure that invocation is always valid by enforcing the diff --git a/SOURCES/RHEL-113767-podman-etcd-wrap-ipv6-address-in-brackets.patch b/SOURCES/RHEL-113767-podman-etcd-wrap-ipv6-address-in-brackets.patch new file mode 100644 index 0000000..b544e6d --- /dev/null +++ b/SOURCES/RHEL-113767-podman-etcd-wrap-ipv6-address-in-brackets.patch @@ -0,0 +1,193 @@ +From 11cdff8c886c72c83c26e48e46a8620c06e4c2f0 Mon Sep 17 00:00:00 2001 +From: E Hila +Date: Tue, 9 Sep 2025 06:06:12 -0400 +Subject: [PATCH] OCPBUGS-60977: podman-etcd: wrap ipv6 address in brackets for + attribute_node_ip (#2068) + +When trying to determine the node ip address we need to make sure we account for ipv6 and dualstack deployments, and accordingly wrap ipv6 in brackets so it correctly resolves. Since the node ip mapping is provided by the controller, we parse out the IP address of the node from there and use a helper function for building URLs with ports to correctly use brackets for ipv6 ip addresses. + +Signed-off-by: ehila +--- + heartbeat/podman-etcd | 77 ++++++++++++++++++++++++++++--------------- + 1 file changed, 51 insertions(+), 26 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 884b7c579..4969fbaaf 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -347,21 +347,41 @@ remove_container() + return $rc + } + ++# Correctly wraps an ipv6 in [] for url otherwise use return normal ipv4 address. ++ip_url() { ++ local ip_addr=$1 ++ local value ++ if echo "$ip_addr" | grep -q ":" ; then ++ value="[$ip_addr]" ++ else ++ value="$ip_addr" ++ fi ++ echo "https://$value" ++} ++ + attribute_node_ip() + { + local action="$1" + local attribute="node_ip" +- local value ++ local ip_addr name + +- if ! value=$(ip -brief addr show "$OCF_RESKEY_nic" | awk '{gsub("/.*", "", $3); print $3}'); then +- rc=$? +- ocf_log err "could not get node ip, error code: $rc" +- return "$rc" ++ for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do ++ name=$(echo "$node" | cut -d: -f1) ++ # ignore other nodes ++ if [ "$name" != "$NODENAME" ]; then ++ continue ++ fi ++ ip_addr=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6 ++ done ++ ++ if [ -z "$ip_addr" ]; then ++ ocf_log err "ip address was empty when querying (getent ahosts) for hostname: $(hostname -f)" ++ return 1 + fi + + case "$action" in + get) +- echo "$value" ++ echo "$ip_addr" + ;; + update) + if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then +@@ -409,26 +429,28 @@ get_env_from_manifest() { + } + + prepare_env() { +- local name ip standalone_node ++ local name ip ipurl standalone_node + + NODEIP="$(attribute_node_ip get)" ++ NODEIPURL=$(ip_url $NODEIP) + + if is_force_new_cluster; then +- ALL_ETCD_ENDPOINTS="https://$NODEIP:2379" ++ ALL_ETCD_ENDPOINTS="$NODEIPURL:2379" + ETCD_INITIAL_CLUSTER_STATE="new" +- ETCD_INITIAL_CLUSTER="$NODENAME=https://$NODEIP:2380" ++ ETCD_INITIAL_CLUSTER="$NODENAME=$NODEIPURL:2380" + else + ETCD_INITIAL_CLUSTER_STATE="existing" + for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do +- name=$(echo "$node" | awk -F":" '{print $1}') +- ip=$(echo "$node" | awk -F":" '{print $2}') ++ name=$(echo "$node" | cut -d: -f1) ++ ip=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6 ++ ipurl="$(ip_url $ip)" + if [ -z "$name" ] || [ -z "$ip" ]; then + ocf_exit_reason "name or ip missing for 1 or more nodes" + exit $OCF_ERR_CONFIGURED + fi + +- [ -z "$ALL_ETCD_ENDPOINTS" ] && ALL_ETCD_ENDPOINTS="https://$ip:2379" || ALL_ETCD_ENDPOINTS="$ALL_ETCD_ENDPOINTS,https://$ip:2379" +- [ -z "$ETCD_INITIAL_CLUSTER" ] && ETCD_INITIAL_CLUSTER="$name=https://$ip:2380" || ETCD_INITIAL_CLUSTER="$ETCD_INITIAL_CLUSTER,$name=https://$ip:2380" ++ [ -z "$ALL_ETCD_ENDPOINTS" ] && ALL_ETCD_ENDPOINTS="$ipurl:2379" || ALL_ETCD_ENDPOINTS="$ALL_ETCD_ENDPOINTS,$ipurl:2379" ++ [ -z "$ETCD_INITIAL_CLUSTER" ] && ETCD_INITIAL_CLUSTER="$name=$ipurl:2380" || ETCD_INITIAL_CLUSTER="$ETCD_INITIAL_CLUSTER,$name=$ipurl:2380" + done + fi + +@@ -609,9 +631,11 @@ add_member_as_learner() + local rc + local member_name=$1 + local member_ip=$2 ++ local endpoint_url=$(ip_url $(attribute_node_ip get)) ++ local peer_url=$(ip_url $member_ip) + + ocf_log info "add $member_name ($member_ip) to the member list as learner" +- out=$(podman exec "${CONTAINER}" etcdctl --endpoints="https://$(attribute_node_ip get):2379" member add "$member_name" --peer-urls="https://$member_ip:2380" --learner) ++ out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner) + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "could not add $member_name as learner, error code: $rc" +@@ -806,14 +830,15 @@ get_peer_node_name() { + + get_all_etcd_endpoints() { + for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do +- name=$(echo "$node" | awk -F":" '{print $1}') +- ip=$(echo "$node" | awk -F":" '{print $2}') ++ name=$(echo "$node" | cut -d: -f1) ++ ip=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6 ++ ipurl="$(ip_url $ip)" + if [ -z "$name" ] || [ -z "$ip" ]; then + ocf_exit_reason "name or ip missing for 1 or more nodes" + exit $OCF_ERR_CONFIGURED + fi + +- [ -z "$ALL_ETCD_ENDPOINTS" ] && ALL_ETCD_ENDPOINTS="https://$ip:2379" || ALL_ETCD_ENDPOINTS="$ALL_ETCD_ENDPOINTS,https://$ip:2379" ++ [ -z "$ALL_ETCD_ENDPOINTS" ] && ALL_ETCD_ENDPOINTS="$ipurl:2379" || ALL_ETCD_ENDPOINTS="$ALL_ETCD_ENDPOINTS,$ipurl:2379" + done + echo "$ALL_ETCD_ENDPOINTS" + } +@@ -831,7 +856,7 @@ get_member_list_json() { + # Get the list of members visible to the current node + local this_node_endpoint + +- this_node_endpoint="https://$(attribute_node_ip get):2379" ++ this_node_endpoint="$(ip_url $(attribute_node_ip get)):2379" + podman exec "${CONTAINER}" etcdctl member list --endpoints="$this_node_endpoint" -w json + } + +@@ -886,14 +911,14 @@ check_peers() + # ] + # } + for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do +- name=$(echo "$node" | awk -F":" '{print $1}') ++ name=$(echo "$node" | cut -d: -f1) + # do not check itself + if [ "$name" = "$NODENAME" ]; then + continue + fi + + # Check by IP instead of Name since "learner" members appear only in peerURLs, not by Name. +- ip=$(echo "$node" | awk -F":" '{print $2}') ++ ip=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6 + id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$ip\")) | any).ID") + if [ -z "$id" ]; then + ocf_log info "$name is not in the members list" +@@ -1307,7 +1332,7 @@ podman_start() + --log-level=info \ + --experimental-initial-corrupt-check=true \ + --snapshot-count=10000 \ +- --initial-advertise-peer-urls=https://${NODEIP}:2380 \ ++ --initial-advertise-peer-urls=$NODEIPURL:2380 \ + --cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.crt \ + --key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.key \ + --trusted-ca-file=$SERVER_CACERT \ +@@ -1316,11 +1341,11 @@ podman_start() + --peer-key-file=$ETCD_PEER_KEY \ + --peer-trusted-ca-file=$SERVER_CACERT \ + --peer-client-cert-auth=true \ +- --advertise-client-urls=https://${NODEIP}:2379 \ +- --listen-client-urls=https://${LISTEN_CLIENT_URLS}:2379,unixs://${NODEIP}:0 \ +- --listen-peer-urls=https://${LISTEN_PEER_URLS}:2380 \ ++ --advertise-client-urls=$NODEIPURL:2379 \ ++ --listen-client-urls=$(ip_url ${LISTEN_CLIENT_URLS}):2379,unixs://${NODEIP}:0 \ ++ --listen-peer-urls=$(ip_url ${LISTEN_PEER_URLS}):2380 \ + --metrics=extensive \ +- --listen-metrics-urls=https://${LISTEN_METRICS_URLS}:9978" ++ --listen-metrics-urls=$(ip_url ${LISTEN_METRICS_URLS}):9978" + if [ -n "$OCF_RESKEY_run_cmd_opts" ]; then + OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd $OCF_RESKEY_run_cmd_opts" + fi +@@ -1430,7 +1455,7 @@ podman_stop() + ocf_log info "last member. Not leaving the member list" + else + ocf_log info "leaving members list as member with ID $member_id" +- endpoint="https://$(attribute_node_ip get):2379" ++ endpoint="$(ip_url $(attribute_node_ip get)):2379" + if ! ocf_run podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"; then + rc=$? + ocf_log err "error leaving members list, error code: $rc" diff --git a/SOURCES/RHEL-114489-1-powervs-move-ip-new-ra.patch b/SOURCES/RHEL-114489-1-powervs-move-ip-new-ra.patch new file mode 100644 index 0000000..e94214e --- /dev/null +++ b/SOURCES/RHEL-114489-1-powervs-move-ip-new-ra.patch @@ -0,0 +1,1127 @@ +From 3e15ecb7457e55f39fc5e48eecf250937819f8c5 Mon Sep 17 00:00:00 2001 +From: ehaefele <30649454+ehaefele@users.noreply.github.com> +Date: Fri, 12 Sep 2025 12:17:17 +0200 +Subject: [PATCH] powervs-move-ip: new resource agent (#2072) + +* powervs-move-ip: new resource agent + +Resource agent to move a virtual IP address between two virtual server instances, +and manage the corresponding network routes in the IBM Power Virtual Server workspaces. +--- + .gitignore | 1 + + configure.ac | 8 + + doc/man/Makefile.am | 4 + + heartbeat/Makefile.am | 4 + + heartbeat/powervs-move-ip.in | 1035 ++++++++++++++++++++++++++++++++++ + 5 files changed, 1052 insertions(+) + create mode 100755 heartbeat/powervs-move-ip.in + +diff --git a/.gitignore b/.gitignore +index 0a6d45e65..8dd29db29 100644 +--- a/.gitignore ++++ b/.gitignore +@@ -79,6 +79,7 @@ heartbeat/mariadb + heartbeat/mpathpersist + heartbeat/nfsnotify + heartbeat/openstack-info ++heartbeat/powervs-move-ip + heartbeat/powervs-subnet + heartbeat/rabbitmq-cluster + heartbeat/redis +diff --git a/configure.ac b/configure.ac +index 8a74e6684..3765ac858 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -560,6 +560,13 @@ if test -z "$PYTHON" || test $BUILD_OCF_PY -eq 0; then + fi + AM_CONDITIONAL(BUILD_GCP_VPC_MOVE_VIP, test $BUILD_GCP_VPC_MOVE_VIP -eq 1) + ++BUILD_POWERVS_MOVE_IP=1 ++if test -z "$PYTHON" || test $BUILD_OCF_PY -eq 0 || test "x${HAVE_PYMOD_REQUESTS}" != xyes || test "x${HAVE_PYMOD_URLLIB3}" != xyes; then ++ BUILD_POWERVS_MOVE_IP=0 ++ AC_MSG_WARN("Not building powervs-move-ip") ++fi ++AM_CONDITIONAL(BUILD_POWERVS_MOVE_IP, test $BUILD_POWERVS_MOVE_IP -eq 1) ++ + BUILD_POWERVS_SUBNET=1 + if test -z "$PYTHON" || test $BUILD_OCF_PY -eq 0 || test "x${HAVE_PYMOD_REQUESTS}" != xyes || test "x${HAVE_PYMOD_URLLIB3}" != xyes; then + BUILD_POWERVS_SUBNET=0 +@@ -1044,6 +1051,7 @@ AC_CONFIG_FILES([heartbeat/mariadb], [chmod +x heartbeat/mariadb]) + AC_CONFIG_FILES([heartbeat/mpathpersist], [chmod +x heartbeat/mpathpersist]) + AC_CONFIG_FILES([heartbeat/nfsnotify], [chmod +x heartbeat/nfsnotify]) + AC_CONFIG_FILES([heartbeat/openstack-info], [chmod +x heartbeat/openstack-info]) ++AC_CONFIG_FILES([heartbeat/powervs-move-ip], [chmod +x heartbeat/powervs-move-ip]) + AC_CONFIG_FILES([heartbeat/powervs-subnet], [chmod +x heartbeat/powervs-subnet]) + AC_CONFIG_FILES([heartbeat/rabbitmq-cluster], [chmod +x heartbeat/rabbitmq-cluster]) + AC_CONFIG_FILES([heartbeat/redis], [chmod +x heartbeat/redis]) +diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am +index 0d34c7c65..0dee5e9e1 100644 +--- a/doc/man/Makefile.am ++++ b/doc/man/Makefile.am +@@ -238,6 +238,10 @@ if BUILD_GCP_VPC_MOVE_VIP + man_MANS += ocf_heartbeat_gcp-vpc-move-vip.7 + endif + ++if BUILD_POWERVS_MOVE_IP ++man_MANS += ocf_heartbeat_powervs-move-ip.7 ++endif ++ + if BUILD_POWERVS_SUBNET + man_MANS += ocf_heartbeat_powervs-subnet.7 + endif +diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am +index 839505af9..b5374163d 100644 +--- a/heartbeat/Makefile.am ++++ b/heartbeat/Makefile.am +@@ -207,6 +207,10 @@ if BUILD_GCP_VPC_MOVE_VIP + ocf_SCRIPTS += gcp-vpc-move-vip + endif + ++if BUILD_POWERVS_MOVE_IP ++ocf_SCRIPTS += powervs-move-ip ++endif ++ + if BUILD_POWERVS_SUBNET + ocf_SCRIPTS += powervs-subnet + endif +diff --git a/heartbeat/powervs-move-ip.in b/heartbeat/powervs-move-ip.in +new file mode 100755 +index 000000000..d55979e52 +--- /dev/null ++++ b/heartbeat/powervs-move-ip.in +@@ -0,0 +1,1035 @@ ++#!@PYTHON@ -tt ++# ------------------------------------------------------------------------ ++# Description: Resource agent for moving an overlay IP address between ++# virtual server instances in different PowerVS workspaces. ++# ++# Authors: Edmund Haefele ++# Walter Orb ++# ++# Copyright (c) 2025 International Business Machines, Inc. ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++# ------------------------------------------------------------------------ ++ ++import fcntl ++import ipaddress ++import json ++import os ++import socket ++import subprocess ++import sys ++import textwrap ++import time ++from pathlib import Path ++from urllib.parse import urlparse ++ ++import requests ++import requests.adapters ++import urllib3.util ++ ++# Constants ++OCF_FUNCTIONS_DIR = os.environ.get( ++ "OCF_FUNCTIONS_DIR", "%s/lib/heartbeat" % os.environ.get("OCF_ROOT") ++) ++RESOURCE_OPTIONS = ( ++ "ip", ++ "api_key", ++ "api_type", ++ "region", ++ "route_host_map", ++ "use_token_cache", ++ "monitor_api", ++ "device", ++ "proxy", ++) ++IP_CMD = "/usr/sbin/ip" ++REQUESTS_TIMEOUT = 5 # Timeout for requests calls ++HTTP_MAX_RETRIES = 3 # Maximum number of retries for HTTP requests ++HTTP_BACKOFF_FACTOR = 0.3 # Sleep (factor * (2^number of previous retries)) secs ++HTTP_STATUS_FORCE_RETRIES = (500, 502, 503, 504) # HTTP status codes to retry on ++HTTP_RETRY_ALLOWED_METHODS = frozenset({"GET", "POST", "PUT", "DELETE"}) ++CIDR_NETMASK = "32" ++ ++sys.path.append(OCF_FUNCTIONS_DIR) ++try: ++ import ocf ++except ImportError: ++ sys.stderr.write("ImportError: ocf module import failed.") ++ sys.exit(5) ++ ++ ++class OCFExitError(Exception): ++ """Exception class for OCF (Open Cluster Framework) exit errors.""" ++ ++ def __init__(self, message, exit_code): ++ ocf.ocf_exit_reason(message) ++ sys.exit(exit_code) ++ ++ ++class CmdError(OCFExitError): ++ """Exception class for errors when running system commands.""" ++ ++ def __init__(self, message, exit_code): ++ super().__init__(f"[CmdError] {message}", exit_code) ++ ++ ++def os_cmd(cmd_args, is_json=False, timeout=10): ++ """Run a system command and optionally parse JSON output.""" ++ ocf.logger.debug(f"[os_cmd]: args: {cmd_args}") ++ try: ++ result = subprocess.run( ++ cmd_args, ++ capture_output=True, ++ text=True, ++ check=True, ++ timeout=timeout, ++ env={"LANG": "C"}, ++ ) ++ if is_json: ++ try: ++ return json.loads(result.stdout) ++ except json.JSONDecodeError as e: ++ raise CmdError(f"os_cmd: JSON parsing failed: {e}", ocf.OCF_ERR_GENERIC) ++ ++ return result.returncode ++ ++ except subprocess.CalledProcessError as e: ++ raise CmdError( ++ f"os_cmd: command failed: {e.stderr}", ++ ocf.OCF_ERR_GENERIC, ++ ) ++ except subprocess.TimeoutExpired: ++ raise CmdError("os_cmd: command timed out", ocf.OCF_ERR_GENERIC) ++ ++ ++def ip_cmd(*args, is_json=False): ++ """Generic wrapper for the ip command.""" ++ return os_cmd([IP_CMD] + list(args), is_json=is_json) ++ ++ ++def ip_address_show(): ++ """Show IP addresses in JSON format.""" ++ return ip_cmd("-json", "address", "show", is_json=True) ++ ++ ++def ip_address_add(cidr, device, label=None): ++ """Add an IP address to a device.""" ++ cmd = ["address", "add", cidr, "dev", device] ++ if label: ++ cmd += ["label", label] ++ return ip_cmd(*cmd) ++ ++ ++def ip_address_delete(cidr, device): ++ """Delete an IP address from a device.""" ++ return ip_cmd("address", "delete", cidr, "dev", device) ++ ++ ++def ip_find_device(ip): ++ """Find the device associated with a given IP address.""" ++ for iface in ip_address_show(): ++ addresses = [a["local"] for a in iface["addr_info"]] ++ if ip in addresses and "UP" in iface["flags"]: ++ return iface["ifname"] ++ ++ return None ++ ++ ++def ip_check_device(device): ++ """Verify that a device with the specified interface name (device) exists.""" ++ for iface in ip_address_show(): ++ if iface["ifname"] == device and "UP" in iface["flags"]: ++ return True ++ ++ return False ++ ++ ++def ip_alias_add(ip, device): ++ """Add an IP alias to the given device.""" ++ ip_cidr = f"{ip}/{CIDR_NETMASK}" ++ ocf.logger.debug( ++ f"[ip_alias_add]: adding IP alias '{ip_cidr}' to interface '{device}'" ++ ) ++ _ = ip_address_add(ip_cidr, device) ++ ++ ++def ip_alias_remove(ip): ++ """Find the device with the given IP alias and remove the alias.""" ++ device = ip_find_device(ip) ++ if device: ++ ip_cidr = f"{ip}/{CIDR_NETMASK}" ++ ocf.logger.debug( ++ f"[ip_alias_remove]: removing IP alias '{ip_cidr}' from interface '{device}'" ++ ) ++ _ = ip_address_delete(ip_cidr, device) ++ ++ ++def create_session_with_retries(): ++ """Create a request session with a retry strategy.""" ++ retry_strategy = urllib3.util.Retry( ++ total=HTTP_MAX_RETRIES, ++ status_forcelist=HTTP_STATUS_FORCE_RETRIES, ++ allowed_methods=HTTP_RETRY_ALLOWED_METHODS, ++ backoff_factor=HTTP_BACKOFF_FACTOR, ++ raise_on_status=False, ++ ) ++ adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy) ++ session = requests.Session() ++ session.mount("https://", adapter) ++ return session ++ ++ ++class PowerCloudTokenManagerError(OCFExitError): ++ """Exception class for errors in the PowerCloudTokenManager.""" ++ ++ def __init__(self, message, exit_code): ++ super().__init__(f"[PowerCloudTokenManagerError] {message}", exit_code) ++ ++ ++class PowerCloudTokenManager: ++ """Request and cache IBM Cloud tokens.""" ++ ++ _DEFAULT_RESOURCE_INSTANCE = "powervs-move-ip" ++ _TOKEN_REFRESH_BUFFER = 900 # 15 minutes ++ ++ def __init__( ++ self, ++ api_type="", ++ api_key="", ++ proxy="", ++ use_cache=False, ++ ): ++ self._auth_url = ( ++ "https://private.iam.cloud.ibm.com/identity/token" ++ if api_type == "private" ++ else "https://iam.cloud.ibm.com/identity/token" ++ ) ++ self._api_key = self._load_api_key(api_key) ++ self._proxy = proxy ++ self._session = create_session_with_retries() ++ self._cache_file = None ++ ++ if use_cache: ++ resource_instance = os.environ.get( ++ "OCF_RESOURCE_INSTANCE", self._DEFAULT_RESOURCE_INSTANCE ++ ) ++ self._cache_file = Path( ++ f"/var/run/resource-agents/{resource_instance}-token.json" ++ ) ++ self._cache_file.parent.mkdir(parents=True, exist_ok=True) ++ if not self._cache_file.exists(): ++ self._cache_file.touch() ++ os.chmod(self._cache_file, 0o600) ++ ++ def _load_api_key(self, api_key): ++ """Load API key from string or file.""" ++ if not api_key: ++ raise PowerCloudTokenManagerError( ++ "_load_api_key: API key is missing", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ ++ # API key in string ++ if not api_key.startswith("@"): ++ return api_key ++ ++ # API key in file ++ api_key_path = Path(api_key[1:]) ++ if not api_key_path.is_file(): ++ raise PowerCloudTokenManagerError( ++ f"_load_api_key: API key file not found: '{api_key_path}'", ++ ocf.OCF_ERR_ARGS, ++ ) ++ ++ try: ++ content = api_key_path.read_text().strip() ++ api_key_field = json.loads(content).get("apikey", "") ++ except json.JSONDecodeError: ++ # data is text, return as is ++ api_key_field = content ++ ++ if not api_key_field: ++ raise PowerCloudTokenManagerError( ++ f"_load_api_key: invalid API key in file '{api_key_path}'", ++ ocf.OCF_ERR_ARGS, ++ ) ++ ++ return api_key_field ++ ++ def _request_new_token(self): ++ """Request a new access token.""" ++ headers = { ++ "content-type": "application/x-www-form-urlencoded", ++ "accept": "application/json", ++ } ++ data = { ++ "grant_type": "urn:ibm:params:oauth:grant-type:apikey", ++ "apikey": f"{self._api_key}", ++ } ++ ++ current_time = time.time() ++ try: ++ response = self._session.post( ++ self._auth_url, ++ headers=headers, ++ data=data, ++ proxies=self._proxy, ++ timeout=REQUESTS_TIMEOUT, ++ ) ++ response.raise_for_status() ++ token_data = response.json() ++ return ( ++ token_data["access_token"], ++ current_time + token_data["expires_in"], ++ current_time, ++ ) ++ except requests.RequestException as e: ++ ocf.logger.warning( ++ f"[PowerCloudTokenManager] _request_new_token: failed to request token: '{e}'" ++ ) ++ return None ++ ++ def _read_cache(self): ++ """Read token cache.""" ++ try: ++ with self._cache_file.open("r") as f: ++ fcntl.flock(f, fcntl.LOCK_EX) ++ try: ++ return json.load(f) ++ finally: ++ fcntl.flock(f, fcntl.LOCK_UN) ++ except (json.JSONDecodeError, FileNotFoundError, PermissionError): ++ ocf.logger.warning( ++ "[PowerCloudTokenManager] _read_cache: failed to read token cache read due to missing file or malformed JSON." ++ ) ++ return {} ++ ++ def _write_cache(self, token, expiration, refreshed_at): ++ """Write token cache.""" ++ try: ++ with self._cache_file.open("w") as f: ++ fcntl.flock(f, fcntl.LOCK_EX) ++ try: ++ json.dump( ++ { ++ "token": token, ++ "expiration": expiration, ++ "refreshed_at": refreshed_at, ++ }, ++ f, ++ ) ++ finally: ++ fcntl.flock(f, fcntl.LOCK_UN) ++ except Exception as e: ++ raise PowerCloudTokenManagerError( ++ f"_write_cache: failed to write token cache file: '{e}'", ++ ocf.OCF_ERR_GENERIC, ++ ) ++ ++ def _is_token_expired(self, expiration): ++ """Check if token is expired or near expiry.""" ++ return time.time() + self._TOKEN_REFRESH_BUFFER >= expiration ++ ++ def get_token(self): ++ """Get a valid access token, using cache if enabled.""" ++ if not self._cache_file: ++ result = self._request_new_token() ++ if result: ++ token, _, _ = result ++ return token ++ raise PowerCloudTokenManagerError( ++ "get_token: token request failed and no cache available", ++ ocf.OCF_ERR_GENERIC, ++ ) ++ ++ cache = self._read_cache() ++ token = cache.get("token") ++ expiration = cache.get("expiration", 0) ++ ++ if not token or self._is_token_expired(expiration): ++ result = self._request_new_token() ++ if result: ++ token, expiration, refreshed_at = result ++ refresh_time = time.ctime(refreshed_at) ++ ocf.logger.debug( ++ f"[PowerCloudTokenManager] get_token: refreshed token at '{refresh_time}'" ++ ) ++ self._write_cache(token, expiration, refreshed_at) ++ else: ++ ocf.logger.error( ++ "[PowerCloudTokenManager] get_token: failed to refresh token" ++ ) ++ if token and time.time() < expiration: ++ ocf.logger.warning( ++ "[PowerCloudTokenManager] get_token: using cached token as fallback" ++ ) ++ else: ++ raise PowerCloudTokenManagerError( ++ "get_token: no valid token available", ++ ocf.OCF_ERR_GENERIC, ++ ) ++ ++ return token ++ ++ ++class PowerCloudAPIError(OCFExitError): ++ """Exception class for errors in PowerCloudAPI.""" ++ ++ def __init__(self, message, exit_code): ++ super().__init__(f"[PowerCloudAPIError] {message}", exit_code) ++ ++ ++class PowerCloudAPI: ++ """Offers a convenient method for sending requests to the IBM Power Cloud API.""" ++ ++ _ALLOWED_API_TYPES = {"public", "private"} ++ ++ def __init__( ++ self, ++ api_key="", ++ api_type="", ++ region="", ++ crn="", ++ proxy="", ++ use_cache=False, ++ ): ++ """Initialize class variables, including the IBM Power Cloud API endpoint URL and HTTP header, and get an API token.""" ++ ++ self._crn = crn ++ self._proxy = self._get_proxy(proxy) ++ self._api_url = self._get_api_url(region, api_type) ++ token_manager = PowerCloudTokenManager( ++ api_type=api_type, api_key=api_key, proxy=self._proxy, use_cache=use_cache ++ ) ++ self._token = token_manager.get_token() ++ self._header = self._get_header() ++ self._session = create_session_with_retries() ++ ++ def _get_proxy(self, proxy): ++ """Validate a proxy URL and test TCP connectivity. Returns a proxy dict if reachable.""" ++ if not proxy: ++ return None ++ ++ parsed_url = urlparse(proxy) ++ is_valid_url = ( ++ parsed_url.hostname ++ and parsed_url.port ++ and parsed_url.scheme in ("http", "https") ++ ) ++ ++ if not is_valid_url: ++ raise PowerCloudAPIError( ++ f"_get_proxy: invalid proxy URL '{proxy}'", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ ++ try: ++ with socket.create_connection( ++ (parsed_url.hostname, parsed_url.port), timeout=REQUESTS_TIMEOUT ++ ): ++ return {"https": proxy} ++ except OSError as e: ++ raise PowerCloudAPIError( ++ f"_get_proxy: cannot connect to proxy '{proxy}': {e}", ++ ocf.OCF_ERR_ARGS, ++ ) ++ ++ def _get_api_url(self, region, api_type): ++ """Generate and return the API URL for a given region and API type.""" ++ if not region: ++ raise PowerCloudAPIError( ++ "_get_api_url: missing region parameter", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ ++ api_type = str(api_type).lower() ++ if api_type not in self._ALLOWED_API_TYPES: ++ raise PowerCloudAPIError( ++ f"_get_api_url: invalid api_type: '{api_type}', must be one of {self._ALLOWED_API_TYPES} ", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ if api_type == "public" and not self._proxy: ++ raise PowerCloudAPIError( ++ "_get_api_url: api_type 'public' requires a proxy", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ ++ subdomain = "private." if api_type == "private" else "" ++ return f"https://{subdomain}{region}.power-iaas.cloud.ibm.com" ++ ++ def _get_header(self): ++ """Construct request header.""" ++ return { ++ "Authorization": f"Bearer {self._token}", ++ "CRN": self._crn, ++ "Content-Type": "application/json", ++ } ++ ++ def send_api_request(self, method, resource, **kwargs): ++ """Perform an HTTP API call to the specified resource using the given method""" ++ url = f"{self._api_url}{resource}" ++ method = method.upper() ++ ocf.logger.debug(f"[PowerCloudAPI] send_api_request: '{method}' '{resource}'") ++ ++ try: ++ response = self._session.request( ++ method, ++ url, ++ headers=self._header, ++ proxies=self._proxy, ++ timeout=REQUESTS_TIMEOUT, ++ **kwargs, ++ ) ++ response.raise_for_status() ++ return response.json() ++ except requests.RequestException as e: ++ raise PowerCloudAPIError( ++ f"send_api_request: request error occured: '{method}' - '{url}' - '{e}'", ++ ocf.OCF_ERR_GENERIC, ++ ) ++ ++ ++class PowerCloudRouteError(OCFExitError): ++ """Exception class for errors encountered while managing PowerVS network routes.""" ++ ++ def __init__(self, message, exit_code): ++ super().__init__(f"[PowerCloudRouteError] {message}", exit_code) ++ ++ ++class PowerCloudRoute(PowerCloudAPI): ++ """Provides methods for managing network routes in Power Virtual Server.""" ++ ++ _CRN_PREFIX_INDEX = 0 ++ _CRN_TYPE_INDEX = 8 ++ _CRN_ROUTE_ID_INDEX = 9 ++ _CRN_EXPECTED_LENGTH = 10 ++ ++ def __init__( ++ self, ++ ip="", ++ api_key="", ++ api_type="", ++ region="", ++ route_host_map="", ++ device="", ++ proxy="", ++ monitor_api="", ++ use_token_cache="", ++ is_remote_route=False, ++ ): ++ """Initialize PowerCloudRoute instance.""" ++ self._is_remote_route = is_remote_route ++ self.ip = self._get_ip_info(ip) ++ self.crn, self.route_id = self._parse_route_map(route_host_map) ++ use_cache = str(use_token_cache).lower() == "true" ++ super().__init__( ++ api_key=api_key, ++ api_type=api_type, ++ region=region, ++ crn=self.crn, ++ proxy=proxy, ++ use_cache=use_cache, ++ ) ++ self.route_info = self._get_route_info() ++ self.route_name = self.route_info["name"] ++ self.device = self._get_device_name(device) ++ ++ def _get_ip_info(self, ip): ++ """Validate the given IP address and return its standard form.""" ++ try: ++ return str(ipaddress.ip_address(ip)) ++ except ValueError: ++ raise PowerCloudRouteError( ++ f"_get_ip_info: invalid IP address '{ip}'", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ ++ def _parse_route_crn(self, route_crn): ++ """Parses a PowerVS route CRN and extract its base CRN and route ID.""" ++ crn_parts = route_crn.split(":") ++ ++ if ( ++ len(crn_parts) != self._CRN_EXPECTED_LENGTH ++ or crn_parts[self._CRN_PREFIX_INDEX] != "crn" ++ or crn_parts[self._CRN_TYPE_INDEX] != "route" ++ ): ++ raise PowerCloudAPIError( ++ f"_parse_route_crn: invalid CRN format for network-route: '{route_crn}'", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ ++ workspace_crn = ":".join(crn_parts[: self._CRN_TYPE_INDEX]) + "::" ++ route_id = crn_parts[self._CRN_ROUTE_ID_INDEX] ++ ++ return workspace_crn, route_id ++ ++ def _parse_route_map(self, route_host_map): ++ """Validate the route host map and extract the associated CRN and route ID.""" ++ try: ++ route_map = dict(item.split(":", 1) for item in route_host_map.split(";")) ++ except ValueError: ++ raise PowerCloudRouteError( ++ f"_parse_route_map: invalid route_host_map format: '{route_host_map}'", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ ++ hostname = os.uname().nodename ++ # set nodename to local hostname or get hostname of remote host from route_map ++ nodename = ( ++ hostname ++ if not self._is_remote_route ++ else next((h for h in route_map if h != hostname), None) ++ ) ++ ++ if not nodename or nodename not in route_map: ++ raise PowerCloudRouteError( ++ f"_parse_route_map: hostname '{nodename}' not found in route_host_map '{route_host_map}'", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ ++ return self._parse_route_crn(route_map[nodename]) ++ ++ def _get_route_info(self): ++ """Retrieve and validate attributes of a PowerVS network route.""" ++ resource = f"/v1/routes/{self.route_id}" ++ route_info = self.send_api_request("GET", resource) ++ ++ zone = "remote" if self._is_remote_route else "local" ++ ocf.logger.debug( ++ f"[PowerCloudRoute] _get_route_info: {zone} route info: '{route_info}'" ++ ) ++ ++ if self.ip != route_info["destination"]: ++ raise PowerCloudRouteError( ++ f"_get_route_info: IP '{self.ip}' does not match the route destination address '{route_info['destination']}'", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ ++ if route_info["advertise"] != "enable": ++ raise PowerCloudRouteError( ++ f"_get_route_info: route '{route_info['name']}' advertise flag must be set to enable", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ ++ return route_info ++ ++ def _get_device_name(self, name): ++ """Verify the existence of a network interface with the specified name.""" ++ if self._is_remote_route: ++ return "" ++ ++ if name: ++ if ip_check_device(name): ++ return name ++ raise PowerCloudRouteError( ++ f"_get_device_name: network interface '{name}' does not exist or is down", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ ++ next_hop = self.route_info["nextHop"] ++ interface_name = ip_find_device(next_hop) ++ if interface_name: ++ return interface_name ++ ++ raise PowerCloudRouteError( ++ f"_get_device_name: network interface with next hop '{next_hop}' does not exist or is down", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ ++ def _set_route_enabled(self, enabled: bool): ++ """Enable or disable the PowerVS network route.""" ++ resource = f"/v1/routes/{self.route_id}" ++ data = json.dumps({"enabled": enabled}) ++ ++ state = "enabled" if enabled else "disabled" ++ response = self.send_api_request("PUT", resource, data=data) ++ ocf.logger.debug( ++ f"[PowerCloudRoute] _set_route_enabled: successfully {state} route '{self.route_name}', response: '{response}'" ++ ) ++ ++ def is_enabled(self): ++ """Check whether the PowerVS network route is currently enabled.""" ++ return self.route_info["state"] == "deployed" ++ ++ def enable(self): ++ """Enable the PowerVS network route.""" ++ if not self.is_enabled(): ++ self._set_route_enabled(True) ++ ++ def disable(self): ++ """Disable the PowerVS network route.""" ++ if self.is_enabled(): ++ self._set_route_enabled(False) ++ ++ ++def create_route_instance(options, is_remote_route=False, catch_exception=False): ++ """Instantiate a PowerCloudRoute object and handle errors. ++ ++ Returns: ++ - PowerCloudRoute: The initialized route object if successful. ++ - None: If an error occurs and catch_exception is True. ++ ++ Raises: ++ - PowerCloudRouteError: If instantiation fails and catch_exception is False. ++ """ ++ # Filter only the valid resource agent options from options dictionary. ++ resource_options = {k: options.get(k, "") for k in RESOURCE_OPTIONS} ++ ++ try: ++ return PowerCloudRoute(**resource_options, is_remote_route=is_remote_route) ++ except Exception as e: ++ zone = "remote" if is_remote_route else "local" ++ ocf.logger.error( ++ f"[create_route_instance]: failed to instantiate {zone} route: '{e}'" ++ ) ++ if catch_exception: ++ return None ++ raise ++ ++ ++def start_action( ++ ip="", ++ api_key="", ++ api_type="", ++ region="", ++ route_host_map="", ++ use_token_cache="", ++ monitor_api="", ++ device="", ++ proxy="", ++): ++ """Assign the service IP. ++ ++ This function performs the following actions: ++ - Adds the specified IP address as an alias to the given network interface or the interface matching the route's next hop. ++ - Disables the remote network route. ++ - Enables the network route associated with the provided route host map. ++ """ ++ resource_options = locals() ++ ++ ocf.logger.info("[start_action]: enabling overlay IP") ++ ocf.logger.debug(f"[start_action]: options: '{resource_options}'") ++ ++ remote_route = create_route_instance(resource_options, is_remote_route=True) ++ # Disable remote route ++ ocf.logger.debug( ++ f"[start_action]: disabling remote route '{remote_route.route_name}'" ++ ) ++ remote_route.disable() ++ ++ local_route = create_route_instance(resource_options) ++ ++ # Add IP alias ++ ip_alias_add(ip, local_route.device) ++ ++ # Enable local route ++ ocf.logger.debug(f"[start_action]: enabling local route '{local_route.route_name}'") ++ local_route.enable() ++ ++ monitor_result = monitor_action(**resource_options) ++ if monitor_result != ocf.OCF_SUCCESS: ++ raise PowerCloudRouteError( ++ f"start_action: failed to enable local route '{local_route.route_name}'", ++ monitor_result, ++ ) ++ ++ ocf.logger.info( ++ f"[start_action]: successfully added IP alias '{ip}' and enabled local route '{local_route.route_name}'" ++ ) ++ return ocf.OCF_SUCCESS ++ ++ ++def stop_action( ++ ip="", ++ api_key="", ++ api_type="", ++ region="", ++ route_host_map="", ++ use_token_cache="", ++ monitor_api="", ++ device="", ++ proxy="", ++): ++ """Remove the service IP. ++ ++ This function performs the following actions: ++ - Disables the network route associated with the provided route host map. ++ - Removes the IP alias from the network interface. ++ """ ++ ++ resource_options = locals() ++ ++ ocf.logger.info("[stop_action]: disabling overlay IP") ++ ocf.logger.debug(f"[stop_action]: options: '{resource_options}'") ++ ++ try: ++ remote_route = create_route_instance(resource_options, is_remote_route=True) ++ ocf.logger.debug( ++ f"[stop_action]: disabling remote route '{remote_route.route_name}'" ++ ) ++ remote_route.disable() ++ ++ local_route = create_route_instance(resource_options) ++ ocf.logger.debug( ++ f"[stop_action]: disabling local route '{local_route.route_name}'" ++ ) ++ local_route.disable() ++ finally: ++ # Remove IP alias ++ ip_alias_remove(ip) ++ ++ monitor_result = monitor_action(**resource_options) ++ if monitor_result != ocf.OCF_NOT_RUNNING: ++ raise PowerCloudRouteError( ++ f"stop_action: failed to disable local route '{local_route.route_name}'", ++ monitor_result, ++ ) ++ ++ ocf.logger.info( ++ f"[stop_action]: successfully removed IP alias '{ip}' and disabled local route '{local_route.route_name}'" ++ ) ++ return ocf.OCF_SUCCESS ++ ++ ++def monitor_action( ++ ip="", ++ api_key="", ++ api_type="", ++ region="", ++ route_host_map="", ++ use_token_cache="", ++ monitor_api="", ++ device="", ++ proxy="", ++): ++ """Monitor the service IP. ++ ++ Checks the status of the assigned service IP address. ++ """ ++ resource_options = locals() ++ is_probe = ocf.is_probe() ++ use_extended_monitor = ocf.OCF_ACTION == "start" or ( ++ str(monitor_api).lower() == "true" and not is_probe ++ ) ++ ++ ocf.logger.debug( ++ f"[monitor_action]: options: '{resource_options}', is_probe: '{is_probe}'" ++ ) ++ ++ interface_name = ip_find_device(ip) ++ ++ if not use_extended_monitor: ++ if interface_name: ++ ocf.logger.debug( ++ f"[monitor_action]: IP alias '{ip}' is active'" ++ ) ++ return ocf.OCF_SUCCESS ++ else: ++ ocf.logger.debug( ++ f"[monitor_action]: IP alias '{ip}' is not active" ++ ) ++ return ocf.OCF_NOT_RUNNING ++ ++ remote_route = create_route_instance( ++ resource_options, is_remote_route=True, catch_exception=True ++ ) ++ if remote_route is None: ++ ocf.logger.error("[monitor_action]: failed to instantiate remote route") ++ return ocf.OCF_ERR_GENERIC ++ elif remote_route.is_enabled(): ++ ocf.logger.error( ++ f"[monitor_action]: remote route '{remote_route.route_name}' is enabled" ++ ) ++ return ocf.OCF_ERR_GENERIC ++ ++ local_route = create_route_instance( ++ resource_options, is_remote_route=False, catch_exception=True ++ ) ++ ++ if local_route is None: ++ ocf.logger.error("[monitor_action]: failed to instantiate local route") ++ return ocf.OCF_ERR_GENERIC ++ ++ if interface_name: ++ if local_route.is_enabled(): ++ ocf.logger.debug( ++ f"[monitor_action]: IP alias '{ip}' is active, local route '{local_route.route_name}' is enabled" ++ ) ++ return ocf.OCF_SUCCESS ++ else: ++ ocf.logger.error( ++ f"[monitor_action]: local route '{local_route.route_name}' is not enabled" ++ ) ++ return ocf.OCF_ERR_GENERIC ++ else: ++ if local_route.is_enabled(): ++ ocf.logger.error( ++ f"[monitor_action]: local route '{local_route.route_name}' is enabled, but IP alias is not configured" ++ ) ++ return ocf.OCF_ERR_GENERIC ++ else: ++ ocf.logger.debug( ++ f"[monitor_action]: IP alias '{ip}' is not active and local route '{local_route.route_name}' is disabled" ++ ) ++ return ocf.OCF_NOT_RUNNING ++ ++ ++def validate_all_action( ++ ip="", ++ api_key="", ++ api_type="", ++ region="", ++ route_host_map="", ++ use_token_cache="", ++ monitor_api="", ++ device="", ++ proxy="", ++): ++ """Validate resource agent parameters. ++ ++ Verifies the provided resource agent options by attempting to instantiate route objects for both local and remote routes. ++ """ ++ resource_options = locals() ++ ++ ocf.logger.info("[validate_all_action]: validate local and remote routes") ++ _ = create_route_instance(resource_options) ++ _ = create_route_instance(resource_options, is_remote_route=True) ++ ++ return ocf.OCF_SUCCESS ++ ++ ++def main(): ++ """Instantiate the resource agent.""" ++ agent_description = textwrap.dedent("""\ ++ Resource Agent to move an IP address from one Power Virtual Server instance to another. ++ ++ Prerequisites: ++ 1. Red Hat Enterprise Linux 9.4 or higher ++ ++ 2. Two-node cluster ++ - Distributed across two PowerVS workspaces in separate data centers within the same region. ++ ++ 3. IBM Cloud API Key: ++ - Create a service API key with privileges for both workspaces. ++ - Save the key in a file and copy it to both cluster nodes using the same path and filename. ++ - Reference the key file path in the resource definition. ++ ++ For detailed guidance on high availability for SAP applications on PowerVS, visit: ++ https://cloud.ibm.com/docs/sap?topic=sap-ha-overview. ++ """) ++ ++ agent = ocf.Agent( ++ "powervs-move-ip", ++ shortdesc="Manages Power Virtual Server overlay IP routes.", ++ longdesc=agent_description, ++ version=1.00, ++ ) ++ ++ agent.add_parameter( ++ "ip", ++ shortdesc="IP address", ++ longdesc=( ++ "The virtual IP address is the destination address of a network route." ++ ), ++ content_type="string", ++ required=True, ++ ) ++ agent.add_parameter( ++ "api_key", ++ shortdesc="API Key or @API_KEY_FILE_PATH", ++ longdesc=( ++ "API Key or @API_KEY_FILE_PATH for IBM Cloud access. " ++ "The API key content or the path of an API key file that is indicated by the @ symbol." ++ ), ++ content_type="string", ++ required=True, ++ ) ++ agent.add_parameter( ++ "api_type", ++ shortdesc="API type", ++ longdesc="Connect to Power Virtual Server regional endpoints over a public or private network (public|private).", ++ content_type="string", ++ default="private", ++ required=True, ++ ) ++ agent.add_parameter( ++ "region", ++ shortdesc="Power Virtual Server region", ++ longdesc=( ++ "Region that represents the geographic area where the instance is located. " ++ "The region is used to identify the Cloud API endpoint." ++ ), ++ content_type="string", ++ required=True, ++ ) ++ agent.add_parameter( ++ "route_host_map", ++ shortdesc="Mapping of hostnames to IBM Cloud route CRNs", ++ longdesc=( ++ "Map the hostname of the Power Virtual Server instance to the route CRN of the overlay IP route. " ++ "Separate hostname and route CRN with a colon ':', separate different hostname and route CRN pairs with a semicolon ';'. " ++ "Example: hostname1:route-crn-of-instance1;hostname2:route-crn-of-instance2" ++ ), ++ content_type="string", ++ required=True, ++ ) ++ agent.add_parameter( ++ "use_token_cache", ++ shortdesc="Enable API token cache", ++ longdesc="Enable caching of the API access token in a local file to reduce authentication overhead. ", ++ content_type="string", ++ default="True", ++ required=False, ++ ) ++ agent.add_parameter( ++ "monitor_api", ++ shortdesc="Enhanced API Monitoring", ++ longdesc="Enable enhanced monitoring by using Power Cloud API calls to verify route configuration correctness. ", ++ content_type="string", ++ default="False", ++ required=False, ++ ) ++ agent.add_parameter( ++ "device", ++ shortdesc="Network adapter for the overlay IP address", ++ longdesc=( ++ "Network adapter for the overlay IP address. " ++ "The adapter must have the same name on all Power Virtual Server instances. " ++ "If the `device` parameter is not specified, the IP alias is assigned to the interface whose configured IP address matches the route's next hop address. " ++ ), ++ content_type="string", ++ default="", ++ required=False, ++ ) ++ agent.add_parameter( ++ "proxy", ++ shortdesc="Proxy", ++ longdesc=( ++ "Proxy server used to access IBM Cloud API endpoints. " ++ "The value must be a valid URL in the format 'http[s]://hostname:port'. " ++ ), ++ content_type="string", ++ default="", ++ required=False, ++ ) ++ agent.add_action("start", timeout=60, handler=start_action) ++ agent.add_action("stop", timeout=60, handler=stop_action) ++ agent.add_action( ++ "monitor", depth=0, timeout=60, interval=60, handler=monitor_action ++ ) ++ agent.add_action("validate-all", timeout=60, handler=validate_all_action) ++ agent.run() ++ ++ ++if __name__ == "__main__": ++ main() diff --git a/SOURCES/RHEL-114489-2-powervs-move-ip-set-bundled-path.patch b/SOURCES/RHEL-114489-2-powervs-move-ip-set-bundled-path.patch new file mode 100644 index 0000000..167bdf2 --- /dev/null +++ b/SOURCES/RHEL-114489-2-powervs-move-ip-set-bundled-path.patch @@ -0,0 +1,19 @@ +--- a/heartbeat/powervs-move-ip.in 2025-09-15 16:13:34.225046827 +0200 ++++ b/heartbeat/powervs-move-ip.in 2025-09-15 17:39:02.746258434 +0200 +@@ -33,9 +33,13 @@ + from pathlib import Path + from urllib.parse import urlparse + +-import requests +-import requests.adapters +-import urllib3.util ++try: ++ sys.path.insert(0, '/usr/lib/fence-agents/support/ibm') ++ import requests ++ import requests.adapters ++ import urllib3.util ++except ImportError: ++ pass + + # Constants + OCF_FUNCTIONS_DIR = os.environ.get( diff --git a/SOURCES/RHEL-114489-3-powervs-move-ip-add-iflabel-parameter.patch b/SOURCES/RHEL-114489-3-powervs-move-ip-add-iflabel-parameter.patch new file mode 100644 index 0000000..536ae0c --- /dev/null +++ b/SOURCES/RHEL-114489-3-powervs-move-ip-add-iflabel-parameter.patch @@ -0,0 +1,197 @@ +From a4e496e5e6d9abde1b071fa2dfa1c6e7ba899cf1 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edmund=20H=C3=A4fele?= +Date: Thu, 30 Oct 2025 13:03:22 +0100 +Subject: [PATCH] Update powervs-move-ip + +- Add `iflabel` argument. +- Increase maximum number of retries for HTTP requests to four. +--- + heartbeat/powervs-move-ip.in | 66 +++++++++++++++++++++++++----------- + 1 file changed, 47 insertions(+), 19 deletions(-) + +diff --git a/heartbeat/powervs-move-ip.in b/heartbeat/powervs-move-ip.in +index d55979e52..e2250c998 100755 +--- a/heartbeat/powervs-move-ip.in ++++ b/heartbeat/powervs-move-ip.in +@@ -50,11 +50,13 @@ RESOURCE_OPTIONS = ( + "use_token_cache", + "monitor_api", + "device", ++ "iflabel", + "proxy", + ) + IP_CMD = "/usr/sbin/ip" ++IFLABEL_MAX_LEN = 15 # Maximum character limit for interface labels + REQUESTS_TIMEOUT = 5 # Timeout for requests calls +-HTTP_MAX_RETRIES = 3 # Maximum number of retries for HTTP requests ++HTTP_MAX_RETRIES = 4 # Maximum number of retries for HTTP requests + HTTP_BACKOFF_FACTOR = 0.3 # Sleep (factor * (2^number of previous retries)) secs + HTTP_STATUS_FORCE_RETRIES = (500, 502, 503, 504) # HTTP status codes to retry on + HTTP_RETRY_ALLOWED_METHODS = frozenset({"GET", "POST", "PUT", "DELETE"}) +@@ -154,13 +156,13 @@ def ip_check_device(device): + return False + + +-def ip_alias_add(ip, device): ++def ip_alias_add(ip, device, label=None): + """Add an IP alias to the given device.""" + ip_cidr = f"{ip}/{CIDR_NETMASK}" + ocf.logger.debug( +- f"[ip_alias_add]: adding IP alias '{ip_cidr}' to interface '{device}'" ++ f"[ip_alias_add]: adding IP alias '{ip_cidr}' with label '{label}' to interface '{device}'" + ) +- _ = ip_address_add(ip_cidr, device) ++ _ = ip_address_add(ip_cidr, device, label) + + + def ip_alias_remove(ip): +@@ -522,6 +524,7 @@ class PowerCloudRoute(PowerCloudAPI): + region="", + route_host_map="", + device="", ++ iflabel="", + proxy="", + monitor_api="", + use_token_cache="", +@@ -543,6 +546,7 @@ class PowerCloudRoute(PowerCloudAPI): + self.route_info = self._get_route_info() + self.route_name = self.route_info["name"] + self.device = self._get_device_name(device) ++ self.iflabel = self._make_iflabel(iflabel) + + def _get_ip_info(self, ip): + """Validate the given IP address and return its standard form.""" +@@ -588,7 +592,7 @@ class PowerCloudRoute(PowerCloudAPI): + nodename = ( + hostname + if not self._is_remote_route +- else next((h for h in route_map if h != hostname), None) ++ else next((host for host in route_map if host != hostname), None) + ) + + if not nodename or nodename not in route_map: +@@ -646,6 +650,21 @@ class PowerCloudRoute(PowerCloudAPI): + ocf.OCF_ERR_CONFIGURED, + ) + ++ def _make_iflabel(self, label=None): ++ """Constructs an interface label in the format 'device:label' if both are provided.""" ++ if not label or self._is_remote_route: ++ return None ++ ++ iflabel = f"{self.device}:{label}" ++ ++ if len(iflabel) > IFLABEL_MAX_LEN: ++ raise PowerCloudRouteError( ++ f"_make_iflabel: interface label '{iflabel}' exceeds limit of {IFLABEL_MAX_LEN} characters", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ ++ return iflabel ++ + def _set_route_enabled(self, enabled: bool): + """Enable or disable the PowerVS network route.""" + resource = f"/v1/routes/{self.route_id}" +@@ -706,6 +725,7 @@ def start_action( + use_token_cache="", + monitor_api="", + device="", ++ iflabel="", + proxy="", + ): + """Assign the service IP. +@@ -730,7 +750,7 @@ def start_action( + local_route = create_route_instance(resource_options) + + # Add IP alias +- ip_alias_add(ip, local_route.device) ++ ip_alias_add(ip, local_route.device, local_route.iflabel) + + # Enable local route + ocf.logger.debug(f"[start_action]: enabling local route '{local_route.route_name}'") +@@ -758,6 +778,7 @@ def stop_action( + use_token_cache="", + monitor_api="", + device="", ++ iflabel="", + proxy="", + ): + """Remove the service IP. +@@ -810,6 +831,7 @@ def monitor_action( + use_token_cache="", + monitor_api="", + device="", ++ iflabel="", + proxy="", + ): + """Monitor the service IP. +@@ -829,15 +851,11 @@ def monitor_action( + interface_name = ip_find_device(ip) + + if not use_extended_monitor: +- if interface_name: +- ocf.logger.debug( +- f"[monitor_action]: IP alias '{ip}' is active'" +- ) ++ if interface_name: ++ ocf.logger.debug(f"[monitor_action]: IP alias '{ip}' is active'") + return ocf.OCF_SUCCESS +- else: +- ocf.logger.debug( +- f"[monitor_action]: IP alias '{ip}' is not active" +- ) ++ else: ++ ocf.logger.debug(f"[monitor_action]: IP alias '{ip}' is not active") + return ocf.OCF_NOT_RUNNING + + remote_route = create_route_instance( +@@ -893,6 +911,7 @@ def validate_all_action( + use_token_cache="", + monitor_api="", + device="", ++ iflabel="", + proxy="", + ): + """Validate resource agent parameters. +@@ -914,12 +933,10 @@ def main(): + Resource Agent to move an IP address from one Power Virtual Server instance to another. + + Prerequisites: +- 1. Red Hat Enterprise Linux 9.4 or higher +- +- 2. Two-node cluster ++ 1. Two-node cluster + - Distributed across two PowerVS workspaces in separate data centers within the same region. + +- 3. IBM Cloud API Key: ++ 2. IBM Cloud API Key: + - Create a service API key with privileges for both workspaces. + - Save the key in a file and copy it to both cluster nodes using the same path and filename. + - Reference the key file path in the resource definition. +@@ -932,7 +949,7 @@ def main(): + "powervs-move-ip", + shortdesc="Manages Power Virtual Server overlay IP routes.", + longdesc=agent_description, +- version=1.00, ++ version=1.01, + ) + + agent.add_parameter( +@@ -1011,6 +1028,17 @@ def main(): + default="", + required=False, + ) ++ agent.add_parameter( ++ "iflabel", ++ shortdesc="Network interface label", ++ longdesc=( ++ "A custom suffix for the IP address label. " ++ "It is appended to the interface name in the format device:label. " ++ "The full label must not exceed 15 characters. " ++ ), ++ content_type="string", ++ required=False, ++ ) + agent.add_parameter( + "proxy", + shortdesc="Proxy", diff --git a/SOURCES/RHEL-115785-RHEL-115782-1-db2-add-skip_basic_sql_health_check-and-monitor-parameters.patch b/SOURCES/RHEL-115785-RHEL-115782-1-db2-add-skip_basic_sql_health_check-and-monitor-parameters.patch new file mode 100644 index 0000000..4659bae --- /dev/null +++ b/SOURCES/RHEL-115785-RHEL-115782-1-db2-add-skip_basic_sql_health_check-and-monitor-parameters.patch @@ -0,0 +1,258 @@ +From fc240bdff60aae7133a532c7752c6253ce8f65ca Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Mon, 4 Aug 2025 16:53:09 +0200 +Subject: [PATCH 1/2] db2: add "skip_basic_sql_health_check" parameter to avoid + failing on systems with high load + +--- + heartbeat/db2 | 63 +++++++++++++++++++++++++++++++-------------------- + 1 file changed, 38 insertions(+), 25 deletions(-) + +diff --git a/heartbeat/db2 b/heartbeat/db2 +index 1cd66f15a..da6c9d5f1 100755 +--- a/heartbeat/db2 ++++ b/heartbeat/db2 +@@ -40,10 +40,12 @@ + # Parameter defaults + + OCF_RESKEY_instance_default="" ++OCF_RESKEY_skip_basic_sql_health_check_default="false" + OCF_RESKEY_admin_default="" + OCF_RESKEY_dbpartitionnum_default="0" + + : ${OCF_RESKEY_instance=${OCF_RESKEY_instance_default}} ++: ${OCF_RESKEY_skip_basic_sql_health_check=${OCF_RESKEY_skip_basic_sql_health_check_default}} + : ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}} + : ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}} + +@@ -102,6 +104,15 @@ Defaults to all databases in the instance. Specify one db for HADR mode. + List of databases to be managed + + ++ ++ ++Skip basic health check SQL query. ++ ++Only set to "true" to avoid issues during high load. ++ ++Skip basic health check SQL query ++ ++ + + + DEPRECATED: The admin user of the instance. +@@ -695,31 +706,33 @@ db2_monitor() { + # set master preference accordingly + case "$hadr" in + PRIMARY/*|Primary/*|Standard/*) +- # perform a basic health check +- CMD="if db2 connect to $db; +- then +- db2 select \* from sysibm.sysversions ; rc=\$?; +- db2 terminate; +- else +- rc=\$?; +- fi; +- exit \$rc" +- +- if ! output=$(runasdb2 $CMD) +- then +- case "$output" in +- SQL1776N*) +- # can't connect/select on standby, may be spurious turing takeover +- ;; +- +- *) +- ocf_log err "DB2 database $instance($db2node)/$db is not working" +- ocf_log err "DB2 message: $output" +- +- # dead primary, remove master score +- master_score -D -l reboot +- return $OCF_ERR_GENERIC +- esac ++ if ! ocf_is_true "$OCF_RESKEY_skip_basic_sql_health_check"; then ++ # perform a basic health check ++ CMD="if db2 connect to $db; ++ then ++ db2 select \* from sysibm.sysversions ; rc=\$?; ++ db2 terminate; ++ else ++ rc=\$?; ++ fi; ++ exit \$rc" ++ ++ if ! output=$(runasdb2 $CMD) ++ then ++ case "$output" in ++ SQL1776N*) ++ # can't connect/select on standby, may be spurious turing takeover ++ ;; ++ ++ *) ++ ocf_log err "DB2 database $instance($db2node)/$db is not working" ++ ocf_log err "DB2 message: $output" ++ ++ # dead primary, remove master score ++ master_score -D -l reboot ++ return $OCF_ERR_GENERIC ++ esac ++ fi + fi + + ocf_log debug "DB2 database $instance($db2node)/$db appears to be working" + +From ded016f84d3fb77dc0542e3f4226774526910d97 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Thu, 7 Aug 2025 13:55:11 +0200 +Subject: [PATCH 2/2] db2: add "monitor_retries", "monitor_sleep", and + "monitor_retry_all_errors" parameters to be able to avoid failing on first + try + +--- + heartbeat/db2 | 80 +++++++++++++++++++++++++++++++++++++++++++++------ + 1 file changed, 72 insertions(+), 8 deletions(-) + +diff --git a/heartbeat/db2 b/heartbeat/db2 +index da6c9d5f1..fe1d9b892 100755 +--- a/heartbeat/db2 ++++ b/heartbeat/db2 +@@ -41,11 +41,17 @@ + + OCF_RESKEY_instance_default="" + OCF_RESKEY_skip_basic_sql_health_check_default="false" ++OCF_RESKEY_monitor_retries_default="1" ++OCF_RESKEY_monitor_sleep_default="1" ++OCF_RESKEY_monitor_retry_all_errors_default="false" + OCF_RESKEY_admin_default="" + OCF_RESKEY_dbpartitionnum_default="0" + + : ${OCF_RESKEY_instance=${OCF_RESKEY_instance_default}} + : ${OCF_RESKEY_skip_basic_sql_health_check=${OCF_RESKEY_skip_basic_sql_health_check_default}} ++: ${OCF_RESKEY_monitor_retries=${OCF_RESKEY_monitor_retries_default}} ++: ${OCF_RESKEY_monitor_sleep=${OCF_RESKEY_monitor_sleep_default}} ++: ${OCF_RESKEY_monitor_retry_all_errors=${OCF_RESKEY_monitor_retry_all_errors_default}} + : ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}} + : ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}} + +@@ -108,11 +114,33 @@ Defaults to all databases in the instance. Specify one db for HADR mode. + + Skip basic health check SQL query. + +-Only set to "true" to avoid issues during high load. ++Only set to "true" when the "monitor_retries" and "monitor_retry_all_errors" parameters arent ++enough to avoid issues under high load. + + Skip basic health check SQL query + + ++ ++ ++Monitor retries before failing. ++ ++Monitor retries ++ ++ ++ ++ ++Monitor sleep between tries. ++ ++Monitor sleep ++ ++ ++ ++ ++Set to true to retry monitor-action for all errors instead of the default "db2pd" race conditions. ++ ++Retry monitor for all errors ++ ++ + + + DEPRECATED: The admin user of the instance. +@@ -666,6 +694,7 @@ db2_hadr_status() { + local output + + output=$(runasdb2 db2pd -hadr -db $db) ++ ocf_log debug "db2_hadr_status: $output" + if [ $? != 0 ] + then + echo "Down/Off" +@@ -676,7 +705,34 @@ db2_hadr_status() { + awk '/^\s+HADR_(ROLE|STATE) =/ {printf $3"/"} + /^\s+HADR_CONNECT_STATUS =/ {print $3; exit; } + /^HADR is not active/ {print "Standard/Standalone"; exit; } +- /^Role *State */ {getline; printf "%s/%s\n", $1, $2; exit; }' ++ /^Role *State */ {getline; printf "%s/%s\n", $1, $2; exit; } ++ /^Option -hadr requires -db or -alldbs option and active database./ { exit 255 } ++ /^Another possibility of this failure is the Virtual Address Space Randomization is currently enabled on this system./ { exit 255 } ++ /^Changing data structure forced command termination./ { exit 255 }' ++} ++ ++db2_monitor_retry() { ++ local tries=$(($OCF_RESKEY_monitor_retries + 1)) ++ ++ for try in $(seq $tries); do ++ ocf_log debug "monitor try $try of $tries" ++ db2_monitor ++ rc=$? ++ [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_RUNNING_MASTER ] && [ $rc -ne $OCF_NOT_RUNNING ] && ocf_log warn "Monitor failed with rc $rc." ++ if [ $rc -eq $OCF_SUCCESS ] || [ $rc -eq $OCF_RUNNING_MASTER ] || [ $rc -eq $OCF_NOT_RUNNING ] || { [ $rc -ne 255 ] && ! ocf_is_true "$OCF_RESKEY_monitor_retry_all_errors" ;} ;then ++ break ++ fi ++ [ $try -lt $tries ] && sleep $OCF_RESKEY_monitor_sleep ++ done ++ ++ [ $rc -eq 255 ] && rc=$OCF_ERR_GENERIC ++ ++ if [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_RUNNING_MASTER ]; then ++ # instance is dead remove master score ++ master_score -D -l reboot ++ fi ++ ++ return $rc + } + + # +@@ -690,9 +746,7 @@ db2_monitor() { + db2_instance_status + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then +- # instance is dead remove master score +- master_score -D -l reboot +- exit $rc ++ return $rc + fi + + [ $db2node = 0 ] || return 0 +@@ -700,8 +754,18 @@ db2_monitor() { + + for db in $dblist + do +- hadr=$(db2_hadr_status $db) || return $OCF_ERR_GENERIC ++ hadr=$(db2_hadr_status $db) ++ rc=$? + ocf_log debug "Monitor: DB2 database $instance($db2node)/$db has HADR status $hadr" ++ if [ "$rc" -eq 255 ]; then ++ if [ "$__OCF_ACTION" = "monitor" ]; then ++ return $rc ++ else ++ return $OCF_ERR_GENERIC ++ fi ++ elif [ "$rc" -ne 0 ]; then ++ return $OCF_ERR_GENERIC ++ fi + + # set master preference accordingly + case "$hadr" in +@@ -915,9 +979,9 @@ case "$__OCF_ACTION" in + exit $? + ;; + +- monitor) ++ monitor) + db2_validate +- db2_monitor ++ db2_monitor_retry + exit $? + ;; + diff --git a/SOURCES/RHEL-115785-RHEL-115782-2-db2-fix-variable-name.patch b/SOURCES/RHEL-115785-RHEL-115782-2-db2-fix-variable-name.patch new file mode 100644 index 0000000..634a894 --- /dev/null +++ b/SOURCES/RHEL-115785-RHEL-115782-2-db2-fix-variable-name.patch @@ -0,0 +1,49 @@ +From 54714646c6e2c4ba851e366e63316adb1092af61 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Tue, 28 Oct 2025 16:34:54 +0100 +Subject: [PATCH] db2: fix monitor_retries_sleep variable name + +--- + heartbeat/db2 | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/heartbeat/db2 b/heartbeat/db2 +index 83020fc70..82f2f82c3 100755 +--- a/heartbeat/db2 ++++ b/heartbeat/db2 +@@ -49,7 +49,7 @@ fi + OCF_RESKEY_instance_default="" + OCF_RESKEY_skip_basic_sql_health_check_default="false" + OCF_RESKEY_monitor_retries_default="1" +-OCF_RESKEY_monitor_sleep_default="1" ++OCF_RESKEY_monitor_retries_sleep_default="1" + OCF_RESKEY_monitor_retry_all_errors_default="false" + OCF_RESKEY_admin_default="" + OCF_RESKEY_dbpartitionnum_default="0" +@@ -57,7 +57,7 @@ OCF_RESKEY_dbpartitionnum_default="0" + : ${OCF_RESKEY_instance=${OCF_RESKEY_instance_default}} + : ${OCF_RESKEY_skip_basic_sql_health_check=${OCF_RESKEY_skip_basic_sql_health_check_default}} + : ${OCF_RESKEY_monitor_retries=${OCF_RESKEY_monitor_retries_default}} +-: ${OCF_RESKEY_monitor_sleep=${OCF_RESKEY_monitor_sleep_default}} ++: ${OCF_RESKEY_monitor_retries_sleep=${OCF_RESKEY_monitor_retries_sleep_default}} + : ${OCF_RESKEY_monitor_retry_all_errors=${OCF_RESKEY_monitor_retry_all_errors_default}} + : ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}} + : ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}} +@@ -140,7 +140,7 @@ Monitor retries before failing. + Monitor sleep between tries. + + Monitor sleep +- ++ + + + +@@ -776,7 +776,7 @@ db2_monitor_retry() { + if [ $rc -eq $OCF_SUCCESS ] || [ $rc -eq $OCF_RUNNING_MASTER ] || [ $rc -eq $OCF_NOT_RUNNING ] || { [ $rc -ne 255 ] && ! ocf_is_true "$OCF_RESKEY_monitor_retry_all_errors" ;} ;then + break + fi +- [ $try -lt $tries ] && sleep $OCF_RESKEY_monitor_sleep ++ [ $try -lt $tries ] && sleep $OCF_RESKEY_monitor_retries_sleep + done + + [ $rc -eq 255 ] && rc=$OCF_ERR_GENERIC diff --git a/SOURCES/RHEL-116151-1-ocf-shellfuncs-add-ocf_promotion_score.patch b/SOURCES/RHEL-116151-1-ocf-shellfuncs-add-ocf_promotion_score.patch new file mode 100644 index 0000000..059d505 --- /dev/null +++ b/SOURCES/RHEL-116151-1-ocf-shellfuncs-add-ocf_promotion_score.patch @@ -0,0 +1,19 @@ +--- a/heartbeat/ocf-shellfuncs.in 2025-09-29 14:01:55.762931795 +0200 ++++ b/heartbeat/ocf-shellfuncs.in 2025-09-29 14:09:28.651731793 +0200 +@@ -1093,6 +1093,16 @@ + echo $1 + } + ++ocf_promotion_score() { ++ ocf_version_cmp "$OCF_RESKEY_crm_feature_set" "3.10.0" ++ res=$? ++ if [ $res -eq 2 ] || [ $res -eq 1 ] || ! have_binary "crm_master"; then ++ ${HA_SBIN_DIR}/crm_attribute -p ${OCF_RESOURCE_INSTANCE} $@ ++ else ++ ${HA_SBIN_DIR}/crm_master -l reboot $@ ++ fi ++} ++ + __ocf_set_defaults "$@" + + : ${OCF_TRACE_RA:=$OCF_RESKEY_trace_ra} diff --git a/SOURCES/RHEL-116151-2-portblock-add-promotable-support.patch b/SOURCES/RHEL-116151-2-portblock-add-promotable-support.patch new file mode 100644 index 0000000..0ae29e3 --- /dev/null +++ b/SOURCES/RHEL-116151-2-portblock-add-promotable-support.patch @@ -0,0 +1,362 @@ +--- a/heartbeat/portblock 2025-09-30 09:52:13.967530030 +0200 ++++ b/heartbeat/portblock 2025-09-30 09:52:49.018382542 +0200 +@@ -4,6 +4,7 @@ + # + # Author: Sun Jiang Dong (initial version) + # Philipp Reisner (per-IP filtering) ++# Sebastian Baszczyj (nftables code) + # + # License: GNU General Public License (GPL) + # +@@ -43,11 +44,15 @@ + ####################################################################### + CMD=`basename $0` + TICKLETCP=$HA_BIN/tickle_tcp ++TABLE="portblock" ++# Promotion scores ++SCORE_UNPROMOTED=5 ++SCORE_PROMOTED=10 + + usage() + { + cat <&2 +- usage: $CMD {start|stop|status|monitor|meta-data|validate-all} ++ usage: $CMD {start|stop|promote|demote|status|monitor|meta-data|validate-all} + + $CMD is used to temporarily block ports using iptables. + +@@ -86,8 +91,8 @@ + NOTE: iptables is Linux-specific. + + An additional feature in the portblock RA is the tickle ACK function +- enabled by specifying the tickle_dir parameter. The tickle ACK +- triggers the clients to faster reconnect their TCP connections to the ++ enabled by specifying the tickle_dir parameter. The tickle ACK ++ triggers the clients to faster reconnect their TCP connections to the + fail-overed server. + + Please note that this feature is often used for the floating IP fail- +@@ -95,7 +100,7 @@ + It doesn't support the cluster alias IP scenario. + + When using the tickle ACK function, in addition to the normal usage +- of portblock RA, the parameter tickle_dir must be specified in the ++ of portblock RA, the parameter tickle_dir must be specified in the + action=unblock instance of the portblock resources. + For example, you may stack resources like below: + portblock action=block +@@ -103,18 +108,18 @@ + portblock action=unblock tickle_dir=/tickle/state/dir + + If you want to tickle all the TCP connections which connected to _one_ +- floating IP but different ports, no matter how many portblock resources +- you have defined, you should enable tickles for _one_ portblock ++ floating IP but different ports, no matter how many portblock resources ++ you have defined, you should enable tickles for _one_ portblock + resource(action=unblock) only. +- +- The tickle_dir is a location which stores the established TCP +- connections. It can be a shared directory(which is cluster-visible to ++ ++ The tickle_dir is a location which stores the established TCP ++ connections. It can be a shared directory(which is cluster-visible to + all nodes) or a local directory. + If you use the shared directory, you needn't do any other things. + If you use the local directory, you must also specify the sync_script + paramater. We recommend you to use csync2 as the sync_script. +- For example, if you use the local directory /tmp/tickle as tickle_dir, +- you could setup the csync2 as the csync2 documentation says and ++ For example, if you use the local directory /tmp/tickle as tickle_dir, ++ you could setup the csync2 as the csync2 documentation says and + configure your /etc/csync2/csync2.cfg like: + group ticklegroup { + host node1; +@@ -137,15 +142,19 @@ + 1.0 + + +-Resource script for portblock. It is used to temporarily block ports ++Resource script for portblock. It is used to block ports + using iptables. In addition, it may allow for faster TCP reconnects + for clients on failover. Use that if there are long lived TCP + connections to an HA service. This feature is enabled by setting the + tickle_dir parameter and only in concert with action set to unblock. + Note that the tickle ACK function is new as of version 3.0.2 and + hasn't yet seen widespread use. ++ ++In Promotable mode, the promote action unblocks the port(s) on the Promoted node ++and blocks the port(s) on the Unpromoted node(s) when action=unblock, and vice versa ++when action=block. + +-Block and unblocks access to TCP and UDP ports ++Blocks and unblocks access to TCP and UDP ports + + + +@@ -167,6 +176,10 @@ + + + The action (block/unblock) to be done on the protocol::portno. ++ ++In Promotable mode it is the action for the promote action, ++and the opposite action will be used for the start and demote ++actions. + + action + +@@ -202,7 +215,7 @@ + + + +-The shared or local directory (_must_ be absolute path) which ++The shared or local directory (_must_ be absolute path) which + stores the established TCP connections. + + Tickle directory +@@ -236,6 +249,8 @@ + + + ++ ++ + + + +@@ -269,9 +284,9 @@ + # iptables 1.8.9 briefly broke the output format, returning the + # numeric protocol value instead of a string. Support both variants. + if [ "$1" = "tcp" ]; then +- local prot="(tcp|6)" ++ local prot="\(tcp\|6\)" + else +- local prot="(udp|17)" ++ local prot="\(udp\|17\)" + fi + echo "^DROP${w}${prot}${w}--${w}${src}${w}${dst}${w}multiport${w}${4}ports${w}${2}$" + } +@@ -281,7 +296,7 @@ + { + [ "$4" = "OUTPUT" ] && ds="s" || ds="d" + PAT=$(active_grep_pat "$1" "$2" "$3" "$ds") +- $IPTABLES $wait -n -L "$4" | grep -qE "$PAT" ++ $IPTABLES $wait -n -L "$4" | grep -q "$PAT" + } + + # netstat -tn and ss -Htn, split on whitespace and colon, +@@ -397,6 +412,17 @@ + rc=$OCF_NOT_RUNNING + ;; + esac ++ elif ocf_is_ms; then ++ case $5 in ++ block) ++ SayInactive $* ++ rc=$OCF_NOT_RUNNING ++ ;; ++ *) ++ SayActive $* ++ rc=$OCF_SUCCESS ++ ;; ++ esac + else + case $5 in + block) +@@ -493,18 +519,21 @@ + { + ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" start + case $5 in +- block) IptablesBLOCK "$@";; ++ block) IptablesBLOCK "$@" ++ rc=$? ++ ;; + unblock) + IptablesUNBLOCK "$@" + rc=$? + tickle_remote + #ignore run_tickle_tcp exit code! +- return $rc + ;; +- *) usage; return 1; ++ *) usage; return $OCF_ERR_CONFIGURED ; + esac + +- return $? ++ ocf_is_ms && ocf_promotion_score -v $SCORE_UNPROMOTED -N $nodename ++ ++ return $rc + } + + #IptablesStop {udp|tcp} portno,portno ip {in|out|both} {block|unblock} +@@ -512,17 +541,73 @@ + { + ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" stop + case $5 in +- block) IptablesUNBLOCK "$@";; ++ block) IptablesUNBLOCK "$@" ++ rc=$? ++ ;; + unblock) + save_tcp_connections + IptablesBLOCK "$@" ++ rc=$? + ;; +- *) usage; return 1;; ++ *) usage; return $OCF_ERR_CONFIGURED ;; + esac + ++ ocf_is_ms && ocf_promotion_score -D -N $nodename ++ ++ return $rc ++} ++ ++IptablesPromote() { ++ IptablesStatus "$@" ++ rc=$? ++ if [ $rc -eq $OCF_SUCCESS ] && [ $promotion_score -eq $SCORE_PROMOTED ]; then ++ ocf_log info "Promote: resource already promoted." ++ return $rc ++ elif [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_NOT_RUNNING ]; then ++ ocf_exit_reason "Promote: IptablesStatus failed with rc: $rc." ++ return $rc ++ fi ++ case $5 in ++ block) IptablesBLOCK "$@" ++ rc=$? ++ ;; ++ unblock) ++ IptablesUNBLOCK "$@" ++ rc=$? ++ tickle_remote ++ #ignore run_tickle_tcp exit code! ++ ;; ++ *) usage; return $OCF_ERR_CONFIGURED ; ++ esac ++ ocf_promotion_score -v $SCORE_PROMOTED -N $nodename + return $? + } + ++IptablesDemote() { ++ IptablesStatus "$@" ++ rc=$? ++ if [ $rc -eq $OCF_SUCCESS ] && [ $promotion_score -eq $SCORE_UNPROMOTED ]; then ++ ocf_log info "Demote: resource already demoted." ++ return $rc ++ elif [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_NOT_RUNNING ]; then ++ ocf_exit_reason "Demote: IptablesStatus failed with rc: $rc." ++ return $rc ++ fi ++ case $5 in ++ block) ++ save_tcp_connections ++ IptablesBLOCK "$@" ++ rc=$? ++ ;; ++ unblock) IptablesUNBLOCK "$@" ++ rc=$? ++ ;; ++ *) usage; return $OCF_ERR_CONFIGURED ;; ++ esac ++ ocf_promotion_score -v $SCORE_UNPROMOTED -N $nodename ++ return $rc ++} ++ + # + # Check if the port is valid, this function code is not decent, but works + # +@@ -558,17 +643,17 @@ + fi + if [ ! -d "$OCF_RESKEY_tickle_dir" ]; then + ocf_log err "The tickle dir doesn't exist!" +- exit $OCF_ERR_INSTALLED ++ exit $OCF_ERR_INSTALLED + fi + fi + + case $action in +- block|unblock) ++ block|unblock) + ;; +- *) ++ *) + ocf_log err "Invalid action $action!" + exit $OCF_ERR_CONFIGURED +- ;; ++ ;; + esac + + if ocf_is_true $reset_local_on_unblock_stop; then +@@ -591,7 +676,7 @@ + exit $OCF_ERR_ARGS + fi + +-case $1 in ++case $__OCF_ACTION in + meta-data) meta_data + exit $OCF_SUCCESS + ;; +@@ -605,12 +690,12 @@ + if [ -z "$OCF_RESKEY_protocol" ]; then + ocf_log err "Please set OCF_RESKEY_protocol" + exit $OCF_ERR_CONFIGURED +-fi ++fi + + if [ -z "$OCF_RESKEY_portno" ]; then + ocf_log err "Please set OCF_RESKEY_portno" + exit $OCF_ERR_CONFIGURED +-fi ++fi + + if [ -z "$OCF_RESKEY_action" ]; then + ocf_log err "Please set OCF_RESKEY_action" +@@ -632,6 +717,7 @@ + action=$OCF_RESKEY_action + ip=$OCF_RESKEY_ip + reset_local_on_unblock_stop=$OCF_RESKEY_reset_local_on_unblock_stop ++nodename=$(ocf_local_nodename) + + + # If "tickle" is enabled, we need to record the list of currently established +@@ -647,17 +733,35 @@ + fi + fi + +-case $1 in +- start) +- IptablesStart $protocol $portno $ip $direction $action ++if ocf_is_ms; then ++ promotion_score=$(ocf_promotion_score -G -N $nodename -q 2> /dev/null) ++ if { [ "$__OCF_ACTION" = "monitor" ] && [ "$promotion_score" = "$SCORE_UNPROMOTED" ]; } || [ "$__OCF_ACTION" = "demote" ] || [ "$__OCF_ACTION" = "start" ]; then ++ case $action in ++ block) action="unblock" ;; ++ unblock) action="block" ;; ++ esac ++ fi ++fi ++ ++case $__OCF_ACTION in ++ start) ++ IptablesStart "$protocol" "$portno" "$ip" "$direction" "$action" ++ ;; ++ ++ stop) ++ IptablesStop "$protocol" "$portno" "$ip" "$direction" "$action" ++ ;; ++ ++ promote) ++ IptablesPromote "$protocol" "$portno" "$ip" "$direction" "$action" + ;; + +- stop) +- IptablesStop $protocol $portno $ip $direction $action ++ demote) ++ IptablesDemote "$protocol" "$portno" "$ip" "$direction" "$action" + ;; + +- status|monitor) +- IptablesStatus $protocol $portno $ip $direction $action ++ status|monitor) ++ IptablesStatus "$protocol" "$portno" "$ip" "$direction" "$action" + ;; + + validate-all) diff --git a/SOURCES/RHEL-116151-3-portblock-fixes-add-method-and-status_check-parameters.patch b/SOURCES/RHEL-116151-3-portblock-fixes-add-method-and-status_check-parameters.patch new file mode 100644 index 0000000..39a7da3 --- /dev/null +++ b/SOURCES/RHEL-116151-3-portblock-fixes-add-method-and-status_check-parameters.patch @@ -0,0 +1,180 @@ +--- a/heartbeat/portblock 2025-10-21 09:27:41.753028260 +0200 ++++ b/heartbeat/portblock 2025-10-21 09:28:55.573855995 +0200 +@@ -28,6 +28,8 @@ + OCF_RESKEY_portno_default="" + OCF_RESKEY_direction_default="in" + OCF_RESKEY_action_default="" ++OCF_RESKEY_method_default="drop" ++OCF_RESKEY_status_check_default="rule" + OCF_RESKEY_ip_default="0.0.0.0/0" + OCF_RESKEY_reset_local_on_unblock_stop_default="false" + OCF_RESKEY_tickle_dir_default="" +@@ -37,6 +39,8 @@ + : ${OCF_RESKEY_portno=${OCF_RESKEY_portno_default}} + : ${OCF_RESKEY_direction=${OCF_RESKEY_direction_default}} + : ${OCF_RESKEY_action=${OCF_RESKEY_action_default}} ++: ${OCF_RESKEY_method=${OCF_RESKEY_method_default}} ++: ${OCF_RESKEY_status_check=${OCF_RESKEY_status_check_default}} + : ${OCF_RESKEY_ip=${OCF_RESKEY_ip_default}} + : ${OCF_RESKEY_reset_local_on_unblock_stop=${OCF_RESKEY_reset_local_on_unblock_stop_default}} + : ${OCF_RESKEY_tickle_dir=${OCF_RESKEY_tickle_dir_default}} +@@ -185,6 +189,26 @@ + + + ++ ++ ++Block method: ++drop: Use DROP rule. ++reject: Use REJECT rule w/conntrack to clear connections when blocking. ++ ++Block method ++ ++ ++ ++ ++ ++Status check: ++rule: Check rule. ++pseudo: Check pseudo status when rule is absent. ++ ++Status check ++ ++ ++ + + + If for some reason the long lived server side TCP sessions won't be cleaned up +@@ -253,6 +277,7 @@ + + + ++ + + + +@@ -288,7 +313,11 @@ + else + local prot="\(udp\|17\)" + fi +- echo "^DROP${w}${prot}${w}--${w}${src}${w}${dst}${w}multiport${w}${4}ports${w}${2}$" ++ if [ "$method" = "DROP" ]; then ++ echo "^DROP${w}${prot}${w}--${w}${src}${w}${dst}${w}multiport${w}${4}ports${w}${2}$" ++ else ++ echo "^REJECT${w}${prot}${w}--${w}${src}${w}${dst}${w}multiport${w}${4}ports${w}${2}${w}ctstate${w}NEW,RELATED,ESTABLISHED${w}reject-with${w}tcp-reset$" ++ fi + } + + #chain_isactive {udp|tcp} portno,portno ip chain +@@ -374,17 +403,17 @@ + + SayActive() + { +- ocf_log debug "$CMD DROP rule [$*] is running (OK)" ++ ocf_log debug "$CMD $method rule [$*] is running (OK)" + } + + SayConsideredActive() + { +- ocf_log debug "$CMD DROP rule [$*] considered to be running (OK)" ++ ocf_log debug "$CMD $method rule [$*] considered to be running (OK)" + } + + SayInactive() + { +- ocf_log debug "$CMD DROP rule [$*] is inactive" ++ ocf_log debug "$CMD $method rule [$*] is inactive" + } + + #IptablesStatus {udp|tcp} portno,portno ip {in|out|both} {block|unblock} +@@ -405,14 +434,18 @@ + case $5 in + block) + SayActive $* +- rc=$OCF_SUCCESS ++ if [ "$__OCF_ACTION" = "monitor" ] && [ "$promotion_score" = "$SCORE_PROMOTED" ]; then ++ rc=$OCF_RUNNING_MASTER ++ else ++ rc=$OCF_SUCCESS ++ fi + ;; + *) + SayInactive $* + rc=$OCF_NOT_RUNNING + ;; + esac +- elif ocf_is_ms; then ++ elif [ "$OCF_RESKEY_status_check" = "rule" ]; then + case $5 in + block) + SayInactive $* +@@ -420,7 +453,11 @@ + ;; + *) + SayActive $* +- rc=$OCF_SUCCESS ++ if [ "$__OCF_ACTION" = "monitor" ] && [ "$promotion_score" = "$SCORE_PROMOTED" ]; then ++ rc=$OCF_RUNNING_MASTER ++ else ++ rc=$OCF_SUCCESS ++ fi + ;; + esac + else +@@ -461,7 +498,11 @@ + : Chain already in desired state + else + [ "$chain" = "OUTPUT" ] && ds="s" || ds="d" +- $IPTABLES $wait "$op" "$chain" -p "$proto" -${ds} "$ip" -m multiport --${ds}ports "$ports" -j DROP ++ if [ "$method" = "DROP" ]; then ++ $IPTABLES $wait "$op" "$chain" -p "$proto" -${ds} "$ip" -m multiport --${ds}ports "$ports" -j DROP ++ else ++ $IPTABLES $wait "$op" "$chain" -p "$proto" -${ds} "$ip" -m multiport --${ds}ports "$ports" -m conntrack --ctstate NEW,ESTABLISHED,RELATED -j REJECT --reject-with tcp-reset ++ fi + fi + } + +@@ -486,7 +527,11 @@ + $IPTABLES $wait -I OUTPUT -p "$1" -s "$3" -m multiport --sports "$2" -j REJECT --reject-with tcp-reset + tickle_local + fi +- $IPTABLES $wait -I INPUT -p "$1" -d "$3" -m multiport --dports "$2" -j DROP ++ if [ "$method" = "DROP" ]; then ++ $IPTABLES $wait -I INPUT -p "$1" -d "$3" -m multiport --dports "$2" -j DROP ++ else ++ $IPTABLES $wait -I INPUT -p "$1" -d "$3" -m multiport --dports "$2" -m conntrack --ctstate NEW,ESTABLISHED,RELATED -j REJECT --reject-with tcp-reset ++ fi + rc_in=$? + if $try_reset ; then + $IPTABLES $wait -D OUTPUT -p "$1" -s "$3" -m multiport --sports "$2" -j REJECT --reject-with tcp-reset +@@ -718,6 +763,13 @@ + ip=$OCF_RESKEY_ip + reset_local_on_unblock_stop=$OCF_RESKEY_reset_local_on_unblock_stop + nodename=$(ocf_local_nodename) ++case "$OCF_RESKEY_method" in ++ drop) method="DROP" ;; ++ reject) method="REJECT" ;; ++ *) ocf_log err "method: $OCF_RESKEY_method not supported" ++ exit $OCF_ERR_CONFIGURED ++ ;; ++esac + + + # If "tickle" is enabled, we need to record the list of currently established +@@ -743,6 +795,8 @@ + fi + fi + ++IptablesValidateAll ++ + case $__OCF_ACTION in + start) + IptablesStart "$protocol" "$portno" "$ip" "$direction" "$action" +@@ -765,7 +819,6 @@ + ;; + + validate-all) +- IptablesValidateAll + ;; + + *) usage diff --git a/SOURCES/RHEL-116206-podman-etcd-add-cluster-wide-force_new_cluster-attribute-check.patch b/SOURCES/RHEL-116206-podman-etcd-add-cluster-wide-force_new_cluster-attribute-check.patch new file mode 100644 index 0000000..5938026 --- /dev/null +++ b/SOURCES/RHEL-116206-podman-etcd-add-cluster-wide-force_new_cluster-attribute-check.patch @@ -0,0 +1,186 @@ +From 1afdd91b2961061937fc802c575304ede8d79286 Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Wed, 10 Sep 2025 16:56:56 +0200 +Subject: [PATCH] podman-etcd: Add cluster-wide force_new_cluster attribute + checking + +Implement cluster-wide validation of force_new_cluster attribute to resolve +race conditions during automated cluster recovery. The enhancement ensures +agents check for the cluster-wide attribute before falling back to local +etcd revision comparison. + +Key changes: +- Enhanced get_force_new_cluster() to query all cluster nodes +- Ensure force_new_cluster is not set in both nodes to prevent + conflicting recovery attempts +- Updated startup logic to prioritize cluster-wide attribute checking + +fixes OCPBUGS-61117 +--- + heartbeat/podman-etcd | 107 ++++++++++++++++++++++++++++-------------- + 1 file changed, 72 insertions(+), 35 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 33804414a..f3a6da5e2 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -794,54 +794,72 @@ set_force_new_cluster() + return $rc + } + ++# get_force_new_cluster returns a space-separated list of nodes that have the force_new_cluster attribute set. ++# Return values: ++# - Exit code 0 with non-empty output: One or more nodes have the force_new_cluster attribute set ++# - Exit code 0 with empty output: No nodes have the force_new_cluster attribute set ++# - Exit code 1 with empty output: Error occurred while querying the cluster nodes + get_force_new_cluster() + { +- crm_attribute --lifetime reboot --query --name "force_new_cluster" | awk -F"value=" '{print $2}' ++ local node nodes value ++ local holders="" ++ ++ if ! nodes=$(crm_node -l | awk '{print $2}'); then ++ ocf_log err "could not get force_new_cluster attribute, crm_node error code: $?" ++ return 1 ++ fi ++ if [ -z "$nodes" ]; then ++ ocf_log err "could not get force_new_cluster attribute, the list of nodes is empty" ++ return 1 ++ fi ++ ++ for node in $nodes; do ++ if ! value=$(crm_attribute --query --lifetime reboot --name "force_new_cluster" --node "$node" 2>/dev/null | awk -F'value=' '{print $2}' | tr -d "'"); then ++ ocf_log err "could not get force_new_cluster attribute, crm_attribut error code: $?" ++ return 1 ++ fi ++ if [ -n "$value" ]; then ++ holders="$holders$node " ++ fi ++ done ++ echo "$holders" + } + ++ + clear_force_new_cluster() + { +- local force_new_cluster_node +- +- force_new_cluster_node=$(get_force_new_cluster) +- if [ -z "$force_new_cluster_node" ]; then +- ocf_log info "$NODENAME: force_new_cluster attribute not set" ++ # only the holder of "force_new_cluster" attribute can delete it ++ if ! is_force_new_cluster; then ++ ocf_log info "force_new_cluster unset or not owned by $NODENAME" + return $OCF_SUCCESS + fi + +- # only the holder of "force_new_cluster" attribute can delete it +- if [ "$NODENAME" = "$force_new_cluster_node" ]; then +- crm_attribute --lifetime reboot --name "force_new_cluster" --delete +- rc=$? +- if [ $rc -ne 0 ]; then +- ocf_log err "could not clear force_new_cluster attribute, error code: $rc" +- else +- ocf_log info "$NODENAME: force_new_cluster attribute cleared" +- fi +- return $rc +- else +- ocf_log info "$NODENAME does not hold force_new_cluster ($force_new_cluster_node has it)" +- return $OCF_SUCCESS ++ if ! crm_attribute --delete --lifetime reboot --node "$NODENAME" --name "force_new_cluster"; then ++ ocf_log err "could not clear force_new_cluster attribute, error code: $?" ++ return $OCF_ERR_GENERIC + fi ++ ++ ocf_log info "$NODENAME: force_new_cluster attribute cleared" ++ return $OCF_SUCCESS + } + ++ + is_force_new_cluster() + { +- # Return 0 if 'force_new_cluster' is set and the value matches the current node name, 1 otherwise. +- local value ++ # Return 0 if 'force_new_cluster' is set on the current node, 1 otherwise. ++ local fnc_holders + +- value=$(get_force_new_cluster) +- if [ -z "$value" ]; then +- ocf_log debug "force_new_cluster attribute is not set" +- return 1 ++ if ! fnc_holders=$(get_force_new_cluster); then ++ ocf_exit_reason "is_force_new_cluster: Failed to get force_new_cluster node holders" ++ exit $OCF_ERR_GENERIC + fi + +- if [ "$value" = "$NODENAME" ]; then ++ if echo "$fnc_holders" | grep -q -w "$NODENAME"; then + ocf_log debug "$NODENAME has force_new_cluster set" + return 0 + fi + +- ocf_log info "force_new_cluster attribute set on peer node $value" ++ ocf_log debug "force_new_cluster attribute is not set on $NODENAME" + return 1 + } + +@@ -1415,17 +1433,34 @@ podman_start() + return "$OCF_ERR_GENERIC" + fi + +- # force-new-cluster property is a runtime-scoped flag that instructs the agent to force a new cluster-of-1. +- # Since this attribute is configured with a reboot-lifetime, it is automatically cleared when the machine reboots. +- # If the agent detects during its start that this property is set, it indicates that the flag was explicitly set +- # during the current node boot session, implying a deliberate request to recover the cluster. + if ocf_is_true "$pod_was_running"; then + ocf_log info "static pod was running: start normally" + else +- if is_force_new_cluster; then +- ocf_log notice "'$NODENAME' marked to force-new-cluster" ++ local fnc_holders ++ if ! fnc_holders=$(get_force_new_cluster); then ++ ocf_exit_reason "Failed to get force_new_cluster node holders" ++ return "$OCF_ERR_GENERIC" ++ fi ++ ++ local fnc_holder_count ++ fnc_holder_count=$(echo "$fnc_holders" | wc -w) ++ if [ "$fnc_holder_count" -gt 1 ]; then ++ ocf_exit_reason "force_new_cluster attribute is set on multiple nodes ($fnc_holders)" ++ return "$OCF_ERR_GENERIC" ++ fi ++ ++ if [ "$fnc_holder_count" -eq 1 ]; then ++ if echo "$fnc_holders" | grep -q -w "$NODENAME"; then ++ # Attribute is set on the local node. ++ ocf_log notice "$NODENAME marked to force-new-cluster" ++ JOIN_AS_LEARNER=false ++ else ++ # Attribute is set on a peer node. ++ ocf_log info "$NODENAME shall join as learner because force_new_cluster is set on peer $fnc_holders" ++ JOIN_AS_LEARNER=true ++ fi + else +- ocf_log info "'$NODENAME' is not marked to force-new-cluster" ++ ocf_log info "no node is marked to force-new-cluster" + # When the local agent starts, we can infer the cluster state by counting + # how many agents are starting or already active: + # - 1 active agent: it's the peer (we are just starting) +@@ -1522,7 +1557,7 @@ podman_start() + for try in $(seq $retries); do + learner_node=$(attribute_learner_node get) + if [ "$NODENAME" != "$learner_node" ]; then +- ocf_log info "$learner_node is not in the member list yet. Retry in $poll_interval_sec seconds." ++ ocf_log info "$NODENAME is not in the member list yet. Retry in $poll_interval_sec seconds." + sleep $poll_interval_sec + continue + fi +@@ -1673,6 +1708,8 @@ podman_stop() + { + local timeout=60 + local rc ++ ++ ocf_log notice "podman-etcd stop" + podman_simple_status + if [ $? -eq $OCF_NOT_RUNNING ]; then + ocf_log info "could not leave members list: etcd container not running" diff --git a/SOURCES/RHEL-118621-MailTo-add-s-nail-support-for-multiple-recipients.patch b/SOURCES/RHEL-118621-MailTo-add-s-nail-support-for-multiple-recipients.patch new file mode 100644 index 0000000..69a636a --- /dev/null +++ b/SOURCES/RHEL-118621-MailTo-add-s-nail-support-for-multiple-recipients.patch @@ -0,0 +1,36 @@ +From 1e546b85010e5fdbf7a0f31207dce144c14c50ec Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Wed, 29 Oct 2025 15:17:30 +0100 +Subject: [PATCH] MailTo: add s-nail support for multiple recipients + +--- + heartbeat/MailTo | 16 ++++++++++------ + 1 file changed, 10 insertions(+), 6 deletions(-) + +diff --git a/heartbeat/MailTo b/heartbeat/MailTo +index 56940bafaa..a3ee6a04c8 100755 +--- a/heartbeat/MailTo ++++ b/heartbeat/MailTo +@@ -92,12 +92,16 @@ END + } + + MailProgram() { +- $MAILCMD -s "$1" "$email" < +Date: Thu, 25 Sep 2025 14:23:20 +0200 +Subject: [PATCH] db2: use reintegration flag to avoid race condition on + cluster reintegration, and removed FAL, as it's no longer needed + +--- + heartbeat/db2 | 306 ++++++++++++++++++++++++++++++++------------------ + 1 file changed, 197 insertions(+), 109 deletions(-) + +diff --git a/heartbeat/db2 b/heartbeat/db2 +index fe1d9b892..83020fc70 100755 +--- a/heartbeat/db2 ++++ b/heartbeat/db2 +@@ -37,6 +37,13 @@ + : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} + . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + ++# Use runuser if available for SELinux. ++if [ -x "/sbin/runuser" ]; then ++ SU="runuser" ++else ++ SU="su" ++fi ++ + # Parameter defaults + + OCF_RESKEY_instance_default="" +@@ -55,11 +62,12 @@ OCF_RESKEY_dbpartitionnum_default="0" + : ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}} + : ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}} + ++POSIX_UNICODE_LOCALE="C.UTF-8" + ####################################################################### + + + db2_usage() { +- echo "db2 start|stop|monitor|promote|demote|notify|validate-all|meta-data" ++ echo "db2 start|stop|monitor|promote|demote|validate-all|meta-data" + } + + db2_meta_data() { +@@ -162,7 +170,6 @@ The number of the partition (DBPARTITIONNUM) to be managed. + + + +- + + + +@@ -273,7 +280,18 @@ master_score() + # Run the given command as db2 instance user + # + runasdb2() { +- su $instance -c ". $db2profile; $*" ++ $SU $instance -c ". $db2profile; $*" ++} ++ ++# ++# Run the given command as db2 instance user using $SU ++# We run this function as opposed to runasdb2 whenever we have to issue commands ++# that leave processes running on the system, such as db2start ++# We do not want these processes to hog the resources as they were run with elevated privileges ++# ++runasdb2_session() { ++ # Override db2profile with unicode locale is required to maintain compatibility with unicode CODEPAGE ++ $SU "$instance" -c "ksh -c '. $db2profile; export LC_ALL="$POSIX_UNICODE_LOCALE"; export LANG="$POSIX_UNICODE_LOCALE"; $*'" + } + + # +@@ -294,48 +312,6 @@ logasdb2() { + } + + +-# +-# maintain the fal (first active log) attribute +-# db2_fal_attrib DB {set val|get} +-# +-db2_fal_attrib() { +- local db=$1 +- local attr val rc id node member me +- +- attr=db2hadr_${instance}_${db}_fal +- +- case "$2" in +- set) +- me=$(ocf_local_nodename) +- +- # loop over all member nodes and set attribute +- crm_node -l | +- while read id node member +- do +- [ "$member" = member -a "$node" != "$me" ] || continue +- crm_attribute -l forever --node=$node -n $attr -v "$3" +- rc=$? +- ocf_log info "DB2 instance $instance($db2node/$db: setting attrib for FAL to $FIRST_ACTIVE_LOG @ $node" +- [ $rc != 0 ] && break +- done +- ;; +- +- get) +- crm_attribute -l forever -n $attr -G --quiet 2>&1 +- rc=$? +- if ! ocf_is_true "$OCF_RESKEY_CRM_meta_notify" && [ $rc != 0 ] +- then +- ocf_log warn "DB2 instance $instance($db2node/$db: can't retrieve attribute $attr, are you sure notifications are enabled ?" +- fi +- ;; +- +- *) +- exit $OCF_ERR_CONFIGURED +- esac +- +- return $rc +-} +- + # + # unfortunately a first connect after a crash may need several minutes + # for some internal cleanup stuff in DB2. +@@ -429,6 +405,42 @@ db2_check_config_compatibility() { + + } + ++# ++# Start HADR as standby. ++# ++# Parameters ++# 1 - Calling function ++# 2 - Calling functions line number ++# ++# Return codes: ++# 0 - Start as standby successful ++# 1 - Start as standby failed ++# ++reintegrateAsStandby() { ++ db=$1 ++ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint" ++ ocf_log info "$__OCF_ACTION: $LINENO: reintegrateAsStandby called by $2 at $3. Attempting to reintegrate $db as standby." ++ if output=$(runasdb2_session "db2 start hadr on db $db as standby"); then ++ rc=0 ++ ocf_log info "$__OCF_ACTION: $LINENO: Db2 database $instance($db2node)/$db started/activated" ++ else ++ case $output in ++ SQL1777N*) ++ # SQL1777N: HADR is already started in given state. ++ ocf_log info "$__OCF_ACTION: $LINENO: $output" ++ rc=0 ++ ;; ++ ++ *) ++ rc=1 ++ ocf_log err "$__OCF_ACTION: $LINENO: Unable to reintegrate Db2 database $instance($db2node)/$db. Please reintegrate manually: $output, return with rc=$rc" ++ ;; ++ esac ++ fi ++ crm_attribute -n "$reint_attr" -N "$local_host" -v "0" -l forever ++ return $rc ++} ++ + # + # Start instance and DB. + # Standard mode is through "db2 activate" in order to start in previous +@@ -478,6 +490,8 @@ db2_start() { + + for db in $dblist + do ++ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint" ++ + # sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW FIRST_ACTIVE_LOG + db2_get_cfg $db || return $? + +@@ -488,20 +502,13 @@ db2_start() { + + if [ $HADR_ROLE = PRIMARY ] + then +- local master_fal +- +- # communicate our FAL to other nodes the might start concurrently +- db2_fal_attrib $db set $FIRST_ACTIVE_LOG +- +- # ignore false positive: +- # error: Can't use > in [ ]. Escape it or use [[..]]. [SC2073] +- # see https://github.com/koalaman/shellcheck/issues/691 +- # shellcheck disable=SC2073 +- if master_fal=$(db2_fal_attrib $db get) && [ "$master_fal" '>' $FIRST_ACTIVE_LOG ] +- then ++ cib_value=$(crm_attribute -n "$reint_attr" -N "$local_host" -G | awk -v FS=' value=' '{print $2}') ++ ocf_log info "$__OCF_ACTION: $LINENO: CIB attribute $reint_attr is set to '$cib_value'" ++ if [ "$cib_value" = "1" ]; then + ocf_log info "DB2 database $instance($db2node)/$db is Primary and outdated, starting as secondary" + start_cmd="db2 start hadr on db $db as standby" + HADR_ROLE=STANDBY ++ standby_reintegration=1 + fi + fi + +@@ -511,27 +518,65 @@ db2_start() { + [ $HADR_ROLE != STANDBY ] && db2_run_connect $db & + else + case $output in +- SQL1490W*|SQL1494W*|SQL1497W*|SQL1777N*) +- ocf_log info "DB2 database $instance($db2node)/$db already activated: $output" ++ SQL1490W* | SQL1494W* | SQL1497W* | SQL1777N*) ++ # SQL1490W Activate database is successful, however, the database has already been activated on one or more nodes. ++ # SQL1494W Activate database is successful, however, there is already a connection to the database. ++ # SQL1497W Activate/Deactivate database was successful, however, an error occurred on some nodes. ++ # SQL1777N HADR is already started. ++ ++ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database is already activated: $output" + ;; + +- SQL1768N*"Reason code = \"7\""*) +- ocf_log err "DB2 database $instance($db2node)/$db is a Primary and the Standby is down" +- ocf_log err "Possible split brain ! Manual intervention required." ++ SQL1768N*"Reason code = \"7\""*) ++ rc="$OCF_ERR_GENERIC" ++ ++ ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database is a Primary and the Standby is down" ++ ocf_log err "Possible split brain! Manual intervention required." + ocf_log err "If this DB is outdated use \"db2 start hadr on db $db as standby\"" +- ocf_log err "If this DB is the surviving primary use \"db2 start hadr on db $db as primary by force\"" ++ ocf_log err "If this DB is the surviving primary use \"db2 start hadr on db $db as primary by force\". db2_start() exit with rc=$rc." + +- # might be the Standby is not yet there +- # might be a timing problem because "First active log" is delayed +- # on the next start attempt we might succeed when FAL was advanced +- # might be manual intervention is required +- # ... so let pacemaker give it another try and we will succeed then +- return $OCF_ERR_GENERIC ++ # let pacemaker give it another try and we will succeed then ++ return "$rc" + ;; + +- *) +- ocf_log err "DB2 database $instance($db2node)/$db didn't start: $output" +- return $OCF_ERR_GENERIC ++ SQL1776N*"Reason code = \"6\""*) ++ # SQL1776N The command cannot be issued on an HADR database. ++ # Reason code 6: ++ # This database is an old primary database. It cannot be started ++ # because the standby has become the new primary through forced ++ # takeover. ++ ++ rc="$OCF_ERR_GENERIC" ++ ocf_log err "$__OCF_ACTION: $LINENO: Db2 database $instance($db2node)/$db didn't start: $output, return with rc=$rc" ++ ocf_log err "$__OCF_ACTION: $LINENO: This database is an old primary database. Trying start again as standby" ++ ++ start_cmd="db2 start hadr on db $db as standby" ++ if output=$(runasdb2_session "$start_cmd"); then ++ rc="$OCF_SUCCESS" ++ ocf_log info "$__OCF_ACTION: $LINENO: Db2 database $instance($db2node)/$db started/activated" ++ else ++ case $output in ++ SQL1777N*) ++ # SQL1777N: HADR is already started. ++ ocf_log info "$__OCF_ACTION: $LINENO: $output" ++ rc="$OCF_SUCCESS" ++ ;; ++ ++ *) ++ rc="$OCF_ERR_GENERIC" ++ ocf_log err "$__OCF_ACTION: $LINENO: Unable to reintegrate Db2 database $instance($db2node)/$db. Please reintegrate manually: $output, return with rc=$rc" ++ ;; ++ esac ++ fi ++ ++ return "$rc" ++ ;; ++ ++ *) ++ rc="$OCF_ERR_GENERIC" ++ ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database didn't start: $output, db2_start() exit with rc=$rc." ++ return "$rc" ++ ;; + esac + fi + done +@@ -539,6 +584,15 @@ db2_start() { + # come here with success + # Even if we are a db2 Primary pacemaker requires start to end up in slave mode + echo SLAVE > $STATE_FILE ++ ++ # Unset primary failover attribute as host was successfully reintegrated as standby ++ if [ "$standby_reintegration" = "1" ]; then ++ for db in $dblist; do ++ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint" ++ crm_attribute -n "$reint_attr" -N "$local_host" -v "0" -l forever ++ done ++ fi ++ + return $OCF_SUCCESS + } + +@@ -737,7 +791,7 @@ db2_monitor_retry() { + + # + # Monitor the db +-# And as side effect set crm_master / FAL attribute ++# And as side effect set crm_master + # + db2_monitor() { + local CMD output hadr db +@@ -754,6 +808,22 @@ db2_monitor() { + + for db in $dblist + do ++ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint" ++ ++ #Check for the reintegration file, then set the flag if it exists and delete the file ++ if [ -e "/tmp/$reint_attr" ] && [ -n "$remote_host" ]; then ++ #The file exist, try to set the reintegration attribute ++ crm_attribute -n "$reint_attr" -N "$remote_host" -v "1" -l forever ++ cib_value=$(crm_attribute -n "$reint_attr" -N "$remote_host" -G | awk -v FS=' value=' '{print $2}') ++ ++ if [ "$cib_value" = "1" ]; then ++ ocf_log info "$__OCF_ACTION: $LINENO: CIB attribute $reint_attr is set to '$cib_value', reintegration flag file will now be deleted." ++ rm -f "/tmp/$reint_attr" ++ else ++ ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The reintegration flag file exists, but its attribute failed to set." ++ fi ++ fi ++ + hadr=$(db2_hadr_status $db) + rc=$? + ocf_log debug "Monitor: DB2 database $instance($db2node)/$db has HADR status $hadr" +@@ -804,6 +874,14 @@ db2_monitor() { + ;; + + STANDBY/*PEER/*|Standby/*Peer) ++ # If db is in standby peer, then it has already reintegrated. ++ # If the reintegrate flag is still set, remove it ++ cib_value=$(crm_attribute -n "$reint_attr" -N "$local_host" -G | awk -v FS=' value=' '{print $2}') ++ if [ "$cib_value" = "1" ]; then ++ ocf_log info "$__OCF_ACTION: $LINENO: Reintegrate flag detected for $db, but it has already reintegrated as standby. Removing reintegration flag." ++ crm_attribute -n "$reint_attr" -N "$local_host" -v "0" -l forever ++ fi ++ + master_score -v 8000 -l reboot + ;; + +@@ -812,6 +890,34 @@ db2_monitor() { + master_score -D -l reboot + ;; + ++ Down/Off) ++ # If db is a deactivated primary and it has a reintegration flag, then reintegrate as standby. ++ cib_value=$(crm_attribute -n "$reint_attr" -N "$local_host" -G | awk -v FS=' value=' '{print $2}') ++ if [ "$cib_value" = "1" ]; then ++ output=$(runasdb2 "db2 get db cfg for $db" | grep 'HADR database role' | awk '{print $5}') ++ if [ "PRIMARY" = "$output" ]; then ++ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: Database is deactivated with Primary role and the reintegration flag is set. Role: $output, Reintegration flag: $reint_attr = $cib_value" ++ # Reintegrate as the standby database. ++ if reintegrateAsStandby "$db" 'db2_monitor' $LINENO; then ++ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database reintegration succeeded." ++ # Setting slave state here will cause rc to be OCF_SUCCESS below. ++ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: Echoing SLAVE into $STATE_FILE" ++ echo SLAVE >"$STATE_FILE" ++ # Update master score to reflect standby state. ++ master_score -v 8000 -l reboot ++ else ++ ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database reintegration failed." ++ return "$OCF_ERR_GENERIC" ++ fi ++ fi ++ else ++ rc="$OCF_NOT_RUNNING" ++ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database has HADR status $hadr." ++ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: db2_monitor() exit with rc=$rc." ++ return "$rc" ++ fi ++ ;; ++ + *) + return $OCF_ERR_GENERIC + esac +@@ -875,8 +981,6 @@ db2_promote() { + # update pacemaker's view + echo MASTER > $STATE_FILE + +- # turn the log so we rapidly get a new FAL +- logasdb2 "db2 archive log for db $db" + return $OCF_SUCCESS + fi + +@@ -914,26 +1018,6 @@ db2_demote() { + return $? + } + +-# +-# handle pre start notification +-# We record our first active log on the other nodes. +-# If two primaries come up after a crash they can safely determine who is +-# the outdated one. +-# +-db2_notify() { +- local node +- +- # only interested in pre-start +- [ $OCF_RESKEY_CRM_meta_notify_type = pre \ +- -a $OCF_RESKEY_CRM_meta_notify_operation = start ] || return $OCF_SUCCESS +- +- # gets FIRST_ACTIVE_LOG +- db2_get_cfg $dblist || return $? +- +- db2_fal_attrib $dblist set $FIRST_ACTIVE_LOG || return $OCF_ERR_GENERIC +- exit $OCF_SUCCESS +-} +- + ######## + # Main # + ######## +@@ -947,50 +1031,54 @@ case "$__OCF_ACTION" in + db2_usage + exit $OCF_SUCCESS + ;; ++esac + ++local_host=$(ocf_local_nodename) ++inst1=$(echo "$OCF_RESKEY_instance" | cut -d"," -f1) ++inst2=$(echo "$OCF_RESKEY_instance" | cut -d"," -f2) ++host1=$(crm_node -l | sort | awk '{print $2;}' | sed -n 1p) ++ ++if [ "$host1" = "$local_host" ]; then ++ remote_host=$(crm_node -l | sort | awk '{print $2;}' | sed -n 2p) ++else ++ remote_host="$host1" ++fi ++ ++db2_validate; validate_rc=$? ++ ++case "$__OCF_ACTION" in + start) +- db2_validate + db2_start || exit $? + db2_monitor +- exit $? + ;; + + stop) +- db2_validate + db2_stop +- exit $? + ;; + + promote) +- db2_validate + db2_promote +- exit $? + ;; + + demote) +- db2_validate + db2_demote +- exit $? + ;; + + notify) +- db2_validate +- db2_notify +- exit $? ++ ocf_log debug "notify-action has been DEPRECATED, and should be removed" + ;; + + monitor) +- db2_validate + db2_monitor_retry +- exit $? + ;; + + validate-all) +- db2_validate +- exit $? ++ exit $validate_rc + ;; + + *) + db2_usage + exit $OCF_ERR_UNIMPLEMENTED + esac ++ ++exit $? diff --git a/SOURCES/RHEL-119495-podman-etcd-add-automatic-learner-member-promotion.patch b/SOURCES/RHEL-119495-podman-etcd-add-automatic-learner-member-promotion.patch new file mode 100644 index 0000000..74795e5 --- /dev/null +++ b/SOURCES/RHEL-119495-podman-etcd-add-automatic-learner-member-promotion.patch @@ -0,0 +1,321 @@ +From a31f15104fc712cd25f8a59d49f1bbcdbbbc5434 Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Tue, 30 Sep 2025 11:54:44 +0200 +Subject: [PATCH 1/2] Refactor(podman-etcd): improve peer checking and + leadership loss detection + +The check_peers function is broken up into smaller, more manageable +functions. This refactoring separates the logic for detecting a loss of +cluster leadership from the logic for managing peer membership. + +The main function is renamed to check_peer as there is only 1 peer to +check (it was check_peers). +--- + heartbeat/podman-etcd | 78 +++++++++++++++++++++++++------------------ + 1 file changed, 45 insertions(+), 33 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index f3a6da5e2..3d1e4c520 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -1014,42 +1014,35 @@ get_member_list_json() { + podman exec "${CONTAINER}" etcdctl member list --endpoints="$this_node_endpoint" -w json + } + +-check_peers() ++detect_cluster_leadership_loss() + { +- # Check peers endpoint status and locally accessible member list +- local member_list_json +- +- if ! container_exists; then +- # we need a running container to execute etcdctl. +- return $OCF_SUCCESS ++ endpoint_status_json=$(get_endpoint_status_json) ++ ocf_log info "endpoint status: $endpoint_status_json" ++ ++ count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l) ++ if [ "$count_endpoints" -eq 1 ]; then ++ ocf_log info "one endpoint only: checking status errors" ++ endpoint_status_errors=$(printf "%s" "$endpoint_status_json" | jq -r ".[0].Status.errors") ++ if echo "$endpoint_status_errors" | grep -q "no leader"; then ++ set_force_new_cluster ++ set_standalone_node ++ ocf_exit_reason "$NODENAME must force a new cluster" ++ return $OCF_ERR_GENERIC ++ fi ++ if [ "$endpoint_status_errors" != "null" ]; then ++ ocf_log err "unmanaged endpoint status error: $endpoint_status_errors" ++ fi + fi + +- member_list_json=$(get_member_list_json) +- rc=$? +- ocf_log debug "member list: $member_list_json" +- if [ $rc -ne 0 ]; then +- ocf_log info "podman failed to get member list, error code: $rc" +- +- endpoint_status_json=$(get_endpoint_status_json) +- ocf_log info "endpoint status: $endpoint_status_json" +- +- count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l) +- if [ "$count_endpoints" -eq 1 ]; then +- ocf_log info "one endpoint only: checking status errors" +- endpoint_status_errors=$(printf "%s" "$endpoint_status_json" | jq -r ".[0].Status.errors") +- if echo "$endpoint_status_errors" | grep -q "no leader"; then +- set_force_new_cluster +- set_standalone_node +- ocf_exit_reason "$NODENAME must force a new cluster" +- return $OCF_ERR_GENERIC +- fi +- if [ "$endpoint_status_errors" != "null" ]; then +- ocf_log err "unmanaged endpoint status error: $endpoint_status_errors" +- fi +- fi ++ return $OCF_SUCCESS ++} + +- return $OCF_SUCCESS +- fi ++manage_peer_membership() ++{ ++ # Read etcd member list to detect the status of the peer member. ++ # If the peer is missing from the member list, it will be added back as learner ++ # If the peer is back in the member list, we ensure that the related CIB attributes (standalone and learner_node) are reset ++ local member_list_json="$1" + + # Example of .members[] instance fields in member list json format: + # NOTE that "name" is present in voting members only, while "isLearner" in learner members only +@@ -1083,6 +1076,25 @@ check_peers() + clear_standalone_and_learner_if_not_learners "$member_list_json" + fi + done ++} ++ ++check_peer() ++{ ++ # Check peers endpoint status and locally accessible member list ++ local member_list_json ++ ++ # we need a running container to execute etcdctl. ++ if ! container_exists; then ++ return $OCF_SUCCESS ++ fi ++ ++ if ! member_list_json=$(get_member_list_json); then ++ ocf_log info "podman failed to get member list, error code: $?" ++ detect_cluster_leadership_loss ++ return $? ++ fi ++ ++ manage_peer_membership "$member_list_json" + return $OCF_SUCCESS + } + +@@ -1124,7 +1136,7 @@ podman_monitor() + # monitor operation to fail. + # TODO: move this inside check_peers where we already query member list json + attribute_node_member_id update +- if ! check_peers; then ++ if ! check_peer; then + return $OCF_ERR_GENERIC + fi + + +From de7c73a933cefb8f7b9e810bd23c3d12f6d6f29a Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Tue, 30 Sep 2025 18:38:06 +0200 +Subject: [PATCH 2/2] OCPBUGS-42808: podman-etcd: add automatic learner member + promotion + +Automatically promote etcd learner members to voting members when detected. +Includes refactored member management functions and improved validation. +--- + heartbeat/podman-etcd | 108 ++++++++++++++++++++++++++++++------------ + 1 file changed, 79 insertions(+), 29 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 3d1e4c520..e1425ec02 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -712,6 +712,22 @@ attribute_node_revision_peer() + crm_attribute --query --type nodes --node "$nodename" --name "revision" | awk -F"value=" '{print $2}' + } + ++# Converts a decimal number to hexadecimal format with validation ++# Args: $1 - decimal number (test for non-negative integer too) ++# Returns: 0 on success, OCF_ERR_GENERIC on invalid input ++# Outputs: hexadecimal representation to stdout ++decimal_to_hex() { ++ local dec=$1 ++ ++ if ! echo "$dec" | grep -q "^[1-9][0-9]*$"; then ++ ocf_log err "Invalid member ID format: '$dec' (expected decimal number)" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ printf "%x" "$dec" ++ return $OCF_SUCCESS ++} ++ + attribute_node_member_id() + { + local action="$1" +@@ -737,7 +753,7 @@ attribute_node_member_id() + return "$rc" + fi + +- local value ++ local value value_hex + if ! value=$(echo -n "$member_list_json" | jq -r ".header.member_id"); then + rc=$? + ocf_log err "could not get $attribute from member list JSON, error code: $rc" +@@ -745,8 +761,11 @@ attribute_node_member_id() + fi + + # JSON member_id is decimal, while etcdctl command needs the hex version +- value=$(printf "%x" "$value") +- if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then ++ if ! value_hex=$(decimal_to_hex "$value"); then ++ ocf_log err "could not convert decimal member_id '$value' to hex, error code: $?" ++ return $OCF_ERR_GENERIC ++ fi ++ if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value_hex"; then + rc=$? + ocf_log err "could not update etcd $attribute, error code: $rc" + return "$rc" +@@ -905,42 +924,70 @@ clear_standalone_node() + crm_attribute --name "standalone_node" --delete + } + +-clear_standalone_and_learner_if_not_learners() ++ ++# Promotes an etcd learner member to a voting member ++# Args: $1 - learner member ID in decimal format ++# Returns: OCF_SUCCESS (even on expected promotion failures), OCF_ERR_GENERIC on conversion errors ++# Note: Promotion failures are expected and logged as info (peer may not be up-to-date) ++promote_learner_member() ++{ ++ local learner_member_id=$1 ++ ++ # JSON member_id is decimal, while etcdctl command needs the hex version ++ if ! learner_member_id_hex=$(decimal_to_hex "$learner_member_id"); then ++ ocf_log err "could not convert decimal member_id '$learner_member_id' to hex, error code: $?" ++ return $OCF_ERR_GENERIC ++ fi ++ if ! ocf_run podman exec "${CONTAINER}" etcdctl member promote "$learner_member_id_hex" 2>&1; then ++ # promotion is expected to fail if the peer is not yet up-to-date ++ ocf_log info "could not promote member $learner_member_id_hex, error code: $?" ++ return $OCF_SUCCESS ++ fi ++ ocf_log info "successfully promoted member '$learner_member_id_hex'" ++ return $OCF_SUCCESS ++} ++ ++# Reconciles etcd cluster member states ++# Promotes learner members or clears standalone/learner attributes as needed ++# Args: $1 - member list JSON from etcdctl ++# Returns: OCF_SUCCESS on completion, OCF_ERR_GENERIC on errors ++# Note: Only operates when exactly 2 started members are present ++reconcile_member_state() + { + local rc + local member_list_json="$1" + +- number_of_members=$(printf "%s" "$member_list_json" | jq -r ".members[].ID" | wc -l) +- if [ "$number_of_members" -ne 2 ]; then +- ocf_log info "could not clear standalone_node, nor learner_node properties: found $number_of_members members, need 2" ++ # count only the started members, which have the ".name" JSON field ++ number_of_started_members=$(printf "%s" "$member_list_json" | jq -r ".members[].name | select(. != null)" | wc -l) ++ if [ "$number_of_started_members" -ne 2 ]; then ++ ocf_log info "could not clear standalone_node, nor learner_node properties: found $number_of_started_members members, need 2" + return $OCF_SUCCESS + fi + +- id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID") ++ learner_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID") + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "could not get isLearner field from member list, error code: $rc" + return $rc + fi + +- if [ -z "$id" ]; then +- clear_standalone_node +- rc=$? +- if [ $rc -ne 0 ]; then +- ocf_og error "could not clear standalone_node attribute, error code: $rc" +- return $rc +- fi ++ if [ -n "$learner_member_id" ]; then ++ promote_learner_member "$learner_member_id" ++ return $? + fi +- if [ -z "$id" ]; then +- attribute_learner_node clear +- rc=$? +- if [ $rc -ne 0 ]; then +- ocf_og error "could not clear learner_node attribute, error code: $rc" +- return $rc ++ ++ if [ -z "$learner_member_id" ]; then ++ if ! clear_standalone_node; then ++ ocf_log error "could not clear standalone_node attribute, error code: $?" ++ return $OCF_ERR_GENERIC ++ fi ++ if ! attribute_learner_node clear; then ++ ocf_log error "could not clear learner_node attribute, error code: $?" ++ return $OCF_ERR_GENERIC + fi + fi + +- return $rc ++ return $OCF_SUCCESS + } + + attribute_learner_node() +@@ -1019,7 +1066,7 @@ detect_cluster_leadership_loss() + endpoint_status_json=$(get_endpoint_status_json) + ocf_log info "endpoint status: $endpoint_status_json" + +- count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l) ++ count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l) + if [ "$count_endpoints" -eq 1 ]; then + ocf_log info "one endpoint only: checking status errors" + endpoint_status_errors=$(printf "%s" "$endpoint_status_json" | jq -r ".[0].Status.errors") +@@ -1037,11 +1084,14 @@ detect_cluster_leadership_loss() + return $OCF_SUCCESS + } + ++ ++# Manages etcd peer membership by detecting and handling missing or rejoining peers ++# Adds missing peers as learners and reconciles member states when peers rejoin ++# Args: $1 - member list JSON from etcdctl ++# Returns: OCF_SUCCESS on completion, OCF_ERR_GENERIC on errors ++# Note: Iterates through all peer nodes to ensure proper cluster membership + manage_peer_membership() + { +- # Read etcd member list to detect the status of the peer member. +- # If the peer is missing from the member list, it will be added back as learner +- # If the peer is back in the member list, we ensure that the related CIB attributes (standalone and learner_node) are reset + local member_list_json="$1" + + # Example of .members[] instance fields in member list json format: +@@ -1066,14 +1116,14 @@ manage_peer_membership() + + # Check by IP instead of Name since "learner" members appear only in peerURLs, not by Name. + ip=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6 +- id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$ip\")) | any).ID") +- if [ -z "$id" ]; then ++ peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$ip\")) | any).ID") ++ if [ -z "$peer_member_id" ]; then + ocf_log info "$name is not in the members list" + add_member_as_learner "$name" "$ip" + set_standalone_node + else + ocf_log debug "$name is in the members list by IP: $ip" +- clear_standalone_and_learner_if_not_learners "$member_list_json" ++ reconcile_member_state "$member_list_json" + fi + done + } diff --git a/SOURCES/RHEL-121986-Filesystem-speed-up-get-PIDs.patch b/SOURCES/RHEL-121986-Filesystem-speed-up-get-PIDs.patch new file mode 100644 index 0000000..60f31c8 --- /dev/null +++ b/SOURCES/RHEL-121986-Filesystem-speed-up-get-PIDs.patch @@ -0,0 +1,135 @@ +From 93729d83fa5bf15f4ec694e08e9777bde858fb41 Mon Sep 17 00:00:00 2001 +From: Lars Ellenberg +Date: Thu, 16 Oct 2025 10:58:37 +0200 +Subject: [PATCH 1/2] Filesystem: speed up get_pids + +With force_umount=safe, we "manually" scan the /proc/ file system. + +We look for symlinks pointing into the path we are interested in. +Specifically, we are interested in + /proc//{root,exe,cwd} + /proc//fd/ +We also look for relevant memory mappings in /proc//maps + +All these are per process, not per "task" or "thread". +see procfs(5) and pthreads(7). +Still, we currently also scan /proc//task// +for all the same things. + +With a large system with many heavily threaded processes, +this can significantly slow down this scanning, +without gaining new information. + +Adding -maxdepth to the find command line avoids this useless work, +potentially reducing the scanning time by orders of magnitute +on systems with many heavily threaded processes. + +We could also write a dedicated helper in C to do the very same thing, +with the option to "short circuit" and proceed with the next pid +as soon as the first "match" is found for the currently inspected pid. + +That could further reduce the scanning time +by about an additional factor of 10. +--- + heartbeat/Filesystem | 25 +++++++++++++++++++++---- + 1 file changed, 21 insertions(+), 4 deletions(-) + +diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem +index 6d3960162..f76339fd6 100755 +--- a/heartbeat/Filesystem ++++ b/heartbeat/Filesystem +@@ -680,14 +680,31 @@ get_pids() + # -path "/proc/[!0-9]*" -prune -o ... + # -path "/proc/[0-9]*" -a ... + # the latter seemd to be significantly faster for this one in my naive test. ++ ++ # root, cwd, exe, maps, fd: all per process, not per task ("thread"). ++ # -maxdepth to avoid repeatedly scanning the same thing ++ # for all threads of a heavily threaded process. ++ # ++ # Adding -maxdepth reduced scanning from > 16 seconds to < 2 seconds ++ # on a mostly idle system that happened to run a few java processes. ++ # ++ # We can also add a dedicated helper in C do twhat is done below, ++ # which would reduce the scanning time by an ++ # additional factor of 10 again. ++ # ++ # Or trust that fuser (above) learned something in the last 15 years ++ # and avoids blocking operations meanwhile? + procs=$(exec 2>/dev/null; +- find /proc -path "/proc/[0-9]*" -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print | ++ find /proc -mindepth 1 -maxdepth 3 \ ++ -path "/proc/[0-9]*" \ ++ -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print | + awk -F/ '{print $3}' | uniq) + +- # This finds both /proc//maps and /proc//task//maps; +- # if you don't want the latter, add -maxdepth. ++ # memory mappings are also per process, not per task. ++ # This finds only /proc//maps, and not /proc//task//maps; ++ # if you also want the latter, drop -maxdepth. + mmap_procs=$(exec 2>/dev/null; +- find /proc -path "/proc/[0-9]*/maps" -print | ++ find /proc -mindepth 2 -maxdepth 2 -path "/proc/[0-9]*/maps" -print | + xargs -r grep -l " ${dir}/" | awk -F/ '{print $3}' | uniq) + printf "${procs}\n${mmap_procs}" | sort -u + fi + +From 3d34db0c60a125126361b45ff8303358b6275298 Mon Sep 17 00:00:00 2001 +From: Lars Ellenberg +Date: Thu, 16 Oct 2025 11:31:00 +0200 +Subject: [PATCH 2/2] Filesystem: futher speed up get_pids + +If we have /proc//map_files/* symlinks, +we don't need to additionally grep /proc//maps. + +Also don't first collect output of commands into variables +just to pipe them to sort -u later, +just pipe the output of the commands through sort -u directly. +--- + heartbeat/Filesystem | 31 +++++++++++++++++++------------ + 1 file changed, 19 insertions(+), 12 deletions(-) + +diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem +index f76339fd6..7021f13da 100755 +--- a/heartbeat/Filesystem ++++ b/heartbeat/Filesystem +@@ -694,19 +694,26 @@ get_pids() + # + # Or trust that fuser (above) learned something in the last 15 years + # and avoids blocking operations meanwhile? +- procs=$(exec 2>/dev/null; +- find /proc -mindepth 1 -maxdepth 3 \ +- -path "/proc/[0-9]*" \ +- -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print | +- awk -F/ '{print $3}' | uniq) +- +- # memory mappings are also per process, not per task. +- # This finds only /proc//maps, and not /proc//task//maps; +- # if you also want the latter, drop -maxdepth. +- mmap_procs=$(exec 2>/dev/null; ++ ( ++ # If you want to debug this, drop this redirection. ++ # But it producess too much "No such file" noise for kernel ++ # threads or due to races with exiting processes or closing fds. ++ exec 2>/dev/null; ++ find /proc -mindepth 1 -maxdepth 3 \ ++ -path "/proc/[0-9]*" \ ++ -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print | ++ awk -F/ '{print $3}' | uniq ++ ++ # If we have "map_files/", "find" above already found the ++ # relevant symlinks, and we don't need to grep "maps" below. ++ # Available since kernel 3.3, respectively 4.3. ++ test -d /proc/$$/map_files || ++ # memory mappings are also per process, not per task. ++ # This finds only /proc//maps, and not /proc//task//maps; ++ # if you also want the latter, drop -maxdepth. + find /proc -mindepth 2 -maxdepth 2 -path "/proc/[0-9]*/maps" -print | +- xargs -r grep -l " ${dir}/" | awk -F/ '{print $3}' | uniq) +- printf "${procs}\n${mmap_procs}" | sort -u ++ xargs -r grep -l " ${dir}/" | awk -F/ '{print $3}' | uniq ++ ) | sort -u + fi + } + diff --git a/SOURCES/RHEL-123887-podman-etcd-certificate-rotation.patch b/SOURCES/RHEL-123887-podman-etcd-certificate-rotation.patch new file mode 100644 index 0000000..7774492 --- /dev/null +++ b/SOURCES/RHEL-123887-podman-etcd-certificate-rotation.patch @@ -0,0 +1,166 @@ +From 6bfbe1dc3a0dad234decd77330ca6189e932bb89 Mon Sep 17 00:00:00 2001 +From: ehila +Date: Thu, 16 Oct 2025 23:39:32 -0400 +Subject: [PATCH] feat: add support for podman-etcd cert rotation + +added a cert check function to the monitor call to force a restart of etcd when the certs have been changed + +Signed-off-by: ehila +--- + heartbeat/podman-etcd | 87 ++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 86 insertions(+), 1 deletion(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index e1425ec02..b8dfb2f9e 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -40,6 +40,7 @@ + # Parameter defaults + OCF_RESKEY_image_default="default" + OCF_RESKEY_pod_manifest_default="/etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml" ++OCF_RESKEY_etcd_certs_dir_default="/etc/kubernetes/static-pod-resources/etcd-certs" + OCF_RESKEY_name_default="etcd" + OCF_RESKEY_nic_default="br-ex" + OCF_RESKEY_authfile_default="/var/lib/kubelet/config.json" +@@ -51,6 +52,7 @@ OCF_RESKEY_backup_location_default="/var/lib/etcd" + + : ${OCF_RESKEY_image=${OCF_RESKEY_image_default}} + : ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}} ++: ${OCF_RESKEY_etcd_certs_dir=${OCF_RESKEY_etcd_certs_dir_default}} + : ${OCF_RESKEY_name=${OCF_RESKEY_name_default}} + : ${OCF_RESKEY_nic=${OCF_RESKEY_nic_default}} + : ${OCF_RESKEY_authfile=${OCF_RESKEY_authfile_default}} +@@ -88,6 +90,15 @@ The Pod manifest with the configuration for Etcd. + + + ++ ++ ++The Etcd certificates directory mounted into the etcd container. ++The agent will monitor this directory for changes and restart the etcd container if the certificates have changed. ++ ++Etcd certificates directory ++ ++ ++ + + + The podman image to base this container off of. +@@ -289,6 +300,59 @@ Expects to have a fully populated OCF RA-compliant environment set. + END + } + ++etcd_certificates_hash_manager() ++{ ++ local action="$1" ++ local current_hash ++ local stored_hash ++ ++ # If the certs directory doesn't exist, consider it unchanged ++ if [ ! -d "$OCF_RESKEY_etcd_certs_dir" ]; then ++ ocf_log warn "certificates directory $OCF_RESKEY_etcd_certs_dir does not exist, skipping certificate monitoring" ++ return $OCF_SUCCESS ++ fi ++ ++ # Calculate hash of all certificate files, ignore key files to avoid accidental disclosure of sensitive information ++ # we only need to monitor the certificate files to detect changes. ++ if ! current_hash=$(find "$OCF_RESKEY_etcd_certs_dir" -type f \( -name "*.crt" \) -exec sha256sum {} \; | sort | sha256sum | cut -d' ' -f1); then ++ ocf_log err "failed to calculate certificate files hash" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ # If no stored hash exists, create one and return success ++ if [ ! -f "$ETCD_CERTS_HASH_FILE" ]; then ++ echo "$current_hash" > "$ETCD_CERTS_HASH_FILE" ++ ocf_log info "created initial certificate hash: $current_hash" ++ return $OCF_SUCCESS ++ fi ++ ++ case "$action" in ++ "update") ++ if ! echo "$current_hash" > "$ETCD_CERTS_HASH_FILE"; then ++ ocf_log err "failed to update certificate hash file $ETCD_CERTS_HASH_FILE" ++ fi ++ ocf_log info "updated certificate hash: $current_hash" ++ ;; ++ "check") ++ if ! stored_hash=$(cat "$ETCD_CERTS_HASH_FILE"); then ++ ocf_log err "failed to read stored certificate hash from $ETCD_CERTS_HASH_FILE" ++ # This should not happen but if for some reason we can not read the stored hash, ++ # use the current hash and log the error but allow etcd to run as long as possible. ++ stored_hash="$current_hash" ++ fi ++ if [ "$current_hash" != "$stored_hash" ]; then ++ ocf_exit_reason "$NODENAME etcd certificate files have changed (stored: $stored_hash, current: $current_hash)" ++ return $OCF_ERR_GENERIC ++ fi ++ ;; ++ *) ++ ocf_log err "unsupported action: $action" ++ return $OCF_ERR_GENERIC ++ ;; ++ esac ++ ++ return $OCF_SUCCESS ++} + + monitor_cmd_exec() + { +@@ -357,7 +421,7 @@ archive_current_container() + + # archive corresponding etcd configuration files + local files_to_archive="" +- for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE"; do ++ for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE" "$ETCD_CERTS_HASH_FILE"; do + if [ -f "$file" ]; then + files_to_archive="$files_to_archive $file" + else +@@ -1178,6 +1242,11 @@ podman_monitor() + return $rc + fi + ++ # Check if certificate files have changed, if they have, etcd needs to be restarted ++ if ! etcd_certificates_hash_manager "check"; then ++ return $OCF_ERR_GENERIC ++ fi ++ + if is_learner; then + ocf_log info "$NODENAME is learner. Cannot get member id" + return "$OCF_SUCCESS" +@@ -1483,6 +1552,14 @@ podman_start() + return $OCF_ERR_GENERIC + fi + ++ # Update the certificate hash after the container has started successfully ++ # this is to ensure that the certificate hash is updated after a restart is initiated ++ # by a cert rotation event from the monitor command. ++ if ! etcd_certificates_hash_manager "update"; then ++ ocf_exit_reason "etcd certificate hash manager failed to update the certificate hash" ++ return $OCF_ERR_GENERIC ++ fi ++ + # check if the container has already started + podman_simple_status + if [ $? -eq $OCF_SUCCESS ]; then +@@ -1888,6 +1965,13 @@ podman_validate() + exit $OCF_ERR_CONFIGURED + fi + ++ if ! echo "validation test" > "$ETCD_CERTS_HASH_FILE" \ ++ || ! cat "$ETCD_CERTS_HASH_FILE" >/dev/null 2>&1 \ ++ || ! rm "$ETCD_CERTS_HASH_FILE"; then ++ ocf_exit_reason "cannot read/write to certificate hash file $ETCD_CERTS_HASH_FILE" ++ exit $OCF_ERR_GENERIC ++ fi ++ + return $OCF_SUCCESS + } + +@@ -1922,6 +2006,7 @@ CONTAINER=$OCF_RESKEY_name + POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml" + ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml" + ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz" ++ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash" + + # Note: we currently monitor podman containers by with the "podman exec" + # command, so make sure that invocation is always valid by enforcing the diff --git a/SOURCES/RHEL-123906-podman-etcd-compute-dynamic-revision-bump-from-maxRaftIndex.patch b/SOURCES/RHEL-123906-podman-etcd-compute-dynamic-revision-bump-from-maxRaftIndex.patch new file mode 100644 index 0000000..00a31ec --- /dev/null +++ b/SOURCES/RHEL-123906-podman-etcd-compute-dynamic-revision-bump-from-maxRaftIndex.patch @@ -0,0 +1,115 @@ +From 6a5608f02a657cf006b6d44d31200342c4bd19b9 Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Tue, 28 Oct 2025 12:47:10 +0100 +Subject: [PATCH] podman-etcd: compute dynamic revision bump from maxRaftIndex + (#2087) + +Replace hardcoded 1 billion revision bump with dynamic calculation based +on 20% of the last known maxRaftIndex from revision.json. + +This aligns with the logic used by cluster-etcd-operator's +quorum-restore-pod utility and ensures the bump amount is proportional +to the cluster's actual revision state. + +The implementation: +- Adds compute_bump_revision() function with safe fallback to 1bn + default +- Extracts magic values to named constants + (ETCD_REVISION_BUMP_PERCENTAGE, ETCD_BUMP_REV_DEFAULT, + ETCD_REVISION_JSON) +- Validates computed values (non-zero, not exceeding default) +- Logs computation results for debugging + +Reference: +https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da9166 +22c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34 +--- + heartbeat/podman-etcd | 38 ++++++++++++++++++++++++++++++++++---- + 1 file changed, 34 insertions(+), 4 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index b8dfb2f9e..551d37a20 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -619,16 +619,43 @@ prepare_env() { + LISTEN_METRICS_URLS="0.0.0.0" + } + ++compute_bump_revision() { ++ # Same logic used by cluster-etcd-operator quorum-restore-pod utility. ++ # see https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da916622c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34 ++ # set a default value: 1bn would be an etcd running at 1000 writes/s for about eleven days. ++ BUMP_REV=$ETCD_BUMP_REV_DEFAULT ++ if [ ! -f "${ETCD_REVISION_JSON}" ]; then ++ ocf_log err "could not compute bump revision: ${ETCD_REVISION_JSON} not found. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump" ++ return ++ fi ++ ++ # this will bump by the amount of 20% of the last known live revision. ++ if ! COMPUTED_BUMP=$(jq -r "(.maxRaftIndex*${ETCD_REVISION_BUMP_PERCENTAGE}|floor)" "${ETCD_REVISION_JSON}"); then ++ ocf_log err "could not compute maxRaftIndex for bump revision, jq error code: $?. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump" ++ return ++ fi ++ ++ if [ -z "${COMPUTED_BUMP}" ] || [ "${COMPUTED_BUMP}" -le 0 ] || [ "${COMPUTED_BUMP}" -gt "${ETCD_BUMP_REV_DEFAULT}" ]; then ++ ocf_log err "computed bump revision (${COMPUTED_BUMP}) is invalid. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump" ++ return ++ fi ++ ++ BUMP_REV="${COMPUTED_BUMP}" ++ ocf_log info "bumping etcd revisions by ${BUMP_REV}" ++} + + generate_etcd_configuration() { + if is_force_new_cluster; then ++ compute_bump_revision + # The embedded newline is required for correct YAML formatting. + FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: true +-force-new-cluster-bump-amount: 1000000000" ++force-new-cluster-bump-amount: $BUMP_REV" + else + FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: false" + fi + ++ # the space indentation for client-transport-security and peer-transport-security ++ # is required for correct YAML formatting. + cat > "$ETCD_CONFIGURATION_FILE" << EOF + logger: zap + log-level: info +@@ -707,7 +734,7 @@ attribute_node_cluster_id() + { + local action="$1" + local value +- if ! value=$(jq -r ".clusterId" /var/lib/etcd/revision.json); then ++ if ! value=$(jq -r ".clusterId" "$ETCD_REVISION_JSON"); then + rc=$? + ocf_log err "could not get cluster_id, error code: $rc" + return "$rc" +@@ -745,7 +772,7 @@ attribute_node_revision() + local value + local attribute="revision" + +- if ! value=$(jq -r ".maxRaftIndex" /var/lib/etcd/revision.json); then ++ if ! value=$(jq -r ".maxRaftIndex" "$ETCD_REVISION_JSON"); then + rc=$? + ocf_log err "could not get $attribute, error code: $rc" + return "$rc" +@@ -1456,7 +1483,7 @@ can_reuse_container() { + + + # If the container does not exist it cannot be reused +- if ! container_exists; then ++ if ! container_exists; then + OCF_RESKEY_reuse=0 + return "$OCF_SUCCESS" + fi +@@ -2006,6 +2033,9 @@ CONTAINER=$OCF_RESKEY_name + POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml" + ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml" + ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz" ++ETCD_REVISION_JSON="/var/lib/etcd/revision.json" ++ETCD_REVISION_BUMP_PERCENTAGE=0.2 ++ETCD_BUMP_REV_DEFAULT=1000000000 + ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash" + + # Note: we currently monitor podman containers by with the "podman exec" diff --git a/SOURCES/RHEL-126087-1-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch b/SOURCES/RHEL-126087-1-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch new file mode 100644 index 0000000..0c2f3e9 --- /dev/null +++ b/SOURCES/RHEL-126087-1-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch @@ -0,0 +1,222 @@ +From e8fb2ad9cc14e91b74b5cde1e012d92afcddb1a5 Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Sat, 25 Oct 2025 17:27:42 +0200 +Subject: [PATCH] podman-etcd: add container crash detection with coordinated + recovery + +This change prevents the agent from starting prematurely when the etcd +container has failed. Previously, an early start would cause the agent +to block while waiting for peer-initiated recovery. This blocking +prevented Pacemaker from allowing the surviving agent to stop and +properly recover the cluster. + +The change introduces `container_health_check` function to monitor the +container's state and catch etcd failures. This check uses a state file +to distinguish between a planned shutdown and an unexpected failure: + +* Container Running: The state file is created or updated with the + current epoch (timestamp). Returns: "healthy". +* Container Not Running + No State File: It's the first check. Returns: + "not-running". +* Container Not Running + State File: An unexpected failure is detected. + * If force_new_cluster is set, the status is: "failed-restart-now". + * Otherwise, the status is: "failed-wait-for-peer". + +The state file is written in a temporary directory (HA_RSCTMP) to ensure +automatic cleanup on reboot. It is also explicitly removed in +`podman_start` and `podman_stop` to mark planned transitions. + +A new helper function `get_time_since_last_heartbeat()` calculates +elapsed time since the last healthy check for diagnostic logging. + +Monitor behavior changes: +* failed-wait-for-peer: Returns OCF_SUCCESS to keep resource running + while waiting for peer-initiated recovery, as the agent is not able + to recover the cluster from a failed state. +* failed-restart-now: Returns OCF_ERR_GENERIC to trigger restart once + peer has set force_new_cluster +--- + heartbeat/podman-etcd | 133 +++++++++++++++++++++++++++++++++++++++--- + 1 file changed, 124 insertions(+), 9 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index b8dfb2f9e..d596c6f2a 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -1226,22 +1226,122 @@ podman_simple_status() + return $rc + } + +-podman_monitor() ++# get_time_since_last_heartbeat returns the time in seconds since the heartbeat file was last updated. ++# ++# Returns: time in seconds since last heartbeat, or empty string if file doesn't exist ++get_time_since_last_heartbeat() + { ++ local last_heartbeat ++ ++ if [ ! -f "$CONTAINER_HEARTBEAT_FILE" ]; then ++ return ++ fi ++ ++ last_heartbeat=$(cat "$CONTAINER_HEARTBEAT_FILE") ++ echo $(($(date +%s) - last_heartbeat)) ++} ++ ++# container_health_check performs comprehensive health monitoring for the container. ++# This function allows coordinated failure handling where the agent waits for ++# peer-initiated cluster recovery in case of container failure. ++# ++# Uses a state file to track container state: ++# - Container running: Update state file with current epoch, return "healthy" ++# - Container not running + no state file: Return "not-running" (never checked before) ++# - Container not running + state file: Failure detected, check force_new_cluster ++# - If force_new_cluster set: Return "failed-restart-now" ++# - Otherwise: Return "failed-wait-for-peer" ++# ++# Returns: healthy, not-running, failed-restart-now, failed-wait-for-peer ++ ++container_health_check() ++{ ++ local rc ++ + # We rely on running podman exec to monitor the container + # state because that command seems to be less prone to + # performance issue under IO load. + # + # For probes to work, we expect cmd_exec to be able to report +- # when a container is not running. Here, we're not interested +- # in distinguishing whether it's stopped or non existing +- # (there's function container_exists for that) ++ # when a container is not running. Here, we're not interested ++ # in distinguishing whether it's stopped or non existing ++ # (there's function container_exists for that) ++ # For monitor, however, we still need to know if it has stopped ++ # recently (i.e. a failure), or not (fresh start) + monitor_cmd_exec + rc=$? +- if [ $rc -ne 0 ]; then +- return $rc ++ if [ "$rc" -eq 0 ]; then ++ # Container is running - update state file with current epoch ++ local current_epoch ++ current_epoch=$(date +%s) ++ if ! echo "$current_epoch" > "$CONTAINER_HEARTBEAT_FILE"; then ++ ocf_log warn "Failed to update container heartbeat file, error code: $?" ++ # wait for peer to detect any real issue with the etcd cluster or wait for the ++ # next monitor interval ++ echo "failed-wait-for-peer" ++ return ++ fi ++ echo "healthy" ++ return + fi + ++ # Check if state file exists (was container running on last check?) ++ if [ ! -f "$CONTAINER_HEARTBEAT_FILE" ]; then ++ # No state file - container was never checked before ++ ocf_log debug "Container ${CONTAINER} has no previous state" ++ echo "not-running" ++ # NOTE: this is where the probe is expected to exit, keeping the logic ++ # quick and less prone to performance issue under IO load. ++ return ++ fi ++ ++ # State file exists - the container failed, check recovery status in this lifecycle ++ local time_since_heartbeat ++ time_since_heartbeat=$(get_time_since_last_heartbeat) ++ ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)" ++ ++ # Check if peer has set force_new_cluster for recovery ++ local fnc_holders ++ if ! fnc_holders=$(get_force_new_cluster); then ++ ocf_log err "Could not detect peer-initiated recovery. Checking again in the next monitor cycle" ++ echo "failed-wait-for-peer" ++ return ++ fi ++ ++ if [ -n "$fnc_holders" ]; then ++ ocf_log debug "force_new_cluster detected (set by: $fnc_holders), triggering restart" ++ echo "failed-restart-now" ++ return ++ fi ++ ++ echo "failed-wait-for-peer" ++} ++ ++podman_monitor() ++{ ++ local container_health_state ++ ++ container_health_state=$(container_health_check) ++ case "$container_health_state" in ++ healthy) ++ # Continue with normal monitoring ++ ;; ++ not-running) ++ return $OCF_NOT_RUNNING ++ ;; ++ failed-restart-now) ++ return $OCF_ERR_GENERIC ++ ;; ++ failed-wait-for-peer) ++ # Continue running, waiting for peer recovery ++ return $OCF_SUCCESS ++ ;; ++ *) ++ ocf_log err "Unknown health state: $container_health_state" ++ return $OCF_ERR_GENERIC ++ ;; ++ esac ++ + # Check if certificate files have changed, if they have, etcd needs to be restarted + if ! etcd_certificates_hash_manager "check"; then + return $OCF_ERR_GENERIC +@@ -1533,6 +1633,12 @@ podman_start() + local pod_was_running=false + + ocf_log notice "podman-etcd start" ++ ++ # Clear container health check state file ++ if ! rm -f "$CONTAINER_HEARTBEAT_FILE"; then ++ ocf_log err "could not delete container health check state file" ++ fi ++ + attribute_node_ip update + attribute_node_cluster_id update + attribute_node_revision update +@@ -1849,15 +1955,21 @@ podman_stop() + local rc + + ocf_log notice "podman-etcd stop" ++ ++ # Clear container health check state file ++ if ! rm -f "$CONTAINER_HEARTBEAT_FILE"; then ++ ocf_log err "could not delete container health check state file" ++ fi ++ ++ attribute_node_revision update ++ attribute_node_cluster_id update ++ + podman_simple_status + if [ $? -eq $OCF_NOT_RUNNING ]; then + ocf_log info "could not leave members list: etcd container not running" + return $OCF_SUCCESS + fi + +- attribute_node_revision update +- attribute_node_cluster_id update +- + if ! member_id=$(attribute_node_member_id get); then + ocf_log err "error leaving members list: could not get member-id" + else +@@ -2007,6 +2119,9 @@ POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml" + ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml" + ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz" + ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash" ++# State file location: Uses HA_RSCTMP to ensure automatic cleanup on reboot. ++# This is intentional - reboots are controlled stops, not failures requiring detection. ++CONTAINER_HEARTBEAT_FILE=${HA_RSCTMP}/podman-container-last-running + + # Note: we currently monitor podman containers by with the "podman exec" + # command, so make sure that invocation is always valid by enforcing the diff --git a/SOURCES/RHEL-126087-2-podman-etcd-fix-count-of-fnc-holders-in-container_health_check.patch b/SOURCES/RHEL-126087-2-podman-etcd-fix-count-of-fnc-holders-in-container_health_check.patch new file mode 100644 index 0000000..6ef5612 --- /dev/null +++ b/SOURCES/RHEL-126087-2-podman-etcd-fix-count-of-fnc-holders-in-container_health_check.patch @@ -0,0 +1,47 @@ +From a155018f6d65edf99493804dad99412b50d13e6c Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Wed, 5 Nov 2025 13:48:38 +0100 +Subject: [PATCH] podman-etcd: fix count of fnc holders in + container_health_check + +The variable `fnc_holders` (a list of nodes that have force_new_cluster +CIB attribute set) can contain empty spaces. Because of this, the +shell's simple `-n` test is not enough to establish if there are no +`fnc_holders`. + +Fixed counting the number of words inside the variable. + +Moreover +* Enhanced comment for clarity. +* Log level changed to `info`. We want visibility when the monitor + detects the peer node is ready for recovery, and this is rare enough + not to flood the logs. +--- + heartbeat/podman-etcd | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 5bdc6d184..7795130a6 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -1366,7 +1366,7 @@ container_health_check() + return + fi + +- # State file exists - the container failed, check recovery status in this lifecycle ++ # Could not execute monitor check command and state file exists - the container failed, check recovery status in this lifecycle + local time_since_heartbeat + time_since_heartbeat=$(get_time_since_last_heartbeat) + ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)" +@@ -1379,8 +1379,9 @@ container_health_check() + return + fi + +- if [ -n "$fnc_holders" ]; then +- ocf_log debug "force_new_cluster detected (set by: $fnc_holders), triggering restart" ++ local fnc_holder_count=$(echo "$fnc_holders" | wc -w) ++ if [ "$fnc_holder_count" -gt 0 ]; then ++ ocf_log info "force_new_cluster detected (set by: $fnc_holders), triggering restart" + echo "failed-restart-now" + return + fi diff --git a/SOURCES/RHEL-127006-storage_mon-fix-handling-of-4k-block-devices.patch b/SOURCES/RHEL-127006-storage_mon-fix-handling-of-4k-block-devices.patch new file mode 100644 index 0000000..bdf7452 --- /dev/null +++ b/SOURCES/RHEL-127006-storage_mon-fix-handling-of-4k-block-devices.patch @@ -0,0 +1,158 @@ +From 48455cb6cef9c5b849045bc838bc2b5ccd01b0fe Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Fri, 7 Nov 2025 17:06:57 +0100 +Subject: [PATCH 1/3] storage_mon: refactor removing basically duplicate code + +--- + tools/storage_mon.c | 45 ++++++++++++++++----------------------------- + 1 file changed, 16 insertions(+), 29 deletions(-) + +diff --git a/tools/storage_mon.c b/tools/storage_mon.c +index 27d2ff1d1..fa9bd0cbc 100644 +--- a/tools/storage_mon.c ++++ b/tools/storage_mon.c +@@ -119,6 +119,8 @@ static void *test_device(const char *device, int verbose, int inject_error_perce + int device_fd; + int res; + off_t seek_spot; ++ int sec_size = 512; ++ void *buffer; + + if (verbose) { + printf("Testing device %s\n", device); +@@ -164,9 +166,6 @@ static void *test_device(const char *device, int verbose, int inject_error_perce + } + + if (flags & O_DIRECT) { +- int sec_size = 0; +- void *buffer; +- + #ifdef __FreeBSD__ + res = ioctl(device_fd, DIOCGSECTORSIZE, &sec_size); + #else +@@ -176,33 +175,21 @@ static void *test_device(const char *device, int verbose, int inject_error_perce + PRINT_STORAGE_MON_ERR("Failed to get block device sector size for %s: %s", device, strerror(errno)); + goto error; + } ++ } + +- if (posix_memalign(&buffer, sysconf(_SC_PAGESIZE), sec_size) != 0) { +- PRINT_STORAGE_MON_ERR("Failed to allocate aligned memory: %s", strerror(errno)); +- goto error; +- } +- res = read(device_fd, buffer, sec_size); +- free(buffer); +- if (res < 0) { +- PRINT_STORAGE_MON_ERR("Failed to read %s: %s", device, strerror(errno)); +- goto error; +- } +- if (res < sec_size) { +- PRINT_STORAGE_MON_ERR("Failed to read %d bytes from %s, got %d", sec_size, device, res); +- goto error; +- } +- } else { +- char buffer[512]; +- +- res = read(device_fd, buffer, sizeof(buffer)); +- if (res < 0) { +- PRINT_STORAGE_MON_ERR("Failed to read %s: %s", device, strerror(errno)); +- goto error; +- } +- if (res < (int)sizeof(buffer)) { +- PRINT_STORAGE_MON_ERR("Failed to read %ld bytes from %s, got %d", sizeof(buffer), device, res); +- goto error; +- } ++ if (posix_memalign(&buffer, sysconf(_SC_PAGESIZE), sec_size) != 0) { ++ PRINT_STORAGE_MON_ERR("Failed to allocate aligned memory: %s", strerror(errno)); ++ goto error; ++ } ++ res = read(device_fd, buffer, sec_size); ++ free(buffer); ++ if (res < 0) { ++ PRINT_STORAGE_MON_ERR("Failed to read %s: %s", device, strerror(errno)); ++ goto error; ++ } ++ if (res < sec_size) { ++ PRINT_STORAGE_MON_ERR("Failed to read %d bytes from %s, got %d", sec_size, device, res); ++ goto error; + } + + /* Fake an error */ + +From 310f224fc7d9a6f4fca234f10696e6049c8f2666 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Fri, 7 Nov 2025 17:14:06 +0100 +Subject: [PATCH 2/3] storage_mon.c: refactor moving up getting blocksize + +if that fails we can bail out without unnecessary seek +--- + tools/storage_mon.c | 24 ++++++++++++------------ + 1 file changed, 12 insertions(+), 12 deletions(-) + +diff --git a/tools/storage_mon.c b/tools/storage_mon.c +index fa9bd0cbc..960266a74 100644 +--- a/tools/storage_mon.c ++++ b/tools/storage_mon.c +@@ -152,6 +152,18 @@ static void *test_device(const char *device, int verbose, int inject_error_perce + PRINT_STORAGE_MON_INFO("%s: opened %s O_DIRECT, size=%zu", device, (flags & O_DIRECT)?"with":"without", devsize); + } + ++ if (flags & O_DIRECT) { ++#ifdef __FreeBSD__ ++ res = ioctl(device_fd, DIOCGSECTORSIZE, &sec_size); ++#else ++ res = ioctl(device_fd, BLKSSZGET, &sec_size); ++#endif ++ if (res < 0) { ++ PRINT_STORAGE_MON_ERR("Failed to get block device sector size for %s: %s", device, strerror(errno)); ++ goto error; ++ } ++ } ++ + /* Don't fret about real randomness */ + srand(time(NULL) + getpid()); + /* Pick a random place on the device - sector aligned */ +@@ -165,18 +177,6 @@ static void *test_device(const char *device, int verbose, int inject_error_perce + PRINT_STORAGE_MON_INFO("%s: reading from pos %ld", device, seek_spot); + } + +- if (flags & O_DIRECT) { +-#ifdef __FreeBSD__ +- res = ioctl(device_fd, DIOCGSECTORSIZE, &sec_size); +-#else +- res = ioctl(device_fd, BLKSSZGET, &sec_size); +-#endif +- if (res < 0) { +- PRINT_STORAGE_MON_ERR("Failed to get block device sector size for %s: %s", device, strerror(errno)); +- goto error; +- } +- } +- + if (posix_memalign(&buffer, sysconf(_SC_PAGESIZE), sec_size) != 0) { + PRINT_STORAGE_MON_ERR("Failed to allocate aligned memory: %s", strerror(errno)); + goto error; + +From ac19911ce550d5eca42be6cb44632384bdf8e1c9 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Fri, 7 Nov 2025 17:18:45 +0100 +Subject: [PATCH 3/3] storage_mon.c: fix block-seek mask deriving it from the + block-size + +now this is as well working for e.g. 4K block-devices +--- + tools/storage_mon.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/storage_mon.c b/tools/storage_mon.c +index 960266a74..6c4555f04 100644 +--- a/tools/storage_mon.c ++++ b/tools/storage_mon.c +@@ -167,7 +167,7 @@ static void *test_device(const char *device, int verbose, int inject_error_perce + /* Don't fret about real randomness */ + srand(time(NULL) + getpid()); + /* Pick a random place on the device - sector aligned */ +- seek_spot = (rand() % (devsize-1024)) & 0xFFFFFFFFFFFFFE00; ++ seek_spot = (rand() % (devsize-sec_size)) & ~(((off_t) sec_size)-1); + res = lseek(device_fd, seek_spot, SEEK_SET); + if (res < 0) { + PRINT_STORAGE_MON_ERR("Failed to seek %s: %s", device, strerror(errno)); diff --git a/SOURCES/RHEL-127891-podman-etcd-exclude-stopping-resources-from-active-count.patch b/SOURCES/RHEL-127891-podman-etcd-exclude-stopping-resources-from-active-count.patch new file mode 100644 index 0000000..d065a34 --- /dev/null +++ b/SOURCES/RHEL-127891-podman-etcd-exclude-stopping-resources-from-active-count.patch @@ -0,0 +1,106 @@ +From d5b4428e6cd66fd47680531ff0244d9b56e4e4c2 Mon Sep 17 00:00:00 2001 +From: Pablo Fontanilla +Date: Tue, 14 Oct 2025 11:57:09 +0200 +Subject: [PATCH 1/2] Redo counting of active_resources + +--- + heartbeat/podman-etcd | 46 +++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 44 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index e1425ec02..dbf16918d 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -1029,6 +1029,48 @@ get_peer_node_name() { + crm_node -l | awk '{print $2}' | grep -v "$NODENAME" + } + ++# Calculate the count of truly active resources by excluding those being stopped. ++# According to Pacemaker documentation, during "Post-notification (stop) / ++# Pre-notification (start)" transitions, the true active resource count should be: ++# Active resources = $OCF_RESKEY_CRM_meta_notify_active_resource ++# minus $OCF_RESKEY_CRM_meta_notify_stop_resource ++# This handles the case where a resource appears in both the active and stop lists ++# during rapid restart scenarios (e.g., process crash recovery). ++get_truly_active_resources_count() { ++ local active_list="$OCF_RESKEY_CRM_meta_notify_active_resource" ++ local stop_list="$OCF_RESKEY_CRM_meta_notify_stop_resource" ++ local truly_active="" ++ ++ # If no active resources, return 0 ++ if [ -z "$active_list" ]; then ++ echo "0" ++ return ++ fi ++ ++ # If no resources being stopped, return count of active resources ++ if [ -z "$stop_list" ]; then ++ echo "$active_list" | wc -w ++ return ++ fi ++ ++ # Filter out resources that are being stopped from the active list ++ for resource in $active_list; do ++ local is_stopping=0 ++ for stop_resource in $stop_list; do ++ if [ "$resource" = "$stop_resource" ]; then ++ is_stopping=1 ++ break ++ fi ++ done ++ if [ $is_stopping -eq 0 ]; then ++ truly_active="$truly_active $resource" ++ fi ++ done ++ ++ # Count the truly active resources (trim leading space and count words) ++ echo "$truly_active" | wc -w ++} ++ + get_all_etcd_endpoints() { + for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do + name=$(echo "$node" | cut -d: -f1) +@@ -1529,8 +1571,8 @@ podman_start() + # - 0 active agents, 1 starting: we are starting; the peer is not starting + # - 0 active agents, 2 starting: both agents are starting simultaneously + local active_resources_count +- active_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_active_resource" | wc -w) +- ocf_log info "found '$active_resources_count' active etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_active_resource')" ++ active_resources_count=$(get_truly_active_resources_count) ++ ocf_log info "found '$active_resources_count' active etcd resources (active: '$OCF_RESKEY_CRM_meta_notify_active_resource', stop: '$OCF_RESKEY_CRM_meta_notify_stop_resource')" + case "$active_resources_count" in + 1) + if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then + +From 0114ddf83c95122a7f9fe9f704f864242cdb284a Mon Sep 17 00:00:00 2001 +From: Pablo Fontanilla +Date: Wed, 29 Oct 2025 12:49:17 +0100 +Subject: [PATCH 2/2] Update truly active resources count with safer empty + calculation + +--- + heartbeat/podman-etcd | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index dbf16918d..8fc92a537 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -1042,13 +1042,15 @@ get_truly_active_resources_count() { + local truly_active="" + + # If no active resources, return 0 +- if [ -z "$active_list" ]; then ++ # Use word count to handle whitespace-only values ++ if [ "$(echo "$active_list" | wc -w)" -eq 0 ]; then + echo "0" + return + fi + + # If no resources being stopped, return count of active resources +- if [ -z "$stop_list" ]; then ++ # Use word count to handle whitespace-only values ++ if [ "$(echo "$stop_list" | wc -w)" -eq 0 ]; then + echo "$active_list" | wc -w + return + fi diff --git a/SOURCES/RHEL-130580-1-podman-etcd-prevent-last-active-member-from-leaving.patch b/SOURCES/RHEL-130580-1-podman-etcd-prevent-last-active-member-from-leaving.patch new file mode 100644 index 0000000..1ec5dcd --- /dev/null +++ b/SOURCES/RHEL-130580-1-podman-etcd-prevent-last-active-member-from-leaving.patch @@ -0,0 +1,161 @@ +From 578e6d982e5ab705dac216cecf85c50fe3842af5 Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Sun, 16 Nov 2025 19:40:30 +0100 +Subject: [PATCH] OCPBUGS-60098: podman-etcd: prevent last active member from + leaving the etcd member list + +When stopping etcd instances, simultaneous member removal from both +nodes can corrupt the etcd Write-Ahead Log (WAL). This change implements +a two-part solution: + +1. Concurrent stop protection: When multiple nodes are stopping, the + alphabetically second node delays its member removal by 10 + seconds. This prevents simultaneous member list updates that can + corrupt WAL. + +2. Last member detection: Checks active resource count after any + delay. If this is the last active member, skips member removal to + avoid leaving an empty cluster. + +Additionally, reorders podman_stop() to clear the member_id attribute +after leaving the member list, ensuring the attribute reflects actual +cluster state during shutdown. +--- + heartbeat/podman-etcd | 86 ++++++++++++++++++++++++++++++++++--------- + 1 file changed, 69 insertions(+), 17 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 7795130a6..7b6e08f11 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -1341,6 +1341,11 @@ container_health_check() + # recently (i.e. a failure), or not (fresh start) + monitor_cmd_exec + rc=$? ++ if [ "$rc" -ne 0 ]; then ++ ocf_log info "Container ${CONTAINER} not-running" ++ echo "not-running" ++ return ++ fi + if [ "$rc" -eq 0 ]; then + # Container is running - update state file with current epoch + local current_epoch +@@ -1639,7 +1644,7 @@ can_reuse_container() { + OCF_RESKEY_reuse=0 + return "$OCF_SUCCESS" + fi +- ++ + if ! filtered_original_pod_manifest=$(filter_pod_manifest "$OCF_RESKEY_pod_manifest"); then + return $OCF_ERR_GENERIC + fi +@@ -1866,7 +1871,7 @@ podman_start() + fi + + if ocf_is_true "$JOIN_AS_LEARNER"; then +- local wait_timeout_sec=$((10*60)) ++ local wait_timeout_sec=60 + local poll_interval_sec=5 + local retries=$(( wait_timeout_sec / poll_interval_sec )) + +@@ -2021,6 +2026,64 @@ podman_start() + done + } + ++# leave_etcd_member_list removes the current node from the etcd member list during ++# shutdown to ensure clean cluster state. ++# ++# Skips removal if this is the standalone (last) node. When both nodes are stopping ++# concurrently, delays the second node to prevent simultaneous member removal that ++# could corrupt the etcd WAL. ++leave_etcd_member_list() ++{ ++ if ! member_id=$(attribute_node_member_id get); then ++ ocf_log err "error leaving members list: could not get member-id" ++ return ++ fi ++ ++ if is_standalone; then ++ ocf_log info "last member. Not leaving the member list" ++ return ++ fi ++ ++ local stopping_resources_count ++ stopping_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_stop_resource" | wc -w) ++ ocf_log info "found '$stopping_resources_count' stopping etcd resources (stop: '$OCF_RESKEY_CRM_meta_notify_stop_resource')" ++ if [ "$stopping_resources_count" -gt 1 ]; then ++ # Prevent WAL corruption by delaying the alphabetically second node's member ++ # removal when both nodes are stopping concurrently. ++ local delayed_node ++ ++ node_names_sorted=$(echo "$OCF_RESKEY_node_ip_map" | sed 's/:[^;]*//g; s/;/ /g' | tr ' ' '\n' | sort | tr '\n' ' ') ++ delayed_node="$(echo "$node_names_sorted" | cut -d' ' -f2)" ++ ++ if [ -z "$delayed_node" ]; then ++ ocf_log warn "could not determine node to be delayed: not leaving the member list" ++ return ++ fi ++ ++ if [ "$NODENAME" = "$delayed_node" ]; then ++ ocf_log info "delaying stop for ${DELAY_SECOND_NODE_LEAVE_SEC}s to prevent simultaneous etcd member removal" ++ sleep $DELAY_SECOND_NODE_LEAVE_SEC ++ fi ++ fi ++ ++ # Ensure we're not the last active resource before leaving. The `standalone_node` property ++ # may not be set if stop was called before monitor check, or after the delayed node waited. ++ local active_resources_count ++ active_resources_count=$(get_truly_active_resources_count) ++ if [ "$active_resources_count" -lt 1 ]; then ++ ocf_log info "last member. Not leaving the member list" ++ return ++ fi ++ ++ ocf_log info "leaving members list as member with ID $member_id" ++ local endpoint ++ endpoint="$(ip_url $(attribute_node_ip get)):2379" ++ if ! ocf_run podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"; then ++ rc=$? ++ ocf_log err "error leaving members list, error code: $rc" ++ fi ++} ++ + podman_stop() + { + local timeout=60 +@@ -2039,24 +2102,12 @@ podman_stop() + podman_simple_status + if [ $? -eq $OCF_NOT_RUNNING ]; then + ocf_log info "could not leave members list: etcd container not running" ++ attribute_node_member_id clear + return $OCF_SUCCESS + fi + +- if ! member_id=$(attribute_node_member_id get); then +- ocf_log err "error leaving members list: could not get member-id" +- else +- # TODO: is it worth/possible to check the current status instead than relying on cached attributes? +- if is_standalone; then +- ocf_log info "last member. Not leaving the member list" +- else +- ocf_log info "leaving members list as member with ID $member_id" +- endpoint="$(ip_url $(attribute_node_ip get)):2379" +- if ! ocf_run podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"; then +- rc=$? +- ocf_log err "error leaving members list, error code: $rc" +- fi +- fi +- fi ++ leave_etcd_member_list ++ # clear node_member_id CIB attribute only after leaving the member list + attribute_node_member_id clear + + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then +@@ -2197,6 +2248,7 @@ ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash" + # State file location: Uses HA_RSCTMP to ensure automatic cleanup on reboot. + # This is intentional - reboots are controlled stops, not failures requiring detection. + CONTAINER_HEARTBEAT_FILE=${HA_RSCTMP}/podman-container-last-running ++DELAY_SECOND_NODE_LEAVE_SEC=10 + + # Note: we currently monitor podman containers by with the "podman exec" + # command, so make sure that invocation is always valid by enforcing the diff --git a/SOURCES/RHEL-130580-2-podman-etcd-remove-test-code.patch b/SOURCES/RHEL-130580-2-podman-etcd-remove-test-code.patch new file mode 100644 index 0000000..8929e47 --- /dev/null +++ b/SOURCES/RHEL-130580-2-podman-etcd-remove-test-code.patch @@ -0,0 +1,42 @@ +From 29df4255c5f65ea94fb6de997805dca65e31071c Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Mon, 24 Nov 2025 12:21:55 +0100 +Subject: [PATCH] podman-etcd: remove test code (#2103) + +--- + heartbeat/podman-etcd | 8 +------- + 1 file changed, 1 insertion(+), 7 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 7b6e08f11..b1f52cd5c 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -1341,11 +1341,6 @@ container_health_check() + # recently (i.e. a failure), or not (fresh start) + monitor_cmd_exec + rc=$? +- if [ "$rc" -ne 0 ]; then +- ocf_log info "Container ${CONTAINER} not-running" +- echo "not-running" +- return +- fi + if [ "$rc" -eq 0 ]; then + # Container is running - update state file with current epoch + local current_epoch +@@ -1644,7 +1639,6 @@ can_reuse_container() { + OCF_RESKEY_reuse=0 + return "$OCF_SUCCESS" + fi +- + if ! filtered_original_pod_manifest=$(filter_pod_manifest "$OCF_RESKEY_pod_manifest"); then + return $OCF_ERR_GENERIC + fi +@@ -1871,7 +1865,7 @@ podman_start() + fi + + if ocf_is_true "$JOIN_AS_LEARNER"; then +- local wait_timeout_sec=60 ++ local wait_timeout_sec=$((10*60)) + local poll_interval_sec=5 + local retries=$(( wait_timeout_sec / poll_interval_sec )) + diff --git a/SOURCES/RHEL-131185-podman-etcd-prevent-learner-from-starting-before-cluster-is-ready.patch b/SOURCES/RHEL-131185-podman-etcd-prevent-learner-from-starting-before-cluster-is-ready.patch new file mode 100644 index 0000000..191f430 --- /dev/null +++ b/SOURCES/RHEL-131185-podman-etcd-prevent-learner-from-starting-before-cluster-is-ready.patch @@ -0,0 +1,107 @@ +From 5cc74acd67c294da36b3f40e44842a82aa7d0957 Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Wed, 26 Nov 2025 11:43:25 +0100 +Subject: [PATCH] OCPEDGE-2213: podman-etcd: fix to prevent learner from + starting before cluster is ready (#2098) + +* OCPEDGE-2213: fix(podman-etcd): prevent learner from starting before cluster is ready + +Clear stale learner_node attribute during stop and on restart when no +active resources exist, ensuring learner always waits for peer +availability. + +* fix: podman-etcd should cleanup standalone/learner attributes when promotion succeeds + +* fix: remove misleading endpoint IP from log +--- + heartbeat/podman-etcd | 33 +++++++++++++++++++-------------- + 1 file changed, 19 insertions(+), 14 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index b1f52cd5c..3e3f1d60e 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -880,7 +880,7 @@ add_member_as_learner() + local endpoint_url=$(ip_url $(attribute_node_ip get)) + local peer_url=$(ip_url $member_ip) + +- ocf_log info "add $member_name ($member_ip, $endpoint_url) to the member list as learner" ++ ocf_log info "add $member_name ($member_ip) to the member list as learner" + out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner) + rc=$? + if [ $rc -ne 0 ]; then +@@ -1032,7 +1032,7 @@ promote_learner_member() + if ! ocf_run podman exec "${CONTAINER}" etcdctl member promote "$learner_member_id_hex" 2>&1; then + # promotion is expected to fail if the peer is not yet up-to-date + ocf_log info "could not promote member $learner_member_id_hex, error code: $?" +- return $OCF_SUCCESS ++ return $OCF_ERR_GENERIC + fi + ocf_log info "successfully promoted member '$learner_member_id_hex'" + return $OCF_SUCCESS +@@ -1063,19 +1063,19 @@ reconcile_member_state() + fi + + if [ -n "$learner_member_id" ]; then +- promote_learner_member "$learner_member_id" +- return $? +- fi +- +- if [ -z "$learner_member_id" ]; then +- if ! clear_standalone_node; then +- ocf_log error "could not clear standalone_node attribute, error code: $?" +- return $OCF_ERR_GENERIC +- fi +- if ! attribute_learner_node clear; then +- ocf_log error "could not clear learner_node attribute, error code: $?" ++ if ! promote_learner_member "$learner_member_id"; then + return $OCF_ERR_GENERIC + fi ++ # promotion succeded: continue to clear standalone_node and learner_node ++ fi ++ ++ if ! clear_standalone_node; then ++ ocf_log error "could not clear standalone_node attribute, error code: $?" ++ return $OCF_ERR_GENERIC ++ fi ++ if ! attribute_learner_node clear; then ++ ocf_log error "could not clear learner_node attribute, error code: $?" ++ return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +@@ -1258,6 +1258,7 @@ manage_peer_membership() + set_standalone_node + else + ocf_log debug "$name is in the members list by IP: $ip" ++ # Errors from reconcile_member_state are logged internally. Ignoring them here prevents stopping a healthy voter agent; critical local failures are caught by detect_cluster_leadership_loss. + reconcile_member_state "$member_list_json" + fi + done +@@ -1369,7 +1370,7 @@ container_health_check() + # Could not execute monitor check command and state file exists - the container failed, check recovery status in this lifecycle + local time_since_heartbeat + time_since_heartbeat=$(get_time_since_last_heartbeat) +- ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)" ++ ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago, error code: $rc)" + + # Check if peer has set force_new_cluster for recovery + local fnc_holders +@@ -1795,6 +1796,9 @@ podman_start() + fi + ;; + 0) ++ # No active resources: clear any stale learner_node attribute from previous failed session ++ ocf_log debug "clearing stale learner_node attribute (safe when active_resources_count=0)" ++ attribute_learner_node clear + # count how many agents are starting now + local start_resources_count + start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w) +@@ -2090,6 +2094,7 @@ podman_stop() + ocf_log err "could not delete container health check state file" + fi + ++ attribute_learner_node clear + attribute_node_revision update + attribute_node_cluster_id update + diff --git a/SOURCES/RHEL-132052-podman-etcd-prevent-retries-on-fatal-errors.patch b/SOURCES/RHEL-132052-podman-etcd-prevent-retries-on-fatal-errors.patch new file mode 100644 index 0000000..3297c6c --- /dev/null +++ b/SOURCES/RHEL-132052-podman-etcd-prevent-retries-on-fatal-errors.patch @@ -0,0 +1,146 @@ +From 192b0ecbe015e8b8a4d32f8b066ead3a6dba0589 Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Tue, 2 Dec 2025 10:01:01 +0100 +Subject: [PATCH] OCPEDGE-2231: podman-etcd: improve error handling to support + retry on start errors (#2105) + +* podman-etcd: improve add_member_as_learner error log + +Improving add_member_as_learner error log to better debug rare issue +when the podman exec command returns error, but the etcd member is added +to the list anyway. This is critical as the `learner_node` attribute +won't be cleaned up anymore. + +Signed-off-by: Carlo Lobrano + +* podman-etcd: remove duplicated check for container already started + +* podman-etcd: improve error return codes to support start retries + +Improved and/or changed some returns code to allow or forbid retry in +case of start errors. + +see: OCPEDGE-2231 + +--------- + +Signed-off-by: Carlo Lobrano +--- + heartbeat/podman-etcd | 40 +++++++++++++++++++++++++--------------- + 1 file changed, 25 insertions(+), 15 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 3e3f1d60e..242226bb1 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -617,9 +617,13 @@ prepare_env() { + LISTEN_CLIENT_URLS="0.0.0.0" + LISTEN_PEER_URLS="0.0.0.0" + LISTEN_METRICS_URLS="0.0.0.0" ++ ++ return $OCF_SUCCESS + } + + compute_bump_revision() { ++ local rc ++ + # Same logic used by cluster-etcd-operator quorum-restore-pod utility. + # see https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da916622c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34 + # set a default value: 1bn would be an etcd running at 1000 writes/s for about eleven days. +@@ -691,7 +695,13 @@ experimental-max-learners: 1 + experimental-warning-apply-duration: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION") + experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL") + EOF ++ rc=$? ++ if [ $rc -ne 0 ]; then ++ ocf_log err "could not create etcd configuration, 'cat' error code: $rc" ++ return $OCF_ERR_CONFIGURED ++ fi + ++ # Append cipher suites from the env variable where the entries are comma separated. + { + if [ -n "$ETCD_CIPHER_SUITES" ]; then + echo "cipher-suites:" +@@ -700,6 +710,13 @@ EOF + done + fi + } >> "$ETCD_CONFIGURATION_FILE" ++ rc=$? ++ if [ $rc -ne 0 ]; then ++ ocf_log err "could not append cipher suites to etcd configuration, error code: $rc" ++ return $OCF_ERR_CONFIGURED ++ fi ++ ++ return $OCF_SUCCESS + } + + archive_data_folder() +@@ -884,7 +901,7 @@ add_member_as_learner() + out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner) + rc=$? + if [ $rc -ne 0 ]; then +- ocf_log err "could not add $member_name as learner, error code: $rc" ++ ocf_log err "could not add $member_name as learner, error code $rc, etcdctl output: $out" + return $rc + fi + ocf_log info "$out" +@@ -1763,7 +1780,7 @@ podman_start() + fnc_holder_count=$(echo "$fnc_holders" | wc -w) + if [ "$fnc_holder_count" -gt 1 ]; then + ocf_exit_reason "force_new_cluster attribute is set on multiple nodes ($fnc_holders)" +- return "$OCF_ERR_GENERIC" ++ return "$OCF_ERR_CONFIGURED" + fi + + if [ "$fnc_holder_count" -eq 1 ]; then +@@ -1837,7 +1854,7 @@ podman_start() + ocf_log info "same cluster_id and revision: start normal" + else + ocf_exit_reason "same revision but different cluster id" +- return "$OCF_ERR_GENERIC" ++ return "$OCF_ERR_CONFIGURED" + fi + fi + ;; +@@ -1862,12 +1879,6 @@ podman_start() + + run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}" + +- # check to see if the container has already started +- podman_simple_status +- if [ $? -eq $OCF_SUCCESS ]; then +- return "$OCF_SUCCESS" +- fi +- + if ocf_is_true "$JOIN_AS_LEARNER"; then + local wait_timeout_sec=$((10*60)) + local poll_interval_sec=5 +@@ -1894,9 +1905,8 @@ podman_start() + + ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced" + if ! can_reuse_container ; then +- rc="$?" +- ocf_log err "could not determine etcd container reuse strategy, rc: $rc" +- return "$rc" ++ ocf_log err "could not determine etcd container reuse strategy" ++ return $OCF_ERR_GENERIC + fi + + # Archive current container and its configuration before creating +@@ -1912,13 +1922,13 @@ podman_start() + fi + + if ! prepare_env; then +- ocf_log err "Could not prepare environment for podman, error code: $?" ++ ocf_log err "Could not prepare environment for podman" + return $OCF_ERR_GENERIC + fi + + if ! generate_etcd_configuration; then +- ocf_log err "Could not generate etcd configuration, error code: $?" +- return $OCF_ERR_GENERIC ++ ocf_log err "Could not generate etcd configuration" ++ return $OCF_ERR_CONFIGURED + fi + + run_opts="$run_opts \ diff --git a/SOURCES/RHEL-133937-podman-etcd-align-variable-names-with-etcd-3.6-pod-manifest.patch b/SOURCES/RHEL-133937-podman-etcd-align-variable-names-with-etcd-3.6-pod-manifest.patch new file mode 100644 index 0000000..0eb6dd4 --- /dev/null +++ b/SOURCES/RHEL-133937-podman-etcd-align-variable-names-with-etcd-3.6-pod-manifest.patch @@ -0,0 +1,52 @@ +From 8b70d5026fee0910a52f0fdefcaf930b2c0a3909 Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Wed, 3 Dec 2025 11:38:25 +0100 +Subject: [PATCH] podman-etcd: sync environment variables with Pod manifest + +The EXPERIMENTAL substring was removed from +ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION and +ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERNAL in the Pod +manifest. This change aligns our config with those updates. + +NOTE: Some Etcd flags deprecated in v3.6 will be replaced in a future +change. + +See: https://github.com/openshift/cluster-etcd-operator/pull/1507 +--- + heartbeat/podman-etcd | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 242226bb1..bb2900536 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -604,8 +604,8 @@ prepare_env() { + fi + ETCD_ELECTION_TIMEOUT=$(get_env_from_manifest "ETCD_ELECTION_TIMEOUT") + ETCD_ENABLE_PPROF=$(get_env_from_manifest "ETCD_ENABLE_PPROF") +- ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION") +- ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL") ++ ETCD_WARNING_APPLY_DURATION=$(get_env_from_manifest "ETCD_WARNING_APPLY_DURATION") ++ ETCD_WATCH_PROGRESS_NOTIFY_INTERVAL=$(get_env_from_manifest "ETCD_WATCH_PROGRESS_NOTIFY_INTERVAL") + ETCD_HEARTBEAT_INTERVAL=$(get_env_from_manifest "ETCD_HEARTBEAT_INTERVAL") + ETCD_QUOTA_BACKEND_BYTES=$(get_env_from_manifest "ETCD_QUOTA_BACKEND_BYTES") + ETCD_SOCKET_REUSE_ADDRESS=$(get_env_from_manifest "ETCD_SOCKET_REUSE_ADDRESS") +@@ -660,6 +660,7 @@ force-new-cluster-bump-amount: $BUMP_REV" + + # the space indentation for client-transport-security and peer-transport-security + # is required for correct YAML formatting. ++ # TODO: replace flags deprecated in Etcd v3.6 + cat > "$ETCD_CONFIGURATION_FILE" << EOF + logger: zap + log-level: info +@@ -692,8 +693,8 @@ listen-metrics-urls: "$(ip_url ${LISTEN_METRICS_URLS}):9978" + metrics: extensive + experimental-initial-corrupt-check: true + experimental-max-learners: 1 +-experimental-warning-apply-duration: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION") +-experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL") ++experimental-warning-apply-duration: $(convert_duration_in_nanoseconds "$ETCD_WARNING_APPLY_DURATION") ++experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds "$ETCD_WATCH_PROGRESS_NOTIFY_INTERVAL") + EOF + rc=$? + if [ $rc -ne 0 ]; then diff --git a/SOURCES/RHEL-139519-podman-etcd-verify-no-containers-running-or-being-deleted.patch b/SOURCES/RHEL-139519-podman-etcd-verify-no-containers-running-or-being-deleted.patch new file mode 100644 index 0000000..b0ba9a8 --- /dev/null +++ b/SOURCES/RHEL-139519-podman-etcd-verify-no-containers-running-or-being-deleted.patch @@ -0,0 +1,25 @@ +From 7449fd88d21650db1eaafdc7ef85bf3553f6ac7f Mon Sep 17 00:00:00 2001 +From: Pablo Fontanilla +Date: Thu, 8 Jan 2026 09:42:42 +0100 +Subject: [PATCH] OCPBUGS-64765: podman-etcd: add -a option to crictl ps + (#2112) + +--- + heartbeat/podman-etcd | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index bb2900536..591a663bf 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -738,8 +738,8 @@ archive_data_folder() + + etcd_pod_container_exists() { + local count_matches +- # Check whether the etcd pod exists on the same node (header line included) +- count_matches=$(crictl pods --label app=etcd -q | xargs -I {} crictl ps --pod {} -o json | jq -r '.containers[].metadata | select ( .name == "etcd" ).name' | wc -l) ++ # Check whether the etcd pod exists on the same node (including stopped/exited containers) ++ count_matches=$(crictl pods --label app=etcd -q | xargs -I {} crictl ps -a --pod {} -o json | jq -r '.containers[].metadata | select ( .name == "etcd" ).name' | wc -l) + if [ "$count_matches" -eq 1 ]; then + # etcd pod found + return 0 diff --git a/SOURCES/RHEL-143527-powervs-move-ip-powervs-subnet-fix-error-logging.patch b/SOURCES/RHEL-143527-powervs-move-ip-powervs-subnet-fix-error-logging.patch new file mode 100644 index 0000000..9558783 --- /dev/null +++ b/SOURCES/RHEL-143527-powervs-move-ip-powervs-subnet-fix-error-logging.patch @@ -0,0 +1,54 @@ +From 8f5c5a2a472ab404b6fd15ff492e72904dc8ac20 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Thu, 22 Jan 2026 07:37:40 +0100 +Subject: [PATCH] powervs-move-ip/powervs-subnet: fix error logging + +--- + heartbeat/powervs-move-ip.in | 4 ++-- + heartbeat/powervs-subnet.in | 10 ++++++---- + 2 files changed, 8 insertions(+), 6 deletions(-) + +diff --git a/heartbeat/powervs-move-ip.in b/heartbeat/powervs-move-ip.in +index e2250c998..0eea89f1d 100755 +--- a/heartbeat/powervs-move-ip.in ++++ b/heartbeat/powervs-move-ip.in +@@ -310,9 +310,9 @@ class PowerCloudTokenManager: + return json.load(f) + finally: + fcntl.flock(f, fcntl.LOCK_UN) +- except (json.JSONDecodeError, FileNotFoundError, PermissionError): ++ except (json.JSONDecodeError, FileNotFoundError, PermissionError) as e: + ocf.logger.warning( +- "[PowerCloudTokenManager] _read_cache: failed to read token cache read due to missing file or malformed JSON." ++ f"[PowerCloudTokenManager] _read_cache: failed to read token cache read due to missing file or malformed JSON: '{e}'" + ) + return {} + +diff --git a/heartbeat/powervs-subnet.in b/heartbeat/powervs-subnet.in +index 062b1235e..b8f3864e9 100755 +--- a/heartbeat/powervs-subnet.in ++++ b/heartbeat/powervs-subnet.in +@@ -837,8 +837,9 @@ def start_action( + if rc != ocf.OCF_SUCCESS: + return rc + +- if monitor_action(**res_options) != ocf.OCF_SUCCESS: +- raise PowerCloudAPIError(f"start_action: start subnet: {ws.subnet_name} failed") ++ rc = monitor_action(**res_options) ++ if rc != ocf.OCF_SUCCESS: ++ raise PowerCloudAPIError(f"start_action: start subnet: {ws.subnet_name} failed", rc) + + ocf.logger.info( + f"start_action: finished, added connection {conn_name} for subnet {ws.subnet_name}" +@@ -872,8 +873,9 @@ def stop_action( + + ws.subnet_remove() + +- if monitor_action(**res_options) != ocf.OCF_NOT_RUNNING: +- raise PowerCloudAPIError(f"stop_action: stop subnet {ws.subnet_name} failed") ++ rc = monitor_action(**res_options) ++ if rc != ocf.OCF_NOT_RUNNING: ++ raise PowerCloudAPIError(f"stop_action: stop subnet {ws.subnet_name} failed", rc) + + ocf.logger.info( + f"stop_action: finished, deleted connection for subnet {ws.subnet_name}" diff --git a/SOURCES/RHEL-145628-podman-etcd-enhance-etcd-data-backup-with-snapshots-and-retention.patch b/SOURCES/RHEL-145628-podman-etcd-enhance-etcd-data-backup-with-snapshots-and-retention.patch new file mode 100644 index 0000000..56d7701 --- /dev/null +++ b/SOURCES/RHEL-145628-podman-etcd-enhance-etcd-data-backup-with-snapshots-and-retention.patch @@ -0,0 +1,278 @@ +From 8df1e4dfdee960b971fb598c043b4ccb2b9fefca Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Mon, 3 Nov 2025 12:34:29 +0100 +Subject: [PATCH] podman-etcd: enhance etcd data backup with snapshots and + retention + +Replace basic data directory backup with proper etcd database snapshot +functionality. The new implementation: +- Creates timestamped snapshot files instead of moving the entire data directory +- Stores backups in a non-volatile location (backup_location parameter) instead + of the previous volatile HA_RSCTMP directory +- Validates backup file existence and size after creation +- Implements configurable retention policy via max_backup_snapshots parameter +- Automatically cleans up old snapshots to control storage usage + +Default retention is set to 3 snapshots, with backups stored in /var/lib/etcd +by default. This provides better backup reliability, persistence across reboots, +and storage management for etcd databases. +--- + heartbeat/podman-etcd | 205 ++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 196 insertions(+), 9 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index bb2900536..1d717ec00 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -49,6 +49,7 @@ OCF_RESKEY_reuse_default="0" + OCF_RESKEY_oom_default="-997" + OCF_RESKEY_config_location_default="/var/lib/etcd" + OCF_RESKEY_backup_location_default="/var/lib/etcd" ++OCF_RESKEY_max_backup_snapshots_default="3" + + : ${OCF_RESKEY_image=${OCF_RESKEY_image_default}} + : ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}} +@@ -61,6 +62,7 @@ OCF_RESKEY_backup_location_default="/var/lib/etcd" + : ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default}} + : ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}} + : ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}} ++: ${OCF_RESKEY_max_backup_snapshots=${OCF_RESKEY_max_backup_snapshots_default}} + + + ####################################################################### +@@ -275,6 +277,17 @@ The directory where the resource agent stores its backups. + + + ++ ++ ++Maximum number of etcd database snapshots to retain. When a new snapshot is created, ++older snapshots will be automatically removed to maintain this limit. This helps ++control storage usage while ensuring recent backups are available for recovery. ++Set max_backup_snapshots=0 to disable backups. ++ ++Maximum number of backup snapshots to retain ++ ++ ++ + + + +@@ -720,20 +733,190 @@ EOF + return $OCF_SUCCESS + } + ++# Remove etcd member directory to allow the node to rejoin the cluster as a learner. ++# ++# When a node rejoins an etcd cluster, it must start fresh as a learner to prevent ++# data inconsistencies. This function removes the member directory and syncs to disk. ++# ++# Returns: ++# OCF_SUCCESS - Member directory successfully removed ++# OCF_ERR_GENERIC - Failed to remove member directory (critical error) ++wipe_data_folder_for_learner() ++{ ++ ocf_log info "deleting etcd member directory ($ETCD_MEMBER_DIR) to enable learner rejoin" ++ if ! rm -rf "$ETCD_MEMBER_DIR"; then ++ ocf_log err "could not delete etcd member directory ($ETCD_MEMBER_DIR), error code: $?" ++ return $OCF_ERR_GENERIC ++ fi ++ sync ++ return $OCF_SUCCESS ++} ++ ++ ++# Calculate available disk space in bytes for a given directory. ++# ++# This function queries the filesystem and returns available space in bytes. ++# It converts df output (KB) to bytes for consistent size comparisons. ++# ++# Arguments: ++# $1 - Target directory path to check ++# ++# Returns: ++# OCF_SUCCESS - Available space in bytes (via stdout) ++# OCF_ERR_GENERIC - Failed to determine available space (error message via stdout) ++get_available_space_in_directory() ++{ ++ local target_dir=$1 ++ local available_space_kb ++ local available_space_bytes ++ ++ available_space_kb=$(df -P "$target_dir" | awk 'NR==2 {print $4}' 2>&1) ++ ++ # Validate output is numeric ++ if ! echo "$available_space_kb" | grep -q '^[0-9]\+$'; then ++ echo "df command failed or returned invalid value: $available_space_kb" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ available_space_bytes=$((available_space_kb*1024)) ++ echo "$available_space_bytes" ++ return $OCF_SUCCESS ++} ++ ++# Archive etcd database with backup and cleanup ++# ++# This function creates a backup copy of the etcd database, validates it, and ++# removes old backups according to the retention policy. Backups are optional ++# and can be disabled by setting max_backup_snapshots=0. ++# ++# Error handling strategy: ++# All backup failures return OCF_SUCCESS to prevent blocking cluster recovery. ++# Backups are beneficial but not critical for recovery operations. ++# ++# NOTE: This function cannot use etcdctl/etcdutl utilities because the etcd ++# server is not running when this backup is performed. + archive_data_folder() + { +- # TODO: use etcd snapshots +- local dest_dir_name +- local data_dir="/var/lib/etcd/member" ++ local backup_dir="$OCF_RESKEY_backup_location" ++ local etcd_db_path="$ETCD_MEMBER_DIR/snap/db" + +- dest_dir_name="members-snapshot-$(date +%Y%M%d%H%M%S)" +- if [ ! -d $data_dir ]; then +- ocf_log info "no data dir to backup" ++ if [ "$OCF_RESKEY_max_backup_snapshots" -eq 0 ]; then ++ ocf_log debug "etcd backup disabled (max_backup_snapshots=0)" + return $OCF_SUCCESS + fi +- ocf_log info "backing up $data_dir under $HA_RSCTMP/$dest_dir_name" +- mv "$data_dir" "$HA_RSCTMP/$dest_dir_name" +- sync ++ ++ # Check if the etcd database file exists ++ if [ ! -f "$etcd_db_path" ]; then ++ ocf_log warn "backup skipped: etcd database file not found at '$etcd_db_path'" ++ return $OCF_SUCCESS ++ fi ++ ++ # Ensure backup directory exists ++ if [ ! -d "$backup_dir" ]; then ++ ocf_log debug "creating backup directory: '$backup_dir'" ++ if ! mkdir -p "$backup_dir"; then ++ ocf_log warn "backup skipped: failed to create backup directory '$backup_dir'" ++ return $OCF_SUCCESS ++ fi ++ fi ++ ++ ocf_log debug "checking disk space: backup_dir=$backup_dir" ++ local available_space_bytes ++ if ! available_space_bytes=$(get_available_space_in_directory "$backup_dir"); then ++ ocf_log warn "backup skipped: could not compute available disk space in '$backup_dir', error msg: $available_space_bytes" ++ return $OCF_SUCCESS ++ fi ++ ++ local required_space_bytes ++ required_space_bytes=$(stat -c %s "$etcd_db_path" 2>&1) ++ if ! echo "$required_space_bytes" | grep -q '^[0-9]\+$'; then ++ ocf_log warn "backup skipped: could not compute etcd database size at '$etcd_db_path', error msg: $required_space_bytes" ++ return $OCF_SUCCESS ++ fi ++ ++ if [ "$required_space_bytes" -gt "$available_space_bytes" ]; then ++ ocf_log warn "backup skipped: insufficient disk space (required: ${required_space_bytes}B, available: ${available_space_bytes}B)" ++ return $OCF_SUCCESS ++ fi ++ ++ # Generate timestamp and backup filename ++ local timestamp ++ timestamp=$(date +%Y%m%d-%H%M%S) ++ ++ local backup_file ++ backup_file="$backup_dir/snapshot-$timestamp.db" ++ ++ ocf_log info "creating etcd database backup: '$backup_file'" ++ ++ # Create the backup by copying the database file (enable Copy-on-Write copy) ++ if ! cp --reflink=auto "$etcd_db_path" "$backup_file"; then ++ ocf_log warn "backup creation failed: could not copy '$etcd_db_path' to '$backup_file', error code: $?" ++ return $OCF_SUCCESS ++ fi ++ ++ # Validate the backup file exists and has the expected size ++ if [ ! -f "$backup_file" ]; then ++ ocf_log warn "backup validation failed: snapshot file '$backup_file' does not exist" ++ return $OCF_SUCCESS ++ fi ++ ++ local backup_size_bytes ++ backup_size_bytes=$(stat -c %s "$backup_file" 2>/dev/null || echo "0") ++ if [ "$backup_size_bytes" -ne "$required_space_bytes" ]; then ++ ocf_log warn "backup validation failed: size mismatch (expected: ${required_space_bytes}B, got: ${backup_size_bytes}B)" ++ rm -f "$backup_file" ++ return $OCF_SUCCESS ++ fi ++ ++ ocf_log info "backup created successfully: $backup_file (${backup_size_bytes}B)" ++ ++ # Cleanup old backups based on retention policy ++ cleanup_old_backups "$backup_dir" ++ ++ return $OCF_SUCCESS ++} ++ ++cleanup_old_backups() ++{ ++ local backup_dir="$1" ++ local max_snapshots="$OCF_RESKEY_max_backup_snapshots" ++ local backup_count ++ local backups_to_remove ++ local old_backups ++ ++ # Validate max_snapshots is a positive integer ++ if ! echo "$max_snapshots" | grep -q '^[1-9][0-9]*$'; then ++ ocf_log warn "invalid max_backup_snapshots value. Positive integer expected, got '$max_snapshots' instead, skipping cleanup" ++ return $OCF_SUCCESS ++ fi ++ ++ # Count existing backup files ++ backup_count=$(find "$backup_dir" -maxdepth 1 -name "snapshot-*.db" -type f 2>/dev/null | wc -l) ++ ++ if [ "$backup_count" -le "$max_snapshots" ]; then ++ ocf_log info "backup count ($backup_count) is within retention limit ($max_snapshots), no cleanup needed" ++ return $OCF_SUCCESS ++ fi ++ ++ # Calculate how many backups to remove ++ backups_to_remove=$((backup_count - max_snapshots)) ++ ocf_log info "removing $backups_to_remove old backup(s) to maintain retention limit of $max_snapshots" ++ ++ # Find oldest backups sorted by modification time ++ # -t sorts by modification time, -r reverses (oldest first) ++ # -print0 and -0 handle filenames with spaces/special characters ++ old_backups=$(find "$backup_dir" -maxdepth 1 -name "snapshot-*.db" -type f -print0 2>/dev/null | \ ++ xargs -0 -r ls -tr | \ ++ head -n "$backups_to_remove") ++ ++ if [ -n "$old_backups" ]; then ++ ocf_log info "removing old backups: $old_backups" ++ if ! echo "$old_backups" | xargs -r rm -f; then ++ ocf_log warn "failed to remove some old backups, error code: $?" ++ fi ++ fi ++ ++ return $OCF_SUCCESS + } + + etcd_pod_container_exists() { +@@ -1902,6 +2085,9 @@ podman_start() + fi + + archive_data_folder ++ if ! wipe_data_folder_for_learner; then ++ return "$OCF_ERR_GENERIC" ++ fi + fi + + ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced" +@@ -2251,6 +2437,7 @@ CONTAINER=$OCF_RESKEY_name + POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml" + ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml" + ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz" ++ETCD_MEMBER_DIR="/var/lib/etcd/member" + ETCD_REVISION_JSON="/var/lib/etcd/revision.json" + ETCD_REVISION_BUMP_PERCENTAGE=0.2 + ETCD_BUMP_REV_DEFAULT=1000000000 diff --git a/SOURCES/RHEL-150700-podman-etcd-set-attributes-if-they-fail-during-force-new-cluster.patch b/SOURCES/RHEL-150700-podman-etcd-set-attributes-if-they-fail-during-force-new-cluster.patch new file mode 100644 index 0000000..cd458ea --- /dev/null +++ b/SOURCES/RHEL-150700-podman-etcd-set-attributes-if-they-fail-during-force-new-cluster.patch @@ -0,0 +1,111 @@ +From e4d311b40d8ded2a1921a0e5c01cb49a07c9fb35 Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Thu, 5 Feb 2026 19:31:42 +0100 +Subject: [PATCH] podman-etcd: fix learner node attribute not set after etcdctl + failure + +Ensure that learner_node attribute is always set when the member list +contains one learner member. + +Moreover: +* Ensure set_standalone_node is called after adding a learner member. +* Capture stderr from etcdctl for better error logging. +--- + heartbeat/podman-etcd | 61 +++++++++++++++++++++++++++---------------- + 1 file changed, 38 insertions(+), 23 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 77525ddb7..06814ad89 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -1082,7 +1082,7 @@ add_member_as_learner() + local peer_url=$(ip_url $member_ip) + + ocf_log info "add $member_name ($member_ip) to the member list as learner" +- out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner) ++ out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner 2>&1) + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "could not add $member_name as learner, error code $rc, etcdctl output: $out" +@@ -1429,10 +1429,22 @@ detect_cluster_leadership_loss() + manage_peer_membership() + { + local member_list_json="$1" ++ local peer_ip_map_entry ++ local peer_member_name ++ local peer_member_ip ++ local peer_member_id ++ ++ # Get peer node name and IP ++ peer_ip_map_entry=$(echo "$OCF_RESKEY_node_ip_map" | tr ';' '\n' | grep -vF "$NODENAME") ++ if [ -z "$peer_ip_map_entry" ]; then ++ ocf_exit_reason "manage_peer_membership: could not parse node_ip_map: '$OCF_RESKEY_node_ip_map'" ++ exit $OCF_ERR_CONFIGURED ++ fi ++ peer_member_name=$(echo "$peer_ip_map_entry" | cut -d: -f1) ++ peer_member_ip=$(echo "$peer_ip_map_entry" | cut -d: -f2-) + +- # Example of .members[] instance fields in member list json format: +- # NOTE that "name" is present in voting members only, while "isLearner" in learner members only +- # and the value is always true (not a string) in that case. ++ # Parsing the member list's json output to find a "learner" member. ++ # Example of .members[] instance fields in member list json format: + # { + # "ID": , + # "name": "", +@@ -1443,26 +1455,28 @@ manage_peer_membership() + # "https://:2379" + # ] + # } +- for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do +- name=$(echo "$node" | cut -d: -f1) +- # do not check itself +- if [ "$name" = "$NODENAME" ]; then +- continue +- fi ++ # NOTE that the "name" field is present in voting members only, while "isLearner" ++ # field in learner members only and the value is always true (not a string) in that case. ++ peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$peer_member_ip\")) | any).ID") ++ if [ -z "$peer_member_id" ]; then ++ ocf_log info "$peer_member_name is not in the members list" ++ add_member_as_learner "$peer_member_name" "$peer_member_ip" ++ set_standalone_node ++ return ++ fi + +- # Check by IP instead of Name since "learner" members appear only in peerURLs, not by Name. +- ip=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6 +- peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$ip\")) | any).ID") +- if [ -z "$peer_member_id" ]; then +- ocf_log info "$name is not in the members list" +- add_member_as_learner "$name" "$ip" +- set_standalone_node +- else +- ocf_log debug "$name is in the members list by IP: $ip" +- # Errors from reconcile_member_state are logged internally. Ignoring them here prevents stopping a healthy voter agent; critical local failures are caught by detect_cluster_leadership_loss. +- reconcile_member_state "$member_list_json" +- fi +- done ++ # Ensure learner_node attribute is always set when we have a learner member ++ local learner_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID") ++ local current_learner_node=$(attribute_learner_node get) ++ if [ -n "$learner_member_id" ] && [ -z "$current_learner_node" ]; then ++ ocf_log debug "$peer_member_name found as learner in member list, but learner_node attribute was not set. Updating" ++ attribute_learner_node update "$peer_member_name" ++ return ++ fi ++ ++ ocf_log debug "$peer_member_name is in the members list by IP: $peer_member_ip" ++ # Errors from reconcile_member_state are logged internally. Ignoring them here prevents stopping a healthy voter agent; critical local failures are caught by detect_cluster_leadership_loss. ++ reconcile_member_state "$member_list_json" + } + + check_peer() +@@ -2209,6 +2223,7 @@ podman_start() + peer_node_ip="$(attribute_node_ip_peer)" + if [ -n "$peer_node_name" ] && [ -n "$peer_node_ip" ]; then + add_member_as_learner "$peer_node_name" "$peer_node_ip" ++ set_standalone_node + else + ocf_log err "could not add peer as learner (peer node name: ${peer_node_name:-unknown}, peer ip: ${peer_node_ip:-unknown})" + fi diff --git a/SOURCES/RHEL-42513-powervs-subnet-new-ra.patch b/SOURCES/RHEL-42513-1-powervs-subnet-new-ra.patch similarity index 100% rename from SOURCES/RHEL-42513-powervs-subnet-new-ra.patch rename to SOURCES/RHEL-42513-1-powervs-subnet-new-ra.patch diff --git a/SOURCES/RHEL-42513-2-build-dont-build-powervs-subnet-if-dependencies-are-missing.patch b/SOURCES/RHEL-42513-2-build-dont-build-powervs-subnet-if-dependencies-are-missing.patch new file mode 100644 index 0000000..dd092d9 --- /dev/null +++ b/SOURCES/RHEL-42513-2-build-dont-build-powervs-subnet-if-dependencies-are-missing.patch @@ -0,0 +1,84 @@ +From 277370f569b34e1cfb49637f9a00afc20bcd4c54 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Wed, 17 Jul 2024 10:43:29 +0200 +Subject: [PATCH] build: dont build powervs-subnet if dependencies are missing + +--- + configure.ac | 9 +++++++++ + doc/man/Makefile.am | 5 ++++- + heartbeat/Makefile.am | 5 ++++- + 3 files changed, 17 insertions(+), 2 deletions(-) + +diff --git a/configure.ac b/configure.ac +index b785e2c2c..21ce27423 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -519,6 +519,8 @@ fi + + AC_PYTHON_MODULE(json) + AC_PYTHON_MODULE(pyroute2) ++AC_PYTHON_MODULE(requests) ++AC_PYTHON_MODULE(urllib3) + + AS_VERSION_COMPARE([$PYTHON_VERSION], [3.6], [BUILD_OCF_PY=0], [BUILD_OCF_PY=1], [BUILD_OCF_PY=1]) + +@@ -557,6 +559,13 @@ if test -z "$PYTHON" || test $BUILD_OCF_PY -eq 0; then + fi + AM_CONDITIONAL(BUILD_GCP_VPC_MOVE_VIP, test $BUILD_GCP_VPC_MOVE_VIP -eq 1) + ++BUILD_POWERVS_SUBNET=1 ++if test -z "$PYTHON" || test $BUILD_OCF_PY -eq 0 || test "x${HAVE_PYMOD_REQUESTS}" != xyes || test "x${HAVE_PYMOD_URLLIB3}" != xyes; then ++ BUILD_POWERVS_SUBNET=0 ++ AC_MSG_WARN("Not building powervs-subnet") ++fi ++AM_CONDITIONAL(BUILD_POWERVS_SUBNET, test $BUILD_POWERVS_SUBNET -eq 1) ++ + AC_PATH_PROGS(ROUTE, route) + AC_DEFINE_UNQUOTED(ROUTE, "$ROUTE", path to route command) + +diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am +index e577e6357..ef7639bff 100644 +--- a/doc/man/Makefile.am ++++ b/doc/man/Makefile.am +@@ -190,7 +190,6 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \ + ocf_heartbeat_portblock.7 \ + ocf_heartbeat_postfix.7 \ + ocf_heartbeat_pound.7 \ +- ocf_heartbeat_powervs-subnet.7 \ + ocf_heartbeat_proftpd.7 \ + ocf_heartbeat_rabbitmq-cluster.7 \ + ocf_heartbeat_rabbitmq-server-ha.7 \ +@@ -238,6 +237,10 @@ if BUILD_GCP_VPC_MOVE_VIP + man_MANS += ocf_heartbeat_gcp-vpc-move-vip.7 + endif + ++if BUILD_POWERVS_SUBNET ++man_MANS += ocf_heartbeat_powervs-subnet.7 ++endif ++ + xmlfiles = $(man_MANS:.7=.xml) + + %.1 %.5 %.7 %.8: %.xml +diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am +index ff73a15aa..409847970 100644 +--- a/heartbeat/Makefile.am ++++ b/heartbeat/Makefile.am +@@ -162,7 +162,6 @@ ocf_SCRIPTS = AoEtarget \ + portblock \ + postfix \ + pound \ +- powervs-subnet \ + proftpd \ + rabbitmq-cluster \ + rabbitmq-server-ha \ +@@ -207,6 +206,10 @@ if BUILD_GCP_VPC_MOVE_VIP + ocf_SCRIPTS += gcp-vpc-move-vip + endif + ++if BUILD_POWERVS_SUBNET ++ocf_SCRIPTS += powervs-subnet ++endif ++ + ocfcommondir = $(OCF_LIB_DIR_PREFIX)/heartbeat + ocfcommon_DATA = ocf-shellfuncs \ + ocf-binaries \ diff --git a/SOURCES/RHEL-42513-powervs-subnet-wait-for-IP.patch b/SOURCES/RHEL-42513-powervs-subnet-wait-for-IP.patch new file mode 100644 index 0000000..60f5cea --- /dev/null +++ b/SOURCES/RHEL-42513-powervs-subnet-wait-for-IP.patch @@ -0,0 +1,43 @@ +From 0b4bf9c23eb60455da6c6a16c1df19282ab2a8b5 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Fri, 9 Jan 2026 12:56:14 +0100 +Subject: [PATCH] powervs-subnet: wait until IP is activated before running + monitor-check + +--- + heartbeat/powervs-subnet.in | 15 +++++++++++++-- + 1 file changed, 13 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/powervs-subnet.in b/heartbeat/powervs-subnet.in +index 84e86c0c4..062b1235e 100755 +--- a/heartbeat/powervs-subnet.in ++++ b/heartbeat/powervs-subnet.in +@@ -243,7 +243,16 @@ class nmcli: + + @staticmethod + def up(name, **kwargs): +- return nmcli._nmcli_cmd("connection", "up", name, **kwargs) ++ nmcli._nmcli_cmd("connection", "up", name, **kwargs) ++ ++ for i in range(1, 10): ++ time.sleep(1) ++ status = nmcli._nmcli_cmd("connection", "show", name, **kwargs) ++ if len(status.get("IP4.ADDRESS[1]", "")) > 0: ++ return ocf.OCF_SUCCESS ++ ocf.logger.warning(f"nmcli.connection.up: check {i} of 10: IP not yet available.") ++ ++ return ocf.OCF_ERR_GENERIC + + @staticmethod + def find(match_key, match_value): +@@ -824,7 +833,9 @@ def start_action( + conn_options.update({"802-3-ethernet.mtu": "9000", "ethtool.feature-tso": "on"}) + + nmcli.connection.add(conn_name, options=conn_options) +- nmcli.connection.up(conn_name) ++ rc = nmcli.connection.up(conn_name) ++ if rc != ocf.OCF_SUCCESS: ++ return rc + + if monitor_action(**res_options) != ocf.OCF_SUCCESS: + raise PowerCloudAPIError(f"start_action: start subnet: {ws.subnet_name} failed") diff --git a/SOURCES/RHEL-64949-oracle-improve-monpassword-description.patch b/SOURCES/RHEL-64949-oracle-improve-monpassword-description.patch new file mode 100644 index 0000000..c6d6827 --- /dev/null +++ b/SOURCES/RHEL-64949-oracle-improve-monpassword-description.patch @@ -0,0 +1,23 @@ +From eac983c14f4695f491fe430a78d8d18a1481c60c Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Wed, 29 Oct 2025 15:15:54 +0100 +Subject: [PATCH] oracle: improve monpassword description + +--- + heartbeat/oracle | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/heartbeat/oracle b/heartbeat/oracle +index 8cf4e3649c..c85e499833 100755 +--- a/heartbeat/oracle ++++ b/heartbeat/oracle +@@ -132,8 +132,7 @@ that the password for this user does not expire. + + Password for the monitoring user. Make sure + that the password for this user does not expire. +-Need to explicitly set a password to a new monitor +-user for the security reason. ++Set to avoid using the agents default password for "monuser". + + monpassword + diff --git a/SPECS/resource-agents.spec b/SPECS/resource-agents.spec index 2d0fdd2..86bc4bf 100644 --- a/SPECS/resource-agents.spec +++ b/SPECS/resource-agents.spec @@ -45,7 +45,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.10.0 -Release: 80%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} +Release: 108%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} License: GPLv2+ and LGPLv2+ URL: https://github.com/ClusterLabs/resource-agents Source0: %{upstream_prefix}-%{upstream_version}.tar.gz @@ -142,7 +142,7 @@ Patch89: RHEL-61888-ocf-shellfuncs-only-create-update-reload-systemd-drop-in-if- Patch90: RHEL-62200-IPaddr2-improve-fail-logic-check-ip_status-after-adding-IP.patch Patch91: RHEL-40589-azure-events-az-update-API-versions-add-retry-for-metadata.patch Patch92: RHEL-58632-azure-events-use-node-name-from-cluster.patch -Patch93: RHEL-42513-powervs-subnet-new-ra.patch +Patch93: RHEL-42513-1-powervs-subnet-new-ra.patch Patch94: RHEL-66292-1-aws-agents-reuse-imds-token-until-it-expires.patch Patch95: RHEL-66292-2-aws-agents-reuse-imds-token-improvements.patch Patch96: RHEL-68739-awsvip-add-interface-parameter.patch @@ -163,6 +163,42 @@ Patch110: RHEL-70044-IPaddr2-IPsrcaddr-avoid-duplicate-route-issues.patch Patch111: RHEL-7688-IPaddr2-add-link-status-DOWN-LOWERLAYERDOWN-check.patch Patch112: RHEL-97123-Filesystem-fix-issue-with-Vormetric-mounts.patch Patch113: RHEL-102727-ocf-shellfuncs-remove-extra-sleep-from-curl_retry.patch +Patch114: RHEL-102610-podman-etcd-add-oom-parameter.patch +Patch115: RHEL-42513-2-build-dont-build-powervs-subnet-if-dependencies-are-missing.patch +Patch116: RHEL-114489-1-powervs-move-ip-new-ra.patch +Patch117: RHEL-114489-2-powervs-move-ip-set-bundled-path.patch +Patch118: RHEL-115785-RHEL-115782-1-db2-add-skip_basic_sql_health_check-and-monitor-parameters.patch +Patch119: RHEL-113767-podman-etcd-wrap-ipv6-address-in-brackets.patch +Patch120: RHEL-113766-podman-etcd-preserve-containers-for-debugging.patch +Patch121: RHEL-116206-podman-etcd-add-cluster-wide-force_new_cluster-attribute-check.patch +Patch122: RHEL-116151-1-ocf-shellfuncs-add-ocf_promotion_score.patch +Patch123: RHEL-116151-2-portblock-add-promotable-support.patch +Patch124: RHEL-116151-3-portblock-fixes-add-method-and-status_check-parameters.patch +Patch125: RHEL-119495-podman-etcd-add-automatic-learner-member-promotion.patch +Patch126: RHEL-118624-db2-use-reintegration-flag-to-avoid-race-condition-on-cluster-reintegration.patch +Patch127: RHEL-123887-podman-etcd-certificate-rotation.patch +Patch128: RHEL-123906-podman-etcd-compute-dynamic-revision-bump-from-maxRaftIndex.patch +Patch129: RHEL-115785-RHEL-115782-2-db2-fix-variable-name.patch +Patch130: RHEL-118621-MailTo-add-s-nail-support-for-multiple-recipients.patch +Patch131: RHEL-64949-oracle-improve-monpassword-description.patch +Patch132: RHEL-109485-1-nfsserver-support-non-clustered-kerberized-mounts.patch +Patch133: RHEL-109485-2-nfsserver-fix-error-message.patch +Patch134: RHEL-114489-3-powervs-move-ip-add-iflabel-parameter.patch +Patch135: RHEL-127006-storage_mon-fix-handling-of-4k-block-devices.patch +Patch136: RHEL-127891-podman-etcd-exclude-stopping-resources-from-active-count.patch +Patch137: RHEL-126087-1-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch +Patch138: RHEL-121986-Filesystem-speed-up-get-PIDs.patch +Patch139: RHEL-130580-1-podman-etcd-prevent-last-active-member-from-leaving.patch +Patch140: RHEL-130580-2-podman-etcd-remove-test-code.patch +Patch141: RHEL-126087-2-podman-etcd-fix-count-of-fnc-holders-in-container_health_check.patch +Patch142: RHEL-131185-podman-etcd-prevent-learner-from-starting-before-cluster-is-ready.patch +Patch143: RHEL-132052-podman-etcd-prevent-retries-on-fatal-errors.patch +Patch144: RHEL-133937-podman-etcd-align-variable-names-with-etcd-3.6-pod-manifest.patch +Patch145: RHEL-139519-podman-etcd-verify-no-containers-running-or-being-deleted.patch +Patch146: RHEL-42513-powervs-subnet-wait-for-IP.patch +Patch147: RHEL-143527-powervs-move-ip-powervs-subnet-fix-error-logging.patch +Patch148: RHEL-145628-podman-etcd-enhance-etcd-data-backup-with-snapshots-and-retention.patch +Patch149: RHEL-150700-podman-etcd-set-attributes-if-they-fail-during-force-new-cluster.patch # bundled ha-cloud-support libs Patch500: ha-cloud-support-aliyun.patch @@ -188,7 +224,7 @@ BuildRequires: python-devel # for pgsqlms BuildRequires: perl-devel perl-English perl-FindBin -%ifarch x86_64 +%ifarch x86_64 ppc64le BuildRequires: ha-cloud-support %endif @@ -216,7 +252,15 @@ Requires: which Requires: /sbin/fsck Requires: /usr/sbin/fsck.ext2 /usr/sbin/fsck.ext3 /usr/sbin/fsck.ext4 Requires: /usr/sbin/fsck.xfs +%if 0%{?fedora} > 40 || 0%{?rhel} > 9 || 0%{?suse_version} +Recommends: /usr/sbin/mount.nfs /usr/sbin/mount.nfs4 +%else +%if 0%{?rhel} > 8 +Recommends: /sbin/mount.nfs /sbin/mount.nfs4 +%else Requires: /sbin/mount.nfs /sbin/mount.nfs4 +%endif +%endif %if (0%{?fedora} && 0%{?fedora} < 33) || (0%{?rhel} && 0%{?rhel} < 9) || (0%{?centos} && 0%{?centos} < 9) || 0%{?suse_version} %if (0%{?rhel} && 0%{?rhel} < 8) || (0%{?centos} && 0%{?centos} < 8) Requires: /usr/sbin/mount.cifs @@ -232,7 +276,20 @@ Requires: /sbin/ip Requires: /usr/sbin/lvm # nfsserver / netfs.sh -Requires: /usr/sbin/rpc.nfsd /sbin/rpc.statd /usr/sbin/rpc.mountd +%if 0%{?fedora} > 40 || 0%{?rhel} > 9 || 0%{?suse_version} +Recommends: /usr/sbin/rpc.statd +%else +%if 0%{?rhel} > 8 +Recommends: /sbin/rpc.statd +%else +Requires: /sbin/rpc.statd +%endif +%endif +%if 0%{?fedora} > 40 || 0%{?rhel} > 8 || 0%{?suse_version} +Recommends: /usr/sbin/rpc.nfsd /usr/sbin/rpc.mountd +%else +Requires: /usr/sbin/rpc.nfsd /usr/sbin/rpc.mountd +%endif # ocf.py Requires: python3 @@ -400,6 +457,42 @@ exit 1 %patch -p1 -P 111 %patch -p1 -P 112 %patch -p1 -P 113 +%patch -p1 -P 114 +%patch -p1 -P 115 -F2 +%patch -p1 -P 116 +%patch -p1 -P 117 +%patch -p1 -P 118 +%patch -p1 -P 119 +%patch -p1 -P 120 +%patch -p1 -P 121 +%patch -p1 -P 122 +%patch -p1 -P 123 +%patch -p1 -P 124 +%patch -p1 -P 125 +%patch -p1 -P 126 +%patch -p1 -P 127 +%patch -p1 -P 128 +%patch -p1 -P 129 +%patch -p1 -P 130 +%patch -p1 -P 131 +%patch -p1 -P 132 +%patch -p1 -P 133 +%patch -p1 -P 134 +%patch -p1 -P 135 +%patch -p1 -P 136 +%patch -p1 -P 137 -F2 +%patch -p1 -P 138 +%patch -p1 -P 139 +%patch -p1 -P 140 +%patch -p1 -P 141 +%patch -p1 -P 142 +%patch -p1 -P 143 +%patch -p1 -P 144 +%patch -p1 -P 145 +%patch -p1 -P 146 +%patch -p1 -P 147 +%patch -p1 -P 148 +%patch -p1 -P 149 # bundled ha-cloud-support libs %patch -p1 -P 500 @@ -441,6 +534,9 @@ export CFLAGS %endif %ifarch x86_64 PYTHONPATH="%{_usr}/lib/fence-agents/support/google" \ +%endif +%ifarch ppc64le + PYTHONPATH="%{_usr}/lib/fence-agents/support/ibm" \ %endif %{conf_opt_fatal} \ %if %{defined _unitdir} @@ -729,6 +825,136 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm %changelog +* Wed Feb 18 2026 Oyvind Albrigtsen - 4.10.0-108 +- podman-etcd: set attributes if they fail during force-new-cluster + + Resolves: RHEL-150700 + +* Wed Feb 4 2026 Oyvind Albrigtsen - 4.10.0-107 +- podman-etcd: enhance etcd data backup with snapshots and retention + + Resolves: RHEL-145628 + +* Thu Jan 22 2026 Oyvind Albrigtsen - 4.10.0-106 +- powervs-move-ip/powervs-subnet: fix error logging + + Resolves: RHEL-143527 + +* Wed Jan 14 2026 Oyvind Albrigtsen - 4.10.0-105 +- powervs-subnet: new resource agent + + Resolves: RHEL-42513 + +* Thu Jan 8 2026 Oyvind Albrigtsen - 4.10.0-104 +- podman-etcd: verify that no static pod containers are running or + being deleted before starting + + Resolves: RHEL-139519 + +* Mon Dec 8 2025 Oyvind Albrigtsen - 4.10.0-103 +- podman-etcd: align environment variable names with Etcd v3.6 Pod + manifest + + Resolves: RHEL-133937 + +* Tue Dec 2 2025 Oyvind Albrigtsen - 4.10.0-102 +- podman-etcd: prevent retries on fatal errors + + Resolves: RHEL-132052 + +* Thu Nov 27 2025 Oyvind Albrigtsen - 4.10.0-101 +- podman-etcd: prevent learner from starting before cluster is ready + + Resolves: RHEL-131185 + +* Tue Nov 25 2025 Oyvind Albrigtsen - 4.10.0-100 +- podman-etcd: add container crash detection with coordinated recovery + + Resolves: RHEL-126087 + +* Mon Nov 24 2025 Oyvind Albrigtsen - 4.10.0-99 +- podman-etcd: prevent last active member from leaving the etcd member + list + + Resolves: RHEL-130580 + +* Thu Nov 20 2025 Oyvind Albrigtsen - 4.10.0-98 +- Filesystem: speed up get PIDs + + Resolves: RHEL-121986 + +* Thu Nov 13 2025 Oyvind Albrigtsen - 4.10.0-97 +- podman-etcd: exclude stopping resources from active count + + Resolves: RHEL-127891 + +* Mon Nov 10 2025 Oyvind Albrigtsen - 4.10.0-96 +- storage_mon: fix handling of 4k block devices + + Resolves: RHEL-127006 + +* Mon Nov 3 2025 Oyvind Albrigtsen - 4.10.0-95 +- powervs-move-ip: new resource agent + + Resolves: RHEL-114489 + +* Fri Oct 31 2025 Oyvind Albrigtsen - 4.10.0-94 +- nfsserver: add ability to set e.g. "pipefs-directory=/run/nfs/rpc_pipefs" + in /etc/nfs.conf to avoid issues with non-clustered Kerberized mounts + + Resolves: RHEL-109485 + +* Wed Oct 29 2025 Oyvind Albrigtsen - 4.10.0-92 +- MailTo: add s-nail support for multiple recipients +- oracle: improve monpassword description + + Resolves: RHEL-118621, RHEL-64949 + +* Wed Oct 29 2025 Oyvind Albrigtsen - 4.10.0-91 +- db2: add "skip_basic_sql_health_check" parameter to avoid failing on + systems with high load +- db2: add "monitor_retries", "monitor_sleep", and "monitor_retry_all_errors" + parameters to be able to avoid failing on first try + + Resolves: RHEL-115785, RHEL-115782 + +* Tue Oct 28 2025 Oyvind Albrigtsen - 4.10.0-90 +- podman-etcd: add support for cert rotation +- podman-etcd: compute dynamic revision bump from maxRaftIndex + + Resolves: RHEL-123887, RHEL-123906 + +* Wed Oct 22 2025 Oyvind Albrigtsen - 4.10.0-89 +- portblock: add promotable support, and method and status_check + parameters +- db2: use reintegration flag to avoid race condition on cluster + reintegration + + Resolves: RHEL-116151, RHEL-118624 + +* Thu Oct 9 2025 Oyvind Albrigtsen - 4.10.0-88 +- podman-etcd: add automatic learner member promotion + + Resolves: RHEL-119495 + +* Wed Oct 8 2025 Oyvind Albrigtsen - 4.10.0-87 +- build: make nfs-utils a weak dependency + + Resolves: RHEL-116100 + +* Mon Sep 22 2025 Oyvind Albrigtsen - 4.10.0-85 +- podman-etcd: wrap ipv6 address in brackets +- podman-etcd: preserve containers for debugging +- podman-etcd: add cluster-wide force_new_cluster attribute check + + Resolves: RHEL-113767, RHEL-113766, RHEL-116206 + +* Tue Sep 9 2025 Oyvind Albrigtsen - 4.10.0-81 +- podman-etcd: add oom parameter to be able to tune the Out-Of-Memory (OOM) + score for etcd containers + + Resolves: RHEL-102610 + * Tue Jul 15 2025 Oyvind Albrigtsen - 4.10.0-80 - ocf-shellfuncs/AWS agents: dont sleep after the final try in curl_retry()