From bf87c3b9844d336e2ecbcee5d72a5c403eb29482 Mon Sep 17 00:00:00 2001 From: Oyvind Albrigtsen Date: Mon, 20 Oct 2025 09:19:50 +0200 Subject: [PATCH] - db2: use reintegration flag to avoid race condition on cluster reintegration - Revert "portblock: add promotable support" Resolves: RHEL-118625 Reverts: RHEL-116150 --- ...f-shellfuncs-add-ocf_promotion_score.patch | 19 - ...0-2-portblock-add-promotable-support.patch | 362 ------------- ...e-condition-on-cluster-reintegration.patch | 481 ++++++++++++++++++ resource-agents.spec | 15 +- 4 files changed, 488 insertions(+), 389 deletions(-) delete mode 100644 RHEL-116150-1-ocf-shellfuncs-add-ocf_promotion_score.patch delete mode 100644 RHEL-116150-2-portblock-add-promotable-support.patch create mode 100644 RHEL-118625-db2-use-reintegration-flag-to-avoid-race-condition-on-cluster-reintegration.patch diff --git a/RHEL-116150-1-ocf-shellfuncs-add-ocf_promotion_score.patch b/RHEL-116150-1-ocf-shellfuncs-add-ocf_promotion_score.patch deleted file mode 100644 index 059d505..0000000 --- a/RHEL-116150-1-ocf-shellfuncs-add-ocf_promotion_score.patch +++ /dev/null @@ -1,19 +0,0 @@ ---- a/heartbeat/ocf-shellfuncs.in 2025-09-29 14:01:55.762931795 +0200 -+++ b/heartbeat/ocf-shellfuncs.in 2025-09-29 14:09:28.651731793 +0200 -@@ -1093,6 +1093,16 @@ - echo $1 - } - -+ocf_promotion_score() { -+ ocf_version_cmp "$OCF_RESKEY_crm_feature_set" "3.10.0" -+ res=$? -+ if [ $res -eq 2 ] || [ $res -eq 1 ] || ! have_binary "crm_master"; then -+ ${HA_SBIN_DIR}/crm_attribute -p ${OCF_RESOURCE_INSTANCE} $@ -+ else -+ ${HA_SBIN_DIR}/crm_master -l reboot $@ -+ fi -+} -+ - __ocf_set_defaults "$@" - - : ${OCF_TRACE_RA:=$OCF_RESKEY_trace_ra} diff --git a/RHEL-116150-2-portblock-add-promotable-support.patch b/RHEL-116150-2-portblock-add-promotable-support.patch deleted file mode 100644 index 0ae29e3..0000000 --- a/RHEL-116150-2-portblock-add-promotable-support.patch +++ /dev/null @@ -1,362 +0,0 @@ ---- a/heartbeat/portblock 2025-09-30 09:52:13.967530030 +0200 -+++ b/heartbeat/portblock 2025-09-30 09:52:49.018382542 +0200 -@@ -4,6 +4,7 @@ - # - # Author: Sun Jiang Dong (initial version) - # Philipp Reisner (per-IP filtering) -+# Sebastian Baszczyj (nftables code) - # - # License: GNU General Public License (GPL) - # -@@ -43,11 +44,15 @@ - ####################################################################### - CMD=`basename $0` - TICKLETCP=$HA_BIN/tickle_tcp -+TABLE="portblock" -+# Promotion scores -+SCORE_UNPROMOTED=5 -+SCORE_PROMOTED=10 - - usage() - { - cat <&2 -- usage: $CMD {start|stop|status|monitor|meta-data|validate-all} -+ usage: $CMD {start|stop|promote|demote|status|monitor|meta-data|validate-all} - - $CMD is used to temporarily block ports using iptables. - -@@ -86,8 +91,8 @@ - NOTE: iptables is Linux-specific. - - An additional feature in the portblock RA is the tickle ACK function -- enabled by specifying the tickle_dir parameter. The tickle ACK -- triggers the clients to faster reconnect their TCP connections to the -+ enabled by specifying the tickle_dir parameter. The tickle ACK -+ triggers the clients to faster reconnect their TCP connections to the - fail-overed server. - - Please note that this feature is often used for the floating IP fail- -@@ -95,7 +100,7 @@ - It doesn't support the cluster alias IP scenario. - - When using the tickle ACK function, in addition to the normal usage -- of portblock RA, the parameter tickle_dir must be specified in the -+ of portblock RA, the parameter tickle_dir must be specified in the - action=unblock instance of the portblock resources. - For example, you may stack resources like below: - portblock action=block -@@ -103,18 +108,18 @@ - portblock action=unblock tickle_dir=/tickle/state/dir - - If you want to tickle all the TCP connections which connected to _one_ -- floating IP but different ports, no matter how many portblock resources -- you have defined, you should enable tickles for _one_ portblock -+ floating IP but different ports, no matter how many portblock resources -+ you have defined, you should enable tickles for _one_ portblock - resource(action=unblock) only. -- -- The tickle_dir is a location which stores the established TCP -- connections. It can be a shared directory(which is cluster-visible to -+ -+ The tickle_dir is a location which stores the established TCP -+ connections. It can be a shared directory(which is cluster-visible to - all nodes) or a local directory. - If you use the shared directory, you needn't do any other things. - If you use the local directory, you must also specify the sync_script - paramater. We recommend you to use csync2 as the sync_script. -- For example, if you use the local directory /tmp/tickle as tickle_dir, -- you could setup the csync2 as the csync2 documentation says and -+ For example, if you use the local directory /tmp/tickle as tickle_dir, -+ you could setup the csync2 as the csync2 documentation says and - configure your /etc/csync2/csync2.cfg like: - group ticklegroup { - host node1; -@@ -137,15 +142,19 @@ - 1.0 - - --Resource script for portblock. It is used to temporarily block ports -+Resource script for portblock. It is used to block ports - using iptables. In addition, it may allow for faster TCP reconnects - for clients on failover. Use that if there are long lived TCP - connections to an HA service. This feature is enabled by setting the - tickle_dir parameter and only in concert with action set to unblock. - Note that the tickle ACK function is new as of version 3.0.2 and - hasn't yet seen widespread use. -+ -+In Promotable mode, the promote action unblocks the port(s) on the Promoted node -+and blocks the port(s) on the Unpromoted node(s) when action=unblock, and vice versa -+when action=block. - --Block and unblocks access to TCP and UDP ports -+Blocks and unblocks access to TCP and UDP ports - - - -@@ -167,6 +176,10 @@ - - - The action (block/unblock) to be done on the protocol::portno. -+ -+In Promotable mode it is the action for the promote action, -+and the opposite action will be used for the start and demote -+actions. - - action - -@@ -202,7 +215,7 @@ - - - --The shared or local directory (_must_ be absolute path) which -+The shared or local directory (_must_ be absolute path) which - stores the established TCP connections. - - Tickle directory -@@ -236,6 +249,8 @@ - - - -+ -+ - - - -@@ -269,9 +284,9 @@ - # iptables 1.8.9 briefly broke the output format, returning the - # numeric protocol value instead of a string. Support both variants. - if [ "$1" = "tcp" ]; then -- local prot="(tcp|6)" -+ local prot="\(tcp\|6\)" - else -- local prot="(udp|17)" -+ local prot="\(udp\|17\)" - fi - echo "^DROP${w}${prot}${w}--${w}${src}${w}${dst}${w}multiport${w}${4}ports${w}${2}$" - } -@@ -281,7 +296,7 @@ - { - [ "$4" = "OUTPUT" ] && ds="s" || ds="d" - PAT=$(active_grep_pat "$1" "$2" "$3" "$ds") -- $IPTABLES $wait -n -L "$4" | grep -qE "$PAT" -+ $IPTABLES $wait -n -L "$4" | grep -q "$PAT" - } - - # netstat -tn and ss -Htn, split on whitespace and colon, -@@ -397,6 +412,17 @@ - rc=$OCF_NOT_RUNNING - ;; - esac -+ elif ocf_is_ms; then -+ case $5 in -+ block) -+ SayInactive $* -+ rc=$OCF_NOT_RUNNING -+ ;; -+ *) -+ SayActive $* -+ rc=$OCF_SUCCESS -+ ;; -+ esac - else - case $5 in - block) -@@ -493,18 +519,21 @@ - { - ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" start - case $5 in -- block) IptablesBLOCK "$@";; -+ block) IptablesBLOCK "$@" -+ rc=$? -+ ;; - unblock) - IptablesUNBLOCK "$@" - rc=$? - tickle_remote - #ignore run_tickle_tcp exit code! -- return $rc - ;; -- *) usage; return 1; -+ *) usage; return $OCF_ERR_CONFIGURED ; - esac - -- return $? -+ ocf_is_ms && ocf_promotion_score -v $SCORE_UNPROMOTED -N $nodename -+ -+ return $rc - } - - #IptablesStop {udp|tcp} portno,portno ip {in|out|both} {block|unblock} -@@ -512,17 +541,73 @@ - { - ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" stop - case $5 in -- block) IptablesUNBLOCK "$@";; -+ block) IptablesUNBLOCK "$@" -+ rc=$? -+ ;; - unblock) - save_tcp_connections - IptablesBLOCK "$@" -+ rc=$? - ;; -- *) usage; return 1;; -+ *) usage; return $OCF_ERR_CONFIGURED ;; - esac - -+ ocf_is_ms && ocf_promotion_score -D -N $nodename -+ -+ return $rc -+} -+ -+IptablesPromote() { -+ IptablesStatus "$@" -+ rc=$? -+ if [ $rc -eq $OCF_SUCCESS ] && [ $promotion_score -eq $SCORE_PROMOTED ]; then -+ ocf_log info "Promote: resource already promoted." -+ return $rc -+ elif [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_NOT_RUNNING ]; then -+ ocf_exit_reason "Promote: IptablesStatus failed with rc: $rc." -+ return $rc -+ fi -+ case $5 in -+ block) IptablesBLOCK "$@" -+ rc=$? -+ ;; -+ unblock) -+ IptablesUNBLOCK "$@" -+ rc=$? -+ tickle_remote -+ #ignore run_tickle_tcp exit code! -+ ;; -+ *) usage; return $OCF_ERR_CONFIGURED ; -+ esac -+ ocf_promotion_score -v $SCORE_PROMOTED -N $nodename - return $? - } - -+IptablesDemote() { -+ IptablesStatus "$@" -+ rc=$? -+ if [ $rc -eq $OCF_SUCCESS ] && [ $promotion_score -eq $SCORE_UNPROMOTED ]; then -+ ocf_log info "Demote: resource already demoted." -+ return $rc -+ elif [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_NOT_RUNNING ]; then -+ ocf_exit_reason "Demote: IptablesStatus failed with rc: $rc." -+ return $rc -+ fi -+ case $5 in -+ block) -+ save_tcp_connections -+ IptablesBLOCK "$@" -+ rc=$? -+ ;; -+ unblock) IptablesUNBLOCK "$@" -+ rc=$? -+ ;; -+ *) usage; return $OCF_ERR_CONFIGURED ;; -+ esac -+ ocf_promotion_score -v $SCORE_UNPROMOTED -N $nodename -+ return $rc -+} -+ - # - # Check if the port is valid, this function code is not decent, but works - # -@@ -558,17 +643,17 @@ - fi - if [ ! -d "$OCF_RESKEY_tickle_dir" ]; then - ocf_log err "The tickle dir doesn't exist!" -- exit $OCF_ERR_INSTALLED -+ exit $OCF_ERR_INSTALLED - fi - fi - - case $action in -- block|unblock) -+ block|unblock) - ;; -- *) -+ *) - ocf_log err "Invalid action $action!" - exit $OCF_ERR_CONFIGURED -- ;; -+ ;; - esac - - if ocf_is_true $reset_local_on_unblock_stop; then -@@ -591,7 +676,7 @@ - exit $OCF_ERR_ARGS - fi - --case $1 in -+case $__OCF_ACTION in - meta-data) meta_data - exit $OCF_SUCCESS - ;; -@@ -605,12 +690,12 @@ - if [ -z "$OCF_RESKEY_protocol" ]; then - ocf_log err "Please set OCF_RESKEY_protocol" - exit $OCF_ERR_CONFIGURED --fi -+fi - - if [ -z "$OCF_RESKEY_portno" ]; then - ocf_log err "Please set OCF_RESKEY_portno" - exit $OCF_ERR_CONFIGURED --fi -+fi - - if [ -z "$OCF_RESKEY_action" ]; then - ocf_log err "Please set OCF_RESKEY_action" -@@ -632,6 +717,7 @@ - action=$OCF_RESKEY_action - ip=$OCF_RESKEY_ip - reset_local_on_unblock_stop=$OCF_RESKEY_reset_local_on_unblock_stop -+nodename=$(ocf_local_nodename) - - - # If "tickle" is enabled, we need to record the list of currently established -@@ -647,17 +733,35 @@ - fi - fi - --case $1 in -- start) -- IptablesStart $protocol $portno $ip $direction $action -+if ocf_is_ms; then -+ promotion_score=$(ocf_promotion_score -G -N $nodename -q 2> /dev/null) -+ if { [ "$__OCF_ACTION" = "monitor" ] && [ "$promotion_score" = "$SCORE_UNPROMOTED" ]; } || [ "$__OCF_ACTION" = "demote" ] || [ "$__OCF_ACTION" = "start" ]; then -+ case $action in -+ block) action="unblock" ;; -+ unblock) action="block" ;; -+ esac -+ fi -+fi -+ -+case $__OCF_ACTION in -+ start) -+ IptablesStart "$protocol" "$portno" "$ip" "$direction" "$action" -+ ;; -+ -+ stop) -+ IptablesStop "$protocol" "$portno" "$ip" "$direction" "$action" -+ ;; -+ -+ promote) -+ IptablesPromote "$protocol" "$portno" "$ip" "$direction" "$action" - ;; - -- stop) -- IptablesStop $protocol $portno $ip $direction $action -+ demote) -+ IptablesDemote "$protocol" "$portno" "$ip" "$direction" "$action" - ;; - -- status|monitor) -- IptablesStatus $protocol $portno $ip $direction $action -+ status|monitor) -+ IptablesStatus "$protocol" "$portno" "$ip" "$direction" "$action" - ;; - - validate-all) diff --git a/RHEL-118625-db2-use-reintegration-flag-to-avoid-race-condition-on-cluster-reintegration.patch b/RHEL-118625-db2-use-reintegration-flag-to-avoid-race-condition-on-cluster-reintegration.patch new file mode 100644 index 0000000..0ef688b --- /dev/null +++ b/RHEL-118625-db2-use-reintegration-flag-to-avoid-race-condition-on-cluster-reintegration.patch @@ -0,0 +1,481 @@ +From dbc0d2647d73bed986bf7208df33f092f56e8523 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Thu, 25 Sep 2025 14:23:20 +0200 +Subject: [PATCH] db2: use reintegration flag to avoid race condition on + cluster reintegration, and removed FAL, as it's no longer needed + +--- + heartbeat/db2 | 306 ++++++++++++++++++++++++++++++++------------------ + 1 file changed, 197 insertions(+), 109 deletions(-) + +diff --git a/heartbeat/db2 b/heartbeat/db2 +index fe1d9b892..83020fc70 100755 +--- a/heartbeat/db2 ++++ b/heartbeat/db2 +@@ -37,6 +37,13 @@ + : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} + . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + ++# Use runuser if available for SELinux. ++if [ -x "/sbin/runuser" ]; then ++ SU="runuser" ++else ++ SU="su" ++fi ++ + # Parameter defaults + + OCF_RESKEY_instance_default="" +@@ -55,11 +62,12 @@ OCF_RESKEY_dbpartitionnum_default="0" + : ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}} + : ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}} + ++POSIX_UNICODE_LOCALE="C.UTF-8" + ####################################################################### + + + db2_usage() { +- echo "db2 start|stop|monitor|promote|demote|notify|validate-all|meta-data" ++ echo "db2 start|stop|monitor|promote|demote|validate-all|meta-data" + } + + db2_meta_data() { +@@ -162,7 +170,6 @@ The number of the partition (DBPARTITIONNUM) to be managed. + + + +- + + + +@@ -273,7 +280,18 @@ master_score() + # Run the given command as db2 instance user + # + runasdb2() { +- su $instance -c ". $db2profile; $*" ++ $SU $instance -c ". $db2profile; $*" ++} ++ ++# ++# Run the given command as db2 instance user using $SU ++# We run this function as opposed to runasdb2 whenever we have to issue commands ++# that leave processes running on the system, such as db2start ++# We do not want these processes to hog the resources as they were run with elevated privileges ++# ++runasdb2_session() { ++ # Override db2profile with unicode locale is required to maintain compatibility with unicode CODEPAGE ++ $SU "$instance" -c "ksh -c '. $db2profile; export LC_ALL="$POSIX_UNICODE_LOCALE"; export LANG="$POSIX_UNICODE_LOCALE"; $*'" + } + + # +@@ -294,48 +312,6 @@ logasdb2() { + } + + +-# +-# maintain the fal (first active log) attribute +-# db2_fal_attrib DB {set val|get} +-# +-db2_fal_attrib() { +- local db=$1 +- local attr val rc id node member me +- +- attr=db2hadr_${instance}_${db}_fal +- +- case "$2" in +- set) +- me=$(ocf_local_nodename) +- +- # loop over all member nodes and set attribute +- crm_node -l | +- while read id node member +- do +- [ "$member" = member -a "$node" != "$me" ] || continue +- crm_attribute -l forever --node=$node -n $attr -v "$3" +- rc=$? +- ocf_log info "DB2 instance $instance($db2node/$db: setting attrib for FAL to $FIRST_ACTIVE_LOG @ $node" +- [ $rc != 0 ] && break +- done +- ;; +- +- get) +- crm_attribute -l forever -n $attr -G --quiet 2>&1 +- rc=$? +- if ! ocf_is_true "$OCF_RESKEY_CRM_meta_notify" && [ $rc != 0 ] +- then +- ocf_log warn "DB2 instance $instance($db2node/$db: can't retrieve attribute $attr, are you sure notifications are enabled ?" +- fi +- ;; +- +- *) +- exit $OCF_ERR_CONFIGURED +- esac +- +- return $rc +-} +- + # + # unfortunately a first connect after a crash may need several minutes + # for some internal cleanup stuff in DB2. +@@ -429,6 +405,42 @@ db2_check_config_compatibility() { + + } + ++# ++# Start HADR as standby. ++# ++# Parameters ++# 1 - Calling function ++# 2 - Calling functions line number ++# ++# Return codes: ++# 0 - Start as standby successful ++# 1 - Start as standby failed ++# ++reintegrateAsStandby() { ++ db=$1 ++ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint" ++ ocf_log info "$__OCF_ACTION: $LINENO: reintegrateAsStandby called by $2 at $3. Attempting to reintegrate $db as standby." ++ if output=$(runasdb2_session "db2 start hadr on db $db as standby"); then ++ rc=0 ++ ocf_log info "$__OCF_ACTION: $LINENO: Db2 database $instance($db2node)/$db started/activated" ++ else ++ case $output in ++ SQL1777N*) ++ # SQL1777N: HADR is already started in given state. ++ ocf_log info "$__OCF_ACTION: $LINENO: $output" ++ rc=0 ++ ;; ++ ++ *) ++ rc=1 ++ ocf_log err "$__OCF_ACTION: $LINENO: Unable to reintegrate Db2 database $instance($db2node)/$db. Please reintegrate manually: $output, return with rc=$rc" ++ ;; ++ esac ++ fi ++ crm_attribute -n "$reint_attr" -N "$local_host" -v "0" -l forever ++ return $rc ++} ++ + # + # Start instance and DB. + # Standard mode is through "db2 activate" in order to start in previous +@@ -478,6 +490,8 @@ db2_start() { + + for db in $dblist + do ++ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint" ++ + # sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW FIRST_ACTIVE_LOG + db2_get_cfg $db || return $? + +@@ -488,20 +502,13 @@ db2_start() { + + if [ $HADR_ROLE = PRIMARY ] + then +- local master_fal +- +- # communicate our FAL to other nodes the might start concurrently +- db2_fal_attrib $db set $FIRST_ACTIVE_LOG +- +- # ignore false positive: +- # error: Can't use > in [ ]. Escape it or use [[..]]. [SC2073] +- # see https://github.com/koalaman/shellcheck/issues/691 +- # shellcheck disable=SC2073 +- if master_fal=$(db2_fal_attrib $db get) && [ "$master_fal" '>' $FIRST_ACTIVE_LOG ] +- then ++ cib_value=$(crm_attribute -n "$reint_attr" -N "$local_host" -G | awk -v FS=' value=' '{print $2}') ++ ocf_log info "$__OCF_ACTION: $LINENO: CIB attribute $reint_attr is set to '$cib_value'" ++ if [ "$cib_value" = "1" ]; then + ocf_log info "DB2 database $instance($db2node)/$db is Primary and outdated, starting as secondary" + start_cmd="db2 start hadr on db $db as standby" + HADR_ROLE=STANDBY ++ standby_reintegration=1 + fi + fi + +@@ -511,27 +518,65 @@ db2_start() { + [ $HADR_ROLE != STANDBY ] && db2_run_connect $db & + else + case $output in +- SQL1490W*|SQL1494W*|SQL1497W*|SQL1777N*) +- ocf_log info "DB2 database $instance($db2node)/$db already activated: $output" ++ SQL1490W* | SQL1494W* | SQL1497W* | SQL1777N*) ++ # SQL1490W Activate database is successful, however, the database has already been activated on one or more nodes. ++ # SQL1494W Activate database is successful, however, there is already a connection to the database. ++ # SQL1497W Activate/Deactivate database was successful, however, an error occurred on some nodes. ++ # SQL1777N HADR is already started. ++ ++ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database is already activated: $output" + ;; + +- SQL1768N*"Reason code = \"7\""*) +- ocf_log err "DB2 database $instance($db2node)/$db is a Primary and the Standby is down" +- ocf_log err "Possible split brain ! Manual intervention required." ++ SQL1768N*"Reason code = \"7\""*) ++ rc="$OCF_ERR_GENERIC" ++ ++ ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database is a Primary and the Standby is down" ++ ocf_log err "Possible split brain! Manual intervention required." + ocf_log err "If this DB is outdated use \"db2 start hadr on db $db as standby\"" +- ocf_log err "If this DB is the surviving primary use \"db2 start hadr on db $db as primary by force\"" ++ ocf_log err "If this DB is the surviving primary use \"db2 start hadr on db $db as primary by force\". db2_start() exit with rc=$rc." + +- # might be the Standby is not yet there +- # might be a timing problem because "First active log" is delayed +- # on the next start attempt we might succeed when FAL was advanced +- # might be manual intervention is required +- # ... so let pacemaker give it another try and we will succeed then +- return $OCF_ERR_GENERIC ++ # let pacemaker give it another try and we will succeed then ++ return "$rc" + ;; + +- *) +- ocf_log err "DB2 database $instance($db2node)/$db didn't start: $output" +- return $OCF_ERR_GENERIC ++ SQL1776N*"Reason code = \"6\""*) ++ # SQL1776N The command cannot be issued on an HADR database. ++ # Reason code 6: ++ # This database is an old primary database. It cannot be started ++ # because the standby has become the new primary through forced ++ # takeover. ++ ++ rc="$OCF_ERR_GENERIC" ++ ocf_log err "$__OCF_ACTION: $LINENO: Db2 database $instance($db2node)/$db didn't start: $output, return with rc=$rc" ++ ocf_log err "$__OCF_ACTION: $LINENO: This database is an old primary database. Trying start again as standby" ++ ++ start_cmd="db2 start hadr on db $db as standby" ++ if output=$(runasdb2_session "$start_cmd"); then ++ rc="$OCF_SUCCESS" ++ ocf_log info "$__OCF_ACTION: $LINENO: Db2 database $instance($db2node)/$db started/activated" ++ else ++ case $output in ++ SQL1777N*) ++ # SQL1777N: HADR is already started. ++ ocf_log info "$__OCF_ACTION: $LINENO: $output" ++ rc="$OCF_SUCCESS" ++ ;; ++ ++ *) ++ rc="$OCF_ERR_GENERIC" ++ ocf_log err "$__OCF_ACTION: $LINENO: Unable to reintegrate Db2 database $instance($db2node)/$db. Please reintegrate manually: $output, return with rc=$rc" ++ ;; ++ esac ++ fi ++ ++ return "$rc" ++ ;; ++ ++ *) ++ rc="$OCF_ERR_GENERIC" ++ ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database didn't start: $output, db2_start() exit with rc=$rc." ++ return "$rc" ++ ;; + esac + fi + done +@@ -539,6 +584,15 @@ db2_start() { + # come here with success + # Even if we are a db2 Primary pacemaker requires start to end up in slave mode + echo SLAVE > $STATE_FILE ++ ++ # Unset primary failover attribute as host was successfully reintegrated as standby ++ if [ "$standby_reintegration" = "1" ]; then ++ for db in $dblist; do ++ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint" ++ crm_attribute -n "$reint_attr" -N "$local_host" -v "0" -l forever ++ done ++ fi ++ + return $OCF_SUCCESS + } + +@@ -737,7 +791,7 @@ db2_monitor_retry() { + + # + # Monitor the db +-# And as side effect set crm_master / FAL attribute ++# And as side effect set crm_master + # + db2_monitor() { + local CMD output hadr db +@@ -754,6 +808,22 @@ db2_monitor() { + + for db in $dblist + do ++ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint" ++ ++ #Check for the reintegration file, then set the flag if it exists and delete the file ++ if [ -e "/tmp/$reint_attr" ] && [ -n "$remote_host" ]; then ++ #The file exist, try to set the reintegration attribute ++ crm_attribute -n "$reint_attr" -N "$remote_host" -v "1" -l forever ++ cib_value=$(crm_attribute -n "$reint_attr" -N "$remote_host" -G | awk -v FS=' value=' '{print $2}') ++ ++ if [ "$cib_value" = "1" ]; then ++ ocf_log info "$__OCF_ACTION: $LINENO: CIB attribute $reint_attr is set to '$cib_value', reintegration flag file will now be deleted." ++ rm -f "/tmp/$reint_attr" ++ else ++ ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The reintegration flag file exists, but its attribute failed to set." ++ fi ++ fi ++ + hadr=$(db2_hadr_status $db) + rc=$? + ocf_log debug "Monitor: DB2 database $instance($db2node)/$db has HADR status $hadr" +@@ -804,6 +874,14 @@ db2_monitor() { + ;; + + STANDBY/*PEER/*|Standby/*Peer) ++ # If db is in standby peer, then it has already reintegrated. ++ # If the reintegrate flag is still set, remove it ++ cib_value=$(crm_attribute -n "$reint_attr" -N "$local_host" -G | awk -v FS=' value=' '{print $2}') ++ if [ "$cib_value" = "1" ]; then ++ ocf_log info "$__OCF_ACTION: $LINENO: Reintegrate flag detected for $db, but it has already reintegrated as standby. Removing reintegration flag." ++ crm_attribute -n "$reint_attr" -N "$local_host" -v "0" -l forever ++ fi ++ + master_score -v 8000 -l reboot + ;; + +@@ -812,6 +890,34 @@ db2_monitor() { + master_score -D -l reboot + ;; + ++ Down/Off) ++ # If db is a deactivated primary and it has a reintegration flag, then reintegrate as standby. ++ cib_value=$(crm_attribute -n "$reint_attr" -N "$local_host" -G | awk -v FS=' value=' '{print $2}') ++ if [ "$cib_value" = "1" ]; then ++ output=$(runasdb2 "db2 get db cfg for $db" | grep 'HADR database role' | awk '{print $5}') ++ if [ "PRIMARY" = "$output" ]; then ++ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: Database is deactivated with Primary role and the reintegration flag is set. Role: $output, Reintegration flag: $reint_attr = $cib_value" ++ # Reintegrate as the standby database. ++ if reintegrateAsStandby "$db" 'db2_monitor' $LINENO; then ++ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database reintegration succeeded." ++ # Setting slave state here will cause rc to be OCF_SUCCESS below. ++ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: Echoing SLAVE into $STATE_FILE" ++ echo SLAVE >"$STATE_FILE" ++ # Update master score to reflect standby state. ++ master_score -v 8000 -l reboot ++ else ++ ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database reintegration failed." ++ return "$OCF_ERR_GENERIC" ++ fi ++ fi ++ else ++ rc="$OCF_NOT_RUNNING" ++ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database has HADR status $hadr." ++ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: db2_monitor() exit with rc=$rc." ++ return "$rc" ++ fi ++ ;; ++ + *) + return $OCF_ERR_GENERIC + esac +@@ -875,8 +981,6 @@ db2_promote() { + # update pacemaker's view + echo MASTER > $STATE_FILE + +- # turn the log so we rapidly get a new FAL +- logasdb2 "db2 archive log for db $db" + return $OCF_SUCCESS + fi + +@@ -914,26 +1018,6 @@ db2_demote() { + return $? + } + +-# +-# handle pre start notification +-# We record our first active log on the other nodes. +-# If two primaries come up after a crash they can safely determine who is +-# the outdated one. +-# +-db2_notify() { +- local node +- +- # only interested in pre-start +- [ $OCF_RESKEY_CRM_meta_notify_type = pre \ +- -a $OCF_RESKEY_CRM_meta_notify_operation = start ] || return $OCF_SUCCESS +- +- # gets FIRST_ACTIVE_LOG +- db2_get_cfg $dblist || return $? +- +- db2_fal_attrib $dblist set $FIRST_ACTIVE_LOG || return $OCF_ERR_GENERIC +- exit $OCF_SUCCESS +-} +- + ######## + # Main # + ######## +@@ -947,50 +1031,54 @@ case "$__OCF_ACTION" in + db2_usage + exit $OCF_SUCCESS + ;; ++esac + ++local_host=$(ocf_local_nodename) ++inst1=$(echo "$OCF_RESKEY_instance" | cut -d"," -f1) ++inst2=$(echo "$OCF_RESKEY_instance" | cut -d"," -f2) ++host1=$(crm_node -l | sort | awk '{print $2;}' | sed -n 1p) ++ ++if [ "$host1" = "$local_host" ]; then ++ remote_host=$(crm_node -l | sort | awk '{print $2;}' | sed -n 2p) ++else ++ remote_host="$host1" ++fi ++ ++db2_validate; validate_rc=$? ++ ++case "$__OCF_ACTION" in + start) +- db2_validate + db2_start || exit $? + db2_monitor +- exit $? + ;; + + stop) +- db2_validate + db2_stop +- exit $? + ;; + + promote) +- db2_validate + db2_promote +- exit $? + ;; + + demote) +- db2_validate + db2_demote +- exit $? + ;; + + notify) +- db2_validate +- db2_notify +- exit $? ++ ocf_log debug "notify-action has been DEPRECATED, and should be removed" + ;; + + monitor) +- db2_validate + db2_monitor_retry +- exit $? + ;; + + validate-all) +- db2_validate +- exit $? ++ exit $validate_rc + ;; + + *) + db2_usage + exit $OCF_ERR_UNIMPLEMENTED + esac ++ ++exit $? diff --git a/resource-agents.spec b/resource-agents.spec index 7ea61fc..d4138fe 100644 --- a/resource-agents.spec +++ b/resource-agents.spec @@ -73,7 +73,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.9.0 -Release: 54%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}.18 +Release: 54%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}.19 License: GPLv2+ and LGPLv2+ URL: https://github.com/ClusterLabs/resource-agents %if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel} @@ -174,8 +174,7 @@ Patch77: RHEL-85048-tomcat-fix-CATALINA_PID-not-set-and-parameter-defaults.patc Patch78: RHEL-91257-Filesystem-add-support-for-aznfs.patch Patch79: RHEL-102731-ocf-shellfuncs-remove-extra-sleep-from-curl_retry.patch Patch80: RHEL-115783-RHEL-115781-db2-add-skip_basic_sql_health_check-and-monitor-parameters.patch -Patch81: RHEL-116150-1-ocf-shellfuncs-add-ocf_promotion_score.patch -Patch82: RHEL-116150-2-portblock-add-promotable-support.patch +Patch81: RHEL-118625-db2-use-reintegration-flag-to-avoid-race-condition-on-cluster-reintegration.patch # bundle patches Patch1000: 7-gcp-bundled.patch @@ -441,8 +440,7 @@ exit 1 %patch -p1 -P 78 -F2 %patch -p1 -P 79 %patch -p1 -P 80 -%patch -p1 -P 81 -%patch -p1 -P 82 +%patch -p1 -P 81 -F2 chmod 755 heartbeat/nova-compute-wait chmod 755 heartbeat/NovaEvacuate @@ -1033,10 +1031,11 @@ ccs_update_schema > /dev/null 2>&1 ||: %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm %changelog -* Thu Oct 2 2025 Oyvind Albrigtsen - 4.9.0-54.18 -- portblock: add promotable support +* Mon Oct 20 2025 Oyvind Albrigtsen - 4.9.0-54.19 +- db2: use reintegration flag to avoid race condition on cluster + reintegration - Resolves: RHEL-116150 + Resolves: RHEL-118625 * Thu Sep 18 2025 Oyvind Albrigtsen - 4.9.0-54.17 - db2: add "skip_basic_sql_health_check" parameter to avoid failing on