Compare commits

...

No commits in common. "c8" and "c8s" have entirely different histories.
c8 ... c8s

106 changed files with 1152 additions and 25 deletions

19
.gitignore vendored
View File

@ -1,12 +1,7 @@
SOURCES/ClusterLabs-resource-agents-55a4e2c9.tar.gz
SOURCES/aliyun-cli-2.1.10.tar.gz
SOURCES/aliyun-python-sdk-core-2.13.1.tar.gz
SOURCES/aliyun-python-sdk-ecs-4.9.3.tar.gz
SOURCES/aliyun-python-sdk-vpc-3.0.2.tar.gz
SOURCES/colorama-0.3.3.tar.gz
SOURCES/google-cloud-sdk-360.0.0-linux-x86_64.tar.gz
SOURCES/httplib2-0.20.4.tar.gz
SOURCES/pycryptodome-3.20.0.tar.gz
SOURCES/pyparsing-2.4.7-py2.py3-none-any.whl
SOURCES/pyroute2-0.4.13.tar.gz
SOURCES/urllib3-1.26.18.tar.gz
/*.tar.gz
/*.rpm
/*.whl
/.*
/*/
!/tests/
/tests/*.retry

View File

@ -1,12 +0,0 @@
dfc65f4cac3f95026b2f5674019814a527333004 SOURCES/ClusterLabs-resource-agents-55a4e2c9.tar.gz
306e131d8908ca794276bfe3a0b55ccc3bbd482f SOURCES/aliyun-cli-2.1.10.tar.gz
0a56f6d9ed2014a363486d33b63eca094379be06 SOURCES/aliyun-python-sdk-core-2.13.1.tar.gz
c2a98b9a1562d223a76514f05028488ca000c395 SOURCES/aliyun-python-sdk-ecs-4.9.3.tar.gz
f14647a4d37a9a254c4e711b95a7654fc418e41e SOURCES/aliyun-python-sdk-vpc-3.0.2.tar.gz
0fe5bd8bca54dd71223778a1e0bcca9af324abb1 SOURCES/colorama-0.3.3.tar.gz
81f039cf075e9c8b70d5af99c189296a9e031de3 SOURCES/google-cloud-sdk-360.0.0-linux-x86_64.tar.gz
7caf4412d9473bf17352316249a8133fa70b7e37 SOURCES/httplib2-0.20.4.tar.gz
c55d177e9484d974c95078d4ae945f89ba2c7251 SOURCES/pycryptodome-3.20.0.tar.gz
c8307f47e3b75a2d02af72982a2dfefa3f56e407 SOURCES/pyparsing-2.4.7-py2.py3-none-any.whl
147149db11104c06d405fd077dcd2aa1c345f109 SOURCES/pyroute2-0.4.13.tar.gz
84e2852d8da1655373f7ce5e7d5d3e256b62b4e4 SOURCES/urllib3-1.26.18.tar.gz

View File

@ -0,0 +1,19 @@
--- a/heartbeat/ocf-shellfuncs.in 2025-09-29 14:01:55.762931795 +0200
+++ b/heartbeat/ocf-shellfuncs.in 2025-09-29 14:09:28.651731793 +0200
@@ -1093,6 +1093,16 @@
echo $1
}
+ocf_promotion_score() {
+ ocf_version_cmp "$OCF_RESKEY_crm_feature_set" "3.10.0"
+ res=$?
+ if [ $res -eq 2 ] || [ $res -eq 1 ] || ! have_binary "crm_master"; then
+ ${HA_SBIN_DIR}/crm_attribute -p ${OCF_RESOURCE_INSTANCE} $@
+ else
+ ${HA_SBIN_DIR}/crm_master -l reboot $@
+ fi
+}
+
__ocf_set_defaults "$@"
: ${OCF_TRACE_RA:=$OCF_RESKEY_trace_ra}

View File

@ -0,0 +1,362 @@
--- a/heartbeat/portblock 2025-09-30 09:52:13.967530030 +0200
+++ b/heartbeat/portblock 2025-09-30 09:52:49.018382542 +0200
@@ -4,6 +4,7 @@
#
# Author: Sun Jiang Dong (initial version)
# Philipp Reisner (per-IP filtering)
+# Sebastian Baszczyj (nftables code)
#
# License: GNU General Public License (GPL)
#
@@ -43,11 +44,15 @@
#######################################################################
CMD=`basename $0`
TICKLETCP=$HA_BIN/tickle_tcp
+TABLE="portblock"
+# Promotion scores
+SCORE_UNPROMOTED=5
+SCORE_PROMOTED=10
usage()
{
cat <<END >&2
- usage: $CMD {start|stop|status|monitor|meta-data|validate-all}
+ usage: $CMD {start|stop|promote|demote|status|monitor|meta-data|validate-all}
$CMD is used to temporarily block ports using iptables.
@@ -86,8 +91,8 @@
NOTE: iptables is Linux-specific.
An additional feature in the portblock RA is the tickle ACK function
- enabled by specifying the tickle_dir parameter. The tickle ACK
- triggers the clients to faster reconnect their TCP connections to the
+ enabled by specifying the tickle_dir parameter. The tickle ACK
+ triggers the clients to faster reconnect their TCP connections to the
fail-overed server.
Please note that this feature is often used for the floating IP fail-
@@ -95,7 +100,7 @@
It doesn't support the cluster alias IP scenario.
When using the tickle ACK function, in addition to the normal usage
- of portblock RA, the parameter tickle_dir must be specified in the
+ of portblock RA, the parameter tickle_dir must be specified in the
action=unblock instance of the portblock resources.
For example, you may stack resources like below:
portblock action=block
@@ -103,18 +108,18 @@
portblock action=unblock tickle_dir=/tickle/state/dir
If you want to tickle all the TCP connections which connected to _one_
- floating IP but different ports, no matter how many portblock resources
- you have defined, you should enable tickles for _one_ portblock
+ floating IP but different ports, no matter how many portblock resources
+ you have defined, you should enable tickles for _one_ portblock
resource(action=unblock) only.
-
- The tickle_dir is a location which stores the established TCP
- connections. It can be a shared directory(which is cluster-visible to
+
+ The tickle_dir is a location which stores the established TCP
+ connections. It can be a shared directory(which is cluster-visible to
all nodes) or a local directory.
If you use the shared directory, you needn't do any other things.
If you use the local directory, you must also specify the sync_script
paramater. We recommend you to use csync2 as the sync_script.
- For example, if you use the local directory /tmp/tickle as tickle_dir,
- you could setup the csync2 as the csync2 documentation says and
+ For example, if you use the local directory /tmp/tickle as tickle_dir,
+ you could setup the csync2 as the csync2 documentation says and
configure your /etc/csync2/csync2.cfg like:
group ticklegroup {
host node1;
@@ -137,15 +142,19 @@
<version>1.0</version>
<longdesc lang="en">
-Resource script for portblock. It is used to temporarily block ports
+Resource script for portblock. It is used to block ports
using iptables. In addition, it may allow for faster TCP reconnects
for clients on failover. Use that if there are long lived TCP
connections to an HA service. This feature is enabled by setting the
tickle_dir parameter and only in concert with action set to unblock.
Note that the tickle ACK function is new as of version 3.0.2 and
hasn't yet seen widespread use.
+
+In Promotable mode, the promote action unblocks the port(s) on the Promoted node
+and blocks the port(s) on the Unpromoted node(s) when action=unblock, and vice versa
+when action=block.
</longdesc>
-<shortdesc lang="en">Block and unblocks access to TCP and UDP ports</shortdesc>
+<shortdesc lang="en">Blocks and unblocks access to TCP and UDP ports</shortdesc>
<parameters>
<parameter name="protocol" unique="0" required="1">
@@ -167,6 +176,10 @@
<parameter name="action" unique="0" required="1">
<longdesc lang="en">
The action (block/unblock) to be done on the protocol::portno.
+
+In Promotable mode it is the action for the promote action,
+and the opposite action will be used for the start and demote
+actions.
</longdesc>
<shortdesc lang="en">action</shortdesc>
<content type="string" default="${OCF_RESKEY_action_default}" />
@@ -202,7 +215,7 @@
<parameter name="tickle_dir" unique="0" required="0">
<longdesc lang="en">
-The shared or local directory (_must_ be absolute path) which
+The shared or local directory (_must_ be absolute path) which
stores the established TCP connections.
</longdesc>
<shortdesc lang="en">Tickle directory</shortdesc>
@@ -236,6 +249,8 @@
<actions>
<action name="start" timeout="20s" />
<action name="stop" timeout="20s" />
+<action name="promote" timeout="10s"/>
+<action name="demote" timeout="10s"/>
<action name="status" depth="0" timeout="10s" interval="10s" />
<action name="monitor" depth="0" timeout="10s" interval="10s" />
<action name="meta-data" timeout="5s" />
@@ -269,9 +284,9 @@
# iptables 1.8.9 briefly broke the output format, returning the
# numeric protocol value instead of a string. Support both variants.
if [ "$1" = "tcp" ]; then
- local prot="(tcp|6)"
+ local prot="\(tcp\|6\)"
else
- local prot="(udp|17)"
+ local prot="\(udp\|17\)"
fi
echo "^DROP${w}${prot}${w}--${w}${src}${w}${dst}${w}multiport${w}${4}ports${w}${2}$"
}
@@ -281,7 +296,7 @@
{
[ "$4" = "OUTPUT" ] && ds="s" || ds="d"
PAT=$(active_grep_pat "$1" "$2" "$3" "$ds")
- $IPTABLES $wait -n -L "$4" | grep -qE "$PAT"
+ $IPTABLES $wait -n -L "$4" | grep -q "$PAT"
}
# netstat -tn and ss -Htn, split on whitespace and colon,
@@ -397,6 +412,17 @@
rc=$OCF_NOT_RUNNING
;;
esac
+ elif ocf_is_ms; then
+ case $5 in
+ block)
+ SayInactive $*
+ rc=$OCF_NOT_RUNNING
+ ;;
+ *)
+ SayActive $*
+ rc=$OCF_SUCCESS
+ ;;
+ esac
else
case $5 in
block)
@@ -493,18 +519,21 @@
{
ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" start
case $5 in
- block) IptablesBLOCK "$@";;
+ block) IptablesBLOCK "$@"
+ rc=$?
+ ;;
unblock)
IptablesUNBLOCK "$@"
rc=$?
tickle_remote
#ignore run_tickle_tcp exit code!
- return $rc
;;
- *) usage; return 1;
+ *) usage; return $OCF_ERR_CONFIGURED ;
esac
- return $?
+ ocf_is_ms && ocf_promotion_score -v $SCORE_UNPROMOTED -N $nodename
+
+ return $rc
}
#IptablesStop {udp|tcp} portno,portno ip {in|out|both} {block|unblock}
@@ -512,17 +541,73 @@
{
ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" stop
case $5 in
- block) IptablesUNBLOCK "$@";;
+ block) IptablesUNBLOCK "$@"
+ rc=$?
+ ;;
unblock)
save_tcp_connections
IptablesBLOCK "$@"
+ rc=$?
;;
- *) usage; return 1;;
+ *) usage; return $OCF_ERR_CONFIGURED ;;
esac
+ ocf_is_ms && ocf_promotion_score -D -N $nodename
+
+ return $rc
+}
+
+IptablesPromote() {
+ IptablesStatus "$@"
+ rc=$?
+ if [ $rc -eq $OCF_SUCCESS ] && [ $promotion_score -eq $SCORE_PROMOTED ]; then
+ ocf_log info "Promote: resource already promoted."
+ return $rc
+ elif [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_NOT_RUNNING ]; then
+ ocf_exit_reason "Promote: IptablesStatus failed with rc: $rc."
+ return $rc
+ fi
+ case $5 in
+ block) IptablesBLOCK "$@"
+ rc=$?
+ ;;
+ unblock)
+ IptablesUNBLOCK "$@"
+ rc=$?
+ tickle_remote
+ #ignore run_tickle_tcp exit code!
+ ;;
+ *) usage; return $OCF_ERR_CONFIGURED ;
+ esac
+ ocf_promotion_score -v $SCORE_PROMOTED -N $nodename
return $?
}
+IptablesDemote() {
+ IptablesStatus "$@"
+ rc=$?
+ if [ $rc -eq $OCF_SUCCESS ] && [ $promotion_score -eq $SCORE_UNPROMOTED ]; then
+ ocf_log info "Demote: resource already demoted."
+ return $rc
+ elif [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_NOT_RUNNING ]; then
+ ocf_exit_reason "Demote: IptablesStatus failed with rc: $rc."
+ return $rc
+ fi
+ case $5 in
+ block)
+ save_tcp_connections
+ IptablesBLOCK "$@"
+ rc=$?
+ ;;
+ unblock) IptablesUNBLOCK "$@"
+ rc=$?
+ ;;
+ *) usage; return $OCF_ERR_CONFIGURED ;;
+ esac
+ ocf_promotion_score -v $SCORE_UNPROMOTED -N $nodename
+ return $rc
+}
+
#
# Check if the port is valid, this function code is not decent, but works
#
@@ -558,17 +643,17 @@
fi
if [ ! -d "$OCF_RESKEY_tickle_dir" ]; then
ocf_log err "The tickle dir doesn't exist!"
- exit $OCF_ERR_INSTALLED
+ exit $OCF_ERR_INSTALLED
fi
fi
case $action in
- block|unblock)
+ block|unblock)
;;
- *)
+ *)
ocf_log err "Invalid action $action!"
exit $OCF_ERR_CONFIGURED
- ;;
+ ;;
esac
if ocf_is_true $reset_local_on_unblock_stop; then
@@ -591,7 +676,7 @@
exit $OCF_ERR_ARGS
fi
-case $1 in
+case $__OCF_ACTION in
meta-data) meta_data
exit $OCF_SUCCESS
;;
@@ -605,12 +690,12 @@
if [ -z "$OCF_RESKEY_protocol" ]; then
ocf_log err "Please set OCF_RESKEY_protocol"
exit $OCF_ERR_CONFIGURED
-fi
+fi
if [ -z "$OCF_RESKEY_portno" ]; then
ocf_log err "Please set OCF_RESKEY_portno"
exit $OCF_ERR_CONFIGURED
-fi
+fi
if [ -z "$OCF_RESKEY_action" ]; then
ocf_log err "Please set OCF_RESKEY_action"
@@ -632,6 +717,7 @@
action=$OCF_RESKEY_action
ip=$OCF_RESKEY_ip
reset_local_on_unblock_stop=$OCF_RESKEY_reset_local_on_unblock_stop
+nodename=$(ocf_local_nodename)
# If "tickle" is enabled, we need to record the list of currently established
@@ -647,17 +733,35 @@
fi
fi
-case $1 in
- start)
- IptablesStart $protocol $portno $ip $direction $action
+if ocf_is_ms; then
+ promotion_score=$(ocf_promotion_score -G -N $nodename -q 2> /dev/null)
+ if { [ "$__OCF_ACTION" = "monitor" ] && [ "$promotion_score" = "$SCORE_UNPROMOTED" ]; } || [ "$__OCF_ACTION" = "demote" ] || [ "$__OCF_ACTION" = "start" ]; then
+ case $action in
+ block) action="unblock" ;;
+ unblock) action="block" ;;
+ esac
+ fi
+fi
+
+case $__OCF_ACTION in
+ start)
+ IptablesStart "$protocol" "$portno" "$ip" "$direction" "$action"
+ ;;
+
+ stop)
+ IptablesStop "$protocol" "$portno" "$ip" "$direction" "$action"
+ ;;
+
+ promote)
+ IptablesPromote "$protocol" "$portno" "$ip" "$direction" "$action"
;;
- stop)
- IptablesStop $protocol $portno $ip $direction $action
+ demote)
+ IptablesDemote "$protocol" "$portno" "$ip" "$direction" "$action"
;;
- status|monitor)
- IptablesStatus $protocol $portno $ip $direction $action
+ status|monitor)
+ IptablesStatus "$protocol" "$portno" "$ip" "$direction" "$action"
;;
validate-all)

View File

@ -0,0 +1,180 @@
--- a/heartbeat/portblock 2025-10-21 09:27:41.753028260 +0200
+++ b/heartbeat/portblock 2025-10-21 09:28:55.573855995 +0200
@@ -28,6 +28,8 @@
OCF_RESKEY_portno_default=""
OCF_RESKEY_direction_default="in"
OCF_RESKEY_action_default=""
+OCF_RESKEY_method_default="drop"
+OCF_RESKEY_status_check_default="rule"
OCF_RESKEY_ip_default="0.0.0.0/0"
OCF_RESKEY_reset_local_on_unblock_stop_default="false"
OCF_RESKEY_tickle_dir_default=""
@@ -37,6 +39,8 @@
: ${OCF_RESKEY_portno=${OCF_RESKEY_portno_default}}
: ${OCF_RESKEY_direction=${OCF_RESKEY_direction_default}}
: ${OCF_RESKEY_action=${OCF_RESKEY_action_default}}
+: ${OCF_RESKEY_method=${OCF_RESKEY_method_default}}
+: ${OCF_RESKEY_status_check=${OCF_RESKEY_status_check_default}}
: ${OCF_RESKEY_ip=${OCF_RESKEY_ip_default}}
: ${OCF_RESKEY_reset_local_on_unblock_stop=${OCF_RESKEY_reset_local_on_unblock_stop_default}}
: ${OCF_RESKEY_tickle_dir=${OCF_RESKEY_tickle_dir_default}}
@@ -185,6 +189,26 @@
<content type="string" default="${OCF_RESKEY_action_default}" />
</parameter>
+<parameter name="method" unique="0" required="0">
+<longdesc lang="en">
+Block method:
+drop: Use DROP rule.
+reject: Use REJECT rule w/conntrack to clear connections when blocking.
+</longdesc>
+<shortdesc lang="en">Block method</shortdesc>
+<content type="string" default="${OCF_RESKEY_method_default}" />
+</parameter>
+
+<parameter name="status_check" unique="0" required="0">
+<longdesc lang="en">
+Status check:
+rule: Check rule.
+pseudo: Check pseudo status when rule is absent.
+</longdesc>
+<shortdesc lang="en">Status check</shortdesc>
+<content type="string" default="${OCF_RESKEY_status_check_default}" />
+</parameter>
+
<parameter name="reset_local_on_unblock_stop" unique="0" required="0">
<longdesc lang="en">
If for some reason the long lived server side TCP sessions won't be cleaned up
@@ -253,6 +277,7 @@
<action name="demote" timeout="10s"/>
<action name="status" depth="0" timeout="10s" interval="10s" />
<action name="monitor" depth="0" timeout="10s" interval="10s" />
+<action name="monitor" depth="0" timeout="10s" interval="9s" role="Promoted" />
<action name="meta-data" timeout="5s" />
<action name="validate-all" timeout="5s" />
</actions>
@@ -288,7 +313,11 @@
else
local prot="\(udp\|17\)"
fi
- echo "^DROP${w}${prot}${w}--${w}${src}${w}${dst}${w}multiport${w}${4}ports${w}${2}$"
+ if [ "$method" = "DROP" ]; then
+ echo "^DROP${w}${prot}${w}--${w}${src}${w}${dst}${w}multiport${w}${4}ports${w}${2}$"
+ else
+ echo "^REJECT${w}${prot}${w}--${w}${src}${w}${dst}${w}multiport${w}${4}ports${w}${2}${w}ctstate${w}NEW,RELATED,ESTABLISHED${w}reject-with${w}tcp-reset$"
+ fi
}
#chain_isactive {udp|tcp} portno,portno ip chain
@@ -374,17 +403,17 @@
SayActive()
{
- ocf_log debug "$CMD DROP rule [$*] is running (OK)"
+ ocf_log debug "$CMD $method rule [$*] is running (OK)"
}
SayConsideredActive()
{
- ocf_log debug "$CMD DROP rule [$*] considered to be running (OK)"
+ ocf_log debug "$CMD $method rule [$*] considered to be running (OK)"
}
SayInactive()
{
- ocf_log debug "$CMD DROP rule [$*] is inactive"
+ ocf_log debug "$CMD $method rule [$*] is inactive"
}
#IptablesStatus {udp|tcp} portno,portno ip {in|out|both} {block|unblock}
@@ -405,14 +434,18 @@
case $5 in
block)
SayActive $*
- rc=$OCF_SUCCESS
+ if [ "$__OCF_ACTION" = "monitor" ] && [ "$promotion_score" = "$SCORE_PROMOTED" ]; then
+ rc=$OCF_RUNNING_MASTER
+ else
+ rc=$OCF_SUCCESS
+ fi
;;
*)
SayInactive $*
rc=$OCF_NOT_RUNNING
;;
esac
- elif ocf_is_ms; then
+ elif [ "$OCF_RESKEY_status_check" = "rule" ]; then
case $5 in
block)
SayInactive $*
@@ -420,7 +453,11 @@
;;
*)
SayActive $*
- rc=$OCF_SUCCESS
+ if [ "$__OCF_ACTION" = "monitor" ] && [ "$promotion_score" = "$SCORE_PROMOTED" ]; then
+ rc=$OCF_RUNNING_MASTER
+ else
+ rc=$OCF_SUCCESS
+ fi
;;
esac
else
@@ -461,7 +498,11 @@
: Chain already in desired state
else
[ "$chain" = "OUTPUT" ] && ds="s" || ds="d"
- $IPTABLES $wait "$op" "$chain" -p "$proto" -${ds} "$ip" -m multiport --${ds}ports "$ports" -j DROP
+ if [ "$method" = "DROP" ]; then
+ $IPTABLES $wait "$op" "$chain" -p "$proto" -${ds} "$ip" -m multiport --${ds}ports "$ports" -j DROP
+ else
+ $IPTABLES $wait "$op" "$chain" -p "$proto" -${ds} "$ip" -m multiport --${ds}ports "$ports" -m conntrack --ctstate NEW,ESTABLISHED,RELATED -j REJECT --reject-with tcp-reset
+ fi
fi
}
@@ -486,7 +527,11 @@
$IPTABLES $wait -I OUTPUT -p "$1" -s "$3" -m multiport --sports "$2" -j REJECT --reject-with tcp-reset
tickle_local
fi
- $IPTABLES $wait -I INPUT -p "$1" -d "$3" -m multiport --dports "$2" -j DROP
+ if [ "$method" = "DROP" ]; then
+ $IPTABLES $wait -I INPUT -p "$1" -d "$3" -m multiport --dports "$2" -j DROP
+ else
+ $IPTABLES $wait -I INPUT -p "$1" -d "$3" -m multiport --dports "$2" -m conntrack --ctstate NEW,ESTABLISHED,RELATED -j REJECT --reject-with tcp-reset
+ fi
rc_in=$?
if $try_reset ; then
$IPTABLES $wait -D OUTPUT -p "$1" -s "$3" -m multiport --sports "$2" -j REJECT --reject-with tcp-reset
@@ -718,6 +763,13 @@
ip=$OCF_RESKEY_ip
reset_local_on_unblock_stop=$OCF_RESKEY_reset_local_on_unblock_stop
nodename=$(ocf_local_nodename)
+case "$OCF_RESKEY_method" in
+ drop) method="DROP" ;;
+ reject) method="REJECT" ;;
+ *) ocf_log err "method: $OCF_RESKEY_method not supported"
+ exit $OCF_ERR_CONFIGURED
+ ;;
+esac
# If "tickle" is enabled, we need to record the list of currently established
@@ -743,6 +795,8 @@
fi
fi
+IptablesValidateAll
+
case $__OCF_ACTION in
start)
IptablesStart "$protocol" "$portno" "$ip" "$direction" "$action"
@@ -765,7 +819,6 @@
;;
validate-all)
- IptablesValidateAll
;;
*) usage

View File

@ -0,0 +1,481 @@
From dbc0d2647d73bed986bf7208df33f092f56e8523 Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Thu, 25 Sep 2025 14:23:20 +0200
Subject: [PATCH] db2: use reintegration flag to avoid race condition on
cluster reintegration, and removed FAL, as it's no longer needed
---
heartbeat/db2 | 306 ++++++++++++++++++++++++++++++++------------------
1 file changed, 197 insertions(+), 109 deletions(-)
diff --git a/heartbeat/db2 b/heartbeat/db2
index fe1d9b892..83020fc70 100755
--- a/heartbeat/db2
+++ b/heartbeat/db2
@@ -37,6 +37,13 @@
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
+# Use runuser if available for SELinux.
+if [ -x "/sbin/runuser" ]; then
+ SU="runuser"
+else
+ SU="su"
+fi
+
# Parameter defaults
OCF_RESKEY_instance_default=""
@@ -55,11 +62,12 @@ OCF_RESKEY_dbpartitionnum_default="0"
: ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}}
: ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}}
+POSIX_UNICODE_LOCALE="C.UTF-8"
#######################################################################
db2_usage() {
- echo "db2 start|stop|monitor|promote|demote|notify|validate-all|meta-data"
+ echo "db2 start|stop|monitor|promote|demote|validate-all|meta-data"
}
db2_meta_data() {
@@ -162,7 +170,6 @@ The number of the partition (DBPARTITIONNUM) to be managed.
<action name="stop" timeout="120s"/>
<action name="promote" timeout="120s"/>
<action name="demote" timeout="120s"/>
-<action name="notify" timeout="10s"/>
<action name="monitor" depth="0" timeout="60s" interval="20s"/>
<action name="monitor" depth="0" timeout="60s" role="Promoted" interval="22s"/>
<action name="validate-all" timeout="5s"/>
@@ -273,7 +280,18 @@ master_score()
# Run the given command as db2 instance user
#
runasdb2() {
- su $instance -c ". $db2profile; $*"
+ $SU $instance -c ". $db2profile; $*"
+}
+
+#
+# Run the given command as db2 instance user using $SU
+# We run this function as opposed to runasdb2 whenever we have to issue commands
+# that leave processes running on the system, such as db2start
+# We do not want these processes to hog the resources as they were run with elevated privileges
+#
+runasdb2_session() {
+ # Override db2profile with unicode locale is required to maintain compatibility with unicode CODEPAGE
+ $SU "$instance" -c "ksh -c '. $db2profile; export LC_ALL="$POSIX_UNICODE_LOCALE"; export LANG="$POSIX_UNICODE_LOCALE"; $*'"
}
#
@@ -294,48 +312,6 @@ logasdb2() {
}
-#
-# maintain the fal (first active log) attribute
-# db2_fal_attrib DB {set val|get}
-#
-db2_fal_attrib() {
- local db=$1
- local attr val rc id node member me
-
- attr=db2hadr_${instance}_${db}_fal
-
- case "$2" in
- set)
- me=$(ocf_local_nodename)
-
- # loop over all member nodes and set attribute
- crm_node -l |
- while read id node member
- do
- [ "$member" = member -a "$node" != "$me" ] || continue
- crm_attribute -l forever --node=$node -n $attr -v "$3"
- rc=$?
- ocf_log info "DB2 instance $instance($db2node/$db: setting attrib for FAL to $FIRST_ACTIVE_LOG @ $node"
- [ $rc != 0 ] && break
- done
- ;;
-
- get)
- crm_attribute -l forever -n $attr -G --quiet 2>&1
- rc=$?
- if ! ocf_is_true "$OCF_RESKEY_CRM_meta_notify" && [ $rc != 0 ]
- then
- ocf_log warn "DB2 instance $instance($db2node/$db: can't retrieve attribute $attr, are you sure notifications are enabled ?"
- fi
- ;;
-
- *)
- exit $OCF_ERR_CONFIGURED
- esac
-
- return $rc
-}
-
#
# unfortunately a first connect after a crash may need several minutes
# for some internal cleanup stuff in DB2.
@@ -429,6 +405,42 @@ db2_check_config_compatibility() {
}
+#
+# Start HADR as standby.
+#
+# Parameters
+# 1 - Calling function
+# 2 - Calling functions line number
+#
+# Return codes:
+# 0 - Start as standby successful
+# 1 - Start as standby failed
+#
+reintegrateAsStandby() {
+ db=$1
+ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint"
+ ocf_log info "$__OCF_ACTION: $LINENO: reintegrateAsStandby called by $2 at $3. Attempting to reintegrate $db as standby."
+ if output=$(runasdb2_session "db2 start hadr on db $db as standby"); then
+ rc=0
+ ocf_log info "$__OCF_ACTION: $LINENO: Db2 database $instance($db2node)/$db started/activated"
+ else
+ case $output in
+ SQL1777N*)
+ # SQL1777N: HADR is already started in given state.
+ ocf_log info "$__OCF_ACTION: $LINENO: $output"
+ rc=0
+ ;;
+
+ *)
+ rc=1
+ ocf_log err "$__OCF_ACTION: $LINENO: Unable to reintegrate Db2 database $instance($db2node)/$db. Please reintegrate manually: $output, return with rc=$rc"
+ ;;
+ esac
+ fi
+ crm_attribute -n "$reint_attr" -N "$local_host" -v "0" -l forever
+ return $rc
+}
+
#
# Start instance and DB.
# Standard mode is through "db2 activate" in order to start in previous
@@ -478,6 +490,8 @@ db2_start() {
for db in $dblist
do
+ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint"
+
# sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW FIRST_ACTIVE_LOG
db2_get_cfg $db || return $?
@@ -488,20 +502,13 @@ db2_start() {
if [ $HADR_ROLE = PRIMARY ]
then
- local master_fal
-
- # communicate our FAL to other nodes the might start concurrently
- db2_fal_attrib $db set $FIRST_ACTIVE_LOG
-
- # ignore false positive:
- # error: Can't use > in [ ]. Escape it or use [[..]]. [SC2073]
- # see https://github.com/koalaman/shellcheck/issues/691
- # shellcheck disable=SC2073
- if master_fal=$(db2_fal_attrib $db get) && [ "$master_fal" '>' $FIRST_ACTIVE_LOG ]
- then
+ cib_value=$(crm_attribute -n "$reint_attr" -N "$local_host" -G | awk -v FS=' value=' '{print $2}')
+ ocf_log info "$__OCF_ACTION: $LINENO: CIB attribute $reint_attr is set to '$cib_value'"
+ if [ "$cib_value" = "1" ]; then
ocf_log info "DB2 database $instance($db2node)/$db is Primary and outdated, starting as secondary"
start_cmd="db2 start hadr on db $db as standby"
HADR_ROLE=STANDBY
+ standby_reintegration=1
fi
fi
@@ -511,27 +518,65 @@ db2_start() {
[ $HADR_ROLE != STANDBY ] && db2_run_connect $db &
else
case $output in
- SQL1490W*|SQL1494W*|SQL1497W*|SQL1777N*)
- ocf_log info "DB2 database $instance($db2node)/$db already activated: $output"
+ SQL1490W* | SQL1494W* | SQL1497W* | SQL1777N*)
+ # SQL1490W Activate database is successful, however, the database has already been activated on one or more nodes.
+ # SQL1494W Activate database is successful, however, there is already a connection to the database.
+ # SQL1497W Activate/Deactivate database was successful, however, an error occurred on some nodes.
+ # SQL1777N HADR is already started.
+
+ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database is already activated: $output"
;;
- SQL1768N*"Reason code = \"7\""*)
- ocf_log err "DB2 database $instance($db2node)/$db is a Primary and the Standby is down"
- ocf_log err "Possible split brain ! Manual intervention required."
+ SQL1768N*"Reason code = \"7\""*)
+ rc="$OCF_ERR_GENERIC"
+
+ ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database is a Primary and the Standby is down"
+ ocf_log err "Possible split brain! Manual intervention required."
ocf_log err "If this DB is outdated use \"db2 start hadr on db $db as standby\""
- ocf_log err "If this DB is the surviving primary use \"db2 start hadr on db $db as primary by force\""
+ ocf_log err "If this DB is the surviving primary use \"db2 start hadr on db $db as primary by force\". db2_start() exit with rc=$rc."
- # might be the Standby is not yet there
- # might be a timing problem because "First active log" is delayed
- # on the next start attempt we might succeed when FAL was advanced
- # might be manual intervention is required
- # ... so let pacemaker give it another try and we will succeed then
- return $OCF_ERR_GENERIC
+ # let pacemaker give it another try and we will succeed then
+ return "$rc"
;;
- *)
- ocf_log err "DB2 database $instance($db2node)/$db didn't start: $output"
- return $OCF_ERR_GENERIC
+ SQL1776N*"Reason code = \"6\""*)
+ # SQL1776N The command cannot be issued on an HADR database.
+ # Reason code 6:
+ # This database is an old primary database. It cannot be started
+ # because the standby has become the new primary through forced
+ # takeover.
+
+ rc="$OCF_ERR_GENERIC"
+ ocf_log err "$__OCF_ACTION: $LINENO: Db2 database $instance($db2node)/$db didn't start: $output, return with rc=$rc"
+ ocf_log err "$__OCF_ACTION: $LINENO: This database is an old primary database. Trying start again as standby"
+
+ start_cmd="db2 start hadr on db $db as standby"
+ if output=$(runasdb2_session "$start_cmd"); then
+ rc="$OCF_SUCCESS"
+ ocf_log info "$__OCF_ACTION: $LINENO: Db2 database $instance($db2node)/$db started/activated"
+ else
+ case $output in
+ SQL1777N*)
+ # SQL1777N: HADR is already started.
+ ocf_log info "$__OCF_ACTION: $LINENO: $output"
+ rc="$OCF_SUCCESS"
+ ;;
+
+ *)
+ rc="$OCF_ERR_GENERIC"
+ ocf_log err "$__OCF_ACTION: $LINENO: Unable to reintegrate Db2 database $instance($db2node)/$db. Please reintegrate manually: $output, return with rc=$rc"
+ ;;
+ esac
+ fi
+
+ return "$rc"
+ ;;
+
+ *)
+ rc="$OCF_ERR_GENERIC"
+ ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database didn't start: $output, db2_start() exit with rc=$rc."
+ return "$rc"
+ ;;
esac
fi
done
@@ -539,6 +584,15 @@ db2_start() {
# come here with success
# Even if we are a db2 Primary pacemaker requires start to end up in slave mode
echo SLAVE > $STATE_FILE
+
+ # Unset primary failover attribute as host was successfully reintegrated as standby
+ if [ "$standby_reintegration" = "1" ]; then
+ for db in $dblist; do
+ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint"
+ crm_attribute -n "$reint_attr" -N "$local_host" -v "0" -l forever
+ done
+ fi
+
return $OCF_SUCCESS
}
@@ -737,7 +791,7 @@ db2_monitor_retry() {
#
# Monitor the db
-# And as side effect set crm_master / FAL attribute
+# And as side effect set crm_master
#
db2_monitor() {
local CMD output hadr db
@@ -754,6 +808,22 @@ db2_monitor() {
for db in $dblist
do
+ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint"
+
+ #Check for the reintegration file, then set the flag if it exists and delete the file
+ if [ -e "/tmp/$reint_attr" ] && [ -n "$remote_host" ]; then
+ #The file exist, try to set the reintegration attribute
+ crm_attribute -n "$reint_attr" -N "$remote_host" -v "1" -l forever
+ cib_value=$(crm_attribute -n "$reint_attr" -N "$remote_host" -G | awk -v FS=' value=' '{print $2}')
+
+ if [ "$cib_value" = "1" ]; then
+ ocf_log info "$__OCF_ACTION: $LINENO: CIB attribute $reint_attr is set to '$cib_value', reintegration flag file will now be deleted."
+ rm -f "/tmp/$reint_attr"
+ else
+ ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The reintegration flag file exists, but its attribute failed to set."
+ fi
+ fi
+
hadr=$(db2_hadr_status $db)
rc=$?
ocf_log debug "Monitor: DB2 database $instance($db2node)/$db has HADR status $hadr"
@@ -804,6 +874,14 @@ db2_monitor() {
;;
STANDBY/*PEER/*|Standby/*Peer)
+ # If db is in standby peer, then it has already reintegrated.
+ # If the reintegrate flag is still set, remove it
+ cib_value=$(crm_attribute -n "$reint_attr" -N "$local_host" -G | awk -v FS=' value=' '{print $2}')
+ if [ "$cib_value" = "1" ]; then
+ ocf_log info "$__OCF_ACTION: $LINENO: Reintegrate flag detected for $db, but it has already reintegrated as standby. Removing reintegration flag."
+ crm_attribute -n "$reint_attr" -N "$local_host" -v "0" -l forever
+ fi
+
master_score -v 8000 -l reboot
;;
@@ -812,6 +890,34 @@ db2_monitor() {
master_score -D -l reboot
;;
+ Down/Off)
+ # If db is a deactivated primary and it has a reintegration flag, then reintegrate as standby.
+ cib_value=$(crm_attribute -n "$reint_attr" -N "$local_host" -G | awk -v FS=' value=' '{print $2}')
+ if [ "$cib_value" = "1" ]; then
+ output=$(runasdb2 "db2 get db cfg for $db" | grep 'HADR database role' | awk '{print $5}')
+ if [ "PRIMARY" = "$output" ]; then
+ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: Database is deactivated with Primary role and the reintegration flag is set. Role: $output, Reintegration flag: $reint_attr = $cib_value"
+ # Reintegrate as the standby database.
+ if reintegrateAsStandby "$db" 'db2_monitor' $LINENO; then
+ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database reintegration succeeded."
+ # Setting slave state here will cause rc to be OCF_SUCCESS below.
+ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: Echoing SLAVE into $STATE_FILE"
+ echo SLAVE >"$STATE_FILE"
+ # Update master score to reflect standby state.
+ master_score -v 8000 -l reboot
+ else
+ ocf_log err "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database reintegration failed."
+ return "$OCF_ERR_GENERIC"
+ fi
+ fi
+ else
+ rc="$OCF_NOT_RUNNING"
+ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: The database has HADR status $hadr."
+ ocf_log info "$__OCF_ACTION: $LINENO: $instance: $db2node: $db: db2_monitor() exit with rc=$rc."
+ return "$rc"
+ fi
+ ;;
+
*)
return $OCF_ERR_GENERIC
esac
@@ -875,8 +981,6 @@ db2_promote() {
# update pacemaker's view
echo MASTER > $STATE_FILE
- # turn the log so we rapidly get a new FAL
- logasdb2 "db2 archive log for db $db"
return $OCF_SUCCESS
fi
@@ -914,26 +1018,6 @@ db2_demote() {
return $?
}
-#
-# handle pre start notification
-# We record our first active log on the other nodes.
-# If two primaries come up after a crash they can safely determine who is
-# the outdated one.
-#
-db2_notify() {
- local node
-
- # only interested in pre-start
- [ $OCF_RESKEY_CRM_meta_notify_type = pre \
- -a $OCF_RESKEY_CRM_meta_notify_operation = start ] || return $OCF_SUCCESS
-
- # gets FIRST_ACTIVE_LOG
- db2_get_cfg $dblist || return $?
-
- db2_fal_attrib $dblist set $FIRST_ACTIVE_LOG || return $OCF_ERR_GENERIC
- exit $OCF_SUCCESS
-}
-
########
# Main #
########
@@ -947,50 +1031,54 @@ case "$__OCF_ACTION" in
db2_usage
exit $OCF_SUCCESS
;;
+esac
+local_host=$(ocf_local_nodename)
+inst1=$(echo "$OCF_RESKEY_instance" | cut -d"," -f1)
+inst2=$(echo "$OCF_RESKEY_instance" | cut -d"," -f2)
+host1=$(crm_node -l | sort | awk '{print $2;}' | sed -n 1p)
+
+if [ "$host1" = "$local_host" ]; then
+ remote_host=$(crm_node -l | sort | awk '{print $2;}' | sed -n 2p)
+else
+ remote_host="$host1"
+fi
+
+db2_validate; validate_rc=$?
+
+case "$__OCF_ACTION" in
start)
- db2_validate
db2_start || exit $?
db2_monitor
- exit $?
;;
stop)
- db2_validate
db2_stop
- exit $?
;;
promote)
- db2_validate
db2_promote
- exit $?
;;
demote)
- db2_validate
db2_demote
- exit $?
;;
notify)
- db2_validate
- db2_notify
- exit $?
+ ocf_log debug "notify-action has been DEPRECATED, and should be removed"
;;
monitor)
- db2_validate
db2_monitor_retry
- exit $?
;;
validate-all)
- db2_validate
- exit $?
+ exit $validate_rc
;;
*)
db2_usage
exit $OCF_ERR_UNIMPLEMENTED
esac
+
+exit $?

6
gating.yaml Normal file
View File

@ -0,0 +1,6 @@
--- !Policy
product_versions:
- rhel-8
decision_context: osci_compose_gate
rules:
- !PassingTestCaseRule {test_case_name: osci.brew-build.tier0.functional}

Some files were not shown because too many files have changed in this diff Show More