resource-agents/SOURCES/bz1891855-galera-recover-2-...

81 lines
2.9 KiB
Diff

--- a/heartbeat/galera 2020-10-28 16:28:48.125700714 +0100
+++ b/heartbeat/galera 2020-10-28 16:31:14.932820752 +0100
@@ -81,6 +81,11 @@
. /etc/default/clustercheck
fi
+# Parameter defaults
+
+OCF_RESKEY_two_node_mode_default="false"
+: ${OCF_RESKEY_two_node_mode=${OCF_RESKEY_two_node_mode_default}}
+
#######################################################################
usage() {
@@ -249,6 +254,16 @@
<content type="string" default="" />
</parameter>
+<parameter name="two_node_mode" unique="0" required="0">
+<longdesc lang="en">
+If running in a 2-node pacemaker cluster, rely on pacemaker quorum
+to allow automatic recovery even when the other node is unreachable.
+Use it with caution! (and fencing)
+</longdesc>
+<shortdesc lang="en">Special recovery when running on a 2-node cluster</shortdesc>
+<content type="boolean" default="${OCF_RESKEY_two_node_mode_default}"/>
+</parameter>
+
</parameters>
<actions>
@@ -400,6 +415,27 @@
return 0
}
+is_two_node_mode_active()
+{
+ # crm_node or corosync-quorumtool cannot access various corosync
+ # flags when running inside a bundle, so only count the cluster
+ # members
+ ocf_is_true "$OCF_RESKEY_two_node_mode" && ${HA_SBIN_DIR}/crm_mon -1X | xmllint --xpath "count(//nodes/node[@type='member'])" - | grep -q -w 2
+}
+
+is_last_node_in_quorate_partition()
+{
+ # when a network split occurs in a 2-node cluster, pacemaker
+ # fences the other node and try to retain quorum. So until
+ # the fencing is resolved (and the status of the peer node
+ # is clean), we shouldn't consider ourself quorate.
+ local partition_members=$(${HA_SBIN_DIR}/crm_node -p | wc -w)
+ local quorate=$(${HA_SBIN_DIR}/crm_node -q)
+ local clean_members=$(${HA_SBIN_DIR}/crm_mon -1X | xmllint --xpath 'count(//nodes/node[@type="member" and @unclean="false"])' -)
+
+ [ "$partition_members" = 1 ] && [ "$quorate" = 1 ] && [ "$clean_members" = 2 ]
+}
+
master_exists()
{
if [ "$__OCF_ACTION" = "demote" ]; then
@@ -518,8 +554,20 @@
done
for node in $nodes_recovered $nodes; do
+ # On clean shutdown, galera sets the last stopped node as 'safe to bootstrap',
+ # so use this hint when we can
safe_to_bootstrap=$(get_safe_to_bootstrap $node)
+ # Special case for 2-node clusters: during a network split, rely on
+ # pacemaker's quorum to check whether we can restart galera
+ if [ "$safe_to_bootstrap" != "1" ] && [ "$node" = "$NODENAME" ] && is_two_node_mode_active; then
+ is_last_node_in_quorate_partition
+ if [ $? -eq 0 ]; then
+ ocf_log warn "Survived a split in a 2-node cluster, considering ourselves safe to bootstrap"
+ safe_to_bootstrap=1
+ fi
+ fi
+
if [ "$safe_to_bootstrap" = "1" ]; then
# Galera marked the node as safe to boostrap during shutdown. Let's just
# pick it as our bootstrap node.