resource-agents/SOURCES/bz1745713-rabbitmq-cluster-2-fail-when-in-minority-partition.patch

97 lines
2.9 KiB
Diff
Raw Normal View History

2020-01-21 19:47:07 +00:00
From cc23c5523a0185fa557a5ab9056d50a60300d12a Mon Sep 17 00:00:00 2001
From: John Eckersberg <jeckersb@redhat.com>
Date: Tue, 16 Oct 2018 16:21:25 -0400
Subject: [PATCH] rabbitmq-cluster: fail monitor when node is in minority
partition
It's possible for mnesia to still be running, but for mnesia to be
partitioned. And it's also possible to get into this state without
pacemaker seeing the node go down so no corrective action is taken.
When monitoring, check the number of nodes that pacemaker thinks is
running, and compare to the number of nodes that mnesia thinks is
running. If mnesia only sees a minority of the total nodes, fail it
so corrective action can be taken to rejoin the cluster.
This also adds a new function, rmq_app_running, which simply checks
whether the app is running or not and does not care about the
partition status. This is now used instead of the full monitor in a
few places where we don't care about partition state.
Resolves: RHBZ#1639826
---
heartbeat/rabbitmq-cluster | 28 +++++++++++++++++++++++++---
1 file changed, 25 insertions(+), 3 deletions(-)
diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster
index 204917475..78b2bbadf 100755
--- a/heartbeat/rabbitmq-cluster
+++ b/heartbeat/rabbitmq-cluster
@@ -178,10 +178,31 @@ remove_pid () {
rm -f ${RMQ_PID_FILE} > /dev/null 2>&1
}
+rmq_app_running() {
+ if $RMQ_CTL eval 'application:which_applications().' | grep -q '{rabbit,'; then
+ ocf_log debug "RabbitMQ application is running"
+ return $OCF_SUCCESS
+ else
+ ocf_log debug "RabbitMQ application is stopped"
+ return $OCF_NOT_RUNNING
+ fi
+}
+
rmq_monitor() {
local rc
if $RMQ_CTL eval 'rabbit_mnesia:cluster_status_from_mnesia().' | grep -q '^{ok'; then
+ pcs_running=$(rmq_join_list | wc -w)
+ ocf_log debug "Pacemaker thinks ${pcs_running} RabbitMQ nodes are running"
+ rmq_running=$($RMQ_CTL eval 'length(mnesia:system_info(running_db_nodes)).')
+ ocf_log debug "RabbitMQ thinks ${rmq_running} RabbitMQ nodes are running"
+
+ if [ $(( $rmq_running * 2 )) -lt $pcs_running ]; then
+ ocf_log info "RabbitMQ is a minority partition, failing monitor"
+ rmq_delete_nodename
+ return $OCF_ERR_GENERIC
+ fi
+
ocf_log debug "RabbitMQ server is running normally"
rmq_write_nodename
@@ -215,7 +236,7 @@ rmq_init_and_wait()
return $OCF_ERR_GENERIC
fi
- rmq_monitor
+ rmq_app_running
return $?
}
@@ -236,6 +257,7 @@ rmq_start_first()
if [ $rc -eq 0 ]; then
rc=$OCF_SUCCESS
ocf_log info "cluster bootstrapped"
+ rmq_write_nodename
if [ -n "$OCF_RESKEY_set_policy" ]; then
# do not quote set_policy, we are passing in arguments
@@ -492,7 +514,7 @@ rmq_stop() {
end.
"
- rmq_monitor
+ rmq_app_running
if [ $? -eq $OCF_NOT_RUNNING ]; then
return $OCF_SUCCESS
fi
@@ -508,7 +530,7 @@ rmq_stop() {
#TODO add kill logic
stop_wait=1
while [ $stop_wait = 1 ]; do
- rmq_monitor
+ rmq_app_running
rc=$?
if [ "$rc" -eq $OCF_NOT_RUNNING ]; then
stop_wait=0