97 lines
2.9 KiB
Diff
97 lines
2.9 KiB
Diff
|
From cc23c5523a0185fa557a5ab9056d50a60300d12a Mon Sep 17 00:00:00 2001
|
||
|
From: John Eckersberg <jeckersb@redhat.com>
|
||
|
Date: Tue, 16 Oct 2018 16:21:25 -0400
|
||
|
Subject: [PATCH] rabbitmq-cluster: fail monitor when node is in minority
|
||
|
partition
|
||
|
|
||
|
It's possible for mnesia to still be running, but for mnesia to be
|
||
|
partitioned. And it's also possible to get into this state without
|
||
|
pacemaker seeing the node go down so no corrective action is taken.
|
||
|
|
||
|
When monitoring, check the number of nodes that pacemaker thinks is
|
||
|
running, and compare to the number of nodes that mnesia thinks is
|
||
|
running. If mnesia only sees a minority of the total nodes, fail it
|
||
|
so corrective action can be taken to rejoin the cluster.
|
||
|
|
||
|
This also adds a new function, rmq_app_running, which simply checks
|
||
|
whether the app is running or not and does not care about the
|
||
|
partition status. This is now used instead of the full monitor in a
|
||
|
few places where we don't care about partition state.
|
||
|
|
||
|
Resolves: RHBZ#1639826
|
||
|
---
|
||
|
heartbeat/rabbitmq-cluster | 28 +++++++++++++++++++++++++---
|
||
|
1 file changed, 25 insertions(+), 3 deletions(-)
|
||
|
|
||
|
diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster
|
||
|
index 204917475..78b2bbadf 100755
|
||
|
--- a/heartbeat/rabbitmq-cluster
|
||
|
+++ b/heartbeat/rabbitmq-cluster
|
||
|
@@ -178,10 +178,31 @@ remove_pid () {
|
||
|
rm -f ${RMQ_PID_FILE} > /dev/null 2>&1
|
||
|
}
|
||
|
|
||
|
+rmq_app_running() {
|
||
|
+ if $RMQ_CTL eval 'application:which_applications().' | grep -q '{rabbit,'; then
|
||
|
+ ocf_log debug "RabbitMQ application is running"
|
||
|
+ return $OCF_SUCCESS
|
||
|
+ else
|
||
|
+ ocf_log debug "RabbitMQ application is stopped"
|
||
|
+ return $OCF_NOT_RUNNING
|
||
|
+ fi
|
||
|
+}
|
||
|
+
|
||
|
rmq_monitor() {
|
||
|
local rc
|
||
|
|
||
|
if $RMQ_CTL eval 'rabbit_mnesia:cluster_status_from_mnesia().' | grep -q '^{ok'; then
|
||
|
+ pcs_running=$(rmq_join_list | wc -w)
|
||
|
+ ocf_log debug "Pacemaker thinks ${pcs_running} RabbitMQ nodes are running"
|
||
|
+ rmq_running=$($RMQ_CTL eval 'length(mnesia:system_info(running_db_nodes)).')
|
||
|
+ ocf_log debug "RabbitMQ thinks ${rmq_running} RabbitMQ nodes are running"
|
||
|
+
|
||
|
+ if [ $(( $rmq_running * 2 )) -lt $pcs_running ]; then
|
||
|
+ ocf_log info "RabbitMQ is a minority partition, failing monitor"
|
||
|
+ rmq_delete_nodename
|
||
|
+ return $OCF_ERR_GENERIC
|
||
|
+ fi
|
||
|
+
|
||
|
ocf_log debug "RabbitMQ server is running normally"
|
||
|
rmq_write_nodename
|
||
|
|
||
|
@@ -215,7 +236,7 @@ rmq_init_and_wait()
|
||
|
return $OCF_ERR_GENERIC
|
||
|
fi
|
||
|
|
||
|
- rmq_monitor
|
||
|
+ rmq_app_running
|
||
|
return $?
|
||
|
}
|
||
|
|
||
|
@@ -236,6 +257,7 @@ rmq_start_first()
|
||
|
if [ $rc -eq 0 ]; then
|
||
|
rc=$OCF_SUCCESS
|
||
|
ocf_log info "cluster bootstrapped"
|
||
|
+ rmq_write_nodename
|
||
|
|
||
|
if [ -n "$OCF_RESKEY_set_policy" ]; then
|
||
|
# do not quote set_policy, we are passing in arguments
|
||
|
@@ -492,7 +514,7 @@ rmq_stop() {
|
||
|
end.
|
||
|
"
|
||
|
|
||
|
- rmq_monitor
|
||
|
+ rmq_app_running
|
||
|
if [ $? -eq $OCF_NOT_RUNNING ]; then
|
||
|
return $OCF_SUCCESS
|
||
|
fi
|
||
|
@@ -508,7 +530,7 @@ rmq_stop() {
|
||
|
#TODO add kill logic
|
||
|
stop_wait=1
|
||
|
while [ $stop_wait = 1 ]; do
|
||
|
- rmq_monitor
|
||
|
+ rmq_app_running
|
||
|
rc=$?
|
||
|
if [ "$rc" -eq $OCF_NOT_RUNNING ]; then
|
||
|
stop_wait=0
|