From cc23c5523a0185fa557a5ab9056d50a60300d12a Mon Sep 17 00:00:00 2001 From: John Eckersberg Date: Tue, 16 Oct 2018 16:21:25 -0400 Subject: [PATCH] rabbitmq-cluster: fail monitor when node is in minority partition It's possible for mnesia to still be running, but for mnesia to be partitioned. And it's also possible to get into this state without pacemaker seeing the node go down so no corrective action is taken. When monitoring, check the number of nodes that pacemaker thinks is running, and compare to the number of nodes that mnesia thinks is running. If mnesia only sees a minority of the total nodes, fail it so corrective action can be taken to rejoin the cluster. This also adds a new function, rmq_app_running, which simply checks whether the app is running or not and does not care about the partition status. This is now used instead of the full monitor in a few places where we don't care about partition state. Resolves: RHBZ#1639826 --- heartbeat/rabbitmq-cluster | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster index 204917475..78b2bbadf 100755 --- a/heartbeat/rabbitmq-cluster +++ b/heartbeat/rabbitmq-cluster @@ -178,10 +178,31 @@ remove_pid () { rm -f ${RMQ_PID_FILE} > /dev/null 2>&1 } +rmq_app_running() { + if $RMQ_CTL eval 'application:which_applications().' | grep -q '{rabbit,'; then + ocf_log debug "RabbitMQ application is running" + return $OCF_SUCCESS + else + ocf_log debug "RabbitMQ application is stopped" + return $OCF_NOT_RUNNING + fi +} + rmq_monitor() { local rc if $RMQ_CTL eval 'rabbit_mnesia:cluster_status_from_mnesia().' | grep -q '^{ok'; then + pcs_running=$(rmq_join_list | wc -w) + ocf_log debug "Pacemaker thinks ${pcs_running} RabbitMQ nodes are running" + rmq_running=$($RMQ_CTL eval 'length(mnesia:system_info(running_db_nodes)).') + ocf_log debug "RabbitMQ thinks ${rmq_running} RabbitMQ nodes are running" + + if [ $(( $rmq_running * 2 )) -lt $pcs_running ]; then + ocf_log info "RabbitMQ is a minority partition, failing monitor" + rmq_delete_nodename + return $OCF_ERR_GENERIC + fi + ocf_log debug "RabbitMQ server is running normally" rmq_write_nodename @@ -215,7 +236,7 @@ rmq_init_and_wait() return $OCF_ERR_GENERIC fi - rmq_monitor + rmq_app_running return $? } @@ -236,6 +257,7 @@ rmq_start_first() if [ $rc -eq 0 ]; then rc=$OCF_SUCCESS ocf_log info "cluster bootstrapped" + rmq_write_nodename if [ -n "$OCF_RESKEY_set_policy" ]; then # do not quote set_policy, we are passing in arguments @@ -492,7 +514,7 @@ rmq_stop() { end. " - rmq_monitor + rmq_app_running if [ $? -eq $OCF_NOT_RUNNING ]; then return $OCF_SUCCESS fi @@ -508,7 +530,7 @@ rmq_stop() { #TODO add kill logic stop_wait=1 while [ $stop_wait = 1 ]; do - rmq_monitor + rmq_app_running rc=$? if [ "$rc" -eq $OCF_NOT_RUNNING ]; then stop_wait=0