glusterfs/SOURCES/0550-common-ha-stability-fixes-for-ganesha_grace-and-gane.patch
2022-03-30 15:45:22 +00:00

185 lines
7.5 KiB
Diff

From 053bb9c7356eae82b1089582bb2844388ae4df57 Mon Sep 17 00:00:00 2001
From: "Kaleb S. KEITHLEY" <kkeithle@redhat.com>
Date: Wed, 2 Jun 2021 07:49:12 -0400
Subject: [PATCH 550/584] common-ha: stability fixes for ganesha_grace and
ganesha_mon RAs
Include fixes suggested by ClusterHA devs.
1) It turns out that crm_attribute attrs and attrd_updater attrs really
are one and the same, despite what I was told years ago.
attrs created with crm_attribute ... --lifetime=reboot ... or
attrd_updater are one and same. As per ClusterHA devs having an attr
created with crm_attribute ... --lifetime=forever and also
creating/updating the same attr with attrd_updater is a recipe for
weird things to happen that will be difficult to debug.
2) using hostname -s or hostname for node names in crm_attribute and
attrd_updater potentially could use the wrong name if the host has
been renamed; use ocf_local_nodename() (in ocf-shellfuncs) instead.
https://github.com/gluster/glusterfs/issues/2276
https://github.com/gluster/glusterfs/pull/2283
commit 9bd2c697686ec40e2c4f711df961860c8a735baa
Change-Id:If572d396fae9206628714fb2ce00f72e94f2258f
BUG: 1945143
Signed-off-by: Kaleb S. KEITHLEY <kkeithle@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244593
Tested-by: RHGS Build Bot <nigelb@redhat.com>
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
---
extras/ganesha/ocf/ganesha_grace | 28 +++++++++---------------
extras/ganesha/ocf/ganesha_mon | 47 ++++++++++++++--------------------------
2 files changed, 26 insertions(+), 49 deletions(-)
diff --git a/extras/ganesha/ocf/ganesha_grace b/extras/ganesha/ocf/ganesha_grace
index 825f716..edc6fa2 100644
--- a/extras/ganesha/ocf/ganesha_grace
+++ b/extras/ganesha/ocf/ganesha_grace
@@ -94,25 +94,21 @@ esac
ganesha_grace_start()
{
local rc=${OCF_ERR_GENERIC}
- local host=$(hostname -s)
+ local host=$(ocf_local_nodename)
- ocf_log debug "ganesha_grace_start()"
- # give ganesha_mon RA a chance to set the crm_attr first
+ ocf_log debug "ganesha_grace_start ${host}"
+ # give ganesha_mon RA a chance to set the attr first
# I mislike the sleep, but it's not clear that looping
# with a small sleep is necessarily better
# start has a 40sec timeout, so a 5sec sleep here is okay
sleep 5
- attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
+ attr=$(attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
if [ $? -ne 0 ]; then
- host=$(hostname)
- attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null )
- if [ $? -ne 0 ]; then
- ocf_log info "grace start: crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} failed"
- fi
+ ocf_log info "grace start: attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} failed"
fi
# Three possibilities:
- # 1. There is no attribute at all and attr_updater returns
+ # 1. There is no attribute at all and attrd_updater returns
# a zero length string. This happens when
# ganesha_mon::monitor hasn't run at least once to set
# the attribute. The assumption here is that the system
@@ -164,17 +160,13 @@ ganesha_grace_notify()
ganesha_grace_monitor()
{
- local host=$(hostname -s)
+ local host=$(ocf_local_nodename)
- ocf_log debug "monitor"
+ ocf_log debug "ganesha_grace monitor ${host}"
- attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
+ attr=$(attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
if [ $? -ne 0 ]; then
- host=$(hostname)
- attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
- if [ $? -ne 0 ]; then
- ocf_log info "crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} failed"
- fi
+ ocf_log info "attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} failed"
fi
# if there is no attribute (yet), maybe it's because
diff --git a/extras/ganesha/ocf/ganesha_mon b/extras/ganesha/ocf/ganesha_mon
index 2b4a9d6..7fbbf70 100644
--- a/extras/ganesha/ocf/ganesha_mon
+++ b/extras/ganesha/ocf/ganesha_mon
@@ -124,7 +124,6 @@ ganesha_mon_stop()
ganesha_mon_monitor()
{
- local host=$(hostname -s)
local pid_file="/var/run/ganesha.pid"
local rhel6_pid_file="/var/run/ganesha.nfsd.pid"
local proc_pid="/proc/"
@@ -141,31 +140,27 @@ ganesha_mon_monitor()
if [ "x${proc_pid}" != "x/proc/" -a -d ${proc_pid} ]; then
- attrd_updater -n ${OCF_RESKEY_ganesha_active} -v 1
+ attrd_updater --name ${OCF_RESKEY_ganesha_active} -v 1
if [ $? -ne 0 ]; then
- ocf_log info "warning: attrd_updater -n ${OCF_RESKEY_ganesha_active} -v 1 failed"
+ ocf_log info "warning: attrd_updater --name ${OCF_RESKEY_ganesha_active} -v 1 failed"
fi
# ganesha_grace (nfs-grace) RA follows grace-active attr
# w/ constraint location
- attrd_updater -n ${OCF_RESKEY_grace_active} -v 1
+ attrd_updater --name ${OCF_RESKEY_grace_active} -v 1
if [ $? -ne 0 ]; then
- ocf_log info "warning: attrd_updater -n ${OCF_RESKEY_grace_active} -v 1 failed"
+ ocf_log info "warning: attrd_updater --name ${OCF_RESKEY_grace_active} -v 1 failed"
fi
# ganesha_mon (nfs-mon) and ganesha_grace (nfs-grace)
- # track grace-active crm_attr (attr != crm_attr)
- # we can't just use the attr as there's no way to query
- # its value in RHEL6 pacemaker
-
- crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 2> /dev/null
- if [ $? -ne 0 ]; then
- host=$(hostname)
- crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 2> /dev/null
- if [ $? -ne 0 ]; then
- ocf_log info "mon monitor warning: crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 failed"
- fi
- fi
+ # track grace-active attr.
+ #
+ # Originally we were told that attrs set with attrd_updater
+ # are different/distinct than attrs set with crm_attribute.
+ # Now, years later, we are told that they are the same and
+ # that the values of attrs set with attrd_updater can be
+ # retrieved with crm_attribute. Or with attrd_updater -Q
+ # now that we no longer have to deal with rhel6.
return ${OCF_SUCCESS}
fi
@@ -182,26 +177,16 @@ ganesha_mon_monitor()
# the remaining ganesha.nfsds into grace before
# initiating the VIP fail-over.
- attrd_updater -D -n ${OCF_RESKEY_grace_active}
- if [ $? -ne 0 ]; then
- ocf_log info "warning: attrd_updater -D -n ${OCF_RESKEY_grace_active} failed"
- fi
-
- host=$(hostname -s)
- crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 2> /dev/null
+ attrd_updater --delete --name ${OCF_RESKEY_grace_active}
if [ $? -ne 0 ]; then
- host=$(hostname)
- crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 2> /dev/null
- if [ $? -ne 0 ]; then
- ocf_log info "mon monitor warning: crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 failed"
- fi
+ ocf_log info "warning: attrd_updater --delete --name ${OCF_RESKEY_grace_active} failed"
fi
sleep ${OCF_RESKEY_grace_delay}
- attrd_updater -D -n ${OCF_RESKEY_ganesha_active}
+ attrd_updater --delete --name ${OCF_RESKEY_ganesha_active}
if [ $? -ne 0 ]; then
- ocf_log info "warning: attrd_updater -D -n ${OCF_RESKEY_ganesha_active} failed"
+ ocf_log info "warning: attrd_updater --delete --name ${OCF_RESKEY_ganesha_active} failed"
fi
return ${OCF_SUCCESS}
--
1.8.3.1