import resource-agents-4.9.0-40.el8

This commit is contained in:
CentOS Sources 2023-03-28 08:23:40 +00:00 committed by Stepan Oksanichenko
parent 4325103a5f
commit 5c6552b5d5
17 changed files with 2883 additions and 2 deletions

View File

@ -0,0 +1,195 @@
From 640c2b57f0f3e7256d587ddd5960341cb38b1982 Mon Sep 17 00:00:00 2001
From: Reid Wahl <nrwahl@protonmail.com>
Date: Sun, 13 Dec 2020 14:58:34 -0800
Subject: [PATCH] LVM-activate: Fix return codes
OCF_ERR_ARGS should be used when the configuration isn't valid for the
**local** node, and so the resource should not attempt to start again
locally until the issue is corrected.
OCF_ERR_CONFIGURED should be used when the configuration isn't valid on
**any** node, and so the resource should not attempt to start again
anywhere until the issue is corrected.
One remaining gray area: Should lvmlockd/lvmetad/clvmd improperly
running (or improperly not running) be an OCF_ERR_GENERIC or
OCF_ERR_ARGS? The fact that it's a state issue rather than a config
issue suggests OCF_ERR_GENERIC. The fact that it won't be fixed without
user intervention suggests OCF_ERR_ARGS. The approach here is to use
GENERIC for all of these. One can make the case that "improperly
running" should use ARGS, since a process must be manually stopped to
fix the issue, and that "improperly not running" should use GENERIC,
since there's a small chance the process died and will be recovered in
some way.
More info about return code meanings:
- https://clusterlabs.org/pacemaker/doc/2.1/Pacemaker_Administration/html/agents.html#how-are-ocf-return-codes-interpreted
Resolves: RHBZ#1905820
Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
---
heartbeat/LVM-activate | 47 +++++++++++++++++++++---------------------
1 file changed, 23 insertions(+), 24 deletions(-)
diff --git a/heartbeat/LVM-activate b/heartbeat/LVM-activate
index c86606637..e951a08e9 100755
--- a/heartbeat/LVM-activate
+++ b/heartbeat/LVM-activate
@@ -333,8 +333,7 @@ config_verify()
real=$(lvmconfig "$name" | cut -d'=' -f2)
if [ "$real" != "$expect" ]; then
ocf_exit_reason "config item $name: expect=$expect but real=$real"
- exit $OCF_ERR_CONFIGURED
-
+ exit $OCF_ERR_ARGS
fi
return $OCF_SUCCESS
@@ -366,12 +365,12 @@ lvmlockd_check()
fi
ocf_exit_reason "lvmlockd daemon is not running!"
- exit $OCF_ERR_CONFIGURED
+ exit $OCF_ERR_GENERIC
fi
if pgrep clvmd >/dev/null 2>&1 ; then
ocf_exit_reason "clvmd daemon is running unexpectedly."
- exit $OCF_ERR_CONFIGURED
+ exit $OCF_ERR_GENERIC
fi
return $OCF_SUCCESS
@@ -402,17 +401,17 @@ clvmd_check()
# Good: clvmd is running, and lvmlockd is not running
if ! pgrep clvmd >/dev/null 2>&1 ; then
ocf_exit_reason "clvmd daemon is not running!"
- exit $OCF_ERR_CONFIGURED
+ exit $OCF_ERR_GENERIC
fi
if pgrep lvmetad >/dev/null 2>&1 ; then
ocf_exit_reason "Please stop lvmetad daemon when clvmd is running."
- exit $OCF_ERR_CONFIGURED
+ exit $OCF_ERR_GENERIC
fi
if pgrep lvmlockd >/dev/null 2>&1 ; then
ocf_exit_reason "lvmlockd daemon is running unexpectedly."
- exit $OCF_ERR_CONFIGURED
+ exit $OCF_ERR_GENERIC
fi
return $OCF_SUCCESS
@@ -424,12 +423,12 @@ systemid_check()
source=$(lvmconfig 'global/system_id_source' 2>/dev/null | cut -d"=" -f2)
if [ "$source" = "" ] || [ "$source" = "none" ]; then
ocf_exit_reason "system_id_source in lvm.conf is not set correctly!"
- exit $OCF_ERR_CONFIGURED
+ exit $OCF_ERR_ARGS
fi
if [ -z ${SYSTEM_ID} ]; then
ocf_exit_reason "local/system_id is not set!"
- exit $OCF_ERR_CONFIGURED
+ exit $OCF_ERR_ARGS
fi
return $OCF_SUCCESS
@@ -441,18 +440,18 @@ tagging_check()
# The volume_list must be initialized to something in order to
# guarantee our tag will be filtered on startup
if ! lvm dumpconfig activation/volume_list; then
- ocf_log err "LVM: Improper setup detected"
+ ocf_log err "LVM: Improper setup detected"
ocf_exit_reason "The volume_list filter must be initialized in lvm.conf for exclusive activation without clvmd"
- exit $OCF_ERR_CONFIGURED
+ exit $OCF_ERR_ARGS
fi
# Our tag must _NOT_ be in the volume_list. This agent
# overrides the volume_list during activation using the
# special tag reserved for cluster activation
if lvm dumpconfig activation/volume_list | grep -e "\"@${OUR_TAG}\"" -e "\"${VG}\""; then
- ocf_log err "LVM: Improper setup detected"
+ ocf_log err "LVM: Improper setup detected"
ocf_exit_reason "The volume_list in lvm.conf must not contain the cluster tag, \"${OUR_TAG}\", or volume group, ${VG}"
- exit $OCF_ERR_CONFIGURED
+ exit $OCF_ERR_ARGS
fi
return $OCF_SUCCESS
@@ -463,13 +462,13 @@ read_parameters()
if [ -z "$VG" ]
then
ocf_exit_reason "You must identify the volume group name!"
- exit $OCF_ERR_ARGS
+ exit $OCF_ERR_CONFIGURED
fi
if [ "$LV_activation_mode" != "shared" ] && [ "$LV_activation_mode" != "exclusive" ]
then
ocf_exit_reason "Invalid value for activation_mode: $LV_activation_mode"
- exit $OCF_ERR_ARGS
+ exit $OCF_ERR_CONFIGURED
fi
# Convert VG_access_mode from string to index
@@ -519,8 +518,10 @@ lvm_validate() {
exit $OCF_NOT_RUNNING
fi
+ # Could be a transient error (e.g., iSCSI connection
+ # issue) so use OCF_ERR_GENERIC
ocf_exit_reason "Volume group[${VG}] doesn't exist, or not visible on this node!"
- exit $OCF_ERR_CONFIGURED
+ exit $OCF_ERR_GENERIC
fi
# Inconsistency might be due to missing physical volumes, which doesn't
@@ -549,7 +550,7 @@ lvm_validate() {
mode=$?
if [ $VG_access_mode_num -ne 4 ] && [ $mode -ne $VG_access_mode_num ]; then
ocf_exit_reason "The specified vg_access_mode doesn't match the lock_type on VG metadata!"
- exit $OCF_ERR_ARGS
+ exit $OCF_ERR_CONFIGURED
fi
# Nothing to do if the VG has no logical volume
@@ -561,11 +562,11 @@ lvm_validate() {
# Check if the given $LV is in the $VG
if [ -n "$LV" ]; then
- OUT=$(lvs --foreign --noheadings ${VG}/${LV} 2>&1)
+ output=$(lvs --foreign --noheadings ${VG}/${LV} 2>&1)
if [ $? -ne 0 ]; then
- ocf_log err "lvs: ${OUT}"
+ ocf_log err "lvs: ${output}"
ocf_exit_reason "LV ($LV) is not in the given VG ($VG)."
- exit $OCF_ERR_ARGS
+ exit $OCF_ERR_CONFIGURED
fi
fi
@@ -580,7 +581,6 @@ lvm_validate() {
3)
systemid_check
;;
-
4)
tagging_check
;;
@@ -808,10 +808,9 @@ lvm_status() {
dd if=${dm_name} of=/dev/null bs=1 count=1 >/dev/null \
2>&1
if [ $? -ne 0 ]; then
- return $OCF_NOT_RUNNING
- else
- return $OCF_SUCCESS
+ return $OCF_ERR_GENERIC
fi
+ return $OCF_SUCCESS
;;
*)
ocf_exit_reason "unsupported monitor level $OCF_CHECK_LEVEL"

View File

@ -0,0 +1,903 @@
From 5dcd5153f0318e4766f7f4d3e61dfdb4b352c39c Mon Sep 17 00:00:00 2001
From: MSSedusch <sedusch@microsoft.com>
Date: Mon, 30 May 2022 15:08:10 +0200
Subject: [PATCH 1/2] add new Azure Events AZ resource agent
---
.gitignore | 1 +
configure.ac | 8 +
doc/man/Makefile.am | 4 +
heartbeat/Makefile.am | 4 +
heartbeat/azure-events-az.in | 782 +++++++++++++++++++++++++++++++++++
5 files changed, 799 insertions(+)
create mode 100644 heartbeat/azure-events-az.in
diff --git a/.gitignore b/.gitignore
index 0c259b5cf..e2b7c039c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -54,6 +54,7 @@ heartbeat/Squid
heartbeat/SysInfo
heartbeat/aws-vpc-route53
heartbeat/azure-events
+heartbeat/azure-events-az
heartbeat/clvm
heartbeat/conntrackd
heartbeat/dnsupdate
diff --git a/configure.ac b/configure.ac
index eeecfad0e..5716a2be2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -523,6 +523,13 @@ if test -z "$PYTHON" || test $BUILD_OCF_PY -eq 0; then
fi
AM_CONDITIONAL(BUILD_AZURE_EVENTS, test $BUILD_AZURE_EVENTS -eq 1)
+BUILD_AZURE_EVENTS_AZ=1
+if test -z "$PYTHON" || test $BUILD_OCF_PY -eq 0; then
+ BUILD_AZURE_EVENTS_AZ=0
+ AC_MSG_WARN("Not building azure-events-az")
+fi
+AM_CONDITIONAL(BUILD_AZURE_EVENTS_AZ, test $BUILD_AZURE_EVENTS_AZ -eq 1)
+
BUILD_GCP_PD_MOVE=1
if test -z "$PYTHON" || test "x${HAVE_PYMOD_GOOGLEAPICLIENT}" != xyes || test $BUILD_OCF_PY -eq 0; then
BUILD_GCP_PD_MOVE=0
@@ -976,6 +983,7 @@ rgmanager/Makefile \
dnl Files we output that need to be executable
AC_CONFIG_FILES([heartbeat/azure-events], [chmod +x heartbeat/azure-events])
+AC_CONFIG_FILES([heartbeat/azure-events-az], [chmod +x heartbeat/azure-events-az])
AC_CONFIG_FILES([heartbeat/AoEtarget], [chmod +x heartbeat/AoEtarget])
AC_CONFIG_FILES([heartbeat/ManageRAID], [chmod +x heartbeat/ManageRAID])
AC_CONFIG_FILES([heartbeat/ManageVE], [chmod +x heartbeat/ManageVE])
diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am
index cd8fd16bf..658c700ac 100644
--- a/doc/man/Makefile.am
+++ b/doc/man/Makefile.am
@@ -219,6 +219,10 @@ if BUILD_AZURE_EVENTS
man_MANS += ocf_heartbeat_azure-events.7
endif
+if BUILD_AZURE_EVENTS_AZ
+man_MANS += ocf_heartbeat_azure-events-az.7
+endif
+
if BUILD_GCP_PD_MOVE
man_MANS += ocf_heartbeat_gcp-pd-move.7
endif
diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am
index 20d41e36a..1133dc13e 100644
--- a/heartbeat/Makefile.am
+++ b/heartbeat/Makefile.am
@@ -188,6 +188,10 @@ if BUILD_AZURE_EVENTS
ocf_SCRIPTS += azure-events
endif
+if BUILD_AZURE_EVENTS_AZ
+ocf_SCRIPTS += azure-events-az
+endif
+
if BUILD_GCP_PD_MOVE
ocf_SCRIPTS += gcp-pd-move
endif
diff --git a/heartbeat/azure-events-az.in b/heartbeat/azure-events-az.in
new file mode 100644
index 000000000..616fc8d9e
--- /dev/null
+++ b/heartbeat/azure-events-az.in
@@ -0,0 +1,782 @@
+#!@PYTHON@ -tt
+#
+# Resource agent for monitoring Azure Scheduled Events
+#
+# License: GNU General Public License (GPL)
+# (c) 2018 Tobias Niekamp, Microsoft Corp.
+# and Linux-HA contributors
+
+import os
+import sys
+import time
+import subprocess
+import json
+try:
+ import urllib2
+ from urllib2 import URLError
+except ImportError:
+ import urllib.request as urllib2
+ from urllib.error import URLError
+import socket
+from collections import defaultdict
+
+OCF_FUNCTIONS_DIR = os.environ.get("OCF_FUNCTIONS_DIR", "%s/lib/heartbeat" % os.environ.get("OCF_ROOT"))
+sys.path.append(OCF_FUNCTIONS_DIR)
+import ocf
+
+##############################################################################
+
+
+VERSION = "0.10"
+USER_AGENT = "Pacemaker-ResourceAgent/%s %s" % (VERSION, ocf.distro())
+
+attr_globalPullState = "azure-events-az_globalPullState"
+attr_lastDocVersion = "azure-events-az_lastDocVersion"
+attr_curNodeState = "azure-events-az_curNodeState"
+attr_pendingEventIDs = "azure-events-az_pendingEventIDs"
+attr_healthstate = "#health-azure"
+
+default_loglevel = ocf.logging.INFO
+default_relevantEventTypes = set(["Reboot", "Redeploy"])
+
+global_pullMaxAttempts = 3
+global_pullDelaySecs = 1
+
+##############################################################################
+
+class attrDict(defaultdict):
+ """
+ A wrapper for accessing dict keys like an attribute
+ """
+ def __init__(self, data):
+ super(attrDict, self).__init__(attrDict)
+ for d in data.keys():
+ self.__setattr__(d, data[d])
+
+ def __getattr__(self, key):
+ try:
+ return self[key]
+ except KeyError:
+ raise AttributeError(key)
+
+ def __setattr__(self, key, value):
+ self[key] = value
+
+##############################################################################
+
+class azHelper:
+ """
+ Helper class for Azure's metadata API (including Scheduled Events)
+ """
+ metadata_host = "http://169.254.169.254/metadata"
+ instance_api = "instance"
+ events_api = "scheduledevents"
+ api_version = "2019-08-01"
+
+ @staticmethod
+ def _sendMetadataRequest(endpoint, postData=None):
+ """
+ Send a request to Azure's Azure Metadata Service API
+ """
+ url = "%s/%s?api-version=%s" % (azHelper.metadata_host, endpoint, azHelper.api_version)
+ data = ""
+ ocf.logger.debug("_sendMetadataRequest: begin; endpoint = %s, postData = %s" % (endpoint, postData))
+ ocf.logger.debug("_sendMetadataRequest: url = %s" % url)
+
+ if postData and type(postData) != bytes:
+ postData = postData.encode()
+
+ req = urllib2.Request(url, postData)
+ req.add_header("Metadata", "true")
+ req.add_header("User-Agent", USER_AGENT)
+ try:
+ resp = urllib2.urlopen(req)
+ except URLError as e:
+ if hasattr(e, 'reason'):
+ ocf.logger.warning("Failed to reach the server: %s" % e.reason)
+ clusterHelper.setAttr(attr_globalPullState, "IDLE")
+ elif hasattr(e, 'code'):
+ ocf.logger.warning("The server couldn\'t fulfill the request. Error code: %s" % e.code)
+ clusterHelper.setAttr(attr_globalPullState, "IDLE")
+ else:
+ data = resp.read()
+ ocf.logger.debug("_sendMetadataRequest: response = %s" % data)
+
+ if data:
+ data = json.loads(data)
+
+ ocf.logger.debug("_sendMetadataRequest: finished")
+ return data
+
+ @staticmethod
+ def getInstanceInfo():
+ """
+ Fetch details about the current VM from Azure's Azure Metadata Service API
+ """
+ ocf.logger.debug("getInstanceInfo: begin")
+
+ jsondata = azHelper._sendMetadataRequest(azHelper.instance_api)
+ ocf.logger.debug("getInstanceInfo: json = %s" % jsondata)
+
+ if jsondata:
+ ocf.logger.debug("getInstanceInfo: finished, returning {}".format(jsondata["compute"]))
+ return attrDict(jsondata["compute"])
+ else:
+ ocf.ocf_exit_reason("getInstanceInfo: Unable to get instance info")
+ sys.exit(ocf.OCF_ERR_GENERIC)
+
+ @staticmethod
+ def pullScheduledEvents():
+ """
+ Retrieve all currently scheduled events via Azure Metadata Service API
+ """
+ ocf.logger.debug("pullScheduledEvents: begin")
+
+ jsondata = azHelper._sendMetadataRequest(azHelper.events_api)
+ ocf.logger.debug("pullScheduledEvents: json = %s" % jsondata)
+
+ ocf.logger.debug("pullScheduledEvents: finished")
+ return attrDict(jsondata)
+
+ @staticmethod
+ def forceEvents(eventIDs):
+ """
+ Force a set of events to start immediately
+ """
+ ocf.logger.debug("forceEvents: begin")
+
+ events = []
+ for e in eventIDs:
+ events.append({
+ "EventId": e,
+ })
+ postData = {
+ "StartRequests" : events
+ }
+ ocf.logger.info("forceEvents: postData = %s" % postData)
+ resp = azHelper._sendMetadataRequest(azHelper.events_api, postData=json.dumps(postData))
+
+ ocf.logger.debug("forceEvents: finished")
+ return
+
+##############################################################################
+
+class clusterHelper:
+ """
+ Helper functions for Pacemaker control via crm
+ """
+ @staticmethod
+ def _getLocation(node):
+ """
+ Helper function to retrieve local/global attributes
+ """
+ if node:
+ return ["--node", node]
+ else:
+ return ["--type", "crm_config"]
+
+ @staticmethod
+ def _exec(command, *args):
+ """
+ Helper function to execute a UNIX command
+ """
+ args = list(args)
+ ocf.logger.debug("_exec: begin; command = %s, args = %s" % (command, str(args)))
+
+ def flatten(*n):
+ return (str(e) for a in n
+ for e in (flatten(*a) if isinstance(a, (tuple, list)) else (str(a),)))
+ command = list(flatten([command] + args))
+ ocf.logger.debug("_exec: cmd = %s" % " ".join(command))
+ try:
+ ret = subprocess.check_output(command)
+ if type(ret) != str:
+ ret = ret.decode()
+ ocf.logger.debug("_exec: return = %s" % ret)
+ return ret.rstrip()
+ except Exception as err:
+ ocf.logger.exception(err)
+ return None
+
+ @staticmethod
+ def setAttr(key, value, node=None):
+ """
+ Set the value of a specific global/local attribute in the Pacemaker cluster
+ """
+ ocf.logger.debug("setAttr: begin; key = %s, value = %s, node = %s" % (key, value, node))
+
+ if value:
+ ret = clusterHelper._exec("crm_attribute",
+ "--name", key,
+ "--update", value,
+ clusterHelper._getLocation(node))
+ else:
+ ret = clusterHelper._exec("crm_attribute",
+ "--name", key,
+ "--delete",
+ clusterHelper._getLocation(node))
+
+ ocf.logger.debug("setAttr: finished")
+ return len(ret) == 0
+
+ @staticmethod
+ def getAttr(key, node=None):
+ """
+ Retrieve a global/local attribute from the Pacemaker cluster
+ """
+ ocf.logger.debug("getAttr: begin; key = %s, node = %s" % (key, node))
+
+ val = clusterHelper._exec("crm_attribute",
+ "--name", key,
+ "--query", "--quiet",
+ "--default", "",
+ clusterHelper._getLocation(node))
+ ocf.logger.debug("getAttr: finished")
+ if not val:
+ return None
+ return val if not val.isdigit() else int(val)
+
+ @staticmethod
+ def getAllNodes():
+ """
+ Get a list of hostnames for all nodes in the Pacemaker cluster
+ """
+ ocf.logger.debug("getAllNodes: begin")
+
+ nodes = []
+ nodeList = clusterHelper._exec("crm_node", "--list")
+ for n in nodeList.split("\n"):
+ nodes.append(n.split()[1])
+ ocf.logger.debug("getAllNodes: finished; return %s" % str(nodes))
+
+ return nodes
+
+ @staticmethod
+ def getHostNameFromAzName(azName):
+ """
+ Helper function to get the actual host name from an Azure node name
+ """
+ return clusterHelper.getAttr("hostName_%s" % azName)
+
+ @staticmethod
+ def removeHoldFromNodes():
+ """
+ Remove the ON_HOLD state from all nodes in the Pacemaker cluster
+ """
+ ocf.logger.debug("removeHoldFromNodes: begin")
+
+ for n in clusterHelper.getAllNodes():
+ if clusterHelper.getAttr(attr_curNodeState, node=n) == "ON_HOLD":
+ clusterHelper.setAttr(attr_curNodeState, "AVAILABLE", node=n)
+ ocf.logger.info("removeHoldFromNodes: removed ON_HOLD from node %s" % n)
+
+ ocf.logger.debug("removeHoldFromNodes: finished")
+ return False
+
+ @staticmethod
+ def otherNodesAvailable(exceptNode):
+ """
+ Check if there are any nodes (except a given node) in the Pacemaker cluster that have state AVAILABLE
+ """
+ ocf.logger.debug("otherNodesAvailable: begin; exceptNode = %s" % exceptNode)
+
+ for n in clusterHelper.getAllNodes():
+ state = clusterHelper.getAttr(attr_curNodeState, node=n)
+ state = stringToNodeState(state) if state else AVAILABLE
+ if state == AVAILABLE and n != exceptNode.hostName:
+ ocf.logger.info("otherNodesAvailable: at least %s is available" % n)
+ ocf.logger.debug("otherNodesAvailable: finished")
+ return True
+ ocf.logger.info("otherNodesAvailable: no other nodes are available")
+ ocf.logger.debug("otherNodesAvailable: finished")
+
+ return False
+
+ @staticmethod
+ def transitionSummary():
+ """
+ Get the current Pacemaker transition summary (used to check if all resources are stopped when putting a node standby)
+ """
+ # <tniek> Is a global crm_simulate "too much"? Or would it be sufficient it there are no planned transitions for a particular node?
+ # # crm_simulate -Ls
+ # Transition Summary:
+ # * Promote rsc_SAPHana_HN1_HDB03:0 (Slave -> Master hsr3-db1)
+ # * Stop rsc_SAPHana_HN1_HDB03:1 (hsr3-db0)
+ # * Move rsc_ip_HN1_HDB03 (Started hsr3-db0 -> hsr3-db1)
+ # * Start rsc_nc_HN1_HDB03 (hsr3-db1)
+ # # Excepted result when there are no pending actions:
+ # Transition Summary:
+ ocf.logger.debug("transitionSummary: begin")
+
+ summary = clusterHelper._exec("crm_simulate", "-Ls")
+ if not summary:
+ ocf.logger.warning("transitionSummary: could not load transition summary")
+ return False
+ if summary.find("Transition Summary:") < 0:
+ ocf.logger.warning("transitionSummary: received unexpected transition summary: %s" % summary)
+ return False
+ summary = summary.split("Transition Summary:")[1]
+ ret = summary.split("\n").pop(0)
+
+ ocf.logger.debug("transitionSummary: finished; return = %s" % str(ret))
+ return ret
+
+ @staticmethod
+ def listOperationsOnNode(node):
+ """
+ Get a list of all current operations for a given node (used to check if any resources are pending)
+ """
+ # hsr3-db1:/home/tniek # crm_resource --list-operations -N hsr3-db0
+ # rsc_azure-events-az (ocf::heartbeat:azure-events-az): Started: rsc_azure-events-az_start_0 (node=hsr3-db0, call=91, rc=0, last-rc-change=Fri Jun 8 22:37:46 2018, exec=115ms): complete
+ # rsc_azure-events-az (ocf::heartbeat:azure-events-az): Started: rsc_azure-events-az_monitor_10000 (node=hsr3-db0, call=93, rc=0, last-rc-change=Fri Jun 8 22:37:47 2018, exec=197ms): complete
+ # rsc_SAPHana_HN1_HDB03 (ocf::suse:SAPHana): Master: rsc_SAPHana_HN1_HDB03_start_0 (node=hsr3-db0, call=-1, rc=193, last-rc-change=Fri Jun 8 22:37:46 2018, exec=0ms): pending
+ # rsc_SAPHanaTopology_HN1_HDB03 (ocf::suse:SAPHanaTopology): Started: rsc_SAPHanaTopology_HN1_HDB03_start_0 (node=hsr3-db0, call=90, rc=0, last-rc-change=Fri Jun 8 22:37:46 2018, exec=3214ms): complete
+ ocf.logger.debug("listOperationsOnNode: begin; node = %s" % node)
+
+ resources = clusterHelper._exec("crm_resource", "--list-operations", "-N", node)
+ if len(resources) == 0:
+ ret = []
+ else:
+ ret = resources.split("\n")
+
+ ocf.logger.debug("listOperationsOnNode: finished; return = %s" % str(ret))
+ return ret
+
+ @staticmethod
+ def noPendingResourcesOnNode(node):
+ """
+ Check that there are no pending resources on a given node
+ """
+ ocf.logger.debug("noPendingResourcesOnNode: begin; node = %s" % node)
+
+ for r in clusterHelper.listOperationsOnNode(node):
+ ocf.logger.debug("noPendingResourcesOnNode: * %s" % r)
+ resource = r.split()[-1]
+ if resource == "pending":
+ ocf.logger.info("noPendingResourcesOnNode: found resource %s that is still pending" % resource)
+ ocf.logger.debug("noPendingResourcesOnNode: finished; return = False")
+ return False
+ ocf.logger.info("noPendingResourcesOnNode: no pending resources on node %s" % node)
+ ocf.logger.debug("noPendingResourcesOnNode: finished; return = True")
+
+ return True
+
+ @staticmethod
+ def allResourcesStoppedOnNode(node):
+ """
+ Check that all resources on a given node are stopped
+ """
+ ocf.logger.debug("allResourcesStoppedOnNode: begin; node = %s" % node)
+
+ if clusterHelper.noPendingResourcesOnNode(node):
+ if len(clusterHelper.transitionSummary()) == 0:
+ ocf.logger.info("allResourcesStoppedOnNode: no pending resources on node %s and empty transition summary" % node)
+ ocf.logger.debug("allResourcesStoppedOnNode: finished; return = True")
+ return True
+ ocf.logger.info("allResourcesStoppedOnNode: transition summary is not empty")
+ ocf.logger.debug("allResourcesStoppedOnNode: finished; return = False")
+ return False
+
+ ocf.logger.info("allResourcesStoppedOnNode: still pending resources on node %s" % node)
+ ocf.logger.debug("allResourcesStoppedOnNode: finished; return = False")
+ return False
+
+##############################################################################
+
+AVAILABLE = 0 # Node is online and ready to handle events
+STOPPING = 1 # Standby has been triggered, but some resources are still running
+IN_EVENT = 2 # All resources are stopped, and event has been initiated via Azure Metadata Service
+ON_HOLD = 3 # Node has a pending event that cannot be started there are no other nodes available
+
+def stringToNodeState(name):
+ if type(name) == int: return name
+ if name == "STOPPING": return STOPPING
+ if name == "IN_EVENT": return IN_EVENT
+ if name == "ON_HOLD": return ON_HOLD
+ return AVAILABLE
+
+def nodeStateToString(state):
+ if state == STOPPING: return "STOPPING"
+ if state == IN_EVENT: return "IN_EVENT"
+ if state == ON_HOLD: return "ON_HOLD"
+ return "AVAILABLE"
+
+##############################################################################
+
+class Node:
+ """
+ Core class implementing logic for a cluster node
+ """
+ def __init__(self, ra):
+ self.raOwner = ra
+ self.azInfo = azHelper.getInstanceInfo()
+ self.azName = self.azInfo.name
+ self.hostName = socket.gethostname()
+ self.setAttr("azName", self.azName)
+ clusterHelper.setAttr("hostName_%s" % self.azName, self.hostName)
+
+ def getAttr(self, key):
+ """
+ Get a local attribute
+ """
+ return clusterHelper.getAttr(key, node=self.hostName)
+
+ def setAttr(self, key, value):
+ """
+ Set a local attribute
+ """
+ return clusterHelper.setAttr(key, value, node=self.hostName)
+
+ def selfOrOtherNode(self, node):
+ """
+ Helper function to distinguish self/other node
+ """
+ return node if node else self.hostName
+
+ def setState(self, state, node=None):
+ """
+ Set the state for a given node (or self)
+ """
+ node = self.selfOrOtherNode(node)
+ ocf.logger.debug("setState: begin; node = %s, state = %s" % (node, nodeStateToString(state)))
+
+ clusterHelper.setAttr(attr_curNodeState, nodeStateToString(state), node=node)
+
+ ocf.logger.debug("setState: finished")
+
+ def getState(self, node=None):
+ """
+ Get the state for a given node (or self)
+ """
+ node = self.selfOrOtherNode(node)
+ ocf.logger.debug("getState: begin; node = %s" % node)
+
+ state = clusterHelper.getAttr(attr_curNodeState, node=node)
+ ocf.logger.debug("getState: state = %s" % state)
+ ocf.logger.debug("getState: finished")
+ if not state:
+ return AVAILABLE
+ return stringToNodeState(state)
+
+ def setEventIDs(self, eventIDs, node=None):
+ """
+ Set pending EventIDs for a given node (or self)
+ """
+ node = self.selfOrOtherNode(node)
+ ocf.logger.debug("setEventIDs: begin; node = %s, eventIDs = %s" % (node, str(eventIDs)))
+
+ if eventIDs:
+ eventIDStr = ",".join(eventIDs)
+ else:
+ eventIDStr = None
+ clusterHelper.setAttr(attr_pendingEventIDs, eventIDStr, node=node)
+
+ ocf.logger.debug("setEventIDs: finished")
+ return
+
+ def getEventIDs(self, node=None):
+ """
+ Get pending EventIDs for a given node (or self)
+ """
+ node = self.selfOrOtherNode(node)
+ ocf.logger.debug("getEventIDs: begin; node = %s" % node)
+
+ eventIDStr = clusterHelper.getAttr(attr_pendingEventIDs, node=node)
+ if eventIDStr:
+ eventIDs = eventIDStr.split(",")
+ else:
+ eventIDs = None
+
+ ocf.logger.debug("getEventIDs: finished; eventIDs = %s" % str(eventIDs))
+ return eventIDs
+
+ def updateNodeStateAndEvents(self, state, eventIDs, node=None):
+ """
+ Set the state and pending EventIDs for a given node (or self)
+ """
+ ocf.logger.debug("updateNodeStateAndEvents: begin; node = %s, state = %s, eventIDs = %s" % (node, nodeStateToString(state), str(eventIDs)))
+
+ self.setState(state, node=node)
+ self.setEventIDs(eventIDs, node=node)
+
+ ocf.logger.debug("updateNodeStateAndEvents: finished")
+ return state
+
+ def putNodeStandby(self, node=None):
+ """
+ Put self to standby
+ """
+ node = self.selfOrOtherNode(node)
+ ocf.logger.debug("putNodeStandby: begin; node = %s" % node)
+
+ clusterHelper._exec("crm_attribute",
+ "--node", node,
+ "--name", attr_healthstate,
+ "--update", "-1000000",
+ "--lifetime=forever")
+
+ ocf.logger.debug("putNodeStandby: finished")
+
+ def isNodeInStandby(self, node=None):
+ """
+ check if node is in standby
+ """
+ node = self.selfOrOtherNode(node)
+ ocf.logger.debug("isNodeInStandby: begin; node = %s" % node)
+ isInStandy = False
+
+ healthAttributeStr = clusterHelper.getAttr(attr_healthstate, node)
+ if healthAttributeStr is not None:
+ try:
+ healthAttribute = int(healthAttributeStr)
+ isInStandy = healthAttribute < 0
+ except ValueError:
+ # Handle the exception
+ ocf.logger.warn("Health attribute %s on node %s cannot be converted to an integer value" % (healthAttributeStr, node))
+
+ ocf.logger.debug("isNodeInStandby: finished - result %s" % isInStandy)
+ return isInStandy
+
+ def putNodeOnline(self, node=None):
+ """
+ Put self back online
+ """
+ node = self.selfOrOtherNode(node)
+ ocf.logger.debug("putNodeOnline: begin; node = %s" % node)
+
+ clusterHelper._exec("crm_attribute",
+ "--node", node,
+ "--name", "#health-azure",
+ "--update", "0",
+ "--lifetime=forever")
+
+ ocf.logger.debug("putNodeOnline: finished")
+
+ def separateEvents(self, events):
+ """
+ Split own/other nodes' events
+ """
+ ocf.logger.debug("separateEvents: begin; events = %s" % str(events))
+
+ localEvents = []
+ remoteEvents = []
+ for e in events:
+ e = attrDict(e)
+ if e.EventType not in self.raOwner.relevantEventTypes:
+ continue
+ if self.azName in e.Resources:
+ localEvents.append(e)
+ else:
+ remoteEvents.append(e)
+ ocf.logger.debug("separateEvents: finished; localEvents = %s, remoteEvents = %s" % (str(localEvents), str(remoteEvents)))
+ return (localEvents, remoteEvents)
+
+##############################################################################
+
+class raAzEvents:
+ """
+ Main class for resource agent
+ """
+ def __init__(self, relevantEventTypes):
+ self.node = Node(self)
+ self.relevantEventTypes = relevantEventTypes
+
+ def monitor(self):
+ ocf.logger.debug("monitor: begin")
+
+ events = azHelper.pullScheduledEvents()
+
+ # get current document version
+ curDocVersion = events.DocumentIncarnation
+ lastDocVersion = self.node.getAttr(attr_lastDocVersion)
+ ocf.logger.debug("monitor: lastDocVersion = %s; curDocVersion = %s" % (lastDocVersion, curDocVersion))
+
+ # split events local/remote
+ (localEvents, remoteEvents) = self.node.separateEvents(events.Events)
+
+ # ensure local events are only executing once
+ if curDocVersion == lastDocVersion:
+ ocf.logger.info("monitor: already handled curDocVersion, skip")
+ return ocf.OCF_SUCCESS
+
+ localAzEventIDs = set()
+ for e in localEvents:
+ localAzEventIDs.add(e.EventId)
+
+ curState = self.node.getState()
+ clusterEventIDs = self.node.getEventIDs()
+
+ ocf.logger.debug("monitor: curDocVersion has not been handled yet")
+
+ if clusterEventIDs:
+ # there are pending events set, so our state must be STOPPING or IN_EVENT
+ i = 0; touchedEventIDs = False
+ while i < len(clusterEventIDs):
+ # clean up pending events that are already finished according to AZ
+ if clusterEventIDs[i] not in localAzEventIDs:
+ ocf.logger.info("monitor: remove finished local clusterEvent %s" % (clusterEventIDs[i]))
+ clusterEventIDs.pop(i)
+ touchedEventIDs = True
+ else:
+ i += 1
+ if len(clusterEventIDs) > 0:
+ # there are still pending events (either because we're still stopping, or because the event is still in place)
+ # either way, we need to wait
+ if touchedEventIDs:
+ ocf.logger.info("monitor: added new local clusterEvent %s" % str(clusterEventIDs))
+ self.node.setEventIDs(clusterEventIDs)
+ else:
+ ocf.logger.info("monitor: no local clusterEvents were updated")
+ else:
+ # there are no more pending events left after cleanup
+ if clusterHelper.noPendingResourcesOnNode(self.node.hostName):
+ # and no pending resources on the node -> set it back online
+ ocf.logger.info("monitor: all local events finished -> clean up, put node online and AVAILABLE")
+ curState = self.node.updateNodeStateAndEvents(AVAILABLE, None)
+ self.node.putNodeOnline()
+ clusterHelper.removeHoldFromNodes()
+ # If Azure Scheduled Events are not used for 24 hours (e.g. because the cluster was asleep), it will be disabled for a VM.
+ # When the cluster wakes up and starts using it again, the DocumentIncarnation is reset.
+ # We need to remove it during cleanup, otherwise azure-events-az will not process the event after wakeup
+ self.node.setAttr(attr_lastDocVersion, None)
+ else:
+ ocf.logger.info("monitor: all local events finished, but some resources have not completed startup yet -> wait")
+ else:
+ if curState == AVAILABLE:
+ if len(localAzEventIDs) > 0:
+ if clusterHelper.otherNodesAvailable(self.node):
+ ocf.logger.info("monitor: can handle local events %s -> set state STOPPING" % (str(localAzEventIDs)))
+ curState = self.node.updateNodeStateAndEvents(STOPPING, localAzEventIDs)
+ else:
+ ocf.logger.info("monitor: cannot handle azEvents %s (only node available) -> set state ON_HOLD" % str(localAzEventIDs))
+ self.node.setState(ON_HOLD)
+ else:
+ ocf.logger.debug("monitor: no local azEvents to handle")
+
+ if curState == STOPPING:
+ eventIDsForNode = {}
+ if clusterHelper.noPendingResourcesOnNode(self.node.hostName):
+ if not self.node.isNodeInStandby():
+ ocf.logger.info("monitor: all local resources are started properly -> put node standby and exit")
+ self.node.putNodeStandby()
+ return ocf.OCF_SUCCESS
+
+ for e in localEvents:
+ ocf.logger.info("monitor: handling remote event %s (%s; nodes = %s)" % (e.EventId, e.EventType, str(e.Resources)))
+ # before we can force an event to start, we need to ensure all nodes involved have stopped their resources
+ if e.EventStatus == "Scheduled":
+ allNodesStopped = True
+ for azName in e.Resources:
+ hostName = clusterHelper.getHostNameFromAzName(azName)
+ state = self.node.getState(node=hostName)
+ if state == STOPPING:
+ # the only way we can continue is when node state is STOPPING, but all resources have been stopped
+ if not clusterHelper.allResourcesStoppedOnNode(hostName):
+ ocf.logger.info("monitor: (at least) node %s has still resources running -> wait" % hostName)
+ allNodesStopped = False
+ break
+ elif state in (AVAILABLE, IN_EVENT, ON_HOLD):
+ ocf.logger.info("monitor: node %s is still %s -> remote event needs to be picked up locally" % (hostName, nodeStateToString(state)))
+ allNodesStopped = False
+ break
+ if allNodesStopped:
+ ocf.logger.info("monitor: nodes %s are stopped -> add remote event %s to force list" % (str(e.Resources), e.EventId))
+ for n in e.Resources:
+ hostName = clusterHelper.getHostNameFromAzName(n)
+ if hostName in eventIDsForNode:
+ eventIDsForNode[hostName].append(e.EventId)
+ else:
+ eventIDsForNode[hostName] = [e.EventId]
+ elif e.EventStatus == "Started":
+ ocf.logger.info("monitor: remote event already started")
+
+ # force the start of all events whose nodes are ready (i.e. have no more resources running)
+ if len(eventIDsForNode.keys()) > 0:
+ eventIDsToForce = set([item for sublist in eventIDsForNode.values() for item in sublist])
+ ocf.logger.info("monitor: set nodes %s to IN_EVENT; force remote events %s" % (str(eventIDsForNode.keys()), str(eventIDsToForce)))
+ for node, eventId in eventIDsForNode.items():
+ self.node.updateNodeStateAndEvents(IN_EVENT, eventId, node=node)
+ azHelper.forceEvents(eventIDsToForce)
+ self.node.setAttr(attr_lastDocVersion, curDocVersion)
+ else:
+ ocf.logger.info("monitor: some local resources are not clean yet -> wait")
+
+ ocf.logger.debug("monitor: finished")
+ return ocf.OCF_SUCCESS
+
+##############################################################################
+
+def setLoglevel(verbose):
+ # set up writing into syslog
+ loglevel = default_loglevel
+ if verbose:
+ opener = urllib2.build_opener(urllib2.HTTPHandler(debuglevel=1))
+ urllib2.install_opener(opener)
+ loglevel = ocf.logging.DEBUG
+ ocf.log.setLevel(loglevel)
+
+description = (
+ "Microsoft Azure Scheduled Events monitoring agent",
+ """This resource agent implements a monitor for scheduled
+(maintenance) events for a Microsoft Azure VM.
+
+If any relevant events are found, it moves all Pacemaker resources
+away from the affected node to allow for a graceful shutdown.
+
+ Usage:
+ [OCF_RESKEY_eventTypes=VAL] [OCF_RESKEY_verbose=VAL] azure-events-az ACTION
+
+ action (required): Supported values: monitor, help, meta-data
+ eventTypes (optional): List of event types to be considered
+ relevant by the resource agent (comma-separated).
+ Supported values: Freeze,Reboot,Redeploy
+ Default = Reboot,Redeploy
+/ verbose (optional): If set to true, displays debug info.
+ Default = false
+
+ Deployment:
+ crm configure primitive rsc_azure-events-az ocf:heartbeat:azure-events-az \
+ op monitor interval=10s
+ crm configure clone cln_azure-events-az rsc_azure-events-az
+
+For further information on Microsoft Azure Scheduled Events, please
+refer to the following documentation:
+https://docs.microsoft.com/en-us/azure/virtual-machines/linux/scheduled-events
+""")
+
+def monitor_action(eventTypes):
+ relevantEventTypes = set(eventTypes.split(",") if eventTypes else [])
+ ra = raAzEvents(relevantEventTypes)
+ return ra.monitor()
+
+def validate_action(eventTypes):
+ if eventTypes:
+ for event in eventTypes.split(","):
+ if event not in ("Freeze", "Reboot", "Redeploy"):
+ ocf.ocf_exit_reason("Event type not one of Freeze, Reboot, Redeploy: " + eventTypes)
+ return ocf.OCF_ERR_CONFIGURED
+ return ocf.OCF_SUCCESS
+
+def main():
+ agent = ocf.Agent("azure-events-az", shortdesc=description[0], longdesc=description[1])
+ agent.add_parameter(
+ "eventTypes",
+ shortdesc="List of resources to be considered",
+ longdesc="A comma-separated list of event types that will be handled by this resource agent. (Possible values: Freeze,Reboot,Redeploy)",
+ content_type="string",
+ default="Reboot,Redeploy")
+ agent.add_parameter(
+ "verbose",
+ shortdesc="Enable verbose agent logging",
+ longdesc="Set to true to enable verbose logging",
+ content_type="boolean",
+ default="false")
+ agent.add_action("start", timeout=10, handler=lambda: ocf.OCF_SUCCESS)
+ agent.add_action("stop", timeout=10, handler=lambda: ocf.OCF_SUCCESS)
+ agent.add_action("validate-all", timeout=20, handler=validate_action)
+ agent.add_action("monitor", timeout=240, interval=10, handler=monitor_action)
+ setLoglevel(ocf.is_true(ocf.get_parameter("verbose", "false")))
+ agent.run()
+
+if __name__ == '__main__':
+ main()
\ No newline at end of file
From a95337d882c7cc69d604b050159ad50b679f18be Mon Sep 17 00:00:00 2001
From: MSSedusch <sedusch@microsoft.com>
Date: Thu, 2 Jun 2022 14:10:33 +0200
Subject: [PATCH 2/2] Remove developer documentation
---
heartbeat/azure-events-az.in | 11 -----------
1 file changed, 11 deletions(-)
diff --git a/heartbeat/azure-events-az.in b/heartbeat/azure-events-az.in
index 616fc8d9e..59d095306 100644
--- a/heartbeat/azure-events-az.in
+++ b/heartbeat/azure-events-az.in
@@ -723,17 +723,6 @@ description = (
If any relevant events are found, it moves all Pacemaker resources
away from the affected node to allow for a graceful shutdown.
- Usage:
- [OCF_RESKEY_eventTypes=VAL] [OCF_RESKEY_verbose=VAL] azure-events-az ACTION
-
- action (required): Supported values: monitor, help, meta-data
- eventTypes (optional): List of event types to be considered
- relevant by the resource agent (comma-separated).
- Supported values: Freeze,Reboot,Redeploy
- Default = Reboot,Redeploy
-/ verbose (optional): If set to true, displays debug info.
- Default = false
-
Deployment:
crm configure primitive rsc_azure-events-az ocf:heartbeat:azure-events-az \
op monitor interval=10s

View File

@ -0,0 +1,175 @@
From cab190c737fdf58268aa5c009f6089b754862b22 Mon Sep 17 00:00:00 2001
From: Reid Wahl <nrwahl@protonmail.com>
Date: Tue, 1 Feb 2022 16:32:50 -0800
Subject: [PATCH 1/3] Filesystem: Fix OpenBSD check in fstype_supported()
fstype_supported() is supposed to skip the /proc/filesystems check if
the OS is OpenBSD. Instead, it skips the check if the OS is **not**
OpenBSD. That means the function has been a no-op for all other distros.
Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
---
heartbeat/Filesystem | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
index 010c1dcfc..8b4792152 100755
--- a/heartbeat/Filesystem
+++ b/heartbeat/Filesystem
@@ -440,7 +440,7 @@ fstype_supported()
local support="$FSTYPE"
local rc
- if [ "X${HOSTOS}" != "XOpenBSD" ];then
+ if [ "X${HOSTOS}" = "XOpenBSD" ];then
# skip checking /proc/filesystems for obsd
return $OCF_SUCCESS
fi
From 5d38b87daa9cfffa89a193df131d6ebd87cd05aa Mon Sep 17 00:00:00 2001
From: Reid Wahl <nrwahl@protonmail.com>
Date: Tue, 1 Feb 2022 18:26:32 -0800
Subject: [PATCH 2/3] Filesystem: Improve fstype_supported logs for fuse
Make it more clear when we have to use a different name to check for
support of a particular filesystem. Currently only used for fuse-type
filesystems.
Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
---
heartbeat/Filesystem | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
index 8b4792152..4d84846c1 100755
--- a/heartbeat/Filesystem
+++ b/heartbeat/Filesystem
@@ -455,6 +455,10 @@ fstype_supported()
fuse.*|glusterfs|rozofs) support="fuse";;
esac
+ if [ "$support" != "$FSTYPE" ]; then
+ ocf_log info "Checking support for $FSTYPE as \"$support\""
+ fi
+
grep -w "$support"'$' /proc/filesystems >/dev/null
if [ $? -eq 0 ]; then
# found the fs type
@@ -465,7 +469,7 @@ fstype_supported()
# check the if the filesystem support exists again.
$MODPROBE $support >/dev/null
if [ $? -ne 0 ]; then
- ocf_exit_reason "Couldn't find filesystem $FSTYPE in /proc/filesystems and failed to load kernel module"
+ ocf_exit_reason "Couldn't find filesystem $support in /proc/filesystems and failed to load kernel module"
return $OCF_ERR_INSTALLED
fi
@@ -478,11 +482,11 @@ fstype_supported()
# yes. found the filesystem after doing the modprobe
return $OCF_SUCCESS
fi
- ocf_log debug "Unable to find support for $FSTYPE in /proc/filesystems after modprobe, trying again"
+ ocf_log debug "Unable to find support for $support in /proc/filesystems after modprobe, trying again"
sleep 1
done
- ocf_exit_reason "Couldn't find filesystem $FSTYPE in /proc/filesystems"
+ ocf_exit_reason "Couldn't find filesystem $support in /proc/filesystems"
return $OCF_ERR_INSTALLED
}
@@ -837,6 +841,9 @@ Filesystem_monitor()
# VALIDATE_ALL: Are the instance parameters valid?
# FIXME!! The only part that's useful is the return code.
# This code always returns $OCF_SUCCESS (!)
+# FIXME!! Needs some tuning to match fstype_supported() (e.g., for
+# fuse). Can we just call fstype_supported() with a flag like
+# "no_modprobe" instead?
#
Filesystem_validate_all()
{
From e2174244067b02d798e0f12437f0f499c80f91fe Mon Sep 17 00:00:00 2001
From: Reid Wahl <nrwahl@protonmail.com>
Date: Tue, 1 Feb 2022 18:55:47 -0800
Subject: [PATCH 3/3] Filesystem: Add support for Amazon EFS mount helper
mount.efs, the mount helper for Amazon Elastic File System (EFS)
provided by amazon-efs-utils [1], is a wrapper for mount.nfs4. It offers
a number of AWS-specific mount options and some security improvements
like encryption of data in transit.
This commit adds support by treating an fstype=efs like fstype=nfs4 for
the most part.
Resolves: RHBZ#2049319
[1] https://docs.aws.amazon.com/efs/latest/ug/efs-mount-helper.html
Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
---
heartbeat/Filesystem | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
index 4d84846c1..1a90d6a42 100755
--- a/heartbeat/Filesystem
+++ b/heartbeat/Filesystem
@@ -341,7 +341,7 @@ determine_blockdevice() {
# Get the current real device name, if possible.
# (specified devname could be -L or -U...)
case "$FSTYPE" in
- nfs4|nfs|smbfs|cifs|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs|none|lustre)
+ nfs4|nfs|efs|smbfs|cifs|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs|none|lustre)
: ;;
*)
match_string="${TAB}${CANONICALIZED_MOUNTPOINT}${TAB}"
@@ -423,7 +423,7 @@ is_fsck_needed() {
no) false;;
""|auto)
case "$FSTYPE" in
- ext4|ext4dev|ext3|reiserfs|reiser4|nss|xfs|jfs|vfat|fat|nfs4|nfs|cifs|smbfs|ocfs2|gfs2|none|lustre|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs)
+ ext4|ext4dev|ext3|reiserfs|reiser4|nss|xfs|jfs|vfat|fat|nfs4|nfs|efs|cifs|smbfs|ocfs2|gfs2|none|lustre|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs)
false;;
*)
true;;
@@ -450,9 +450,11 @@ fstype_supported()
return $OCF_SUCCESS
fi
- # support fuse-filesystems (e.g. GlusterFS)
+ # support fuse-filesystems (e.g. GlusterFS) and Amazon Elastic File
+ # System (EFS)
case "$FSTYPE" in
fuse.*|glusterfs|rozofs) support="fuse";;
+ efs) support="nfs4";;
esac
if [ "$support" != "$FSTYPE" ]; then
@@ -701,7 +703,7 @@ Filesystem_stop()
# For networked filesystems, there's merit in trying -f:
case "$FSTYPE" in
- nfs4|nfs|cifs|smbfs) umount_force="-f" ;;
+ nfs4|nfs|efs|cifs|smbfs) umount_force="-f" ;;
esac
# Umount all sub-filesystems mounted under $MOUNTPOINT/ too.
@@ -892,7 +894,7 @@ set_blockdevice_var() {
# these are definitely not block devices
case "$FSTYPE" in
- nfs4|nfs|smbfs|cifs|none|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs|lustre) return;;
+ nfs4|nfs|efs|smbfs|cifs|none|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs|lustre) return;;
esac
if $(is_option "loop"); then
@@ -1013,7 +1015,7 @@ is_option "ro" &&
CLUSTERSAFE=2
case "$FSTYPE" in
-nfs4|nfs|smbfs|cifs|none|gfs2|glusterfs|ceph|ocfs2|overlay|overlayfs|tmpfs|cvfs|lustre)
+nfs4|nfs|efs|smbfs|cifs|none|gfs2|glusterfs|ceph|ocfs2|overlay|overlayfs|tmpfs|cvfs|lustre)
CLUSTERSAFE=1 # this is kind of safe too
;;
# add here CLUSTERSAFE=0 for all filesystems which are not

View File

@ -0,0 +1,79 @@
From b3eadb8523b599af800a7c772606aa0e90cf142f Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Tue, 19 Jul 2022 17:03:02 +0900
Subject: [PATCH 1/2] Make storage_mon -h exit just after printing help
messages.
Previously, when -h or an invalid option was specified, storage_mon
printed the help messages, proceeded processing and then could
throw an error. This was not the behavior that, e.g., users who want
to specify -h option to see the help messages are expecting. To fix
this issue, this commit changes storage_mon so that it exits just
after printing the help messages when -h or an invalid option is
specified.
---
tools/storage_mon.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
index 7b65bb419..1303371f7 100644
--- a/tools/storage_mon.c
+++ b/tools/storage_mon.c
@@ -28,7 +28,7 @@ static void usage(char *name, FILE *f)
fprintf(f, " --timeout <n> max time to wait for a device test to come back. in seconds (default %d)\n", DEFAULT_TIMEOUT);
fprintf(f, " --inject-errors-percent <n> Generate EIO errors <n>%% of the time (for testing only)\n");
fprintf(f, " --verbose emit extra output to stdout\n");
- fprintf(f, " --help print this messages\n");
+ fprintf(f, " --help print this messages, then exit\n");
}
/* Check one device */
@@ -178,9 +178,11 @@ int main(int argc, char *argv[])
break;
case 'h':
usage(argv[0], stdout);
+ exit(0);
break;
default:
usage(argv[0], stderr);
+ exit(-1);
break;
}
From e62795f02d25a772a239e0a4f9eb9d6470c134ee Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Tue, 19 Jul 2022 17:56:32 +0900
Subject: [PATCH 2/2] Fix typo in help message.
---
tools/storage_mon.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
index 1303371f7..3c82d5ee8 100644
--- a/tools/storage_mon.c
+++ b/tools/storage_mon.c
@@ -28,7 +28,7 @@ static void usage(char *name, FILE *f)
fprintf(f, " --timeout <n> max time to wait for a device test to come back. in seconds (default %d)\n", DEFAULT_TIMEOUT);
fprintf(f, " --inject-errors-percent <n> Generate EIO errors <n>%% of the time (for testing only)\n");
fprintf(f, " --verbose emit extra output to stdout\n");
- fprintf(f, " --help print this messages, then exit\n");
+ fprintf(f, " --help print this message\n");
}
/* Check one device */
@@ -178,11 +178,11 @@ int main(int argc, char *argv[])
break;
case 'h':
usage(argv[0], stdout);
- exit(0);
+ return 0;
break;
default:
usage(argv[0], stderr);
- exit(-1);
+ return -1;
break;
}

View File

@ -0,0 +1,36 @@
From a68957e8f1e8169438acf5a4321f47ed7d8ceec1 Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Tue, 19 Jul 2022 20:28:38 +0900
Subject: [PATCH] storage_mon: Fix bug in checking of number of specified
scores.
Previously specifying the maximum allowed number (MAX_DEVICES, currently 25)
of devices and scores as arguments could cause storage_mon to fail unexpectedly
with the error message "too many scores, max is 25". This issue happened
because storage_mon checked whether the number of specified scores
exceeded the upper limit by using the local variable "device_count" indicating
the number of specified devices (not scores). So after the maximum number
of devices arguments were interpreted, the appearance of next score argument
caused the error even when the number of interpreted scores arguments had
not exceeded the maximum.
This patch fixes storage_mon so that it uses the local variable "score_count"
indicating the number of specified scores, to check whether arguments for
scores are specified more than the upper limit.
---
tools/storage_mon.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
index 3c82d5ee8..c749076c2 100644
--- a/tools/storage_mon.c
+++ b/tools/storage_mon.c
@@ -154,7 +154,7 @@ int main(int argc, char *argv[])
}
break;
case 's':
- if (device_count < MAX_DEVICES) {
+ if (score_count < MAX_DEVICES) {
int score = atoi(optarg);
if (score < 1 || score > 10) {
fprintf(stderr, "Score must be between 1 and 10 inclusive\n");

View File

@ -0,0 +1,43 @@
From c6ea93fcb499c84c3d8e9aad2ced65065a3f6d51 Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Tue, 19 Jul 2022 22:34:08 +0900
Subject: [PATCH] Fix bug in handling of child process exit.
When storage_mon detects that a child process exits with zero,
it resets the test_forks[] entry for the child process to 0, to avoid
waitpid() for the process again in the loop. But, previously,
storage_mon didn't do that when it detected that a child process
exited with non-zero. Which caused waitpid() to be called again
for the process already gone and to report an error like
"waitpid on XXX failed: No child processes" unexpectedly.
In this case, basically storage_mon should wait until all the child
processes exit and return the final score, instead.
This patch fixes this issue by making storage_mon reset test_works[]
entry even when a child process exits with non-zero.
---
tools/storage_mon.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
index 3c82d5ee8..83a48ca36 100644
--- a/tools/storage_mon.c
+++ b/tools/storage_mon.c
@@ -232,13 +232,13 @@ int main(int argc, char *argv[])
if (w == test_forks[i]) {
if (WIFEXITED(wstatus)) {
- if (WEXITSTATUS(wstatus) == 0) {
- finished_count++;
- test_forks[i] = 0;
- } else {
+ if (WEXITSTATUS(wstatus) != 0) {
syslog(LOG_ERR, "Error reading from device %s", devices[i]);
final_score += scores[i];
}
+
+ finished_count++;
+ test_forks[i] = 0;
}
}
}

View File

@ -0,0 +1,417 @@
From 0bb52cf9985bda47e13940761b3d8e2eaddf377c Mon Sep 17 00:00:00 2001
From: Kazunori INOUE <kazunori_inoue@newson.co.jp>
Date: Wed, 10 Aug 2022 17:35:54 +0900
Subject: [PATCH 1/4] storage_mon: Use the O_DIRECT flag in open() to eliminate
cache effects
---
tools/Makefile.am | 1 +
tools/storage_mon.c | 82 +++++++++++++++++++++++++++++++++------------
2 files changed, 61 insertions(+), 22 deletions(-)
diff --git a/tools/Makefile.am b/tools/Makefile.am
index 1309223b4..08323fee3 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -74,6 +74,7 @@ sfex_stat_LDADD = $(GLIBLIB) -lplumb -lplumbgpl
findif_SOURCES = findif.c
storage_mon_SOURCES = storage_mon.c
+storage_mon_CFLAGS = -D_GNU_SOURCE
if BUILD_TICKLE
halib_PROGRAMS += tickle_tcp
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
index 930ead41c..ba87492fc 100644
--- a/tools/storage_mon.c
+++ b/tools/storage_mon.c
@@ -31,23 +31,27 @@ static void usage(char *name, FILE *f)
fprintf(f, " --help print this message\n");
}
-/* Check one device */
-static void *test_device(const char *device, int verbose, int inject_error_percent)
+static int open_device(const char *device, int verbose)
{
- uint64_t devsize;
int device_fd;
int res;
+ uint64_t devsize;
off_t seek_spot;
- char buffer[512];
- if (verbose) {
- printf("Testing device %s\n", device);
+#if defined(__linux__) || defined(__FreeBSD__)
+ device_fd = open(device, O_RDONLY|O_DIRECT);
+ if (device_fd >= 0) {
+ return device_fd;
+ } else if (errno != EINVAL) {
+ fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno));
+ return -1;
}
+#endif
device_fd = open(device, O_RDONLY);
if (device_fd < 0) {
fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno));
- exit(-1);
+ return -1;
}
#ifdef __FreeBSD__
res = ioctl(device_fd, DIOCGMEDIASIZE, &devsize);
@@ -57,11 +61,12 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
if (res != 0) {
fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno));
close(device_fd);
- exit(-1);
+ return -1;
}
if (verbose) {
fprintf(stderr, "%s: size=%zu\n", device, devsize);
}
+
/* Don't fret about real randomness */
srand(time(NULL) + getpid());
/* Pick a random place on the device - sector aligned */
@@ -70,35 +75,64 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
if (res < 0) {
fprintf(stderr, "Failed to seek %s: %s\n", device, strerror(errno));
close(device_fd);
- exit(-1);
+ return -1;
}
-
if (verbose) {
printf("%s: reading from pos %ld\n", device, seek_spot);
}
+ return device_fd;
+}
+
+/* Check one device */
+static void *test_device(const char *device, int verbose, int inject_error_percent)
+{
+ int device_fd;
+ int sec_size = 0;
+ int res;
+ void *buffer;
+
+ if (verbose) {
+ printf("Testing device %s\n", device);
+ }
+
+ device_fd = open_device(device, verbose);
+ if (device_fd < 0) {
+ exit(-1);
+ }
+
+ ioctl(device_fd, BLKSSZGET, &sec_size);
+ if (sec_size == 0) {
+ fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno));
+ goto error;
+ }
- res = read(device_fd, buffer, sizeof(buffer));
+ if (posix_memalign(&buffer, sysconf(_SC_PAGESIZE), sec_size) != 0) {
+ fprintf(stderr, "Failed to allocate aligned memory: %s\n", strerror(errno));
+ goto error;
+ }
+
+ res = read(device_fd, buffer, sec_size);
+ free(buffer);
if (res < 0) {
fprintf(stderr, "Failed to read %s: %s\n", device, strerror(errno));
- close(device_fd);
- exit(-1);
+ goto error;
}
- if (res < (int)sizeof(buffer)) {
- fprintf(stderr, "Failed to read %ld bytes from %s, got %d\n", sizeof(buffer), device, res);
- close(device_fd);
- exit(-1);
+ if (res < sec_size) {
+ fprintf(stderr, "Failed to read %d bytes from %s, got %d\n", sec_size, device, res);
+ goto error;
}
/* Fake an error */
- if (inject_error_percent && ((rand() % 100) < inject_error_percent)) {
- fprintf(stderr, "People, please fasten your seatbelts, injecting errors!\n");
- close(device_fd);
- exit(-1);
+ if (inject_error_percent) {
+ srand(time(NULL) + getpid());
+ if ((rand() % 100) < inject_error_percent) {
+ fprintf(stderr, "People, please fasten your seatbelts, injecting errors!\n");
+ goto error;
+ }
}
res = close(device_fd);
if (res != 0) {
fprintf(stderr, "Failed to close %s: %s\n", device, strerror(errno));
- close(device_fd);
exit(-1);
}
@@ -106,6 +140,10 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
printf("%s: done\n", device);
}
exit(0);
+
+error:
+ close(device_fd);
+ exit(-1);
}
int main(int argc, char *argv[])
From ce4e632f29ed6b86b82a959eac5844655baed153 Mon Sep 17 00:00:00 2001
From: Kazunori INOUE <kazunori_inoue@newson.co.jp>
Date: Mon, 15 Aug 2022 19:17:21 +0900
Subject: [PATCH 2/4] storage_mon: fix build-related issues
---
tools/storage_mon.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
index ba87492fc..e34d1975a 100644
--- a/tools/storage_mon.c
+++ b/tools/storage_mon.c
@@ -38,7 +38,6 @@ static int open_device(const char *device, int verbose)
uint64_t devsize;
off_t seek_spot;
-#if defined(__linux__) || defined(__FreeBSD__)
device_fd = open(device, O_RDONLY|O_DIRECT);
if (device_fd >= 0) {
return device_fd;
@@ -46,7 +45,6 @@ static int open_device(const char *device, int verbose)
fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno));
return -1;
}
-#endif
device_fd = open(device, O_RDONLY);
if (device_fd < 0) {
@@ -100,7 +98,11 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
exit(-1);
}
+#ifdef __FreeBSD__
+ ioctl(device_fd, DIOCGSECTORSIZE, &sec_size);
+#else
ioctl(device_fd, BLKSSZGET, &sec_size);
+#endif
if (sec_size == 0) {
fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno));
goto error;
From 7a0aaa0dfdebeab3fae9fe9ddc412c3d1f610273 Mon Sep 17 00:00:00 2001
From: Kazunori INOUE <kazunori_inoue@newson.co.jp>
Date: Wed, 24 Aug 2022 17:36:23 +0900
Subject: [PATCH 3/4] storage_mon: do random lseek even with O_DIRECT, etc
---
tools/storage_mon.c | 118 ++++++++++++++++++++++----------------------
1 file changed, 58 insertions(+), 60 deletions(-)
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
index e34d1975a..0bdb48649 100644
--- a/tools/storage_mon.c
+++ b/tools/storage_mon.c
@@ -31,38 +31,43 @@ static void usage(char *name, FILE *f)
fprintf(f, " --help print this message\n");
}
-static int open_device(const char *device, int verbose)
+/* Check one device */
+static void *test_device(const char *device, int verbose, int inject_error_percent)
{
+ uint64_t devsize;
+ int flags = O_RDONLY | O_DIRECT;
int device_fd;
int res;
- uint64_t devsize;
off_t seek_spot;
- device_fd = open(device, O_RDONLY|O_DIRECT);
- if (device_fd >= 0) {
- return device_fd;
- } else if (errno != EINVAL) {
- fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno));
- return -1;
+ if (verbose) {
+ printf("Testing device %s\n", device);
}
- device_fd = open(device, O_RDONLY);
+ device_fd = open(device, flags);
if (device_fd < 0) {
- fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno));
- return -1;
+ if (errno != EINVAL) {
+ fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno));
+ exit(-1);
+ }
+ flags &= ~O_DIRECT;
+ device_fd = open(device, flags);
+ if (device_fd < 0) {
+ fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno));
+ exit(-1);
+ }
}
#ifdef __FreeBSD__
res = ioctl(device_fd, DIOCGMEDIASIZE, &devsize);
#else
res = ioctl(device_fd, BLKGETSIZE64, &devsize);
#endif
- if (res != 0) {
+ if (res < 0) {
fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno));
- close(device_fd);
- return -1;
+ goto error;
}
if (verbose) {
- fprintf(stderr, "%s: size=%zu\n", device, devsize);
+ printf("%s: opened %s O_DIRECT, size=%zu\n", device, (flags & O_DIRECT)?"with":"without", devsize);
}
/* Don't fret about real randomness */
@@ -72,65 +77,58 @@ static int open_device(const char *device, int verbose)
res = lseek(device_fd, seek_spot, SEEK_SET);
if (res < 0) {
fprintf(stderr, "Failed to seek %s: %s\n", device, strerror(errno));
- close(device_fd);
- return -1;
+ goto error;
}
if (verbose) {
printf("%s: reading from pos %ld\n", device, seek_spot);
}
- return device_fd;
-}
-
-/* Check one device */
-static void *test_device(const char *device, int verbose, int inject_error_percent)
-{
- int device_fd;
- int sec_size = 0;
- int res;
- void *buffer;
-
- if (verbose) {
- printf("Testing device %s\n", device);
- }
- device_fd = open_device(device, verbose);
- if (device_fd < 0) {
- exit(-1);
- }
+ if (flags & O_DIRECT) {
+ int sec_size = 0;
+ void *buffer;
#ifdef __FreeBSD__
- ioctl(device_fd, DIOCGSECTORSIZE, &sec_size);
+ res = ioctl(device_fd, DIOCGSECTORSIZE, &sec_size);
#else
- ioctl(device_fd, BLKSSZGET, &sec_size);
+ res = ioctl(device_fd, BLKSSZGET, &sec_size);
#endif
- if (sec_size == 0) {
- fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno));
- goto error;
- }
+ if (res < 0) {
+ fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno));
+ goto error;
+ }
- if (posix_memalign(&buffer, sysconf(_SC_PAGESIZE), sec_size) != 0) {
- fprintf(stderr, "Failed to allocate aligned memory: %s\n", strerror(errno));
- goto error;
- }
+ if (posix_memalign(&buffer, sysconf(_SC_PAGESIZE), sec_size) != 0) {
+ fprintf(stderr, "Failed to allocate aligned memory: %s\n", strerror(errno));
+ goto error;
+ }
+ res = read(device_fd, buffer, sec_size);
+ free(buffer);
+ if (res < 0) {
+ fprintf(stderr, "Failed to read %s: %s\n", device, strerror(errno));
+ goto error;
+ }
+ if (res < sec_size) {
+ fprintf(stderr, "Failed to read %d bytes from %s, got %d\n", sec_size, device, res);
+ goto error;
+ }
+ } else {
+ char buffer[512];
- res = read(device_fd, buffer, sec_size);
- free(buffer);
- if (res < 0) {
- fprintf(stderr, "Failed to read %s: %s\n", device, strerror(errno));
- goto error;
- }
- if (res < sec_size) {
- fprintf(stderr, "Failed to read %d bytes from %s, got %d\n", sec_size, device, res);
- goto error;
+ res = read(device_fd, buffer, sizeof(buffer));
+ if (res < 0) {
+ fprintf(stderr, "Failed to read %s: %s\n", device, strerror(errno));
+ goto error;
+ }
+ if (res < (int)sizeof(buffer)) {
+ fprintf(stderr, "Failed to read %ld bytes from %s, got %d\n", sizeof(buffer), device, res);
+ goto error;
+ }
}
/* Fake an error */
- if (inject_error_percent) {
- srand(time(NULL) + getpid());
- if ((rand() % 100) < inject_error_percent) {
- fprintf(stderr, "People, please fasten your seatbelts, injecting errors!\n");
- goto error;
- }
+ if (inject_error_percent && ((rand() % 100) < inject_error_percent)) {
+ fprintf(stderr, "People, please fasten your seatbelts, injecting errors!\n");
+ goto error;
}
res = close(device_fd);
if (res != 0) {
From db97e055a17526cec056c595844a9d8851e3ee19 Mon Sep 17 00:00:00 2001
From: Kazunori INOUE <kazunori_inoue@newson.co.jp>
Date: Thu, 25 Aug 2022 16:03:46 +0900
Subject: [PATCH 4/4] storage_mon: improve error messages when ioctl() fails
---
tools/storage_mon.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
index 0bdb48649..f829c5081 100644
--- a/tools/storage_mon.c
+++ b/tools/storage_mon.c
@@ -63,7 +63,7 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
res = ioctl(device_fd, BLKGETSIZE64, &devsize);
#endif
if (res < 0) {
- fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno));
+ fprintf(stderr, "Failed to get device size for %s: %s\n", device, strerror(errno));
goto error;
}
if (verbose) {
@@ -93,7 +93,7 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
res = ioctl(device_fd, BLKSSZGET, &sec_size);
#endif
if (res < 0) {
- fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno));
+ fprintf(stderr, "Failed to get block device sector size for %s: %s\n", device, strerror(errno));
goto error;
}

View File

@ -0,0 +1,298 @@
From 764757380af19d3a21d40f3c9624e4135ff074e1 Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Wed, 2 Nov 2022 10:26:31 +0100
Subject: [PATCH] nfsserver: add nfsv4_only parameter to make it run without
rpc-statd/rpcbind services
---
heartbeat/nfsserver | 200 +++++++++++++++++++++++++-------------------
1 file changed, 114 insertions(+), 86 deletions(-)
diff --git a/heartbeat/nfsserver b/heartbeat/nfsserver
index 9bbd603e5..cb2d43ab1 100755
--- a/heartbeat/nfsserver
+++ b/heartbeat/nfsserver
@@ -79,6 +79,16 @@ Init script for nfsserver
<content type="string" default="auto detected" />
</parameter>
+<parameter name="nfsv4_only" unique="0" required="0">
+<longdesc lang="en">
+Run in NFSv4 only mode (rpc-statd and rpcbind services masked).
+</longdesc>
+<shortdesc lang="en">
+NFSv4 only mode.
+</shortdesc>
+<content type="boolean" default="false" />
+</parameter>
+
<parameter name="nfs_no_notify" unique="0" required="0">
<longdesc lang="en">
Do not send reboot notifications to NFSv3 clients during server startup.
@@ -332,7 +342,7 @@ v3locking_exec()
if [ $EXEC_MODE -eq 2 ]; then
nfs_exec $cmd nfs-lock.service
elif [ $EXEC_MODE -eq 3 ]; then
- nfs_exec $cmd rpc-statd.service
+ nfs_exec $cmd rpc-statd.service
else
case $cmd in
start) locking_start;;
@@ -348,20 +358,22 @@ nfsserver_systemd_monitor()
local rc
local fn
- ocf_log debug "Status: rpcbind"
- rpcinfo > /dev/null 2>&1
- rc=$?
- if [ "$rc" -ne "0" ]; then
- ocf_exit_reason "rpcbind is not running"
- return $OCF_NOT_RUNNING
- fi
+ if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then
+ ocf_log debug "Status: rpcbind"
+ rpcinfo > /dev/null 2>&1
+ rc=$?
+ if [ "$rc" -ne "0" ]; then
+ ocf_exit_reason "rpcbind is not running"
+ return $OCF_NOT_RUNNING
+ fi
- ocf_log debug "Status: nfs-mountd"
- ps axww | grep -q "[r]pc.mountd"
- rc=$?
- if [ "$rc" -ne "0" ]; then
- ocf_exit_reason "nfs-mountd is not running"
- return $OCF_NOT_RUNNING
+ ocf_log debug "Status: nfs-mountd"
+ ps axww | grep -q "[r]pc.mountd"
+ rc=$?
+ if [ "$rc" -ne "0" ]; then
+ ocf_exit_reason "nfs-mountd is not running"
+ return $OCF_NOT_RUNNING
+ fi
fi
ocf_log debug "Status: nfs-idmapd"
@@ -375,12 +387,14 @@ nfsserver_systemd_monitor()
return $OCF_NOT_RUNNING
fi
- ocf_log debug "Status: rpc-statd"
- rpcinfo -t localhost 100024 > /dev/null 2>&1
- rc=$?
- if [ "$rc" -ne "0" ]; then
- ocf_exit_reason "rpc-statd is not running"
- return $OCF_NOT_RUNNING
+ if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then
+ ocf_log debug "Status: rpc-statd"
+ rpcinfo -t localhost 100024 > /dev/null 2>&1
+ rc=$?
+ if [ "$rc" -ne "0" ]; then
+ ocf_exit_reason "rpc-statd is not running"
+ return $OCF_NOT_RUNNING
+ fi
fi
nfs_exec is-active nfs-server
@@ -424,7 +438,7 @@ nfsserver_monitor ()
if [ $rc -eq 0 ]; then
# don't report success if nfs servers are up
# without locking daemons.
- v3locking_exec "status"
+ ocf_is_true "$OCF_RESKEY_nfsv4_only" || v3locking_exec "status"
rc=$?
if [ $rc -ne 0 ]; then
ocf_exit_reason "NFS server is up, but the locking daemons are down"
@@ -786,48 +800,54 @@ nfsserver_start ()
# systemd
case $EXEC_MODE in
- [23]) nfs_exec start rpcbind
- local i=1
- while : ; do
- ocf_log info "Start: rpcbind i: $i"
- rpcinfo > /dev/null 2>&1
- rc=$?
- if [ "$rc" -eq "0" ]; then
- break;
- fi
- sleep 1
- i=$((i + 1))
- done
+ [23]) if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then
+ nfs_exec start rpcbind
+ local i=1
+ while : ; do
+ ocf_log info "Start: rpcbind i: $i"
+ rpcinfo > /dev/null 2>&1
+ rc=$?
+ if [ "$rc" -eq "0" ]; then
+ break
+ fi
+ sleep 1
+ i=$((i + 1))
+ done
+ fi
;;
esac
- # check to see if we need to start rpc.statd
- v3locking_exec "status"
- if [ $? -ne $OCF_SUCCESS ]; then
- v3locking_exec "start"
- rc=$?
- if [ $rc -ne 0 ]; then
- ocf_exit_reason "Failed to start NFS server locking daemons"
- return $rc
+ if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then
+ # check to see if we need to start rpc.statd
+ v3locking_exec "status"
+ if [ $? -ne $OCF_SUCCESS ]; then
+ v3locking_exec "start"
+ rc=$?
+ if [ $rc -ne 0 ]; then
+ ocf_exit_reason "Failed to start NFS server locking daemons"
+ return $rc
+ fi
+ else
+ ocf_log info "rpc.statd already up"
fi
- else
- ocf_log info "rpc.statd already up"
fi
# systemd
case $EXEC_MODE in
- [23]) nfs_exec start nfs-mountd
- local i=1
- while : ; do
- ocf_log info "Start: nfs-mountd i: $i"
- ps axww | grep -q "[r]pc.mountd"
- rc=$?
- if [ "$rc" -eq "0" ]; then
- break;
- fi
- sleep 1
- i=$((i + 1))
- done
+ [23]) if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then
+ nfs_exec start nfs-mountd
+ local i=1
+ while : ; do
+ ocf_log info "Start: nfs-mountd i: $i"
+ ps axww | grep -q "[r]pc.mountd"
+ rc=$?
+ if [ "$rc" -eq "0" ]; then
+ break
+ fi
+ sleep 1
+ i=$((i + 1))
+ done
+ fi
nfs_exec start nfs-idmapd
local i=1
@@ -839,24 +859,26 @@ nfsserver_start ()
ocf_log debug "$(cat $fn)"
rm -f $fn
if [ "$rc" -eq "0" ]; then
- break;
+ break
fi
sleep 1
i=$((i + 1))
done
- nfs_exec start rpc-statd
- local i=1
- while : ; do
- ocf_log info "Start: rpc-statd i: $i"
- rpcinfo -t localhost 100024 > /dev/null 2>&1
- rc=$?
- if [ "$rc" -eq "0" ]; then
- break;
- fi
- sleep 1
- i=$((i + 1))
- done
+ if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then
+ nfs_exec start rpc-statd
+ local i=1
+ while : ; do
+ ocf_log info "Start: rpc-statd i: $i"
+ rpcinfo -t localhost 100024 > /dev/null 2>&1
+ rc=$?
+ if [ "$rc" -eq "0" ]; then
+ break
+ fi
+ sleep 1
+ i=$((i + 1))
+ done
+ fi
esac
@@ -914,13 +936,15 @@ nfsserver_stop ()
sleep 1
done
- nfs_exec stop rpc-statd > /dev/null 2>&1
- ocf_log info "Stop: rpc-statd"
- rpcinfo -t localhost 100024 > /dev/null 2>&1
- rc=$?
- if [ "$rc" -eq "0" ]; then
- ocf_exit_reason "Failed to stop rpc-statd"
- return $OCF_ERR_GENERIC
+ if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then
+ nfs_exec stop rpc-statd > /dev/null 2>&1
+ ocf_log info "Stop: rpc-statd"
+ rpcinfo -t localhost 100024 > /dev/null 2>&1
+ rc=$?
+ if [ "$rc" -eq "0" ]; then
+ ocf_exit_reason "Failed to stop rpc-statd"
+ return $OCF_ERR_GENERIC
+ fi
fi
nfs_exec stop nfs-idmapd > /dev/null 2>&1
@@ -935,13 +959,15 @@ nfsserver_stop ()
return $OCF_ERR_GENERIC
fi
- nfs_exec stop nfs-mountd > /dev/null 2>&1
- ocf_log info "Stop: nfs-mountd"
- ps axww | grep -q "[r]pc.mountd"
- rc=$?
- if [ "$rc" -eq "0" ]; then
- ocf_exit_reason "Failed to stop nfs-mountd"
- return $OCF_ERR_GENERIC
+ if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then
+ nfs_exec stop nfs-mountd > /dev/null 2>&1
+ ocf_log info "Stop: nfs-mountd"
+ ps axww | grep -q "[r]pc.mountd"
+ rc=$?
+ if [ "$rc" -eq "0" ]; then
+ ocf_exit_reason "Failed to stop nfs-mountd"
+ return $OCF_ERR_GENERIC
+ fi
fi
if systemctl --no-legend list-unit-files "nfsdcld*" | grep -q nfsdcld; then
@@ -960,10 +986,12 @@ nfsserver_stop ()
esac
- v3locking_exec "stop"
- if [ $? -ne 0 ]; then
- ocf_exit_reason "Failed to stop NFS locking daemons"
- rc=$OCF_ERR_GENERIC
+ if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then
+ v3locking_exec "stop"
+ if [ $? -ne 0 ]; then
+ ocf_exit_reason "Failed to stop NFS locking daemons"
+ rc=$OCF_ERR_GENERIC
+ fi
fi
# systemd

View File

@ -0,0 +1,147 @@
From 237d55120a7c8d761f839c96651e722b3bb3bc88 Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Wed, 12 Oct 2022 13:57:30 +0200
Subject: [PATCH 1/4] IPsrcaddr: fix PROTO regex
---
heartbeat/IPsrcaddr | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/heartbeat/IPsrcaddr b/heartbeat/IPsrcaddr
index 7dbf65ff5..24406d296 100755
--- a/heartbeat/IPsrcaddr
+++ b/heartbeat/IPsrcaddr
@@ -188,7 +188,7 @@ IPADDR="\($OCTET\.\)\{3\}$OCTET"
SRCCLAUSE="src$WS$WS*\($IPADDR\)"
MATCHROUTE="\(.*${WS}\)\($SRCCLAUSE\)\($WS.*\|$\)"
METRICCLAUSE=".*\(metric$WS[^ ]\+\)"
-PROTOCLAUSE=".*\(proto$WS[^ ]\+\)"
+PROTOCLAUSE=".*\(proto$WS[^ ]\+\).*"
FINDIF=findif
# findif needs that to be set
From c70ba457851a401cb201cb87d23bdbc5f4fcd2b3 Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Wed, 12 Oct 2022 14:00:30 +0200
Subject: [PATCH 2/4] IPsrcaddr: detect metric for main table only, and allow
specifying metric if necessary
---
heartbeat/IPsrcaddr | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
diff --git a/heartbeat/IPsrcaddr b/heartbeat/IPsrcaddr
index 24406d296..4745eb8a7 100755
--- a/heartbeat/IPsrcaddr
+++ b/heartbeat/IPsrcaddr
@@ -59,12 +59,14 @@ OCF_RESKEY_ipaddress_default=""
OCF_RESKEY_cidr_netmask_default=""
OCF_RESKEY_destination_default="0.0.0.0/0"
OCF_RESKEY_proto_default=""
+OCF_RESKEY_metric_default=""
OCF_RESKEY_table_default=""
: ${OCF_RESKEY_ipaddress=${OCF_RESKEY_ipaddress_default}}
: ${OCF_RESKEY_cidr_netmask=${OCF_RESKEY_cidr_netmask_default}}
: ${OCF_RESKEY_destination=${OCF_RESKEY_destination_default}}
: ${OCF_RESKEY_proto=${OCF_RESKEY_proto_default}}
+: ${OCF_RESKEY_metric=${OCF_RESKEY_metric_default}}
: ${OCF_RESKEY_table=${OCF_RESKEY_table_default}}
#######################################################################
@@ -143,6 +145,14 @@ Proto to match when finding network. E.g. "kernel".
<content type="string" default="${OCF_RESKEY_proto_default}" />
</parameter>
+<parameter name="metric">
+<longdesc lang="en">
+Metric. Only needed if incorrect metric value is used.
+</longdesc>
+<shortdesc lang="en">Metric</shortdesc>
+<content type="string" default="${OCF_RESKEY_metric_default}" />
+</parameter>
+
<parameter name="table">
<longdesc lang="en">
Table to modify. E.g. "local".
@@ -548,8 +558,14 @@ rc=$?
INTERFACE=`echo $findif_out | awk '{print $1}'`
LISTROUTE=`$IP2UTIL route list dev $INTERFACE scope link $PROTO match $ipaddress`
-METRIC=`echo $LISTROUTE | sed -n "s/$METRICCLAUSE/\1/p"`
[ -z "$PROTO" ] && PROTO=`echo $LISTROUTE | sed -n "s/$PROTOCLAUSE/\1/p"`
+if [ -n "$OCF_RESKEY_metric" ]; then
+ METRIC="metric $OCF_RESKEY_metric"
+elif [ -z "$TABLE" ] || [ "${TABLE#table }" = "main" ]; then
+ METRIC=`echo $LISTROUTE | sed -n "s/$METRICCLAUSE/\1/p"`
+else
+ METRIC=""
+fi
if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] ;then
NETWORK=`echo $LISTROUTE | grep -m 1 -o '^[^ ]*'`
From c514f12f7a19440f475938f2a4659e5e9667fa25 Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Wed, 12 Oct 2022 14:01:26 +0200
Subject: [PATCH 3/4] IPsrcaddr: use scope host when using non-main tables
---
heartbeat/IPsrcaddr | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/heartbeat/IPsrcaddr b/heartbeat/IPsrcaddr
index 4745eb8a7..926246008 100755
--- a/heartbeat/IPsrcaddr
+++ b/heartbeat/IPsrcaddr
@@ -279,8 +279,14 @@ srca_stop() {
[ $rc = 2 ] && errorexit "The address you specified to stop does not match the preferred source address"
+ if [ -z "$TABLE" ] || [ "${TABLE#table }" = "main" ]; then
+ SCOPE="link"
+ else
+ SCOPE="host"
+ fi
+
PRIMARY_IP="$($IP2UTIL -4 -o addr show dev $INTERFACE primary | awk '{split($4,a,"/");print a[1]}')"
- OPTS="proto kernel scope link src $PRIMARY_IP"
+ OPTS="proto kernel scope $SCOPE src $PRIMARY_IP"
$IP2UTIL route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC || \
errorexit "command 'ip route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC' failed"
From 1f387ac8017b3eee23b41eadafd58ce21a29eb21 Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Thu, 13 Oct 2022 13:11:28 +0200
Subject: [PATCH 4/4] IPsrcaddr: fix monitor/status for default route not being
equal to src IP before start, and change route src correctly in stop-action
---
heartbeat/IPsrcaddr | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/heartbeat/IPsrcaddr b/heartbeat/IPsrcaddr
index 926246008..1bd41a930 100755
--- a/heartbeat/IPsrcaddr
+++ b/heartbeat/IPsrcaddr
@@ -229,6 +229,7 @@ srca_read() {
[ -z "$SRCIP" ] && return 1
[ $SRCIP = $1 ] && return 0
+ [ "$__OCF_ACTION" = "monitor" ] || [ "$__OCF_ACTION" = "status" ] && [ "${ROUTE%% *}" = "default" ] && return 1
return 2
}
@@ -292,8 +293,8 @@ srca_stop() {
errorexit "command 'ip route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC' failed"
if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] ;then
- $CMDCHANGE $ROUTE_WO_SRC || \
- errorexit "command '$CMDCHANGE $ROUTE_WO_SRC' failed"
+ $CMDCHANGE $ROUTE_WO_SRC src $PRIMARY_IP || \
+ errorexit "command '$CMDCHANGE $ROUTE_WO_SRC src $PRIMARY_IP' failed"
fi
return $?

View File

@ -0,0 +1,25 @@
From 97a05e0e662ed922c9ecd016b39ab90ee233d5c9 Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Thu, 24 Nov 2022 10:36:56 +0100
Subject: [PATCH] mysql-common: return error in stop-action if kill fails to
stop the process, so the node can get fenced
---
heartbeat/mysql-common.sh | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/heartbeat/mysql-common.sh b/heartbeat/mysql-common.sh
index 34e1c6748..8104019b0 100755
--- a/heartbeat/mysql-common.sh
+++ b/heartbeat/mysql-common.sh
@@ -318,6 +318,10 @@ mysql_common_stop()
if [ $? != $OCF_NOT_RUNNING ]; then
ocf_log info "MySQL failed to stop after ${shutdown_timeout}s using SIGTERM. Trying SIGKILL..."
/bin/kill -KILL $pid > /dev/null
+ mysql_common_status info $pid
+ if [ $? != $OCF_NOT_RUNNING ]; then
+ return $OCF_ERR_GENERIC
+ fi
fi
ocf_log info "MySQL stopped";

View File

@ -0,0 +1,27 @@
From 739e6ce9096facd6d37dffd524c79c961e3fae38 Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Fri, 11 Nov 2022 14:17:39 +0100
Subject: [PATCH] vdo-vol: dont fail probe action when the underlying device
doesnt exist
---
heartbeat/vdo-vol | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/heartbeat/vdo-vol b/heartbeat/vdo-vol
index 94822cb82..29bd7b8fd 100755
--- a/heartbeat/vdo-vol
+++ b/heartbeat/vdo-vol
@@ -148,6 +148,12 @@ vdo_monitor(){
MODE=$(vdostats --verbose ${OCF_RESKEY_volume} | grep "operating mode" | awk '{print $NF}')
case "$status" in
+ *"ERROR - vdodumpconfig: Failed to make FileLayer from"*)
+ if ocf_is_probe; then
+ return $OCF_NOT_RUNNING
+ fi
+ return $OCF_ERR_GENERIC
+ ;;
*"Device mapper status: not available"*)
return $OCF_NOT_RUNNING
;;

View File

@ -0,0 +1,137 @@
From bf89ad06d5da5c05533c80a37a37c8dbbcd123aa Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Thu, 8 Dec 2022 15:40:07 +0100
Subject: [PATCH] galera/mpathpersist/sg_persist/IPsrcaddr: only check notify
and promotable when OCF_CHECK_LEVEL=10
Pacemaker has started running validate-all action before creating the
resource. It doesnt provide notify/promotable settings while doing so,
so this patch moves these checks to OCF_CHECK_LEVEL 10 and runs the
validate action at OCF_CHECK_LEVEL 10 in the start-action.
---
heartbeat/IPsrcaddr | 13 ++++++++-----
heartbeat/galera.in | 9 ++++++---
heartbeat/mpathpersist.in | 13 +++++++++----
heartbeat/sg_persist.in | 13 +++++++++----
4 files changed, 32 insertions(+), 16 deletions(-)
diff --git a/heartbeat/IPsrcaddr b/heartbeat/IPsrcaddr
index 1bd41a930..66e2ad8cd 100755
--- a/heartbeat/IPsrcaddr
+++ b/heartbeat/IPsrcaddr
@@ -510,11 +510,13 @@ srca_validate_all() {
fi
# We should serve this IP address of course
- if ip_status "$ipaddress"; then
- :
- else
- ocf_exit_reason "We are not serving [$ipaddress], hence can not make it a preferred source address"
- return $OCF_ERR_INSTALLED
+ if [ "$OCF_CHECK_LEVEL" -eq 10 ]; then
+ if ip_status "$ipaddress"; then
+ :
+ else
+ ocf_exit_reason "We are not serving [$ipaddress], hence can not make it a preferred source address"
+ return $OCF_ERR_INSTALLED
+ fi
fi
return $OCF_SUCCESS
}
@@ -540,6 +542,7 @@ esac
ipaddress="$OCF_RESKEY_ipaddress"
+[ "$__OCF_ACTION" != "validate-all" ] && OCF_CHECK_LEVEL=10
srca_validate_all
rc=$?
if [ $rc -ne $OCF_SUCCESS ]; then
diff --git a/heartbeat/galera.in b/heartbeat/galera.in
index cd2fee7c0..6aed3e4b6 100755
--- a/heartbeat/galera.in
+++ b/heartbeat/galera.in
@@ -1015,9 +1015,11 @@ galera_stop()
galera_validate()
{
- if ! ocf_is_ms; then
- ocf_exit_reason "Galera must be configured as a multistate Master/Slave resource."
- return $OCF_ERR_CONFIGURED
+ if [ "$OCF_CHECK_LEVEL" -eq 10 ]; then
+ if ! ocf_is_ms; then
+ ocf_exit_reason "Galera must be configured as a multistate Master/Slave resource."
+ return $OCF_ERR_CONFIGURED
+ fi
fi
if [ -z "$OCF_RESKEY_wsrep_cluster_address" ]; then
@@ -1035,6 +1037,7 @@ case "$1" in
exit $OCF_SUCCESS;;
esac
+[ "$__OCF_ACTION" = "start" ] && OCF_CHECK_LEVEL=10
galera_validate
rc=$?
LSB_STATUS_STOPPED=3
diff --git a/heartbeat/mpathpersist.in b/heartbeat/mpathpersist.in
index 0e2c2a4a0..8a46b9930 100644
--- a/heartbeat/mpathpersist.in
+++ b/heartbeat/mpathpersist.in
@@ -630,10 +630,11 @@ mpathpersist_action_notify() {
}
mpathpersist_action_validate_all () {
-
- if [ "$OCF_RESKEY_CRM_meta_master_max" != "1" ] && [ "$RESERVATION_TYPE" != "7" ] && [ "$RESERVATION_TYPE" != "8" ]; then
- ocf_log err "Master options misconfigured."
- exit $OCF_ERR_CONFIGURED
+ if [ "$OCF_CHECK_LEVEL" -eq 10 ]; then
+ if [ "$OCF_RESKEY_CRM_meta_master_max" != "1" ] && [ "$RESERVATION_TYPE" != "7" ] && [ "$RESERVATION_TYPE" != "8" ]; then
+ ocf_log err "Master options misconfigured."
+ exit $OCF_ERR_CONFIGURED
+ fi
fi
return $OCF_SUCCESS
@@ -659,6 +660,10 @@ case $ACTION in
start|promote|monitor|stop|demote)
ocf_log debug "$RESOURCE: starting action \"$ACTION\""
mpathpersist_init
+ if [ "$__OCF_ACTION" = "start" ]; then
+ OCF_CHECK_LEVEL=10
+ mpathpersist_action_validate_all
+ fi
mpathpersist_action_$ACTION
exit $?
;;
diff --git a/heartbeat/sg_persist.in b/heartbeat/sg_persist.in
index 16048ea6f..620c02f4a 100644
--- a/heartbeat/sg_persist.in
+++ b/heartbeat/sg_persist.in
@@ -643,10 +643,11 @@ sg_persist_action_notify() {
}
sg_persist_action_validate_all () {
-
- if [ "$OCF_RESKEY_CRM_meta_master_max" != "1" ] && [ "$RESERVATION_TYPE" != "7" ] && [ "$RESERVATION_TYPE" != "8" ]; then
- ocf_log err "Master options misconfigured."
- exit $OCF_ERR_CONFIGURED
+ if [ "$OCF_CHECK_LEVEL" -eq 10 ]; then
+ if [ "$OCF_RESKEY_CRM_meta_master_max" != "1" ] && [ "$RESERVATION_TYPE" != "7" ] && [ "$RESERVATION_TYPE" != "8" ]; then
+ ocf_log err "Master options misconfigured."
+ exit $OCF_ERR_CONFIGURED
+ fi
fi
return $OCF_SUCCESS
@@ -672,6 +673,10 @@ case $ACTION in
start|promote|monitor|stop|demote)
ocf_log debug "$RESOURCE: starting action \"$ACTION\""
sg_persist_init
+ if [ "$__OCF_ACTION" = "start" ]; then
+ OCF_CHECK_LEVEL=10
+ sg_persist_action_validate_all
+ fi
sg_persist_action_$ACTION
exit $?
;;

View File

@ -0,0 +1,49 @@
From 21666c5c842b8a6028699ee78db75a1d7134fad0 Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Wed, 4 Jan 2023 10:39:16 +0100
Subject: [PATCH 1/2] Filesystem: remove validate-all mountpoint warning as it
is auto-created during start-action if it doesnt exist
---
heartbeat/Filesystem | 4 ----
1 file changed, 4 deletions(-)
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
index 44270ad98..65088029e 100755
--- a/heartbeat/Filesystem
+++ b/heartbeat/Filesystem
@@ -851,10 +851,6 @@ Filesystem_monitor()
#
Filesystem_validate_all()
{
- if [ -n "$MOUNTPOINT" ] && [ ! -d "$MOUNTPOINT" ]; then
- ocf_log warn "Mountpoint $MOUNTPOINT does not exist"
- fi
-
# Check if the $FSTYPE is workable
# NOTE: Without inserting the $FSTYPE module, this step may be imprecise
# TODO: This is Linux specific crap.
From 8a7f40b6ab93d8d39230d864ab06a57ff48d6f1f Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Thu, 5 Jan 2023 13:09:48 +0100
Subject: [PATCH 2/2] CTDB: change public_addresses validate-all warning to
info
---
heartbeat/CTDB.in | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/heartbeat/CTDB.in b/heartbeat/CTDB.in
index 46f56cfac..b4af66bc1 100755
--- a/heartbeat/CTDB.in
+++ b/heartbeat/CTDB.in
@@ -940,7 +940,7 @@ ctdb_validate() {
fi
if [ -f "${OCF_RESKEY_ctdb_config_dir}/public_addresses" ]; then
- ocf_log warn "CTDB file '${OCF_RESKEY_ctdb_config_dir}/public_addresses' exists - CTDB will try to manage IP failover!"
+ ocf_log info "CTDB file '${OCF_RESKEY_ctdb_config_dir}/public_addresses' exists - CTDB will try to manage IP failover!"
fi
if [ ! -f "$OCF_RESKEY_ctdb_config_dir/nodes" ]; then

View File

@ -0,0 +1,68 @@
--- a/heartbeat/pgsqlms 2023-01-04 14:42:36.093258702 +0100
+++ b/heartbeat/pgsqlms 2023-01-04 14:40:52.403994545 +0100
@@ -66,6 +66,7 @@
my $maxlag = $ENV{'OCF_RESKEY_maxlag'} || $maxlag_default;
my $recovery_tpl = $ENV{'OCF_RESKEY_recovery_template'}
|| "$pgdata/recovery.conf.pcmk";
+my $ocf_check_level = $ENV{'OCF_CHECK_LEVEL'} || 0;
# PostgreSQL commands path
@@ -1304,26 +1305,28 @@
return $OCF_ERR_INSTALLED;
}
- # check notify=true
- $ans = qx{ $CRM_RESOURCE --resource "$OCF_RESOURCE_INSTANCE" \\
- --meta --get-parameter notify 2>/dev/null };
- chomp $ans;
- unless ( lc($ans) =~ /^true$|^on$|^yes$|^y$|^1$/ ) {
- ocf_exit_reason(
- 'You must set meta parameter notify=true for your master resource'
- );
- return $OCF_ERR_INSTALLED;
- }
+ if ( $ocf_check_level == 10 ) {
+ # check notify=true
+ $ans = qx{ $CRM_RESOURCE --resource "$OCF_RESOURCE_INSTANCE" \\
+ --meta --get-parameter notify 2>/dev/null };
+ chomp $ans;
+ unless ( lc($ans) =~ /^true$|^on$|^yes$|^y$|^1$/ ) {
+ ocf_exit_reason(
+ 'You must set meta parameter notify=true for your "master" resource'
+ );
+ return $OCF_ERR_INSTALLED;
+ }
- # check master-max=1
- unless (
- defined $ENV{'OCF_RESKEY_CRM_meta_master_max'}
- and $ENV{'OCF_RESKEY_CRM_meta_master_max'} eq '1'
- ) {
- ocf_exit_reason(
- 'You must set meta parameter master-max=1 for your master resource'
- );
- return $OCF_ERR_INSTALLED;
+ # check master-max=1
+ unless (
+ defined $ENV{'OCF_RESKEY_CRM_meta_master_max'}
+ and $ENV{'OCF_RESKEY_CRM_meta_master_max'} eq '1'
+ ) {
+ ocf_exit_reason(
+ 'You must set meta parameter master-max=1 for your "master" resource'
+ );
+ return $OCF_ERR_INSTALLED;
+ }
}
if ( $PGVERNUM >= $PGVER_12 ) {
@@ -2242,6 +2245,9 @@
# Set current node name.
$nodename = ocf_local_nodename();
+if ( $__OCF_ACTION ne 'validate-all' ) {
+ $ocf_check_level = 10;
+}
$exit_code = pgsql_validate_all();
exit $exit_code if $exit_code != $OCF_SUCCESS or $__OCF_ACTION eq 'validate-all';

View File

@ -0,0 +1,187 @@
From 81f9e1a04dfd2274ccb906310b4f191485e342ab Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Wed, 11 Jan 2023 13:22:24 +0100
Subject: [PATCH 1/2] exportfs: move testdir() to start-action to avoid failing
during resource creation (validate-all) and make it create the directory if
it doesnt exist
---
heartbeat/exportfs | 27 +++++++++++++++------------
1 file changed, 15 insertions(+), 12 deletions(-)
diff --git a/heartbeat/exportfs b/heartbeat/exportfs
index c10777fa9..2307a9e67 100755
--- a/heartbeat/exportfs
+++ b/heartbeat/exportfs
@@ -301,6 +301,16 @@ exportfs_monitor ()
fi
}
+testdir() {
+ if [ ! -d $1 ]; then
+ mkdir -p "$1"
+ if [ $? -ne 0 ]; then
+ ocf_exit_reason "Unable to create directory $1"
+ return 1
+ fi
+ fi
+ return 0
+}
export_one() {
local dir=$1
local opts sep
@@ -331,6 +341,10 @@ export_one() {
}
exportfs_start ()
{
+ if ! forall testdir; then
+ return $OCF_ERR_INSTALLED
+ fi
+
if exportfs_monitor; then
ocf_log debug "already exported"
return $OCF_SUCCESS
@@ -428,14 +442,6 @@ exportfs_stop ()
fi
}
-testdir() {
- if [ ! -d $1 ]; then
- ocf_is_probe ||
- ocf_log err "$1 does not exist or is not a directory"
- return 1
- fi
- return 0
-}
exportfs_validate_all ()
{
if echo "$OCF_RESKEY_fsid" | grep -q -F ','; then
@@ -447,9 +453,6 @@ exportfs_validate_all ()
ocf_exit_reason "use integer fsid when exporting multiple directories"
return $OCF_ERR_CONFIGURED
fi
- if ! forall testdir; then
- return $OCF_ERR_INSTALLED
- fi
}
for dir in $OCF_RESKEY_directory; do
@@ -466,7 +469,7 @@ for dir in $OCF_RESKEY_directory; do
fi
else
case "$__OCF_ACTION" in
- stop|monitor)
+ stop|monitor|validate-all)
canonicalized_dir="$dir"
ocf_log debug "$dir does not exist"
;;
From 8ee41af82cda35149f8e0cfede6a8ddef3e221e1 Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Wed, 11 Jan 2023 13:25:57 +0100
Subject: [PATCH 2/2] pgsql: dont run promotable and file checks that could be
on shared storage during validate-all action
---
heartbeat/pgsql | 53 +++++++++++++++++++++++++++++--------------------
1 file changed, 32 insertions(+), 21 deletions(-)
diff --git a/heartbeat/pgsql b/heartbeat/pgsql
index aa8a13a84..532063ac5 100755
--- a/heartbeat/pgsql
+++ b/heartbeat/pgsql
@@ -1835,7 +1835,7 @@ check_config() {
if [ ! -f "$1" ]; then
if ocf_is_probe; then
- ocf_log info "Configuration file is $1 not readable during probe."
+ ocf_log info "Unable to read $1 during probe."
rc=1
else
ocf_exit_reason "Configuration file $1 doesn't exist"
@@ -1846,8 +1846,7 @@ check_config() {
return $rc
}
-# Validate most critical parameters
-pgsql_validate_all() {
+validate_ocf_check_level_10() {
local version
local check_config_rc
local rep_mode_string
@@ -1883,12 +1882,6 @@ pgsql_validate_all() {
fi
fi
- getent passwd $OCF_RESKEY_pgdba >/dev/null 2>&1
- if [ ! $? -eq 0 ]; then
- ocf_exit_reason "User $OCF_RESKEY_pgdba doesn't exist";
- return $OCF_ERR_INSTALLED;
- fi
-
if ocf_is_probe; then
ocf_log info "Don't check $OCF_RESKEY_pgdata during probe"
else
@@ -1898,18 +1891,6 @@ pgsql_validate_all() {
fi
fi
- if [ -n "$OCF_RESKEY_monitor_user" -a ! -n "$OCF_RESKEY_monitor_password" ]
- then
- ocf_exit_reason "monitor password can't be empty"
- return $OCF_ERR_CONFIGURED
- fi
-
- if [ ! -n "$OCF_RESKEY_monitor_user" -a -n "$OCF_RESKEY_monitor_password" ]
- then
- ocf_exit_reason "monitor_user has to be set if monitor_password is set"
- return $OCF_ERR_CONFIGURED
- fi
-
if is_replication || [ "$OCF_RESKEY_rep_mode" = "slave" ]; then
if [ `printf "$version\n9.1" | sort -n | head -1` != "9.1" ]; then
ocf_exit_reason "Replication mode needs PostgreSQL 9.1 or higher."
@@ -2027,6 +2008,35 @@ pgsql_validate_all() {
return $OCF_SUCCESS
}
+# Validate most critical parameters
+pgsql_validate_all() {
+ local rc
+
+ getent passwd $OCF_RESKEY_pgdba >/dev/null 2>&1
+ if [ ! $? -eq 0 ]; then
+ ocf_exit_reason "User $OCF_RESKEY_pgdba doesn't exist";
+ return $OCF_ERR_INSTALLED;
+ fi
+
+ if [ -n "$OCF_RESKEY_monitor_user" ] && [ -z "$OCF_RESKEY_monitor_password" ]; then
+ ocf_exit_reason "monitor password can't be empty"
+ return $OCF_ERR_CONFIGURED
+ fi
+
+ if [ -z "$OCF_RESKEY_monitor_user" ] && [ -n "$OCF_RESKEY_monitor_password" ]; then
+ ocf_exit_reason "monitor_user has to be set if monitor_password is set"
+ return $OCF_ERR_CONFIGURED
+ fi
+
+ if [ "$OCF_CHECK_LEVEL" -eq 10 ]; then
+ validate_ocf_check_level_10
+ rc=$?
+ [ $rc -ne "$OCF_SUCCESS" ] && exit $rc
+ fi
+
+ return $OCF_SUCCESS
+}
+
#
# Check if we need to create a log file
@@ -2163,6 +2173,7 @@ case "$1" in
exit $OCF_SUCCESS;;
esac
+[ "$__OCF_ACTION" != "validate-all" ] && OCF_CHECK_LEVEL=10
pgsql_validate_all
rc=$?

View File

@ -0,0 +1,23 @@
--- ClusterLabs-resource-agents-fd0720f7/heartbeat/pgsqlms 2023-01-16 10:54:30.897188238 +0100
+++ pgsqlms 2023-01-10 14:21:19.281286242 +0100
@@ -1351,12 +1351,14 @@
return $OCF_ERR_ARGS;
}
- $guc = qx{ $POSTGRES -C primary_conninfo -D "$pgdata" $start_opts};
- unless ($guc =~ /\bapplication_name='?$nodename'?\b/) {
- ocf_exit_reason(
- q{Parameter "primary_conninfo" MUST contain 'application_name=%s'. }.
- q{It is currently set to '%s'}, $nodename, $guc );
- return $OCF_ERR_ARGS;
+ if ( $ocf_check_level == 10 ) {
+ $guc = qx{ $POSTGRES -C primary_conninfo -D "$pgdata" $start_opts};
+ unless ($guc =~ /\bapplication_name='?$nodename'?\b/) {
+ ocf_exit_reason(
+ q{Parameter "primary_conninfo" MUST contain 'application_name=%s'. }.
+ q{It is currently set to '%s'}, $nodename, $guc );
+ return $OCF_ERR_ARGS;
+ }
}
}
else {

View File

@ -69,7 +69,7 @@
Name: resource-agents Name: resource-agents
Summary: Open Source HA Reusable Cluster Resource Scripts Summary: Open Source HA Reusable Cluster Resource Scripts
Version: 4.9.0 Version: 4.9.0
Release: 29%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} Release: 40%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
License: GPLv2+ and LGPLv2+ License: GPLv2+ and LGPLv2+
URL: https://github.com/ClusterLabs/resource-agents URL: https://github.com/ClusterLabs/resource-agents
%if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel} %if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel}
@ -120,6 +120,22 @@ Patch28: bz2103370-ocf-tester-2-remove-deprecated-lrmd-lrmadmin-code.patch
Patch29: bz1908146-bz1908147-bz1908148-bz1949114-openstack-agents-set-domain-parameters-default.patch Patch29: bz1908146-bz1908147-bz1908148-bz1949114-openstack-agents-set-domain-parameters-default.patch
Patch30: bz2090370-CTDB-move-process-to-root-cgroup-if-rt-enabled.patch Patch30: bz2090370-CTDB-move-process-to-root-cgroup-if-rt-enabled.patch
Patch31: bz2116941-ethmonitor-ovsmonitor-pgsql-fix-attrd_updater-q.patch Patch31: bz2116941-ethmonitor-ovsmonitor-pgsql-fix-attrd_updater-q.patch
Patch32: bz2109159-storage_mon-1-exit-after-help.patch
Patch33: bz2109159-storage_mon-2-fix-specified-scores-count.patch
Patch34: bz2109159-storage_mon-3-fix-child-process-exit.patch
Patch35: bz2109159-storage_mon-4-fix-possible-false-negatives.patch
Patch36: bz1905820-LVM-activate-fix-return-codes.patch
Patch37: bz1977012-azure-events-az-new-ra.patch
Patch38: bz2133682-IPsrcaddr-proto-metric-scope-default-route-fixes.patch
Patch39: bz2141836-vdo-vol-dont-fail-probe-action.patch
Patch40: bz2049319-Filesystem-add-support-for-Amazon-EFS.patch
Patch41: bz2127117-nfsserver-nfsv4_only-parameter.patch
Patch42: bz2139131-mysql-common-return-error-if-kill-fails.patch
Patch43: bz2157873-1-all-ras-validate-all-OCF_CHECK_LEVEL-10.patch
Patch44: bz2157873-2-Filesystem-CTDB-validate-all-improvements.patch
Patch45: bz2157873-3-pgsqlms-validate-all-OCF_CHECK_LEVEL-10.patch
Patch46: bz2157873-4-exportfs-pgsql-validate-all-fixes.patch
Patch47: bz2157873-5-pgsqlms-alidate-all-OCF_CHECK_LEVEL-10.patch
# bundle patches # bundle patches
Patch1000: 7-gcp-bundled.patch Patch1000: 7-gcp-bundled.patch
@ -331,6 +347,22 @@ exit 1
%patch29 -p1 %patch29 -p1
%patch30 -p1 %patch30 -p1
%patch31 -p1 %patch31 -p1
%patch32 -p1
%patch33 -p1
%patch34 -p1
%patch35 -p1
%patch36 -p1
%patch37 -p1
%patch38 -p1
%patch39 -p1
%patch40 -p1
%patch41 -p1
%patch42 -p1
%patch43 -p1
%patch44 -p1
%patch45 -p1
%patch46 -p1
%patch47 -p1
chmod 755 heartbeat/nova-compute-wait chmod 755 heartbeat/nova-compute-wait
chmod 755 heartbeat/NovaEvacuate chmod 755 heartbeat/NovaEvacuate
@ -350,7 +382,7 @@ tar -xzf %SOURCE1 -C %{bundled_lib_dir}/gcp
# gcloud support info # gcloud support info
%patch1002 -p1 %patch1002 -p1
# configure: skip bundled gcp lib checks # configure: skip bundled gcp lib checks
%patch1003 -p1 %patch1003 -p1 -F1
# gcloud remove python 2 detection # gcloud remove python 2 detection
%patch1004 -p1 %patch1004 -p1
# rename gcloud # rename gcloud
@ -906,6 +938,46 @@ ccs_update_schema > /dev/null 2>&1 ||:
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
%changelog %changelog
* Tue Jan 17 2023 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.9.0-40
- all agents: dont check notify/promotable settings during
validate-action
Resolves: rhbz#2157873
* Thu Nov 24 2022 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.9.0-35
- mysql-common: return error in stop-action if kill fails to stop
the process, so the node can get fenced
Resolves: rhbz#2139131
* Tue Nov 22 2022 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.9.0-34
- nfsserver: add nfsv4_only parameter to make it run without
rpc-statd/rpcbind services
Resolves: rhbz#2127117
* Mon Nov 14 2022 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.9.0-33
- Filesystem: add support for Amazon EFS (Elastic File System)
- vdo-vol: dont fail probe action when the underlying device doesnt
exist
Resolves: rhbz#2049319
Resolves: rhbz#2141836
* Fri Oct 14 2022 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.9.0-31
- IPsrcaddr: proto, metric, scope and default route fixes
Resolves: rhbz#2133682
* Thu Sep 8 2022 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.9.0-30
- storage_mon: fix specified scores count and possible false negatives
- LVM-activate: use correct return codes to fix unexpected behaviour
- azure-events-az: new resource agent
Resolves: rhbz#2109159
Resolves: rhbz#1905820
Resolves: rhbz#1977012
* Wed Aug 10 2022 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.9.0-29 * Wed Aug 10 2022 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.9.0-29
- ethmonitor/pgsql: remove attrd_updater "-q" parameter to solve issue - ethmonitor/pgsql: remove attrd_updater "-q" parameter to solve issue
with Pacemaker 2.1.3+ not ignoring it with Pacemaker 2.1.3+ not ignoring it