- fence_sbd: improve error handling

Resolves: rhbz#2087176
2022-08-01 16:17:31 +02:00 · 2022-08-01 16:17:31 +02:00 · 6a75d52e40
commit 6a75d52e40
parent db22f2ea02
2 changed files with 303 additions and 1 deletions
--- a/bz2087176-fence_sbd-improve-error-handling.patch
+++ b/bz2087176-fence_sbd-improve-error-handling.patch
@ -0,0 +1,296 @@
 From ab1de07902d9f380c10405d6ddac3aeb43838c86 Mon Sep 17 00:00:00 2001
 From: Klaus Wenninger <klaus.wenninger@aon.at>
 Date: Thu, 28 Jul 2022 15:33:12 +0200
 Subject: [PATCH] fence_sbd: improve error handling
 basically when using 3 disks be happy with 2 answers
 but give it 5s at least to collect all answers
 increase default power-timeout to 30s so that waiting
 those 5s still allows us to get done sending the
 reboot
 RHBZ#2033671
 ---
 agents/sbd/fence_sbd.py           |  77 +++++++++++++--------
 lib/fencing.py.py                 | 109 ++++++++++++++++++++++++++++++
 tests/data/metadata/fence_sbd.xml |   2 +-
 3 files changed, 158 insertions(+), 30 deletions(-)
 diff --git a/agents/sbd/fence_sbd.py b/agents/sbd/fence_sbd.py
 index 0c876b16e..2b0127d55 100644
 --- a/agents/sbd/fence_sbd.py
 +++ b/agents/sbd/fence_sbd.py
@@ -5,7 +5,7 @@
 import os
 import atexit
 sys.path.append("@FENCEAGENTSLIBDIR@")
 -from fencing import fail_usage, run_command, fence_action, all_opt
 +from fencing import fail_usage, run_commands, fence_action, all_opt
 from fencing import atexit_handler, check_input, process_input, show_docs
 from fencing import run_delay
 import itertools
@@ -81,7 +81,7 @@ def check_sbd_device(options, device_path):
     cmd = "%s -d %s dump" % (options["--sbd-path"], device_path)
 -    (return_code, out, err) = run_command(options, cmd)
 +    (return_code, out, err) = run_commands(options, [ cmd ])
     for line in itertools.chain(out.split("\n"), err.split("\n")):
         if len(line) == 0:
@@ -94,21 +94,35 @@ def check_sbd_device(options, device_path):
     return DEVICE_INIT
 +
 def generate_sbd_command(options, command, arguments=None):
     """Generates a sbd command based on given arguments.
     Return Value:
 -    generated sbd command (string)
 +    generated list of sbd commands (strings) depending
 +    on command multiple commands with a device each
 +    or a single command with multiple devices
     """
 -    cmd = options["--sbd-path"]
 +    cmds = []
 +
 +    if not command in ["list", "dump"]:
 +        cmd = options["--sbd-path"]
 -    # add "-d" for each sbd device
 -    for device in parse_sbd_devices(options):
 -        cmd += " -d %s" % device
 +        # add "-d" for each sbd device
 +        for device in parse_sbd_devices(options):
 +            cmd += " -d %s" % device
 -    cmd += " %s %s" % (command, arguments)
 +        cmd += " %s %s" % (command, arguments)
 +        cmds.append(cmd)
 +
 +    else:
 +        for device in parse_sbd_devices(options):
 +            cmd = options["--sbd-path"]
 +            cmd += " -d %s" % device
 +            cmd += " %s %s" % (command, arguments)
 +            cmds.append(cmd)
 -    return cmd
 +    return cmds
 def send_sbd_message(conn, options, plug, message):
     """Sends a message to all sbd devices.
@@ -128,7 +142,7 @@ def send_sbd_message(conn, options, plug, message):
     arguments = "%s %s" % (plug, message)
     cmd = generate_sbd_command(options, "message", arguments)
 -    (return_code, out, err) = run_command(options, cmd)
 +    (return_code, out, err) = run_commands(options, cmd)
     return (return_code, out, err)
@@ -147,7 +161,7 @@ def get_msg_timeout(options):
     cmd = generate_sbd_command(options, "dump")
 -    (return_code, out, err) = run_command(options, cmd)
 +    (return_code, out, err) = run_commands(options, cmd)
     for line in itertools.chain(out.split("\n"), err.split("\n")):
         if len(line) == 0:
@@ -288,7 +302,7 @@ def get_node_list(conn, options):
     cmd = generate_sbd_command(options, "list")
 -    (return_code, out, err) = run_command(options, cmd)
 +    (return_code, out, err) = run_commands(options, cmd)
     for line in out.split("\n"):
         if len(line) == 0:
@@ -356,6 +370,7 @@ def main():
     all_opt["method"]["default"] = "cycle"
     all_opt["method"]["help"] = "-m, --method=[method]          Method to fence (onoff|cycle) (Default: cycle)"
 +    all_opt["power_timeout"]["default"] = "30"
     options = check_input(device_opt, process_input(device_opt))
@@ -376,23 +391,27 @@ def main():
     # We need to check if the provided sbd_devices exists. We need to do
     # that for every given device.
 -    for device_path in parse_sbd_devices(options):
 -        logging.debug("check device \"%s\"", device_path)
 -
 -        return_code = check_sbd_device(options, device_path)
 -        if PATH_NOT_EXISTS == return_code:
 -            logging.error("\"%s\" does not exist", device_path)
 -        elif PATH_NOT_BLOCK == return_code:
 -            logging.error("\"%s\" is not a valid block device", device_path)
 -        elif DEVICE_NOT_INIT == return_code:
 -            logging.error("\"%s\" is not initialized", device_path)
 -        elif DEVICE_INIT != return_code:
 -            logging.error("UNKNOWN error while checking \"%s\"", device_path)
 -
 -        # If we get any error while checking the device we need to exit at this
 -        # point.
 -        if DEVICE_INIT != return_code:
 -            exit(return_code)
 +    # Just for the case we are really rebooting / powering off a device
 +    # (pacemaker as well uses the list command to generate a dynamic list)
 +    # we leave it to sbd to try and decide if it was successful
 +    if not options["--action"] in ["reboot", "off", "list"]:
 +        for device_path in parse_sbd_devices(options):
 +            logging.debug("check device \"%s\"", device_path)
 +
 +            return_code = check_sbd_device(options, device_path)
 +            if PATH_NOT_EXISTS == return_code:
 +                logging.error("\"%s\" does not exist", device_path)
 +            elif PATH_NOT_BLOCK == return_code:
 +                logging.error("\"%s\" is not a valid block device", device_path)
 +            elif DEVICE_NOT_INIT == return_code:
 +                logging.error("\"%s\" is not initialized", device_path)
 +            elif DEVICE_INIT != return_code:
 +                logging.error("UNKNOWN error while checking \"%s\"", device_path)
 +
 +            # If we get any error while checking the device we need to exit at this
 +            # point.
 +            if DEVICE_INIT != return_code:
 +                exit(return_code)
     # we check against the defined timeouts. If the pacemaker timeout is smaller
     # then that defined within sbd we should report this.
 diff --git a/lib/fencing.py.py b/lib/fencing.py.py
 index b746ede8b..fc3679e33 100644
 --- a/lib/fencing.py.py
 +++ b/lib/fencing.py.py
@@ -1088,6 +1088,115 @@ def is_executable(path):
 			return True
 	return False
 +def run_commands(options, commands, timeout=None, env=None, log_command=None):
 +	# inspired by psutils.wait_procs (BSD License)
 +	def check_gone(proc, timeout):
 +		try:
 +			returncode = proc.wait(timeout=timeout)
 +		except subprocess.TimeoutExpired:
 +			pass
 +		else:
 +			if returncode is not None or not proc.is_running():
 +				proc.returncode = returncode
 +				gone.add(proc)
 +
 +	if timeout is None and "--power-timeout" in options:
 +		timeout = options["--power-timeout"]
 +	if timeout == 0:
 +		timeout = None
 +	if timeout is not None:
 +		timeout = float(timeout)
 +
 +	time_start = time.time()
 +	procs = []
 +	status = None
 +	pipe_stdout = ""
 +	pipe_stderr = ""
 +
 +	for command in commands:
 +		logging.info("Executing: %s\n", log_command or command)
 +
 +		try:
 +			process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env,
 +					# decodes newlines and in python3 also converts bytes to str
 +					universal_newlines=(sys.version_info[0] > 2))
 +		except OSError:
 +			fail_usage("Unable to run %s\n" % command)
 +
 +		procs.append(process)
 +
 +	gone = set()
 +	alive = set(procs)
 +
 +	while True:
 +		if alive:
 +			max_timeout = 2.0 / len(alive)
 +			for proc in alive:
 +				if timeout is not None:
 +					if time.time()-time_start >= timeout:
 +						# quickly go over the rest
 +						max_timeout = 0
 +				check_gone(proc, max_timeout)
 +			alive = alive - gone
 +
 +		if not alive:
 +			break
 +
 +		if time.time()-time_start < 5.0:
 +			# give it at least 5s to get a complete answer
 +			# afterwards we're OK with a quorate answer
 +			continue
 +
 +		if len(gone) > len(alive):
 +			good_cnt = 0
 +			for proc in gone:
 +				if proc.returncode == 0:
 +					good_cnt += 1
 +			# a positive result from more than half is fine
 +			if good_cnt > len(procs)/2:
 +				break
 +
 +		if timeout is not None:
 +			if time.time() - time_start >= timeout:
 +				logging.debug("Stop waiting after %s\n", str(timeout))
 +				break
 +
 +	logging.debug("Done: %d gone, %d alive\n", len(gone), len(alive))
 +
 +	for proc in gone:
 +		if (status != 0):
 +			status = proc.returncode
 +		# hand over the best status we have
 +		# but still collect as much stdout/stderr feedback
 +		# avoid communicate as we know already process
 +		# is gone and it seems to block when there
 +		# are D state children we don't get rid off
 +		os.set_blocking(proc.stdout.fileno(), False)
 +		os.set_blocking(proc.stderr.fileno(), False)
 +		try:
 +			pipe_stdout += proc.stdout.read()
 +		except:
 +			pass
 +		try:
 +			pipe_stderr += proc.stderr.read()
 +		except:
 +			pass
 +		proc.stdout.close()
 +		proc.stderr.close()
 +
 +	for proc in alive:
 +		proc.kill()
 +
 +	if status is None:
 +		fail(EC_TIMED_OUT, stop=(int(options.get("retry", 0)) < 1))
 +		status = EC_TIMED_OUT
 +		pipe_stdout = ""
 +		pipe_stderr = "timed out"
 +
 +	logging.debug("%s %s %s\n", str(status), str(pipe_stdout), str(pipe_stderr))
 +
 +	return (status, pipe_stdout, pipe_stderr)
 +
 def run_command(options, command, timeout=None, env=None, log_command=None):
 	if timeout is None and "--power-timeout" in options:
 		timeout = options["--power-timeout"]
 diff --git a/tests/data/metadata/fence_sbd.xml b/tests/data/metadata/fence_sbd.xml
 index 516370c40..7248b864a 100644
 --- a/tests/data/metadata/fence_sbd.xml
 +++ b/tests/data/metadata/fence_sbd.xml
@@ -87,7 +87,7 @@
 	</parameter>
 	<parameter name="power_timeout" unique="0" required="0">
 		<getopt mixed="--power-timeout=[seconds]" />
 -		<content type="second" default="20"  />
 +		<content type="second" default="30"  />
 		<shortdesc lang="en">Test X seconds for status change after ON/OFF</shortdesc>
 	</parameter>
 	<parameter name="power_wait" unique="0" required="0">
--- a/fence-agents.spec
+++ b/fence-agents.spec
@ -59,7 +59,7 @@
 Name: fence-agents
 Summary: Set of unified programs capable of host isolation ("fencing")
 Version: 4.10.0
-Release: 27%{?alphatag:.%{alphatag}}%{?dist}
+Release: 28%{?alphatag:.%{alphatag}}%{?dist}
 License: GPLv2+ and LGPLv2+
 URL: https://github.com/ClusterLabs/fence-agents
 Source0: https://fedorahosted.org/releases/f/e/fence-agents/%{name}-%{version}.tar.gz
@ -241,6 +241,7 @@ Patch24: bz2086559-fence_apc-fence_ilo_moonshot-import-logging.patch
 Patch25: bz2072420-2-fence_zvmip-connect-error.patch
 Patch26: bz2092385-fence_ibm_vpc-add-proxy-support.patch
 Patch27: bz2093216-fence_ibm_powervs-proxy-private-api-servers.patch
 Patch28: bz2087176-fence_sbd-improve-error-handling.patch
 %global supportedagents amt_ws apc apc_snmp bladecenter brocade cisco_mds cisco_ucs compute drac5 eaton_snmp emerson eps evacuate hpblade ibmblade ibm_powervs ibm_vpc ifmib ilo ilo_moonshot ilo_mp ilo_ssh intelmodular ipdu ipmilan kdump kubevirt lpar mpath redfish rhevm rsa rsb sbd scsi vmware_rest vmware_soap wti
 %ifarch x86_64
@ -379,6 +380,7 @@ BuildRequires: %{systemd_units}
 %patch25 -p1
 %patch26 -p1
 %patch27 -p1
 %patch28 -p1
 # prevent compilation of something that won't get used anyway
 sed -i.orig 's|FENCE_ZVM=1|FENCE_ZVM=0|' configure.ac
@ -1470,6 +1472,10 @@ are located on corosync cluster nodes.
 %endif
 %changelog
 * Mon Aug  1 2022 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-28
 - fence_sbd: improve error handling
  Resolves: rhbz#2087176
 * Wed Jun 22 2022 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-27
 - fence_ibm_powervs: add support for proxy, private API servers and
  get token via API key