From db7f5a06386bd742e5f2a2e8ef24980bc5da4822 Mon Sep 17 00:00:00 2001 From: CentOS Sources Date: Wed, 3 Aug 2022 10:13:06 +0000 Subject: [PATCH] import fence-agents-4.2.1-100.el8 --- ...671-fence_sbd-improve-error-handling.patch | 296 ++++++++++++++++++ SPECS/fence-agents.spec | 8 +- 2 files changed, 303 insertions(+), 1 deletion(-) create mode 100644 SOURCES/bz2033671-fence_sbd-improve-error-handling.patch diff --git a/SOURCES/bz2033671-fence_sbd-improve-error-handling.patch b/SOURCES/bz2033671-fence_sbd-improve-error-handling.patch new file mode 100644 index 0000000..c9ab5ad --- /dev/null +++ b/SOURCES/bz2033671-fence_sbd-improve-error-handling.patch @@ -0,0 +1,296 @@ +From ab1de07902d9f380c10405d6ddac3aeb43838c86 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Thu, 28 Jul 2022 15:33:12 +0200 +Subject: [PATCH] fence_sbd: improve error handling + +basically when using 3 disks be happy with 2 answers +but give it 5s at least to collect all answers +increase default power-timeout to 30s so that waiting +those 5s still allows us to get done sending the +reboot + +RHBZ#2033671 +--- + agents/sbd/fence_sbd.py | 77 +++++++++++++-------- + lib/fencing.py.py | 109 ++++++++++++++++++++++++++++++ + tests/data/metadata/fence_sbd.xml | 2 +- + 3 files changed, 158 insertions(+), 30 deletions(-) + +diff --git a/agents/sbd/fence_sbd.py b/agents/sbd/fence_sbd.py +index 0c876b16e..2b0127d55 100644 +--- a/agents/sbd/fence_sbd.py ++++ b/agents/sbd/fence_sbd.py +@@ -5,7 +5,7 @@ + import os + import atexit + sys.path.append("@FENCEAGENTSLIBDIR@") +-from fencing import fail_usage, run_command, fence_action, all_opt ++from fencing import fail_usage, run_commands, fence_action, all_opt + from fencing import atexit_handler, check_input, process_input, show_docs + from fencing import run_delay + import itertools +@@ -81,7 +81,7 @@ def check_sbd_device(options, device_path): + + cmd = "%s -d %s dump" % (options["--sbd-path"], device_path) + +- (return_code, out, err) = run_command(options, cmd) ++ (return_code, out, err) = run_commands(options, [ cmd ]) + + for line in itertools.chain(out.split("\n"), err.split("\n")): + if len(line) == 0: +@@ -94,21 +94,35 @@ def check_sbd_device(options, device_path): + + return DEVICE_INIT + ++ + def generate_sbd_command(options, command, arguments=None): + """Generates a sbd command based on given arguments. + + Return Value: +- generated sbd command (string) ++ generated list of sbd commands (strings) depending ++ on command multiple commands with a device each ++ or a single command with multiple devices + """ +- cmd = options["--sbd-path"] ++ cmds = [] ++ ++ if not command in ["list", "dump"]: ++ cmd = options["--sbd-path"] + +- # add "-d" for each sbd device +- for device in parse_sbd_devices(options): +- cmd += " -d %s" % device ++ # add "-d" for each sbd device ++ for device in parse_sbd_devices(options): ++ cmd += " -d %s" % device + +- cmd += " %s %s" % (command, arguments) ++ cmd += " %s %s" % (command, arguments) ++ cmds.append(cmd) ++ ++ else: ++ for device in parse_sbd_devices(options): ++ cmd = options["--sbd-path"] ++ cmd += " -d %s" % device ++ cmd += " %s %s" % (command, arguments) ++ cmds.append(cmd) + +- return cmd ++ return cmds + + def send_sbd_message(conn, options, plug, message): + """Sends a message to all sbd devices. +@@ -128,7 +142,7 @@ def send_sbd_message(conn, options, plug, message): + arguments = "%s %s" % (plug, message) + cmd = generate_sbd_command(options, "message", arguments) + +- (return_code, out, err) = run_command(options, cmd) ++ (return_code, out, err) = run_commands(options, cmd) + + return (return_code, out, err) + +@@ -147,7 +161,7 @@ def get_msg_timeout(options): + + cmd = generate_sbd_command(options, "dump") + +- (return_code, out, err) = run_command(options, cmd) ++ (return_code, out, err) = run_commands(options, cmd) + + for line in itertools.chain(out.split("\n"), err.split("\n")): + if len(line) == 0: +@@ -288,7 +302,7 @@ def get_node_list(conn, options): + + cmd = generate_sbd_command(options, "list") + +- (return_code, out, err) = run_command(options, cmd) ++ (return_code, out, err) = run_commands(options, cmd) + + for line in out.split("\n"): + if len(line) == 0: +@@ -356,6 +370,7 @@ def main(): + + all_opt["method"]["default"] = "cycle" + all_opt["method"]["help"] = "-m, --method=[method] Method to fence (onoff|cycle) (Default: cycle)" ++ all_opt["power_timeout"]["default"] = "30" + + options = check_input(device_opt, process_input(device_opt)) + +@@ -376,23 +391,27 @@ def main(): + + # We need to check if the provided sbd_devices exists. We need to do + # that for every given device. +- for device_path in parse_sbd_devices(options): +- logging.debug("check device \"%s\"", device_path) +- +- return_code = check_sbd_device(options, device_path) +- if PATH_NOT_EXISTS == return_code: +- logging.error("\"%s\" does not exist", device_path) +- elif PATH_NOT_BLOCK == return_code: +- logging.error("\"%s\" is not a valid block device", device_path) +- elif DEVICE_NOT_INIT == return_code: +- logging.error("\"%s\" is not initialized", device_path) +- elif DEVICE_INIT != return_code: +- logging.error("UNKNOWN error while checking \"%s\"", device_path) +- +- # If we get any error while checking the device we need to exit at this +- # point. +- if DEVICE_INIT != return_code: +- exit(return_code) ++ # Just for the case we are really rebooting / powering off a device ++ # (pacemaker as well uses the list command to generate a dynamic list) ++ # we leave it to sbd to try and decide if it was successful ++ if not options["--action"] in ["reboot", "off", "list"]: ++ for device_path in parse_sbd_devices(options): ++ logging.debug("check device \"%s\"", device_path) ++ ++ return_code = check_sbd_device(options, device_path) ++ if PATH_NOT_EXISTS == return_code: ++ logging.error("\"%s\" does not exist", device_path) ++ elif PATH_NOT_BLOCK == return_code: ++ logging.error("\"%s\" is not a valid block device", device_path) ++ elif DEVICE_NOT_INIT == return_code: ++ logging.error("\"%s\" is not initialized", device_path) ++ elif DEVICE_INIT != return_code: ++ logging.error("UNKNOWN error while checking \"%s\"", device_path) ++ ++ # If we get any error while checking the device we need to exit at this ++ # point. ++ if DEVICE_INIT != return_code: ++ exit(return_code) + + # we check against the defined timeouts. If the pacemaker timeout is smaller + # then that defined within sbd we should report this. +diff --git a/lib/fencing.py.py b/lib/fencing.py.py +index b746ede8b..fc3679e33 100644 +--- a/lib/fencing.py.py ++++ b/lib/fencing.py.py +@@ -1088,6 +1088,115 @@ def is_executable(path): + return True + return False + ++def run_commands(options, commands, timeout=None, env=None, log_command=None): ++ # inspired by psutils.wait_procs (BSD License) ++ def check_gone(proc, timeout): ++ try: ++ returncode = proc.wait(timeout=timeout) ++ except subprocess.TimeoutExpired: ++ pass ++ else: ++ if returncode is not None or not proc.is_running(): ++ proc.returncode = returncode ++ gone.add(proc) ++ ++ if timeout is None and "--power-timeout" in options: ++ timeout = options["--power-timeout"] ++ if timeout == 0: ++ timeout = None ++ if timeout is not None: ++ timeout = float(timeout) ++ ++ time_start = time.time() ++ procs = [] ++ status = None ++ pipe_stdout = "" ++ pipe_stderr = "" ++ ++ for command in commands: ++ logging.info("Executing: %s\n", log_command or command) ++ ++ try: ++ process = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, ++ # decodes newlines and in python3 also converts bytes to str ++ universal_newlines=(sys.version_info[0] > 2)) ++ except OSError: ++ fail_usage("Unable to run %s\n" % command) ++ ++ procs.append(process) ++ ++ gone = set() ++ alive = set(procs) ++ ++ while True: ++ if alive: ++ max_timeout = 2.0 / len(alive) ++ for proc in alive: ++ if timeout is not None: ++ if time.time()-time_start >= timeout: ++ # quickly go over the rest ++ max_timeout = 0 ++ check_gone(proc, max_timeout) ++ alive = alive - gone ++ ++ if not alive: ++ break ++ ++ if time.time()-time_start < 5.0: ++ # give it at least 5s to get a complete answer ++ # afterwards we're OK with a quorate answer ++ continue ++ ++ if len(gone) > len(alive): ++ good_cnt = 0 ++ for proc in gone: ++ if proc.returncode == 0: ++ good_cnt += 1 ++ # a positive result from more than half is fine ++ if good_cnt > len(procs)/2: ++ break ++ ++ if timeout is not None: ++ if time.time() - time_start >= timeout: ++ logging.debug("Stop waiting after %s\n", str(timeout)) ++ break ++ ++ logging.debug("Done: %d gone, %d alive\n", len(gone), len(alive)) ++ ++ for proc in gone: ++ if (status != 0): ++ status = proc.returncode ++ # hand over the best status we have ++ # but still collect as much stdout/stderr feedback ++ # avoid communicate as we know already process ++ # is gone and it seems to block when there ++ # are D state children we don't get rid off ++ os.set_blocking(proc.stdout.fileno(), False) ++ os.set_blocking(proc.stderr.fileno(), False) ++ try: ++ pipe_stdout += proc.stdout.read() ++ except: ++ pass ++ try: ++ pipe_stderr += proc.stderr.read() ++ except: ++ pass ++ proc.stdout.close() ++ proc.stderr.close() ++ ++ for proc in alive: ++ proc.kill() ++ ++ if status is None: ++ fail(EC_TIMED_OUT, stop=(int(options.get("retry", 0)) < 1)) ++ status = EC_TIMED_OUT ++ pipe_stdout = "" ++ pipe_stderr = "timed out" ++ ++ logging.debug("%s %s %s\n", str(status), str(pipe_stdout), str(pipe_stderr)) ++ ++ return (status, pipe_stdout, pipe_stderr) ++ + def run_command(options, command, timeout=None, env=None, log_command=None): + if timeout is None and "--power-timeout" in options: + timeout = options["--power-timeout"] +diff --git a/tests/data/metadata/fence_sbd.xml b/tests/data/metadata/fence_sbd.xml +index 516370c40..7248b864a 100644 +--- a/tests/data/metadata/fence_sbd.xml ++++ b/tests/data/metadata/fence_sbd.xml +@@ -87,7 +87,7 @@ + + + +- ++ + Test X seconds for status change after ON/OFF + + diff --git a/SPECS/fence-agents.spec b/SPECS/fence-agents.spec index 3e147e7..fa57032 100644 --- a/SPECS/fence-agents.spec +++ b/SPECS/fence-agents.spec @@ -87,7 +87,7 @@ Name: fence-agents Summary: Set of unified programs capable of host isolation ("fencing") Version: 4.2.1 -Release: 99%{?alphatag:.%{alphatag}}%{?dist} +Release: 100%{?alphatag:.%{alphatag}}%{?dist} License: GPLv2+ and LGPLv2+ Group: System Environment/Base URL: https://github.com/ClusterLabs/fence-agents @@ -291,6 +291,7 @@ Patch116: bz2080729-2-fence_lpar-fix-import-fail_usage.patch Patch117: bz2072421-2-fence_zvmip-connect-error.patch Patch118: bz2091826-fence_ibm_vpc-add-proxy-support.patch Patch119: bz2092921-fence_ibm_powervs-proxy-private-api-servers.patch +Patch120: bz2033671-fence_sbd-improve-error-handling.patch %if 0%{?fedora} || 0%{?rhel} > 7 %global supportedagents amt_ws apc apc_snmp bladecenter brocade cisco_mds cisco_ucs compute drac5 eaton_snmp emerson eps evacuate hds_cb hpblade ibmblade ibm_powervs ibm_vpc ifmib ilo ilo_moonshot ilo_mp ilo_ssh intelmodular ipdu ipmilan kdump kubevirt lpar mpath redfish rhevm rsa rsb sbd scsi vmware_rest vmware_soap wti @@ -488,6 +489,7 @@ BuildRequires: python3-google-api-client python3-pip python3-wheel python3-jinja %patch117 -p1 %patch118 -p1 %patch119 -p1 +%patch120 -p1 # prevent compilation of something that won't get used anyway sed -i.orig 's|FENCE_ZVM=1|FENCE_ZVM=0|' configure.ac @@ -1512,6 +1514,10 @@ Fence agent for IBM z/VM over IP. %endif %changelog +* Mon Aug 1 2022 Oyvind Albrigtsen - 4.2.1-100 +- fence_sbd: improve error handling + Resolves: rhbz#2033671 + * Wed Jun 22 2022 Oyvind Albrigtsen - 4.2.1-99 - fence_ibm_powervs: add support for proxy, private API servers and get token via API key