diff --git a/006-watchdog-fencing-topology.patch b/006-watchdog-fencing-topology.patch new file mode 100644 index 0000000..7651584 --- /dev/null +++ b/006-watchdog-fencing-topology.patch @@ -0,0 +1,142 @@ +From 17cc49e1564b0ae55cc8212d14c5c055f88040da Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Tue, 14 Feb 2023 15:35:37 +0100 +Subject: [PATCH] Fix: watchdog-fencing: terminate dangling timer before + watchdog-waiting + +--- + daemons/fenced/fenced_remote.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index 5c3fe25e3..aab185adb 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2009-2022 the Pacemaker project contributors ++ * Copyright 2009-2023 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -1702,6 +1702,10 @@ check_watchdog_fencing_and_wait(remote_fencing_op_t * op) + "client %s " CRM_XS " id=%.8s", + (stonith_watchdog_timeout_ms / 1000), + op->target, op->action, op->client_name, op->id); ++ ++ if (op->op_timer_one) { ++ g_source_remove(op->op_timer_one); ++ } + op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, + remote_op_watchdog_done, op); + return TRUE; +-- +2.39.0 + +From f2cc2a4277124230903a18713e50604a8f1842cd Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Wed, 1 Mar 2023 15:00:15 +0100 +Subject: [PATCH] Refactor: watchdog-fencing: convenience function + pcmk__is_fencing_action + +for consistency and add comment making clear why this block exits +with new timer set in any case +--- + daemons/fenced/fenced_remote.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index aab185adb..e0f8de057 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -1834,7 +1834,7 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer) + if (!((stonith_watchdog_timeout_ms > 0) + && (pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_none) + || (pcmk__str_eq(peer->host, op->target, pcmk__str_casei) +- && !pcmk__str_eq(op->action, "on", pcmk__str_none))) ++ && pcmk__is_fencing_action(op->action))) + && check_watchdog_fencing_and_wait(op))) { + + /* Some thoughts about self-fencing cases reaching this point: +@@ -1854,6 +1854,9 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer) + Otherwise the selection of stonith-watchdog-timeout at + least is questionable. + */ ++ ++ /* coming here we're not waiting for watchdog timeout - ++ thus engage timer with timout evaluated before */ + op->op_timer_one = g_timeout_add((1000 * timeout_one), remote_op_timeout_one, op); + } + +-- +2.39.0 + +From c4eb45a986f8865fc5e69350fd5b9f4b056d9d69 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Tue, 14 Feb 2023 11:57:17 +0100 +Subject: [PATCH] Fix: watchdog-fencing: correctly derive timeout with topology + +up to now the timeout for watchdog-fencing was just added to +the overall timeout if the node to be fenced was visible and +reported back to the query. +--- + daemons/fenced/fenced_remote.c | 28 +++++++++++++++++++++++++--- + 1 file changed, 25 insertions(+), 3 deletions(-) + +diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c +index e0f8de057..3b7ab05e9 100644 +--- a/daemons/fenced/fenced_remote.c ++++ b/daemons/fenced/fenced_remote.c +@@ -969,8 +969,9 @@ advance_topology_level(remote_fencing_op_t *op, bool empty_ok) + return pcmk_rc_ok; + } + +- crm_info("All fencing options targeting %s for client %s@%s failed " ++ crm_info("All %sfencing options targeting %s for client %s@%s failed " + CRM_XS " id=%.8s", ++ (stonith_watchdog_timeout_ms > 0)?"non-watchdog ":"", + op->target, op->client_name, op->originator, op->id); + return ENODEV; + } +@@ -1434,8 +1435,17 @@ stonith_choose_peer(remote_fencing_op_t * op) + && pcmk_is_set(op->call_options, st_opt_topology) + && (advance_topology_level(op, false) == pcmk_rc_ok)); + +- crm_notice("Couldn't find anyone to fence (%s) %s using %s", +- op->action, op->target, (device? device : "any device")); ++ if ((stonith_watchdog_timeout_ms > 0) ++ && pcmk__is_fencing_action(op->action) ++ && pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_none) ++ && node_does_watchdog_fencing(op->target)) { ++ crm_info("Couldn't contact watchdog-fencing target-node (%s)", ++ op->target); ++ /* check_watchdog_fencing_and_wait will log additional info */ ++ } else { ++ crm_notice("Couldn't find anyone to fence (%s) %s using %s", ++ op->action, op->target, (device? device : "any device")); ++ } + return NULL; + } + +@@ -1531,6 +1541,18 @@ get_op_total_timeout(const remote_fencing_op_t *op, + continue; + } + for (device_list = tp->levels[i]; device_list; device_list = device_list->next) { ++ /* in case of watchdog-device we add the timeout to the budget ++ regardless of if we got a reply or not ++ */ ++ if ((stonith_watchdog_timeout_ms > 0) ++ && pcmk__is_fencing_action(op->action) ++ && pcmk__str_eq(device_list->data, STONITH_WATCHDOG_ID, ++ pcmk__str_none) ++ && node_does_watchdog_fencing(op->target)) { ++ total_timeout += stonith_watchdog_timeout_ms / 1000; ++ continue; ++ } ++ + for (iter = op->query_results; iter != NULL; iter = iter->next) { + const peer_device_info_t *peer = iter->data; + +-- +2.39.0 + diff --git a/pacemaker.spec b/pacemaker.spec index 2be7c1f..61e409c 100644 --- a/pacemaker.spec +++ b/pacemaker.spec @@ -36,7 +36,7 @@ ## can be incremented to build packages reliably considered "newer" ## than previously built packages with the same pcmkversion) %global pcmkversion 2.1.5 -%global specversion 7 +%global specversion 8 ## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build %global commit a3f44794f94e1571c6ba0042915ade369b4ce4b1 @@ -253,6 +253,7 @@ Patch002: 002-remote-regression.patch Patch003: 003-history-cleanup.patch Patch004: 004-g_source_remove.patch Patch005: 005-query-null.patch +Patch006: 006-watchdog-fencing-topology.patch Requires: resource-agents Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} @@ -855,6 +856,11 @@ exit 0 %license %{nagios_name}-%{nagios_hash}/COPYING %changelog +* Fri May 5 2023 Klaus Wenninger - 2.1.5-8 +- Fix overall timeout calculation if watchdog and another fencing + device share a topology level +- Resolves: rhbz2187424 + * Wed Feb 22 2023 Chris Lumens - 2.1.5-7 - Additional fixes for SIGABRT during pacemaker-fenced shutdown - Backport fix for attrd_updater -QA not displaying all nodes