import sbd-1.4.1-7.el8
This commit is contained in:
parent
3f12f554ed
commit
7cb75f49d2
@ -0,0 +1,71 @@
|
||||
From 3048119bf4a0ddb2da01d4ca827ae659a089b622 Mon Sep 17 00:00:00 2001
|
||||
From: Klaus Wenninger <klaus.wenninger@aon.at>
|
||||
Date: Wed, 24 Jun 2020 14:33:21 +0200
|
||||
Subject: [PATCH] Fix: sbd-pacemaker: handle new no_quorum_demote
|
||||
|
||||
and be robust against unknown no-quorum-policies handling them
|
||||
as would be done with no_quorum_suicide
|
||||
---
|
||||
configure.ac | 17 ++++++++++++++++-
|
||||
src/sbd-pacemaker.c | 11 ++++++++++-
|
||||
2 files changed, 26 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/configure.ac b/configure.ac
|
||||
index 02e2678..3391c5f 100644
|
||||
--- a/configure.ac
|
||||
+++ b/configure.ac
|
||||
@@ -89,7 +89,22 @@ AC_CHECK_LIB(cib, cib_apply_patch_event, , missing="yes")
|
||||
|
||||
dnl pacemaker-2.0 removed support for corosync 1 cluster layer
|
||||
AC_CHECK_DECLS([pcmk_cluster_classic_ais, pcmk_cluster_cman],,,
|
||||
- [#include <pacemaker/crm/cluster.h>])
|
||||
+ [#include <pacemaker/crm/cluster.h>])
|
||||
+
|
||||
+dnl check for additional no-quorum-policies
|
||||
+dnl AC_TEST_NO_QUORUM_POLICY(POLICY)
|
||||
+AC_DEFUN([AC_TEST_NO_QUORUM_POLICY],[
|
||||
+ AC_MSG_CHECKING([whether enum pe_quorum_policy defines value $1])
|
||||
+ AC_LANG_PUSH([C])
|
||||
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
|
||||
+ [#include <pacemaker/crm/pengine/pe_types.h>],
|
||||
+ [enum pe_quorum_policy policy = $1; return policy;])],
|
||||
+ AC_DEFINE_UNQUOTED(m4_toupper(HAVE_ENUM_$1), 1,
|
||||
+ [Does pe_types.h have $1 value in enum pe_quorum_policy?])
|
||||
+ AC_MSG_RESULT([yes]), AC_MSG_RESULT([no]))
|
||||
+ AC_LANG_POP([C])
|
||||
+])
|
||||
+AC_TEST_NO_QUORUM_POLICY(no_quorum_demote)
|
||||
|
||||
dnl check for new pe-API
|
||||
AC_CHECK_FUNCS(pe_new_working_set)
|
||||
diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c
|
||||
index 11e104d..6e53557 100644
|
||||
--- a/src/sbd-pacemaker.c
|
||||
+++ b/src/sbd-pacemaker.c
|
||||
@@ -321,13 +321,22 @@ compute_status(pe_working_set_t * data_set)
|
||||
case no_quorum_freeze:
|
||||
set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Freeze resources");
|
||||
break;
|
||||
+#if HAVE_ENUM_NO_QUORUM_DEMOTE
|
||||
+ case no_quorum_demote:
|
||||
+ set_servant_health(pcmk_health_transient, LOG_INFO,
|
||||
+ "Quorum lost: Demote promotable resources and stop others");
|
||||
+ break;
|
||||
+#endif
|
||||
case no_quorum_stop:
|
||||
set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Stop ALL resources");
|
||||
break;
|
||||
case no_quorum_ignore:
|
||||
set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Ignore");
|
||||
break;
|
||||
- case no_quorum_suicide:
|
||||
+ default:
|
||||
+ /* immediate reboot is the most excessive action we take
|
||||
+ use for no_quorum_suicide and everything we don't know yet
|
||||
+ */
|
||||
set_servant_health(pcmk_health_unclean, LOG_INFO, "Quorum lost: Self-fence");
|
||||
break;
|
||||
}
|
||||
--
|
||||
1.8.3.1
|
||||
|
@ -0,0 +1,399 @@
|
||||
From 4c3e4049b08799094a64dac289a48deef4d3d916 Mon Sep 17 00:00:00 2001
|
||||
From: Klaus Wenninger <klaus.wenninger@aon.at>
|
||||
Date: Fri, 24 Jul 2020 14:31:01 +0200
|
||||
Subject: [PATCH] Fix: sbd-cluster: match qdevice-sync_timeout against
|
||||
wd-timeout
|
||||
|
||||
---
|
||||
configure.ac | 13 +++
|
||||
src/sbd-cluster.c | 252 +++++++++++++++++++++++++++++++++++++++++++++---------
|
||||
2 files changed, 223 insertions(+), 42 deletions(-)
|
||||
|
||||
diff --git a/configure.ac b/configure.ac
|
||||
index 3391c5f..23547cf 100644
|
||||
--- a/configure.ac
|
||||
+++ b/configure.ac
|
||||
@@ -109,6 +109,12 @@ AC_TEST_NO_QUORUM_POLICY(no_quorum_demote)
|
||||
dnl check for new pe-API
|
||||
AC_CHECK_FUNCS(pe_new_working_set)
|
||||
|
||||
+dnl check if votequorum comes with default for qdevice-sync_timeout
|
||||
+AC_CHECK_DECLS([VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT],
|
||||
+ HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT=1,
|
||||
+ HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT=0,
|
||||
+ [#include <corosync/votequorum.h>])
|
||||
+
|
||||
if test "$missing" = "yes"; then
|
||||
AC_MSG_ERROR([Missing required libraries or functions.])
|
||||
fi
|
||||
@@ -140,6 +146,13 @@ AM_CONDITIONAL(CHECK_TWO_NODE, test "$HAVE_cmap" = "1")
|
||||
AC_DEFINE_UNQUOTED(CHECK_VOTEQUORUM_HANDLE, $HAVE_votequorum, Turn on periodic checking of votequorum-handle)
|
||||
AM_CONDITIONAL(CHECK_VOTEQUORUM_HANDLE, test "$HAVE_votequorum" = "1")
|
||||
|
||||
+AC_DEFINE_UNQUOTED(CHECK_QDEVICE_SYNC_TIMEOUT,
|
||||
+ ($HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT && $HAVE_cmap),
|
||||
+ Turn on checking if watchdog-timeout and qdevice-sync_timeout are matching)
|
||||
+AM_CONDITIONAL(CHECK_QDEVICE_SYNC_TIMEOUT,
|
||||
+ test "$HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT" = "1" &&
|
||||
+ test "$HAVE_cmap" = "1")
|
||||
+
|
||||
CONFIGDIR=""
|
||||
AC_ARG_WITH(configdir,
|
||||
[ --with-configdir=DIR
|
||||
diff --git a/src/sbd-cluster.c b/src/sbd-cluster.c
|
||||
index 13fa580..b6c5512 100644
|
||||
--- a/src/sbd-cluster.c
|
||||
+++ b/src/sbd-cluster.c
|
||||
@@ -33,7 +33,7 @@
|
||||
#include <crm/cluster.h>
|
||||
#include <crm/common/mainloop.h>
|
||||
|
||||
-#if CHECK_TWO_NODE
|
||||
+#if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT
|
||||
#include <glib-unix.h>
|
||||
#endif
|
||||
|
||||
@@ -86,11 +86,20 @@ sbd_plugin_membership_dispatch(cpg_handle_t handle,
|
||||
static votequorum_handle_t votequorum_handle = 0;
|
||||
#endif
|
||||
|
||||
+#if CHECK_TWO_NODE
|
||||
static bool two_node = false;
|
||||
+#endif
|
||||
static bool ever_seen_both = false;
|
||||
static int cpg_membership_entries = -1;
|
||||
|
||||
-#if CHECK_TWO_NODE
|
||||
+#if CHECK_QDEVICE_SYNC_TIMEOUT
|
||||
+#include <corosync/votequorum.h>
|
||||
+static bool using_qdevice = false;
|
||||
+static uint32_t qdevice_sync_timeout = /* in seconds */
|
||||
+ VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000;
|
||||
+#endif
|
||||
+
|
||||
+#if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT
|
||||
#include <corosync/cmap.h>
|
||||
|
||||
static cmap_handle_t cmap_handle = 0;
|
||||
@@ -102,28 +111,59 @@ void
|
||||
sbd_cpg_membership_health_update()
|
||||
{
|
||||
if(cpg_membership_entries > 0) {
|
||||
- bool quorum_is_suspect =
|
||||
+#if CHECK_TWO_NODE
|
||||
+ bool quorum_is_suspect_two_node =
|
||||
(two_node && ever_seen_both && cpg_membership_entries == 1);
|
||||
+#endif
|
||||
+#if CHECK_QDEVICE_SYNC_TIMEOUT
|
||||
+ bool quorum_is_suspect_qdevice_timing =
|
||||
+ using_qdevice && (qdevice_sync_timeout > timeout_watchdog);
|
||||
+#endif
|
||||
|
||||
- if (!quorum_is_suspect) {
|
||||
+ do {
|
||||
+#if CHECK_TWO_NODE
|
||||
+ if (quorum_is_suspect_two_node) {
|
||||
+ /* Alternative would be asking votequorum for number of votes.
|
||||
+ * Using pacemaker's cpg as source for number of active nodes
|
||||
+ * avoids binding to an additional library, is definitely
|
||||
+ * less code to write and we wouldn't have to combine data
|
||||
+ * from 3 sources (cmap, cpg & votequorum) in a potentially
|
||||
+ * racy environment.
|
||||
+ */
|
||||
+ set_servant_health(pcmk_health_noquorum, LOG_WARNING,
|
||||
+ "Connected to %s but requires both nodes present",
|
||||
+ name_for_cluster_type(get_cluster_type())
|
||||
+ );
|
||||
+ break;
|
||||
+ }
|
||||
+#endif
|
||||
+#if CHECK_QDEVICE_SYNC_TIMEOUT
|
||||
+ if (quorum_is_suspect_qdevice_timing) {
|
||||
+ /* We can't really trust quorum info as qdevice-sync_timeout
|
||||
+ * makes reaction of quorum too sluggish for our
|
||||
+ * watchdog-timeout.
|
||||
+ */
|
||||
+ set_servant_health(pcmk_health_noquorum, LOG_WARNING,
|
||||
+ "Connected to %s but quorum using qdevice is distrusted "
|
||||
+ "for SBD as qdevice-sync_timeout (%ds) > watchdog-timeout "
|
||||
+ "(%lus).",
|
||||
+ name_for_cluster_type(get_cluster_type()),
|
||||
+ qdevice_sync_timeout, timeout_watchdog
|
||||
+ );
|
||||
+ break;
|
||||
+ }
|
||||
+#endif
|
||||
set_servant_health(pcmk_health_online, LOG_INFO,
|
||||
- "Connected to %s (%u members)",
|
||||
- name_for_cluster_type(get_cluster_type()),
|
||||
- cpg_membership_entries
|
||||
- );
|
||||
- } else {
|
||||
- /* Alternative would be asking votequorum for number of votes.
|
||||
- * Using pacemaker's cpg as source for number of active nodes
|
||||
- * avoids binding to an additional library, is definitely
|
||||
- * less code to write and we wouldn't have to combine data
|
||||
- * from 3 sources (cmap, cpq & votequorum) in a potentially
|
||||
- * racy environment.
|
||||
- */
|
||||
- set_servant_health(pcmk_health_noquorum, LOG_WARNING,
|
||||
- "Connected to %s but requires both nodes present",
|
||||
- name_for_cluster_type(get_cluster_type())
|
||||
- );
|
||||
- }
|
||||
+ "Connected to %s (%u members)%s",
|
||||
+ name_for_cluster_type(get_cluster_type()),
|
||||
+ cpg_membership_entries,
|
||||
+#if CHECK_QDEVICE_SYNC_TIMEOUT
|
||||
+ using_qdevice?" using qdevice for quorum":""
|
||||
+#else
|
||||
+ ""
|
||||
+#endif
|
||||
+ );
|
||||
+ } while (false);
|
||||
|
||||
if (cpg_membership_entries > 1) {
|
||||
ever_seen_both = true;
|
||||
@@ -146,7 +186,7 @@ sbd_cpg_membership_dispatch(cpg_handle_t handle,
|
||||
notify_parent();
|
||||
}
|
||||
|
||||
-#if CHECK_TWO_NODE
|
||||
+#if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT
|
||||
static void sbd_cmap_notify_fn(
|
||||
cmap_handle_t cmap_handle,
|
||||
cmap_track_handle_t cmap_track_handle,
|
||||
@@ -156,21 +196,99 @@ static void sbd_cmap_notify_fn(
|
||||
struct cmap_notify_value old_val,
|
||||
void *user_data)
|
||||
{
|
||||
- if (new_val.type == CMAP_VALUETYPE_UINT8) {
|
||||
- switch (event) {
|
||||
- case CMAP_TRACK_ADD:
|
||||
- case CMAP_TRACK_MODIFY:
|
||||
- two_node = *((uint8_t *) new_val.data);
|
||||
- break;
|
||||
- case CMAP_TRACK_DELETE:
|
||||
- two_node = false;
|
||||
- break;
|
||||
- default:
|
||||
- return;
|
||||
- }
|
||||
- sbd_cpg_membership_health_update();
|
||||
- notify_parent();
|
||||
+ switch (event) {
|
||||
+ case CMAP_TRACK_ADD:
|
||||
+ case CMAP_TRACK_MODIFY:
|
||||
+ switch (new_val.type) {
|
||||
+ case CMAP_VALUETYPE_UINT8:
|
||||
+#if CHECK_TWO_NODE
|
||||
+ if (!strcmp(key_name, "quorum.two_node")) {
|
||||
+ two_node = *((uint8_t *) new_val.data);
|
||||
+ } else {
|
||||
+ return;
|
||||
+ }
|
||||
+ break;
|
||||
+#else
|
||||
+ return;
|
||||
+#endif
|
||||
+ case CMAP_VALUETYPE_STRING:
|
||||
+#if CHECK_QDEVICE_SYNC_TIMEOUT
|
||||
+ if (!strcmp(key_name, "quorum.device.model")) {
|
||||
+ using_qdevice =
|
||||
+ ((new_val.data) && strlen((char *) new_val.data));
|
||||
+ } else {
|
||||
+ return;
|
||||
+ }
|
||||
+ break;
|
||||
+#else
|
||||
+ return;
|
||||
+#endif
|
||||
+ case CMAP_VALUETYPE_UINT32:
|
||||
+#if CHECK_QDEVICE_SYNC_TIMEOUT
|
||||
+ if (!strcmp(key_name, "quorum.device.sync_timeout")) {
|
||||
+ if (new_val.data) {
|
||||
+ qdevice_sync_timeout =
|
||||
+ *((uint32_t *) new_val.data) / 1000;
|
||||
+ } else {
|
||||
+ qdevice_sync_timeout =
|
||||
+ VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000;
|
||||
+ }
|
||||
+ } else {
|
||||
+ return;
|
||||
+ }
|
||||
+ break;
|
||||
+#else
|
||||
+ return;
|
||||
+#endif
|
||||
+ default:
|
||||
+ return;
|
||||
+ }
|
||||
+ break;
|
||||
+ case CMAP_TRACK_DELETE:
|
||||
+ switch (new_val.type) {
|
||||
+ case CMAP_VALUETYPE_UINT8:
|
||||
+#if CHECK_TWO_NODE
|
||||
+ if (!strcmp(key_name, "quorum.two_node")) {
|
||||
+ two_node = false;
|
||||
+ } else {
|
||||
+ return;
|
||||
+ }
|
||||
+ break;
|
||||
+#else
|
||||
+ return;
|
||||
+#endif
|
||||
+ case CMAP_VALUETYPE_STRING:
|
||||
+#if CHECK_QDEVICE_SYNC_TIMEOUT
|
||||
+ if (!strcmp(key_name, "quorum.device.model")) {
|
||||
+ using_qdevice = false;
|
||||
+ } else {
|
||||
+ return;
|
||||
+ }
|
||||
+ break;
|
||||
+#else
|
||||
+ return;
|
||||
+#endif
|
||||
+ case CMAP_VALUETYPE_UINT32:
|
||||
+#if CHECK_QDEVICE_SYNC_TIMEOUT
|
||||
+ if (!strcmp(key_name, "quorum.device.sync_timeout")) {
|
||||
+ qdevice_sync_timeout =
|
||||
+ VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000;
|
||||
+ } else {
|
||||
+ return;
|
||||
+ }
|
||||
+ break;
|
||||
+#else
|
||||
+ return;
|
||||
+#endif
|
||||
+ default:
|
||||
+ return;
|
||||
+ }
|
||||
+ break;
|
||||
+ default:
|
||||
+ return;
|
||||
}
|
||||
+ sbd_cpg_membership_health_update();
|
||||
+ notify_parent();
|
||||
}
|
||||
|
||||
static gboolean
|
||||
@@ -200,9 +318,14 @@ cmap_destroy(void)
|
||||
}
|
||||
|
||||
static gboolean
|
||||
-sbd_get_two_node(void)
|
||||
+verify_against_cmap_config(void)
|
||||
{
|
||||
+#if CHECK_TWO_NODE
|
||||
uint8_t two_node_u8 = 0;
|
||||
+#endif
|
||||
+#if CHECK_QDEVICE_SYNC_TIMEOUT
|
||||
+ char *qdevice_model = NULL;
|
||||
+#endif
|
||||
int cmap_fd;
|
||||
|
||||
if (!track_handle) {
|
||||
@@ -211,12 +334,31 @@ sbd_get_two_node(void)
|
||||
goto out;
|
||||
}
|
||||
|
||||
+#if CHECK_TWO_NODE
|
||||
if (cmap_track_add(cmap_handle, "quorum.two_node",
|
||||
CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD,
|
||||
sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) {
|
||||
cl_log(LOG_WARNING, "Failed adding CMAP tracker for 2Node-mode\n");
|
||||
goto out;
|
||||
}
|
||||
+#endif
|
||||
+
|
||||
+#if CHECK_QDEVICE_SYNC_TIMEOUT
|
||||
+ if (cmap_track_add(cmap_handle, "quorum.device.model",
|
||||
+ CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD,
|
||||
+ sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) {
|
||||
+ cl_log(LOG_WARNING, "Failed adding CMAP tracker for qdevice-model\n");
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ if (cmap_track_add(cmap_handle, "quorum.device.sync_timeout",
|
||||
+ CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD,
|
||||
+ sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) {
|
||||
+ cl_log(LOG_WARNING,
|
||||
+ "Failed adding CMAP tracker for qdevice-sync_timeout\n");
|
||||
+ goto out;
|
||||
+ }
|
||||
+#endif
|
||||
|
||||
/* add the tracker to mainloop */
|
||||
if (cmap_fd_get(cmap_handle, &cmap_fd) != CS_OK) {
|
||||
@@ -232,13 +374,39 @@ sbd_get_two_node(void)
|
||||
g_source_attach(cmap_source, NULL);
|
||||
}
|
||||
|
||||
- if (cmap_get_uint8(cmap_handle, "quorum.two_node", &two_node_u8) == CS_OK) {
|
||||
+#if CHECK_TWO_NODE
|
||||
+ if (cmap_get_uint8(cmap_handle, "quorum.two_node", &two_node_u8)
|
||||
+ == CS_OK) {
|
||||
cl_log(two_node_u8? LOG_NOTICE : LOG_INFO,
|
||||
"Corosync is%s in 2Node-mode", two_node_u8?"":" not");
|
||||
two_node = two_node_u8;
|
||||
} else {
|
||||
cl_log(LOG_INFO, "quorum.two_node not present in cmap\n");
|
||||
}
|
||||
+#endif
|
||||
+
|
||||
+#if CHECK_QDEVICE_SYNC_TIMEOUT
|
||||
+ if (cmap_get_string(cmap_handle, "quorum.device.model",
|
||||
+ &qdevice_model) == CS_OK) {
|
||||
+ using_qdevice = qdevice_model && strlen(qdevice_model);
|
||||
+ cl_log(using_qdevice? LOG_NOTICE : LOG_INFO,
|
||||
+ "Corosync is%s using qdevice", using_qdevice?"":" not");
|
||||
+ } else {
|
||||
+ cl_log(LOG_INFO, "quorum.device.model not present in cmap\n");
|
||||
+ }
|
||||
+
|
||||
+ if (cmap_get_uint32(cmap_handle, "quorum.device.sync_timeout",
|
||||
+ &qdevice_sync_timeout) == CS_OK) {
|
||||
+ qdevice_sync_timeout /= 1000;
|
||||
+ cl_log(LOG_INFO,
|
||||
+ "Corosync is using qdevice-sync_timeout=%ds",
|
||||
+ qdevice_sync_timeout);
|
||||
+ } else {
|
||||
+ cl_log(LOG_INFO,
|
||||
+ "quorum.device.sync_timeout not present in cmap\n");
|
||||
+ }
|
||||
+#endif
|
||||
+
|
||||
return TRUE;
|
||||
|
||||
out:
|
||||
@@ -331,15 +499,15 @@ sbd_membership_connect(void)
|
||||
} else {
|
||||
cl_log(LOG_INFO, "Attempting connection to %s", name_for_cluster_type(stack));
|
||||
|
||||
-#if SUPPORT_COROSYNC && CHECK_TWO_NODE
|
||||
- if (sbd_get_two_node()) {
|
||||
+#if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT)
|
||||
+ if (verify_against_cmap_config()) {
|
||||
#endif
|
||||
|
||||
if(crm_cluster_connect(&cluster)) {
|
||||
connected = true;
|
||||
}
|
||||
|
||||
-#if SUPPORT_COROSYNC && CHECK_TWO_NODE
|
||||
+#if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT)
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -362,7 +530,7 @@ sbd_membership_destroy(gpointer user_data)
|
||||
cl_log(LOG_WARNING, "Lost connection to %s", name_for_cluster_type(get_cluster_type()));
|
||||
|
||||
if (get_cluster_type() != pcmk_cluster_unknown) {
|
||||
-#if SUPPORT_COROSYNC && CHECK_TWO_NODE
|
||||
+#if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT)
|
||||
cmap_destroy();
|
||||
#endif
|
||||
}
|
||||
--
|
||||
1.8.3.1
|
||||
|
@ -0,0 +1,231 @@
|
||||
From 5b5ffac4cce861f3621267a73d2ad29f6d807335 Mon Sep 17 00:00:00 2001
|
||||
From: Klaus Wenninger <klaus.wenninger@aon.at>
|
||||
Date: Tue, 10 Dec 2019 13:16:45 +0100
|
||||
Subject: [PATCH] Fix: sbd-pacemaker: sync with pacemakerd for robustness
|
||||
|
||||
State query ping of pacemakerd prevents pacemakerd from
|
||||
starting any sub-daemons (and thus services) if sbd can't
|
||||
reach it via ipc. As a health-check get timestamp from
|
||||
pacemakerd. On shudown fetch info about graceful
|
||||
shutdown from pacemakerd.
|
||||
Use new pacemakerd-api provided by pacemaker.
|
||||
---
|
||||
configure.ac | 4 ++
|
||||
src/sbd-pacemaker.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++----
|
||||
2 files changed, 126 insertions(+), 10 deletions(-)
|
||||
|
||||
diff --git a/configure.ac b/configure.ac
|
||||
index 23547cf..11d12f0 100644
|
||||
--- a/configure.ac
|
||||
+++ b/configure.ac
|
||||
@@ -81,6 +81,7 @@ AC_CHECK_LIB(crmcluster, crm_peer_init, , missing="yes")
|
||||
AC_CHECK_LIB(uuid, uuid_unparse, , missing="yes")
|
||||
AC_CHECK_LIB(cmap, cmap_initialize, , HAVE_cmap=0)
|
||||
AC_CHECK_LIB(votequorum, votequorum_getinfo, , HAVE_votequorum=0)
|
||||
+AC_CHECK_LIB(crmcommon, pcmk_pacemakerd_api_ping, HAVE_pacemakerd_api=1, HAVE_pacemakerd_api=0)
|
||||
|
||||
dnl pacemaker >= 1.1.8
|
||||
AC_CHECK_HEADERS(crm/cluster.h)
|
||||
@@ -153,6 +154,9 @@ AM_CONDITIONAL(CHECK_QDEVICE_SYNC_TIMEOUT,
|
||||
test "$HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT" = "1" &&
|
||||
test "$HAVE_cmap" = "1")
|
||||
|
||||
+AC_DEFINE_UNQUOTED(USE_PACEMAKERD_API, $HAVE_pacemakerd_api, Turn on synchronization between sbd & pacemakerd)
|
||||
+AM_CONDITIONAL(USE_PACEMAKERD_API, test "$HAVE_pacemakerd_api" = "1")
|
||||
+
|
||||
CONFIGDIR=""
|
||||
AC_ARG_WITH(configdir,
|
||||
[ --with-configdir=DIR
|
||||
diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c
|
||||
index 6e53557..1243bfc 100644
|
||||
--- a/src/sbd-pacemaker.c
|
||||
+++ b/src/sbd-pacemaker.c
|
||||
@@ -83,6 +83,62 @@ pe_free_working_set(pe_working_set_t *data_set)
|
||||
|
||||
#endif
|
||||
|
||||
+static void clean_up(int rc);
|
||||
+
|
||||
+#if USE_PACEMAKERD_API
|
||||
+#include <crm/common/ipc_pacemakerd.h>
|
||||
+
|
||||
+static pcmk_ipc_api_t *pacemakerd_api = NULL;
|
||||
+static time_t last_ok = (time_t) 0;
|
||||
+
|
||||
+static void
|
||||
+pacemakerd_event_cb(pcmk_ipc_api_t *pacemakerd_api,
|
||||
+ enum pcmk_ipc_event event_type, crm_exit_t status,
|
||||
+ void *event_data, void *user_data)
|
||||
+{
|
||||
+ pcmk_pacemakerd_api_reply_t *reply = event_data;
|
||||
+
|
||||
+ switch (event_type) {
|
||||
+ case pcmk_ipc_event_disconnect:
|
||||
+ /* Unexpected */
|
||||
+ cl_log(LOG_ERR, "Lost connection to pacemakerd\n");
|
||||
+ return;
|
||||
+
|
||||
+ case pcmk_ipc_event_reply:
|
||||
+ break;
|
||||
+
|
||||
+ default:
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ if (status != CRM_EX_OK) {
|
||||
+ cl_log(LOG_ERR, "Bad reply from pacemakerd: %s",
|
||||
+ crm_exit_str(status));
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ if (reply->reply_type != pcmk_pacemakerd_reply_ping) {
|
||||
+ cl_log(LOG_ERR, "Unknown reply type %d from pacemakerd\n",
|
||||
+ reply->reply_type);
|
||||
+ } else {
|
||||
+ if ((reply->data.ping.last_good != (time_t) 0) &&
|
||||
+ (reply->data.ping.status == pcmk_rc_ok)) {
|
||||
+ switch (reply->data.ping.state) {
|
||||
+ case pcmk_pacemakerd_state_running:
|
||||
+ case pcmk_pacemakerd_state_shutting_down:
|
||||
+ last_ok = reply->data.ping.last_good;
|
||||
+ break;
|
||||
+ case pcmk_pacemakerd_state_shutdown_complete:
|
||||
+ clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN);
|
||||
+ break;
|
||||
+ default:
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
extern int disk_count;
|
||||
|
||||
static void clean_up(int rc);
|
||||
@@ -133,10 +189,13 @@ mon_cib_connection_destroy(gpointer user_data)
|
||||
cib->cmds->signoff(cib);
|
||||
/* retrigger as last one might have been skipped */
|
||||
mon_refresh_state(NULL);
|
||||
+
|
||||
+#if !USE_PACEMAKERD_API
|
||||
if (pcmk_clean_shutdown) {
|
||||
/* assume a graceful pacemaker-shutdown */
|
||||
clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN);
|
||||
}
|
||||
+#endif
|
||||
/* getting here we aren't sure about the pacemaker-state
|
||||
so try to use the timeout to reconnect and get
|
||||
everything sorted out again
|
||||
@@ -196,6 +255,13 @@ mon_timer_notify(gpointer data)
|
||||
g_source_remove(timer_id_notify);
|
||||
}
|
||||
|
||||
+#if USE_PACEMAKERD_API
|
||||
+ {
|
||||
+ time_t now = time(NULL);
|
||||
+
|
||||
+ if ((last_ok <= now) && (now - last_ok < timeout_watchdog)) {
|
||||
+#endif
|
||||
+
|
||||
if (cib_connected) {
|
||||
if (counter == counter_max) {
|
||||
mon_retrieve_current_cib();
|
||||
@@ -207,6 +273,16 @@ mon_timer_notify(gpointer data)
|
||||
counter++;
|
||||
}
|
||||
}
|
||||
+
|
||||
+#if USE_PACEMAKERD_API
|
||||
+ }
|
||||
+ }
|
||||
+ if (pcmk_connect_ipc(pacemakerd_api,
|
||||
+ pcmk_ipc_dispatch_main) == pcmk_rc_ok) {
|
||||
+ pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name);
|
||||
+ }
|
||||
+#endif
|
||||
+
|
||||
timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL);
|
||||
return FALSE;
|
||||
}
|
||||
@@ -526,6 +602,14 @@ clean_up(int rc)
|
||||
cib = NULL;
|
||||
}
|
||||
|
||||
+#if USE_PACEMAKERD_API
|
||||
+ if (pacemakerd_api != NULL) {
|
||||
+ pcmk_ipc_api_t *capi = pacemakerd_api;
|
||||
+ pacemakerd_api = NULL; // Ensure we can't free this twice
|
||||
+ pcmk_free_ipc_api(capi);
|
||||
+ }
|
||||
+#endif
|
||||
+
|
||||
if (rc >= 0) {
|
||||
exit(rc);
|
||||
}
|
||||
@@ -535,11 +619,11 @@ clean_up(int rc)
|
||||
int
|
||||
servant_pcmk(const char *diskname, int mode, const void* argp)
|
||||
{
|
||||
- int exit_code = 0;
|
||||
+ int exit_code = 0;
|
||||
|
||||
- crm_system_name = strdup("sbd:pcmk");
|
||||
- cl_log(LOG_NOTICE, "Monitoring Pacemaker health");
|
||||
- set_proc_title("sbd: watcher: Pacemaker");
|
||||
+ crm_system_name = strdup("sbd:pcmk");
|
||||
+ cl_log(LOG_NOTICE, "Monitoring Pacemaker health");
|
||||
+ set_proc_title("sbd: watcher: Pacemaker");
|
||||
setenv("PCMK_watchdog", "true", 1);
|
||||
|
||||
if(debug == 0) {
|
||||
@@ -548,12 +632,40 @@ servant_pcmk(const char *diskname, int mode, const void* argp)
|
||||
}
|
||||
|
||||
|
||||
- if (data_set == NULL) {
|
||||
- data_set = pe_new_working_set();
|
||||
- }
|
||||
- if (data_set == NULL) {
|
||||
- return -1;
|
||||
- }
|
||||
+ if (data_set == NULL) {
|
||||
+ data_set = pe_new_working_set();
|
||||
+ }
|
||||
+ if (data_set == NULL) {
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+#if USE_PACEMAKERD_API
|
||||
+ {
|
||||
+ int rc;
|
||||
+
|
||||
+ rc = pcmk_new_ipc_api(&pacemakerd_api, pcmk_ipc_pacemakerd);
|
||||
+ if (pacemakerd_api == NULL) {
|
||||
+ cl_log(LOG_ERR, "Could not connect to pacemakerd: %s\n",
|
||||
+ pcmk_rc_str(rc));
|
||||
+ return -1;
|
||||
+ }
|
||||
+ pcmk_register_ipc_callback(pacemakerd_api, pacemakerd_event_cb, NULL);
|
||||
+ do {
|
||||
+ rc = pcmk_connect_ipc(pacemakerd_api, pcmk_ipc_dispatch_main);
|
||||
+ if (rc != pcmk_rc_ok) {
|
||||
+ cl_log(LOG_DEBUG, "Could not connect to pacemakerd: %s\n",
|
||||
+ pcmk_rc_str(rc));
|
||||
+ sleep(reconnect_msec / 1000);
|
||||
+ }
|
||||
+ } while (rc != pcmk_rc_ok);
|
||||
+ /* send a ping to pacemakerd to wake it up */
|
||||
+ pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name);
|
||||
+ /* cib should come up now as well so it's time
|
||||
+ * to have the inquisitor have a closer look
|
||||
+ */
|
||||
+ notify_parent();
|
||||
+ }
|
||||
+#endif
|
||||
|
||||
if (current_cib == NULL) {
|
||||
cib = cib_new();
|
||||
--
|
||||
1.8.3.1
|
||||
|
@ -0,0 +1,110 @@
|
||||
From f4d38a073ce3bfa2078792f1cc85229457430292 Mon Sep 17 00:00:00 2001
|
||||
From: Klaus Wenninger <klaus.wenninger@aon.at>
|
||||
Date: Tue, 21 Jul 2020 18:30:30 +0200
|
||||
Subject: [PATCH] Fix: make syncing of pacemaker resource startup configurable
|
||||
|
||||
---
|
||||
src/sbd-inquisitor.c | 20 ++++++++++++++++++++
|
||||
src/sbd-pacemaker.c | 6 +++---
|
||||
src/sbd.h | 1 +
|
||||
src/sbd.sysconfig | 14 ++++++++++++++
|
||||
4 files changed, 38 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/sbd-inquisitor.c b/src/sbd-inquisitor.c
|
||||
index 52ede8a..962725e 100644
|
||||
--- a/src/sbd-inquisitor.c
|
||||
+++ b/src/sbd-inquisitor.c
|
||||
@@ -35,6 +35,7 @@ bool do_flush = true;
|
||||
char timeout_sysrq_char = 'b';
|
||||
bool move_to_root_cgroup = true;
|
||||
bool enforce_moving_to_root_cgroup = false;
|
||||
+bool sync_resource_startup = false;
|
||||
|
||||
int parse_device_line(const char *line);
|
||||
|
||||
@@ -964,6 +965,25 @@ int main(int argc, char **argv, char **envp)
|
||||
}
|
||||
}
|
||||
|
||||
+ value = getenv("SBD_SYNC_RESOURCE_STARTUP");
|
||||
+ if(value) {
|
||||
+ sync_resource_startup = crm_is_true(value);
|
||||
+ }
|
||||
+#if !USE_PACEMAKERD_API
|
||||
+ if (sync_resource_startup) {
|
||||
+ fprintf(stderr, "Failed to sync resource-startup as "
|
||||
+ "SBD was built against pacemaker not supporting pacemakerd-API.\n");
|
||||
+ exit_status = -1;
|
||||
+ goto out;
|
||||
+ }
|
||||
+#else
|
||||
+ if (!sync_resource_startup) {
|
||||
+ cl_log(LOG_WARNING, "SBD built against pacemaker supporting "
|
||||
+ "pacemakerd-API. Should think about enabling "
|
||||
+ "SBD_SYNC_RESOURCE_STARTUP.");
|
||||
+ }
|
||||
+#endif
|
||||
+
|
||||
while ((c = getopt(argc, argv, "czC:DPRTWZhvw:d:n:p:1:2:3:4:5:t:I:F:S:s:r:")) != -1) {
|
||||
switch (c) {
|
||||
case 'D':
|
||||
diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c
|
||||
index 1243bfc..aa1fb57 100644
|
||||
--- a/src/sbd-pacemaker.c
|
||||
+++ b/src/sbd-pacemaker.c
|
||||
@@ -190,12 +190,12 @@ mon_cib_connection_destroy(gpointer user_data)
|
||||
/* retrigger as last one might have been skipped */
|
||||
mon_refresh_state(NULL);
|
||||
|
||||
-#if !USE_PACEMAKERD_API
|
||||
- if (pcmk_clean_shutdown) {
|
||||
+
|
||||
+ if ((pcmk_clean_shutdown) && (!sync_resource_startup)) {
|
||||
/* assume a graceful pacemaker-shutdown */
|
||||
clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN);
|
||||
}
|
||||
-#endif
|
||||
+
|
||||
/* getting here we aren't sure about the pacemaker-state
|
||||
so try to use the timeout to reconnect and get
|
||||
everything sorted out again
|
||||
diff --git a/src/sbd.h b/src/sbd.h
|
||||
index 382e553..3b6647c 100644
|
||||
--- a/src/sbd.h
|
||||
+++ b/src/sbd.h
|
||||
@@ -161,6 +161,7 @@ extern bool do_flush;
|
||||
extern char timeout_sysrq_char;
|
||||
extern bool move_to_root_cgroup;
|
||||
extern bool enforce_moving_to_root_cgroup;
|
||||
+extern bool sync_resource_startup;
|
||||
|
||||
/* Global, non-tunable variables: */
|
||||
extern int sector_size;
|
||||
diff --git a/src/sbd.sysconfig b/src/sbd.sysconfig
|
||||
index 33b50d0..b32e826 100644
|
||||
--- a/src/sbd.sysconfig
|
||||
+++ b/src/sbd.sysconfig
|
||||
@@ -106,6 +106,20 @@ SBD_TIMEOUT_ACTION=flush,reboot
|
||||
#
|
||||
SBD_MOVE_TO_ROOT_CGROUP=auto
|
||||
|
||||
+## Type: yesno
|
||||
+## Default: no
|
||||
+#
|
||||
+# If resource startup syncing is enabled then pacemakerd is
|
||||
+# gonna wait to be pinged via IPC before it starts resources.
|
||||
+# On shutdown pacemakerd is going to wait in a state where it
|
||||
+# has cleanly shutdown resources till sbd fetches that state.
|
||||
+#
|
||||
+# Default is 'no' to prevent pacemaker from waiting for a
|
||||
+# ping that will never come when working together with an sbd
|
||||
+# version that doesn't support the feature.
|
||||
+#
|
||||
+SBD_SYNC_RESOURCE_STARTUP=no
|
||||
+
|
||||
## Type: string
|
||||
## Default: ""
|
||||
#
|
||||
--
|
||||
1.8.3.1
|
||||
|
@ -18,7 +18,7 @@
|
||||
%global commit 25fce8a7d5e8cd5abc2379077381b10bd6cec183
|
||||
%global shortcommit %(c=%{commit}; echo ${c:0:7})
|
||||
%global github_owner Clusterlabs
|
||||
%global buildnum 3
|
||||
%global buildnum 7
|
||||
|
||||
Name: sbd
|
||||
Summary: Storage-based death
|
||||
@ -30,6 +30,10 @@ Url: https://github.com/%{github_owner}/%{name}
|
||||
Source0: https://github.com/%{github_owner}/%{name}/archive/%{commit}/%{name}-%{commit}.tar.gz
|
||||
Patch1: 0001-Fix-regressions.sh-make-parameter-passing-consistent.patch
|
||||
Patch2: 0002-Doc-add-environment-section-to-man-page.patch
|
||||
Patch3: 0003-Fix-sbd-pacemaker-handle-new-no_quorum_demote.patch
|
||||
Patch4: 0004-Fix-sbd-cluster-match-qdevice-sync_timeout-against-w.patch
|
||||
Patch5: 0005-Fix-sbd-pacemaker-sync-with-pacemakerd-for-robustnes.patch
|
||||
Patch6: 0006-Fix-make-syncing-of-pacemaker-resource-startup-confi.patch
|
||||
BuildRoot: %{_tmppath}/%{name}-%{version}-build
|
||||
BuildRequires: autoconf
|
||||
BuildRequires: automake
|
||||
@ -45,6 +49,7 @@ BuildRequires: pkgconfig
|
||||
BuildRequires: systemd
|
||||
BuildRequires: make
|
||||
Conflicts: fence-agents-sbd < 4.2.1-38
|
||||
Conflicts: pacemaker-libs < 2.0.4-5
|
||||
|
||||
%if 0%{?rhel} > 0
|
||||
ExclusiveArch: i686 x86_64 s390x ppc64le aarch64
|
||||
@ -75,6 +80,7 @@ regression-testing sbd.
|
||||
sed -i src/sbd.sysconfig -e "s/Default: 5/Default: 15/"
|
||||
sed -i src/sbd.sysconfig -e "s/SBD_WATCHDOG_TIMEOUT=5/SBD_WATCHDOG_TIMEOUT=15/"
|
||||
%endif
|
||||
sed -i src/sbd.sysconfig -e "s/SBD_SYNC_RESOURCE_STARTUP=no/SBD_SYNC_RESOURCE_STARTUP=yes/"
|
||||
|
||||
###########################################################
|
||||
|
||||
@ -155,6 +161,29 @@ fi
|
||||
%{_libdir}/libsbdtestbed*
|
||||
|
||||
%changelog
|
||||
* Thu Jul 30 2020 Klaus Wenninger <kwenning@redhat.com> - 1.4.1-7
|
||||
- conflict with pacemaker-libs < 2.0.4-5 instead of requiring
|
||||
a minimum pacemaker version
|
||||
|
||||
Resolves: rhbz#1861713
|
||||
|
||||
* Mon Jul 27 2020 Klaus Wenninger <kwenning@redhat.com> - 1.4.1-6
|
||||
- match qdevice-sync_timeout against wd-timeout
|
||||
- sync startup/shutdown via pacemakerd-api
|
||||
|
||||
Resolves: rhbz#1703128
|
||||
Resolves: rhbz#1743726
|
||||
|
||||
* Wed Jun 24 2020 Klaus Wenninger <kwenning@redhat.com> - 1.4.1-5
|
||||
- rebuild against pacemaker having new no_quorum_demote
|
||||
|
||||
Resolves: rhbz#1850078
|
||||
|
||||
* Wed Jun 24 2020 Klaus Wenninger <kwenning@redhat.com> - 1.4.1-4
|
||||
- handle new no_quorum_demote in pacemaker
|
||||
|
||||
Resolves: rhbz#1850078
|
||||
|
||||
* Mon Feb 17 2020 Klaus Wenninger <kwenning@redhat.com> - 1.4.1-3
|
||||
- append the man-page by a section auto-generated from
|
||||
sbd.sysconfig
|
||||
|
Loading…
Reference in New Issue
Block a user