sbd/SOURCES/0004-Fix-sbd-cluster-match-qdevice-sync_timeout-against-w.patch
2021-09-10 04:16:13 +00:00

400 lines
14 KiB
Diff

From 4c3e4049b08799094a64dac289a48deef4d3d916 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Fri, 24 Jul 2020 14:31:01 +0200
Subject: [PATCH] Fix: sbd-cluster: match qdevice-sync_timeout against
wd-timeout
---
configure.ac | 13 +++
src/sbd-cluster.c | 252 +++++++++++++++++++++++++++++++++++++++++++++---------
2 files changed, 223 insertions(+), 42 deletions(-)
diff --git a/configure.ac b/configure.ac
index 3391c5f..23547cf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -109,6 +109,12 @@ AC_TEST_NO_QUORUM_POLICY(no_quorum_demote)
dnl check for new pe-API
AC_CHECK_FUNCS(pe_new_working_set)
+dnl check if votequorum comes with default for qdevice-sync_timeout
+AC_CHECK_DECLS([VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT],
+ HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT=1,
+ HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT=0,
+ [#include <corosync/votequorum.h>])
+
if test "$missing" = "yes"; then
AC_MSG_ERROR([Missing required libraries or functions.])
fi
@@ -140,6 +146,13 @@ AM_CONDITIONAL(CHECK_TWO_NODE, test "$HAVE_cmap" = "1")
AC_DEFINE_UNQUOTED(CHECK_VOTEQUORUM_HANDLE, $HAVE_votequorum, Turn on periodic checking of votequorum-handle)
AM_CONDITIONAL(CHECK_VOTEQUORUM_HANDLE, test "$HAVE_votequorum" = "1")
+AC_DEFINE_UNQUOTED(CHECK_QDEVICE_SYNC_TIMEOUT,
+ ($HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT && $HAVE_cmap),
+ Turn on checking if watchdog-timeout and qdevice-sync_timeout are matching)
+AM_CONDITIONAL(CHECK_QDEVICE_SYNC_TIMEOUT,
+ test "$HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT" = "1" &&
+ test "$HAVE_cmap" = "1")
+
CONFIGDIR=""
AC_ARG_WITH(configdir,
[ --with-configdir=DIR
diff --git a/src/sbd-cluster.c b/src/sbd-cluster.c
index 13fa580..b6c5512 100644
--- a/src/sbd-cluster.c
+++ b/src/sbd-cluster.c
@@ -33,7 +33,7 @@
#include <crm/cluster.h>
#include <crm/common/mainloop.h>
-#if CHECK_TWO_NODE
+#if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT
#include <glib-unix.h>
#endif
@@ -86,11 +86,20 @@ sbd_plugin_membership_dispatch(cpg_handle_t handle,
static votequorum_handle_t votequorum_handle = 0;
#endif
+#if CHECK_TWO_NODE
static bool two_node = false;
+#endif
static bool ever_seen_both = false;
static int cpg_membership_entries = -1;
-#if CHECK_TWO_NODE
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+#include <corosync/votequorum.h>
+static bool using_qdevice = false;
+static uint32_t qdevice_sync_timeout = /* in seconds */
+ VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000;
+#endif
+
+#if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT
#include <corosync/cmap.h>
static cmap_handle_t cmap_handle = 0;
@@ -102,28 +111,59 @@ void
sbd_cpg_membership_health_update()
{
if(cpg_membership_entries > 0) {
- bool quorum_is_suspect =
+#if CHECK_TWO_NODE
+ bool quorum_is_suspect_two_node =
(two_node && ever_seen_both && cpg_membership_entries == 1);
+#endif
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+ bool quorum_is_suspect_qdevice_timing =
+ using_qdevice && (qdevice_sync_timeout > timeout_watchdog);
+#endif
- if (!quorum_is_suspect) {
+ do {
+#if CHECK_TWO_NODE
+ if (quorum_is_suspect_two_node) {
+ /* Alternative would be asking votequorum for number of votes.
+ * Using pacemaker's cpg as source for number of active nodes
+ * avoids binding to an additional library, is definitely
+ * less code to write and we wouldn't have to combine data
+ * from 3 sources (cmap, cpg & votequorum) in a potentially
+ * racy environment.
+ */
+ set_servant_health(pcmk_health_noquorum, LOG_WARNING,
+ "Connected to %s but requires both nodes present",
+ name_for_cluster_type(get_cluster_type())
+ );
+ break;
+ }
+#endif
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+ if (quorum_is_suspect_qdevice_timing) {
+ /* We can't really trust quorum info as qdevice-sync_timeout
+ * makes reaction of quorum too sluggish for our
+ * watchdog-timeout.
+ */
+ set_servant_health(pcmk_health_noquorum, LOG_WARNING,
+ "Connected to %s but quorum using qdevice is distrusted "
+ "for SBD as qdevice-sync_timeout (%ds) > watchdog-timeout "
+ "(%lus).",
+ name_for_cluster_type(get_cluster_type()),
+ qdevice_sync_timeout, timeout_watchdog
+ );
+ break;
+ }
+#endif
set_servant_health(pcmk_health_online, LOG_INFO,
- "Connected to %s (%u members)",
- name_for_cluster_type(get_cluster_type()),
- cpg_membership_entries
- );
- } else {
- /* Alternative would be asking votequorum for number of votes.
- * Using pacemaker's cpg as source for number of active nodes
- * avoids binding to an additional library, is definitely
- * less code to write and we wouldn't have to combine data
- * from 3 sources (cmap, cpq & votequorum) in a potentially
- * racy environment.
- */
- set_servant_health(pcmk_health_noquorum, LOG_WARNING,
- "Connected to %s but requires both nodes present",
- name_for_cluster_type(get_cluster_type())
- );
- }
+ "Connected to %s (%u members)%s",
+ name_for_cluster_type(get_cluster_type()),
+ cpg_membership_entries,
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+ using_qdevice?" using qdevice for quorum":""
+#else
+ ""
+#endif
+ );
+ } while (false);
if (cpg_membership_entries > 1) {
ever_seen_both = true;
@@ -146,7 +186,7 @@ sbd_cpg_membership_dispatch(cpg_handle_t handle,
notify_parent();
}
-#if CHECK_TWO_NODE
+#if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT
static void sbd_cmap_notify_fn(
cmap_handle_t cmap_handle,
cmap_track_handle_t cmap_track_handle,
@@ -156,21 +196,99 @@ static void sbd_cmap_notify_fn(
struct cmap_notify_value old_val,
void *user_data)
{
- if (new_val.type == CMAP_VALUETYPE_UINT8) {
- switch (event) {
- case CMAP_TRACK_ADD:
- case CMAP_TRACK_MODIFY:
- two_node = *((uint8_t *) new_val.data);
- break;
- case CMAP_TRACK_DELETE:
- two_node = false;
- break;
- default:
- return;
- }
- sbd_cpg_membership_health_update();
- notify_parent();
+ switch (event) {
+ case CMAP_TRACK_ADD:
+ case CMAP_TRACK_MODIFY:
+ switch (new_val.type) {
+ case CMAP_VALUETYPE_UINT8:
+#if CHECK_TWO_NODE
+ if (!strcmp(key_name, "quorum.two_node")) {
+ two_node = *((uint8_t *) new_val.data);
+ } else {
+ return;
+ }
+ break;
+#else
+ return;
+#endif
+ case CMAP_VALUETYPE_STRING:
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+ if (!strcmp(key_name, "quorum.device.model")) {
+ using_qdevice =
+ ((new_val.data) && strlen((char *) new_val.data));
+ } else {
+ return;
+ }
+ break;
+#else
+ return;
+#endif
+ case CMAP_VALUETYPE_UINT32:
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+ if (!strcmp(key_name, "quorum.device.sync_timeout")) {
+ if (new_val.data) {
+ qdevice_sync_timeout =
+ *((uint32_t *) new_val.data) / 1000;
+ } else {
+ qdevice_sync_timeout =
+ VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000;
+ }
+ } else {
+ return;
+ }
+ break;
+#else
+ return;
+#endif
+ default:
+ return;
+ }
+ break;
+ case CMAP_TRACK_DELETE:
+ switch (new_val.type) {
+ case CMAP_VALUETYPE_UINT8:
+#if CHECK_TWO_NODE
+ if (!strcmp(key_name, "quorum.two_node")) {
+ two_node = false;
+ } else {
+ return;
+ }
+ break;
+#else
+ return;
+#endif
+ case CMAP_VALUETYPE_STRING:
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+ if (!strcmp(key_name, "quorum.device.model")) {
+ using_qdevice = false;
+ } else {
+ return;
+ }
+ break;
+#else
+ return;
+#endif
+ case CMAP_VALUETYPE_UINT32:
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+ if (!strcmp(key_name, "quorum.device.sync_timeout")) {
+ qdevice_sync_timeout =
+ VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000;
+ } else {
+ return;
+ }
+ break;
+#else
+ return;
+#endif
+ default:
+ return;
+ }
+ break;
+ default:
+ return;
}
+ sbd_cpg_membership_health_update();
+ notify_parent();
}
static gboolean
@@ -200,9 +318,14 @@ cmap_destroy(void)
}
static gboolean
-sbd_get_two_node(void)
+verify_against_cmap_config(void)
{
+#if CHECK_TWO_NODE
uint8_t two_node_u8 = 0;
+#endif
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+ char *qdevice_model = NULL;
+#endif
int cmap_fd;
if (!track_handle) {
@@ -211,12 +334,31 @@ sbd_get_two_node(void)
goto out;
}
+#if CHECK_TWO_NODE
if (cmap_track_add(cmap_handle, "quorum.two_node",
CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD,
sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) {
cl_log(LOG_WARNING, "Failed adding CMAP tracker for 2Node-mode\n");
goto out;
}
+#endif
+
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+ if (cmap_track_add(cmap_handle, "quorum.device.model",
+ CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD,
+ sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) {
+ cl_log(LOG_WARNING, "Failed adding CMAP tracker for qdevice-model\n");
+ goto out;
+ }
+
+ if (cmap_track_add(cmap_handle, "quorum.device.sync_timeout",
+ CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD,
+ sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) {
+ cl_log(LOG_WARNING,
+ "Failed adding CMAP tracker for qdevice-sync_timeout\n");
+ goto out;
+ }
+#endif
/* add the tracker to mainloop */
if (cmap_fd_get(cmap_handle, &cmap_fd) != CS_OK) {
@@ -232,13 +374,39 @@ sbd_get_two_node(void)
g_source_attach(cmap_source, NULL);
}
- if (cmap_get_uint8(cmap_handle, "quorum.two_node", &two_node_u8) == CS_OK) {
+#if CHECK_TWO_NODE
+ if (cmap_get_uint8(cmap_handle, "quorum.two_node", &two_node_u8)
+ == CS_OK) {
cl_log(two_node_u8? LOG_NOTICE : LOG_INFO,
"Corosync is%s in 2Node-mode", two_node_u8?"":" not");
two_node = two_node_u8;
} else {
cl_log(LOG_INFO, "quorum.two_node not present in cmap\n");
}
+#endif
+
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+ if (cmap_get_string(cmap_handle, "quorum.device.model",
+ &qdevice_model) == CS_OK) {
+ using_qdevice = qdevice_model && strlen(qdevice_model);
+ cl_log(using_qdevice? LOG_NOTICE : LOG_INFO,
+ "Corosync is%s using qdevice", using_qdevice?"":" not");
+ } else {
+ cl_log(LOG_INFO, "quorum.device.model not present in cmap\n");
+ }
+
+ if (cmap_get_uint32(cmap_handle, "quorum.device.sync_timeout",
+ &qdevice_sync_timeout) == CS_OK) {
+ qdevice_sync_timeout /= 1000;
+ cl_log(LOG_INFO,
+ "Corosync is using qdevice-sync_timeout=%ds",
+ qdevice_sync_timeout);
+ } else {
+ cl_log(LOG_INFO,
+ "quorum.device.sync_timeout not present in cmap\n");
+ }
+#endif
+
return TRUE;
out:
@@ -331,15 +499,15 @@ sbd_membership_connect(void)
} else {
cl_log(LOG_INFO, "Attempting connection to %s", name_for_cluster_type(stack));
-#if SUPPORT_COROSYNC && CHECK_TWO_NODE
- if (sbd_get_two_node()) {
+#if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT)
+ if (verify_against_cmap_config()) {
#endif
if(crm_cluster_connect(&cluster)) {
connected = true;
}
-#if SUPPORT_COROSYNC && CHECK_TWO_NODE
+#if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT)
}
#endif
}
@@ -362,7 +530,7 @@ sbd_membership_destroy(gpointer user_data)
cl_log(LOG_WARNING, "Lost connection to %s", name_for_cluster_type(get_cluster_type()));
if (get_cluster_type() != pcmk_cluster_unknown) {
-#if SUPPORT_COROSYNC && CHECK_TWO_NODE
+#if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT)
cmap_destroy();
#endif
}
--
1.8.3.1