import sbd-1.4.1-7.el8

This commit is contained in:
CentOS Sources 2020-11-03 07:11:00 -05:00 committed by Andrew Lukoshko
parent 3f12f554ed
commit 7cb75f49d2
5 changed files with 841 additions and 1 deletions

View File

@ -0,0 +1,71 @@
From 3048119bf4a0ddb2da01d4ca827ae659a089b622 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Wed, 24 Jun 2020 14:33:21 +0200
Subject: [PATCH] Fix: sbd-pacemaker: handle new no_quorum_demote
and be robust against unknown no-quorum-policies handling them
as would be done with no_quorum_suicide
---
configure.ac | 17 ++++++++++++++++-
src/sbd-pacemaker.c | 11 ++++++++++-
2 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/configure.ac b/configure.ac
index 02e2678..3391c5f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -89,7 +89,22 @@ AC_CHECK_LIB(cib, cib_apply_patch_event, , missing="yes")
dnl pacemaker-2.0 removed support for corosync 1 cluster layer
AC_CHECK_DECLS([pcmk_cluster_classic_ais, pcmk_cluster_cman],,,
- [#include <pacemaker/crm/cluster.h>])
+ [#include <pacemaker/crm/cluster.h>])
+
+dnl check for additional no-quorum-policies
+dnl AC_TEST_NO_QUORUM_POLICY(POLICY)
+AC_DEFUN([AC_TEST_NO_QUORUM_POLICY],[
+ AC_MSG_CHECKING([whether enum pe_quorum_policy defines value $1])
+ AC_LANG_PUSH([C])
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
+ [#include <pacemaker/crm/pengine/pe_types.h>],
+ [enum pe_quorum_policy policy = $1; return policy;])],
+ AC_DEFINE_UNQUOTED(m4_toupper(HAVE_ENUM_$1), 1,
+ [Does pe_types.h have $1 value in enum pe_quorum_policy?])
+ AC_MSG_RESULT([yes]), AC_MSG_RESULT([no]))
+ AC_LANG_POP([C])
+])
+AC_TEST_NO_QUORUM_POLICY(no_quorum_demote)
dnl check for new pe-API
AC_CHECK_FUNCS(pe_new_working_set)
diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c
index 11e104d..6e53557 100644
--- a/src/sbd-pacemaker.c
+++ b/src/sbd-pacemaker.c
@@ -321,13 +321,22 @@ compute_status(pe_working_set_t * data_set)
case no_quorum_freeze:
set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Freeze resources");
break;
+#if HAVE_ENUM_NO_QUORUM_DEMOTE
+ case no_quorum_demote:
+ set_servant_health(pcmk_health_transient, LOG_INFO,
+ "Quorum lost: Demote promotable resources and stop others");
+ break;
+#endif
case no_quorum_stop:
set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Stop ALL resources");
break;
case no_quorum_ignore:
set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Ignore");
break;
- case no_quorum_suicide:
+ default:
+ /* immediate reboot is the most excessive action we take
+ use for no_quorum_suicide and everything we don't know yet
+ */
set_servant_health(pcmk_health_unclean, LOG_INFO, "Quorum lost: Self-fence");
break;
}
--
1.8.3.1

View File

@ -0,0 +1,399 @@
From 4c3e4049b08799094a64dac289a48deef4d3d916 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Fri, 24 Jul 2020 14:31:01 +0200
Subject: [PATCH] Fix: sbd-cluster: match qdevice-sync_timeout against
wd-timeout
---
configure.ac | 13 +++
src/sbd-cluster.c | 252 +++++++++++++++++++++++++++++++++++++++++++++---------
2 files changed, 223 insertions(+), 42 deletions(-)
diff --git a/configure.ac b/configure.ac
index 3391c5f..23547cf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -109,6 +109,12 @@ AC_TEST_NO_QUORUM_POLICY(no_quorum_demote)
dnl check for new pe-API
AC_CHECK_FUNCS(pe_new_working_set)
+dnl check if votequorum comes with default for qdevice-sync_timeout
+AC_CHECK_DECLS([VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT],
+ HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT=1,
+ HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT=0,
+ [#include <corosync/votequorum.h>])
+
if test "$missing" = "yes"; then
AC_MSG_ERROR([Missing required libraries or functions.])
fi
@@ -140,6 +146,13 @@ AM_CONDITIONAL(CHECK_TWO_NODE, test "$HAVE_cmap" = "1")
AC_DEFINE_UNQUOTED(CHECK_VOTEQUORUM_HANDLE, $HAVE_votequorum, Turn on periodic checking of votequorum-handle)
AM_CONDITIONAL(CHECK_VOTEQUORUM_HANDLE, test "$HAVE_votequorum" = "1")
+AC_DEFINE_UNQUOTED(CHECK_QDEVICE_SYNC_TIMEOUT,
+ ($HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT && $HAVE_cmap),
+ Turn on checking if watchdog-timeout and qdevice-sync_timeout are matching)
+AM_CONDITIONAL(CHECK_QDEVICE_SYNC_TIMEOUT,
+ test "$HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT" = "1" &&
+ test "$HAVE_cmap" = "1")
+
CONFIGDIR=""
AC_ARG_WITH(configdir,
[ --with-configdir=DIR
diff --git a/src/sbd-cluster.c b/src/sbd-cluster.c
index 13fa580..b6c5512 100644
--- a/src/sbd-cluster.c
+++ b/src/sbd-cluster.c
@@ -33,7 +33,7 @@
#include <crm/cluster.h>
#include <crm/common/mainloop.h>
-#if CHECK_TWO_NODE
+#if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT
#include <glib-unix.h>
#endif
@@ -86,11 +86,20 @@ sbd_plugin_membership_dispatch(cpg_handle_t handle,
static votequorum_handle_t votequorum_handle = 0;
#endif
+#if CHECK_TWO_NODE
static bool two_node = false;
+#endif
static bool ever_seen_both = false;
static int cpg_membership_entries = -1;
-#if CHECK_TWO_NODE
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+#include <corosync/votequorum.h>
+static bool using_qdevice = false;
+static uint32_t qdevice_sync_timeout = /* in seconds */
+ VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000;
+#endif
+
+#if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT
#include <corosync/cmap.h>
static cmap_handle_t cmap_handle = 0;
@@ -102,28 +111,59 @@ void
sbd_cpg_membership_health_update()
{
if(cpg_membership_entries > 0) {
- bool quorum_is_suspect =
+#if CHECK_TWO_NODE
+ bool quorum_is_suspect_two_node =
(two_node && ever_seen_both && cpg_membership_entries == 1);
+#endif
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+ bool quorum_is_suspect_qdevice_timing =
+ using_qdevice && (qdevice_sync_timeout > timeout_watchdog);
+#endif
- if (!quorum_is_suspect) {
+ do {
+#if CHECK_TWO_NODE
+ if (quorum_is_suspect_two_node) {
+ /* Alternative would be asking votequorum for number of votes.
+ * Using pacemaker's cpg as source for number of active nodes
+ * avoids binding to an additional library, is definitely
+ * less code to write and we wouldn't have to combine data
+ * from 3 sources (cmap, cpg & votequorum) in a potentially
+ * racy environment.
+ */
+ set_servant_health(pcmk_health_noquorum, LOG_WARNING,
+ "Connected to %s but requires both nodes present",
+ name_for_cluster_type(get_cluster_type())
+ );
+ break;
+ }
+#endif
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+ if (quorum_is_suspect_qdevice_timing) {
+ /* We can't really trust quorum info as qdevice-sync_timeout
+ * makes reaction of quorum too sluggish for our
+ * watchdog-timeout.
+ */
+ set_servant_health(pcmk_health_noquorum, LOG_WARNING,
+ "Connected to %s but quorum using qdevice is distrusted "
+ "for SBD as qdevice-sync_timeout (%ds) > watchdog-timeout "
+ "(%lus).",
+ name_for_cluster_type(get_cluster_type()),
+ qdevice_sync_timeout, timeout_watchdog
+ );
+ break;
+ }
+#endif
set_servant_health(pcmk_health_online, LOG_INFO,
- "Connected to %s (%u members)",
- name_for_cluster_type(get_cluster_type()),
- cpg_membership_entries
- );
- } else {
- /* Alternative would be asking votequorum for number of votes.
- * Using pacemaker's cpg as source for number of active nodes
- * avoids binding to an additional library, is definitely
- * less code to write and we wouldn't have to combine data
- * from 3 sources (cmap, cpq & votequorum) in a potentially
- * racy environment.
- */
- set_servant_health(pcmk_health_noquorum, LOG_WARNING,
- "Connected to %s but requires both nodes present",
- name_for_cluster_type(get_cluster_type())
- );
- }
+ "Connected to %s (%u members)%s",
+ name_for_cluster_type(get_cluster_type()),
+ cpg_membership_entries,
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+ using_qdevice?" using qdevice for quorum":""
+#else
+ ""
+#endif
+ );
+ } while (false);
if (cpg_membership_entries > 1) {
ever_seen_both = true;
@@ -146,7 +186,7 @@ sbd_cpg_membership_dispatch(cpg_handle_t handle,
notify_parent();
}
-#if CHECK_TWO_NODE
+#if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT
static void sbd_cmap_notify_fn(
cmap_handle_t cmap_handle,
cmap_track_handle_t cmap_track_handle,
@@ -156,21 +196,99 @@ static void sbd_cmap_notify_fn(
struct cmap_notify_value old_val,
void *user_data)
{
- if (new_val.type == CMAP_VALUETYPE_UINT8) {
- switch (event) {
- case CMAP_TRACK_ADD:
- case CMAP_TRACK_MODIFY:
- two_node = *((uint8_t *) new_val.data);
- break;
- case CMAP_TRACK_DELETE:
- two_node = false;
- break;
- default:
- return;
- }
- sbd_cpg_membership_health_update();
- notify_parent();
+ switch (event) {
+ case CMAP_TRACK_ADD:
+ case CMAP_TRACK_MODIFY:
+ switch (new_val.type) {
+ case CMAP_VALUETYPE_UINT8:
+#if CHECK_TWO_NODE
+ if (!strcmp(key_name, "quorum.two_node")) {
+ two_node = *((uint8_t *) new_val.data);
+ } else {
+ return;
+ }
+ break;
+#else
+ return;
+#endif
+ case CMAP_VALUETYPE_STRING:
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+ if (!strcmp(key_name, "quorum.device.model")) {
+ using_qdevice =
+ ((new_val.data) && strlen((char *) new_val.data));
+ } else {
+ return;
+ }
+ break;
+#else
+ return;
+#endif
+ case CMAP_VALUETYPE_UINT32:
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+ if (!strcmp(key_name, "quorum.device.sync_timeout")) {
+ if (new_val.data) {
+ qdevice_sync_timeout =
+ *((uint32_t *) new_val.data) / 1000;
+ } else {
+ qdevice_sync_timeout =
+ VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000;
+ }
+ } else {
+ return;
+ }
+ break;
+#else
+ return;
+#endif
+ default:
+ return;
+ }
+ break;
+ case CMAP_TRACK_DELETE:
+ switch (new_val.type) {
+ case CMAP_VALUETYPE_UINT8:
+#if CHECK_TWO_NODE
+ if (!strcmp(key_name, "quorum.two_node")) {
+ two_node = false;
+ } else {
+ return;
+ }
+ break;
+#else
+ return;
+#endif
+ case CMAP_VALUETYPE_STRING:
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+ if (!strcmp(key_name, "quorum.device.model")) {
+ using_qdevice = false;
+ } else {
+ return;
+ }
+ break;
+#else
+ return;
+#endif
+ case CMAP_VALUETYPE_UINT32:
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+ if (!strcmp(key_name, "quorum.device.sync_timeout")) {
+ qdevice_sync_timeout =
+ VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000;
+ } else {
+ return;
+ }
+ break;
+#else
+ return;
+#endif
+ default:
+ return;
+ }
+ break;
+ default:
+ return;
}
+ sbd_cpg_membership_health_update();
+ notify_parent();
}
static gboolean
@@ -200,9 +318,14 @@ cmap_destroy(void)
}
static gboolean
-sbd_get_two_node(void)
+verify_against_cmap_config(void)
{
+#if CHECK_TWO_NODE
uint8_t two_node_u8 = 0;
+#endif
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+ char *qdevice_model = NULL;
+#endif
int cmap_fd;
if (!track_handle) {
@@ -211,12 +334,31 @@ sbd_get_two_node(void)
goto out;
}
+#if CHECK_TWO_NODE
if (cmap_track_add(cmap_handle, "quorum.two_node",
CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD,
sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) {
cl_log(LOG_WARNING, "Failed adding CMAP tracker for 2Node-mode\n");
goto out;
}
+#endif
+
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+ if (cmap_track_add(cmap_handle, "quorum.device.model",
+ CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD,
+ sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) {
+ cl_log(LOG_WARNING, "Failed adding CMAP tracker for qdevice-model\n");
+ goto out;
+ }
+
+ if (cmap_track_add(cmap_handle, "quorum.device.sync_timeout",
+ CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD,
+ sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) {
+ cl_log(LOG_WARNING,
+ "Failed adding CMAP tracker for qdevice-sync_timeout\n");
+ goto out;
+ }
+#endif
/* add the tracker to mainloop */
if (cmap_fd_get(cmap_handle, &cmap_fd) != CS_OK) {
@@ -232,13 +374,39 @@ sbd_get_two_node(void)
g_source_attach(cmap_source, NULL);
}
- if (cmap_get_uint8(cmap_handle, "quorum.two_node", &two_node_u8) == CS_OK) {
+#if CHECK_TWO_NODE
+ if (cmap_get_uint8(cmap_handle, "quorum.two_node", &two_node_u8)
+ == CS_OK) {
cl_log(two_node_u8? LOG_NOTICE : LOG_INFO,
"Corosync is%s in 2Node-mode", two_node_u8?"":" not");
two_node = two_node_u8;
} else {
cl_log(LOG_INFO, "quorum.two_node not present in cmap\n");
}
+#endif
+
+#if CHECK_QDEVICE_SYNC_TIMEOUT
+ if (cmap_get_string(cmap_handle, "quorum.device.model",
+ &qdevice_model) == CS_OK) {
+ using_qdevice = qdevice_model && strlen(qdevice_model);
+ cl_log(using_qdevice? LOG_NOTICE : LOG_INFO,
+ "Corosync is%s using qdevice", using_qdevice?"":" not");
+ } else {
+ cl_log(LOG_INFO, "quorum.device.model not present in cmap\n");
+ }
+
+ if (cmap_get_uint32(cmap_handle, "quorum.device.sync_timeout",
+ &qdevice_sync_timeout) == CS_OK) {
+ qdevice_sync_timeout /= 1000;
+ cl_log(LOG_INFO,
+ "Corosync is using qdevice-sync_timeout=%ds",
+ qdevice_sync_timeout);
+ } else {
+ cl_log(LOG_INFO,
+ "quorum.device.sync_timeout not present in cmap\n");
+ }
+#endif
+
return TRUE;
out:
@@ -331,15 +499,15 @@ sbd_membership_connect(void)
} else {
cl_log(LOG_INFO, "Attempting connection to %s", name_for_cluster_type(stack));
-#if SUPPORT_COROSYNC && CHECK_TWO_NODE
- if (sbd_get_two_node()) {
+#if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT)
+ if (verify_against_cmap_config()) {
#endif
if(crm_cluster_connect(&cluster)) {
connected = true;
}
-#if SUPPORT_COROSYNC && CHECK_TWO_NODE
+#if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT)
}
#endif
}
@@ -362,7 +530,7 @@ sbd_membership_destroy(gpointer user_data)
cl_log(LOG_WARNING, "Lost connection to %s", name_for_cluster_type(get_cluster_type()));
if (get_cluster_type() != pcmk_cluster_unknown) {
-#if SUPPORT_COROSYNC && CHECK_TWO_NODE
+#if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT)
cmap_destroy();
#endif
}
--
1.8.3.1

View File

@ -0,0 +1,231 @@
From 5b5ffac4cce861f3621267a73d2ad29f6d807335 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Tue, 10 Dec 2019 13:16:45 +0100
Subject: [PATCH] Fix: sbd-pacemaker: sync with pacemakerd for robustness
State query ping of pacemakerd prevents pacemakerd from
starting any sub-daemons (and thus services) if sbd can't
reach it via ipc. As a health-check get timestamp from
pacemakerd. On shudown fetch info about graceful
shutdown from pacemakerd.
Use new pacemakerd-api provided by pacemaker.
---
configure.ac | 4 ++
src/sbd-pacemaker.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++----
2 files changed, 126 insertions(+), 10 deletions(-)
diff --git a/configure.ac b/configure.ac
index 23547cf..11d12f0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -81,6 +81,7 @@ AC_CHECK_LIB(crmcluster, crm_peer_init, , missing="yes")
AC_CHECK_LIB(uuid, uuid_unparse, , missing="yes")
AC_CHECK_LIB(cmap, cmap_initialize, , HAVE_cmap=0)
AC_CHECK_LIB(votequorum, votequorum_getinfo, , HAVE_votequorum=0)
+AC_CHECK_LIB(crmcommon, pcmk_pacemakerd_api_ping, HAVE_pacemakerd_api=1, HAVE_pacemakerd_api=0)
dnl pacemaker >= 1.1.8
AC_CHECK_HEADERS(crm/cluster.h)
@@ -153,6 +154,9 @@ AM_CONDITIONAL(CHECK_QDEVICE_SYNC_TIMEOUT,
test "$HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT" = "1" &&
test "$HAVE_cmap" = "1")
+AC_DEFINE_UNQUOTED(USE_PACEMAKERD_API, $HAVE_pacemakerd_api, Turn on synchronization between sbd & pacemakerd)
+AM_CONDITIONAL(USE_PACEMAKERD_API, test "$HAVE_pacemakerd_api" = "1")
+
CONFIGDIR=""
AC_ARG_WITH(configdir,
[ --with-configdir=DIR
diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c
index 6e53557..1243bfc 100644
--- a/src/sbd-pacemaker.c
+++ b/src/sbd-pacemaker.c
@@ -83,6 +83,62 @@ pe_free_working_set(pe_working_set_t *data_set)
#endif
+static void clean_up(int rc);
+
+#if USE_PACEMAKERD_API
+#include <crm/common/ipc_pacemakerd.h>
+
+static pcmk_ipc_api_t *pacemakerd_api = NULL;
+static time_t last_ok = (time_t) 0;
+
+static void
+pacemakerd_event_cb(pcmk_ipc_api_t *pacemakerd_api,
+ enum pcmk_ipc_event event_type, crm_exit_t status,
+ void *event_data, void *user_data)
+{
+ pcmk_pacemakerd_api_reply_t *reply = event_data;
+
+ switch (event_type) {
+ case pcmk_ipc_event_disconnect:
+ /* Unexpected */
+ cl_log(LOG_ERR, "Lost connection to pacemakerd\n");
+ return;
+
+ case pcmk_ipc_event_reply:
+ break;
+
+ default:
+ return;
+ }
+
+ if (status != CRM_EX_OK) {
+ cl_log(LOG_ERR, "Bad reply from pacemakerd: %s",
+ crm_exit_str(status));
+ return;
+ }
+
+ if (reply->reply_type != pcmk_pacemakerd_reply_ping) {
+ cl_log(LOG_ERR, "Unknown reply type %d from pacemakerd\n",
+ reply->reply_type);
+ } else {
+ if ((reply->data.ping.last_good != (time_t) 0) &&
+ (reply->data.ping.status == pcmk_rc_ok)) {
+ switch (reply->data.ping.state) {
+ case pcmk_pacemakerd_state_running:
+ case pcmk_pacemakerd_state_shutting_down:
+ last_ok = reply->data.ping.last_good;
+ break;
+ case pcmk_pacemakerd_state_shutdown_complete:
+ clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+}
+#endif
+
extern int disk_count;
static void clean_up(int rc);
@@ -133,10 +189,13 @@ mon_cib_connection_destroy(gpointer user_data)
cib->cmds->signoff(cib);
/* retrigger as last one might have been skipped */
mon_refresh_state(NULL);
+
+#if !USE_PACEMAKERD_API
if (pcmk_clean_shutdown) {
/* assume a graceful pacemaker-shutdown */
clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN);
}
+#endif
/* getting here we aren't sure about the pacemaker-state
so try to use the timeout to reconnect and get
everything sorted out again
@@ -196,6 +255,13 @@ mon_timer_notify(gpointer data)
g_source_remove(timer_id_notify);
}
+#if USE_PACEMAKERD_API
+ {
+ time_t now = time(NULL);
+
+ if ((last_ok <= now) && (now - last_ok < timeout_watchdog)) {
+#endif
+
if (cib_connected) {
if (counter == counter_max) {
mon_retrieve_current_cib();
@@ -207,6 +273,16 @@ mon_timer_notify(gpointer data)
counter++;
}
}
+
+#if USE_PACEMAKERD_API
+ }
+ }
+ if (pcmk_connect_ipc(pacemakerd_api,
+ pcmk_ipc_dispatch_main) == pcmk_rc_ok) {
+ pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name);
+ }
+#endif
+
timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL);
return FALSE;
}
@@ -526,6 +602,14 @@ clean_up(int rc)
cib = NULL;
}
+#if USE_PACEMAKERD_API
+ if (pacemakerd_api != NULL) {
+ pcmk_ipc_api_t *capi = pacemakerd_api;
+ pacemakerd_api = NULL; // Ensure we can't free this twice
+ pcmk_free_ipc_api(capi);
+ }
+#endif
+
if (rc >= 0) {
exit(rc);
}
@@ -535,11 +619,11 @@ clean_up(int rc)
int
servant_pcmk(const char *diskname, int mode, const void* argp)
{
- int exit_code = 0;
+ int exit_code = 0;
- crm_system_name = strdup("sbd:pcmk");
- cl_log(LOG_NOTICE, "Monitoring Pacemaker health");
- set_proc_title("sbd: watcher: Pacemaker");
+ crm_system_name = strdup("sbd:pcmk");
+ cl_log(LOG_NOTICE, "Monitoring Pacemaker health");
+ set_proc_title("sbd: watcher: Pacemaker");
setenv("PCMK_watchdog", "true", 1);
if(debug == 0) {
@@ -548,12 +632,40 @@ servant_pcmk(const char *diskname, int mode, const void* argp)
}
- if (data_set == NULL) {
- data_set = pe_new_working_set();
- }
- if (data_set == NULL) {
- return -1;
- }
+ if (data_set == NULL) {
+ data_set = pe_new_working_set();
+ }
+ if (data_set == NULL) {
+ return -1;
+ }
+
+#if USE_PACEMAKERD_API
+ {
+ int rc;
+
+ rc = pcmk_new_ipc_api(&pacemakerd_api, pcmk_ipc_pacemakerd);
+ if (pacemakerd_api == NULL) {
+ cl_log(LOG_ERR, "Could not connect to pacemakerd: %s\n",
+ pcmk_rc_str(rc));
+ return -1;
+ }
+ pcmk_register_ipc_callback(pacemakerd_api, pacemakerd_event_cb, NULL);
+ do {
+ rc = pcmk_connect_ipc(pacemakerd_api, pcmk_ipc_dispatch_main);
+ if (rc != pcmk_rc_ok) {
+ cl_log(LOG_DEBUG, "Could not connect to pacemakerd: %s\n",
+ pcmk_rc_str(rc));
+ sleep(reconnect_msec / 1000);
+ }
+ } while (rc != pcmk_rc_ok);
+ /* send a ping to pacemakerd to wake it up */
+ pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name);
+ /* cib should come up now as well so it's time
+ * to have the inquisitor have a closer look
+ */
+ notify_parent();
+ }
+#endif
if (current_cib == NULL) {
cib = cib_new();
--
1.8.3.1

View File

@ -0,0 +1,110 @@
From f4d38a073ce3bfa2078792f1cc85229457430292 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Tue, 21 Jul 2020 18:30:30 +0200
Subject: [PATCH] Fix: make syncing of pacemaker resource startup configurable
---
src/sbd-inquisitor.c | 20 ++++++++++++++++++++
src/sbd-pacemaker.c | 6 +++---
src/sbd.h | 1 +
src/sbd.sysconfig | 14 ++++++++++++++
4 files changed, 38 insertions(+), 3 deletions(-)
diff --git a/src/sbd-inquisitor.c b/src/sbd-inquisitor.c
index 52ede8a..962725e 100644
--- a/src/sbd-inquisitor.c
+++ b/src/sbd-inquisitor.c
@@ -35,6 +35,7 @@ bool do_flush = true;
char timeout_sysrq_char = 'b';
bool move_to_root_cgroup = true;
bool enforce_moving_to_root_cgroup = false;
+bool sync_resource_startup = false;
int parse_device_line(const char *line);
@@ -964,6 +965,25 @@ int main(int argc, char **argv, char **envp)
}
}
+ value = getenv("SBD_SYNC_RESOURCE_STARTUP");
+ if(value) {
+ sync_resource_startup = crm_is_true(value);
+ }
+#if !USE_PACEMAKERD_API
+ if (sync_resource_startup) {
+ fprintf(stderr, "Failed to sync resource-startup as "
+ "SBD was built against pacemaker not supporting pacemakerd-API.\n");
+ exit_status = -1;
+ goto out;
+ }
+#else
+ if (!sync_resource_startup) {
+ cl_log(LOG_WARNING, "SBD built against pacemaker supporting "
+ "pacemakerd-API. Should think about enabling "
+ "SBD_SYNC_RESOURCE_STARTUP.");
+ }
+#endif
+
while ((c = getopt(argc, argv, "czC:DPRTWZhvw:d:n:p:1:2:3:4:5:t:I:F:S:s:r:")) != -1) {
switch (c) {
case 'D':
diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c
index 1243bfc..aa1fb57 100644
--- a/src/sbd-pacemaker.c
+++ b/src/sbd-pacemaker.c
@@ -190,12 +190,12 @@ mon_cib_connection_destroy(gpointer user_data)
/* retrigger as last one might have been skipped */
mon_refresh_state(NULL);
-#if !USE_PACEMAKERD_API
- if (pcmk_clean_shutdown) {
+
+ if ((pcmk_clean_shutdown) && (!sync_resource_startup)) {
/* assume a graceful pacemaker-shutdown */
clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN);
}
-#endif
+
/* getting here we aren't sure about the pacemaker-state
so try to use the timeout to reconnect and get
everything sorted out again
diff --git a/src/sbd.h b/src/sbd.h
index 382e553..3b6647c 100644
--- a/src/sbd.h
+++ b/src/sbd.h
@@ -161,6 +161,7 @@ extern bool do_flush;
extern char timeout_sysrq_char;
extern bool move_to_root_cgroup;
extern bool enforce_moving_to_root_cgroup;
+extern bool sync_resource_startup;
/* Global, non-tunable variables: */
extern int sector_size;
diff --git a/src/sbd.sysconfig b/src/sbd.sysconfig
index 33b50d0..b32e826 100644
--- a/src/sbd.sysconfig
+++ b/src/sbd.sysconfig
@@ -106,6 +106,20 @@ SBD_TIMEOUT_ACTION=flush,reboot
#
SBD_MOVE_TO_ROOT_CGROUP=auto
+## Type: yesno
+## Default: no
+#
+# If resource startup syncing is enabled then pacemakerd is
+# gonna wait to be pinged via IPC before it starts resources.
+# On shutdown pacemakerd is going to wait in a state where it
+# has cleanly shutdown resources till sbd fetches that state.
+#
+# Default is 'no' to prevent pacemaker from waiting for a
+# ping that will never come when working together with an sbd
+# version that doesn't support the feature.
+#
+SBD_SYNC_RESOURCE_STARTUP=no
+
## Type: string
## Default: ""
#
--
1.8.3.1

View File

@ -18,7 +18,7 @@
%global commit 25fce8a7d5e8cd5abc2379077381b10bd6cec183
%global shortcommit %(c=%{commit}; echo ${c:0:7})
%global github_owner Clusterlabs
%global buildnum 3
%global buildnum 7
Name: sbd
Summary: Storage-based death
@ -30,6 +30,10 @@ Url: https://github.com/%{github_owner}/%{name}
Source0: https://github.com/%{github_owner}/%{name}/archive/%{commit}/%{name}-%{commit}.tar.gz
Patch1: 0001-Fix-regressions.sh-make-parameter-passing-consistent.patch
Patch2: 0002-Doc-add-environment-section-to-man-page.patch
Patch3: 0003-Fix-sbd-pacemaker-handle-new-no_quorum_demote.patch
Patch4: 0004-Fix-sbd-cluster-match-qdevice-sync_timeout-against-w.patch
Patch5: 0005-Fix-sbd-pacemaker-sync-with-pacemakerd-for-robustnes.patch
Patch6: 0006-Fix-make-syncing-of-pacemaker-resource-startup-confi.patch
BuildRoot: %{_tmppath}/%{name}-%{version}-build
BuildRequires: autoconf
BuildRequires: automake
@ -45,6 +49,7 @@ BuildRequires: pkgconfig
BuildRequires: systemd
BuildRequires: make
Conflicts: fence-agents-sbd < 4.2.1-38
Conflicts: pacemaker-libs < 2.0.4-5
%if 0%{?rhel} > 0
ExclusiveArch: i686 x86_64 s390x ppc64le aarch64
@ -75,6 +80,7 @@ regression-testing sbd.
sed -i src/sbd.sysconfig -e "s/Default: 5/Default: 15/"
sed -i src/sbd.sysconfig -e "s/SBD_WATCHDOG_TIMEOUT=5/SBD_WATCHDOG_TIMEOUT=15/"
%endif
sed -i src/sbd.sysconfig -e "s/SBD_SYNC_RESOURCE_STARTUP=no/SBD_SYNC_RESOURCE_STARTUP=yes/"
###########################################################
@ -155,6 +161,29 @@ fi
%{_libdir}/libsbdtestbed*
%changelog
* Thu Jul 30 2020 Klaus Wenninger <kwenning@redhat.com> - 1.4.1-7
- conflict with pacemaker-libs < 2.0.4-5 instead of requiring
a minimum pacemaker version
Resolves: rhbz#1861713
* Mon Jul 27 2020 Klaus Wenninger <kwenning@redhat.com> - 1.4.1-6
- match qdevice-sync_timeout against wd-timeout
- sync startup/shutdown via pacemakerd-api
Resolves: rhbz#1703128
Resolves: rhbz#1743726
* Wed Jun 24 2020 Klaus Wenninger <kwenning@redhat.com> - 1.4.1-5
- rebuild against pacemaker having new no_quorum_demote
Resolves: rhbz#1850078
* Wed Jun 24 2020 Klaus Wenninger <kwenning@redhat.com> - 1.4.1-4
- handle new no_quorum_demote in pacemaker
Resolves: rhbz#1850078
* Mon Feb 17 2020 Klaus Wenninger <kwenning@redhat.com> - 1.4.1-3
- append the man-page by a section auto-generated from
sbd.sysconfig