232 lines
6.8 KiB
Diff
232 lines
6.8 KiB
Diff
From 5b5ffac4cce861f3621267a73d2ad29f6d807335 Mon Sep 17 00:00:00 2001
|
|
From: Klaus Wenninger <klaus.wenninger@aon.at>
|
|
Date: Tue, 10 Dec 2019 13:16:45 +0100
|
|
Subject: [PATCH] Fix: sbd-pacemaker: sync with pacemakerd for robustness
|
|
|
|
State query ping of pacemakerd prevents pacemakerd from
|
|
starting any sub-daemons (and thus services) if sbd can't
|
|
reach it via ipc. As a health-check get timestamp from
|
|
pacemakerd. On shudown fetch info about graceful
|
|
shutdown from pacemakerd.
|
|
Use new pacemakerd-api provided by pacemaker.
|
|
---
|
|
configure.ac | 4 ++
|
|
src/sbd-pacemaker.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++----
|
|
2 files changed, 126 insertions(+), 10 deletions(-)
|
|
|
|
diff --git a/configure.ac b/configure.ac
|
|
index 23547cf..11d12f0 100644
|
|
--- a/configure.ac
|
|
+++ b/configure.ac
|
|
@@ -81,6 +81,7 @@ AC_CHECK_LIB(crmcluster, crm_peer_init, , missing="yes")
|
|
AC_CHECK_LIB(uuid, uuid_unparse, , missing="yes")
|
|
AC_CHECK_LIB(cmap, cmap_initialize, , HAVE_cmap=0)
|
|
AC_CHECK_LIB(votequorum, votequorum_getinfo, , HAVE_votequorum=0)
|
|
+AC_CHECK_LIB(crmcommon, pcmk_pacemakerd_api_ping, HAVE_pacemakerd_api=1, HAVE_pacemakerd_api=0)
|
|
|
|
dnl pacemaker >= 1.1.8
|
|
AC_CHECK_HEADERS(crm/cluster.h)
|
|
@@ -153,6 +154,9 @@ AM_CONDITIONAL(CHECK_QDEVICE_SYNC_TIMEOUT,
|
|
test "$HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT" = "1" &&
|
|
test "$HAVE_cmap" = "1")
|
|
|
|
+AC_DEFINE_UNQUOTED(USE_PACEMAKERD_API, $HAVE_pacemakerd_api, Turn on synchronization between sbd & pacemakerd)
|
|
+AM_CONDITIONAL(USE_PACEMAKERD_API, test "$HAVE_pacemakerd_api" = "1")
|
|
+
|
|
CONFIGDIR=""
|
|
AC_ARG_WITH(configdir,
|
|
[ --with-configdir=DIR
|
|
diff --git a/src/sbd-pacemaker.c b/src/sbd-pacemaker.c
|
|
index 6e53557..1243bfc 100644
|
|
--- a/src/sbd-pacemaker.c
|
|
+++ b/src/sbd-pacemaker.c
|
|
@@ -83,6 +83,62 @@ pe_free_working_set(pe_working_set_t *data_set)
|
|
|
|
#endif
|
|
|
|
+static void clean_up(int rc);
|
|
+
|
|
+#if USE_PACEMAKERD_API
|
|
+#include <crm/common/ipc_pacemakerd.h>
|
|
+
|
|
+static pcmk_ipc_api_t *pacemakerd_api = NULL;
|
|
+static time_t last_ok = (time_t) 0;
|
|
+
|
|
+static void
|
|
+pacemakerd_event_cb(pcmk_ipc_api_t *pacemakerd_api,
|
|
+ enum pcmk_ipc_event event_type, crm_exit_t status,
|
|
+ void *event_data, void *user_data)
|
|
+{
|
|
+ pcmk_pacemakerd_api_reply_t *reply = event_data;
|
|
+
|
|
+ switch (event_type) {
|
|
+ case pcmk_ipc_event_disconnect:
|
|
+ /* Unexpected */
|
|
+ cl_log(LOG_ERR, "Lost connection to pacemakerd\n");
|
|
+ return;
|
|
+
|
|
+ case pcmk_ipc_event_reply:
|
|
+ break;
|
|
+
|
|
+ default:
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (status != CRM_EX_OK) {
|
|
+ cl_log(LOG_ERR, "Bad reply from pacemakerd: %s",
|
|
+ crm_exit_str(status));
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (reply->reply_type != pcmk_pacemakerd_reply_ping) {
|
|
+ cl_log(LOG_ERR, "Unknown reply type %d from pacemakerd\n",
|
|
+ reply->reply_type);
|
|
+ } else {
|
|
+ if ((reply->data.ping.last_good != (time_t) 0) &&
|
|
+ (reply->data.ping.status == pcmk_rc_ok)) {
|
|
+ switch (reply->data.ping.state) {
|
|
+ case pcmk_pacemakerd_state_running:
|
|
+ case pcmk_pacemakerd_state_shutting_down:
|
|
+ last_ok = reply->data.ping.last_good;
|
|
+ break;
|
|
+ case pcmk_pacemakerd_state_shutdown_complete:
|
|
+ clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN);
|
|
+ break;
|
|
+ default:
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+}
|
|
+#endif
|
|
+
|
|
extern int disk_count;
|
|
|
|
static void clean_up(int rc);
|
|
@@ -133,10 +189,13 @@ mon_cib_connection_destroy(gpointer user_data)
|
|
cib->cmds->signoff(cib);
|
|
/* retrigger as last one might have been skipped */
|
|
mon_refresh_state(NULL);
|
|
+
|
|
+#if !USE_PACEMAKERD_API
|
|
if (pcmk_clean_shutdown) {
|
|
/* assume a graceful pacemaker-shutdown */
|
|
clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN);
|
|
}
|
|
+#endif
|
|
/* getting here we aren't sure about the pacemaker-state
|
|
so try to use the timeout to reconnect and get
|
|
everything sorted out again
|
|
@@ -196,6 +255,13 @@ mon_timer_notify(gpointer data)
|
|
g_source_remove(timer_id_notify);
|
|
}
|
|
|
|
+#if USE_PACEMAKERD_API
|
|
+ {
|
|
+ time_t now = time(NULL);
|
|
+
|
|
+ if ((last_ok <= now) && (now - last_ok < timeout_watchdog)) {
|
|
+#endif
|
|
+
|
|
if (cib_connected) {
|
|
if (counter == counter_max) {
|
|
mon_retrieve_current_cib();
|
|
@@ -207,6 +273,16 @@ mon_timer_notify(gpointer data)
|
|
counter++;
|
|
}
|
|
}
|
|
+
|
|
+#if USE_PACEMAKERD_API
|
|
+ }
|
|
+ }
|
|
+ if (pcmk_connect_ipc(pacemakerd_api,
|
|
+ pcmk_ipc_dispatch_main) == pcmk_rc_ok) {
|
|
+ pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name);
|
|
+ }
|
|
+#endif
|
|
+
|
|
timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL);
|
|
return FALSE;
|
|
}
|
|
@@ -526,6 +602,14 @@ clean_up(int rc)
|
|
cib = NULL;
|
|
}
|
|
|
|
+#if USE_PACEMAKERD_API
|
|
+ if (pacemakerd_api != NULL) {
|
|
+ pcmk_ipc_api_t *capi = pacemakerd_api;
|
|
+ pacemakerd_api = NULL; // Ensure we can't free this twice
|
|
+ pcmk_free_ipc_api(capi);
|
|
+ }
|
|
+#endif
|
|
+
|
|
if (rc >= 0) {
|
|
exit(rc);
|
|
}
|
|
@@ -535,11 +619,11 @@ clean_up(int rc)
|
|
int
|
|
servant_pcmk(const char *diskname, int mode, const void* argp)
|
|
{
|
|
- int exit_code = 0;
|
|
+ int exit_code = 0;
|
|
|
|
- crm_system_name = strdup("sbd:pcmk");
|
|
- cl_log(LOG_NOTICE, "Monitoring Pacemaker health");
|
|
- set_proc_title("sbd: watcher: Pacemaker");
|
|
+ crm_system_name = strdup("sbd:pcmk");
|
|
+ cl_log(LOG_NOTICE, "Monitoring Pacemaker health");
|
|
+ set_proc_title("sbd: watcher: Pacemaker");
|
|
setenv("PCMK_watchdog", "true", 1);
|
|
|
|
if(debug == 0) {
|
|
@@ -548,12 +632,40 @@ servant_pcmk(const char *diskname, int mode, const void* argp)
|
|
}
|
|
|
|
|
|
- if (data_set == NULL) {
|
|
- data_set = pe_new_working_set();
|
|
- }
|
|
- if (data_set == NULL) {
|
|
- return -1;
|
|
- }
|
|
+ if (data_set == NULL) {
|
|
+ data_set = pe_new_working_set();
|
|
+ }
|
|
+ if (data_set == NULL) {
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+#if USE_PACEMAKERD_API
|
|
+ {
|
|
+ int rc;
|
|
+
|
|
+ rc = pcmk_new_ipc_api(&pacemakerd_api, pcmk_ipc_pacemakerd);
|
|
+ if (pacemakerd_api == NULL) {
|
|
+ cl_log(LOG_ERR, "Could not connect to pacemakerd: %s\n",
|
|
+ pcmk_rc_str(rc));
|
|
+ return -1;
|
|
+ }
|
|
+ pcmk_register_ipc_callback(pacemakerd_api, pacemakerd_event_cb, NULL);
|
|
+ do {
|
|
+ rc = pcmk_connect_ipc(pacemakerd_api, pcmk_ipc_dispatch_main);
|
|
+ if (rc != pcmk_rc_ok) {
|
|
+ cl_log(LOG_DEBUG, "Could not connect to pacemakerd: %s\n",
|
|
+ pcmk_rc_str(rc));
|
|
+ sleep(reconnect_msec / 1000);
|
|
+ }
|
|
+ } while (rc != pcmk_rc_ok);
|
|
+ /* send a ping to pacemakerd to wake it up */
|
|
+ pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name);
|
|
+ /* cib should come up now as well so it's time
|
|
+ * to have the inquisitor have a closer look
|
|
+ */
|
|
+ notify_parent();
|
|
+ }
|
|
+#endif
|
|
|
|
if (current_cib == NULL) {
|
|
cib = cib_new();
|
|
--
|
|
1.8.3.1
|
|
|