pacemaker/0012-Feature-crmd-Implement-reliable-event-notifications.patch
Jan Pokorný dff8d9929d
1.1.13-3: Update to Pacemaker-1.1.13 post-release + patches
Signed-off-by: Jan Pokorný <jpokorny@redhat.com>
2015-10-14 01:54:55 +02:00

566 lines
21 KiB
Diff

From: Andrew Beekhof <andrew@beekhof.net>
Date: Tue, 1 Sep 2015 13:17:45 +1000
Subject: [PATCH] Feature: crmd: Implement reliable event notifications
(cherry picked from commit 0cd1b8f02b403976afe106e0ca3a8a8a16864c6c)
---
crmd/Makefile.am | 2 +-
crmd/callbacks.c | 4 +
crmd/control.c | 67 +++++++++++++---
crmd/crmd_utils.h | 1 +
crmd/lrm.c | 2 +
crmd/notify.c | 188 ++++++++++++++++++++++++++++++++++++++++++++
crmd/notify.h | 30 +++++++
crmd/te_utils.c | 2 +
cts/CIB.py | 2 +
extra/pcmk_notify_sample.sh | 68 ++++++++++++++++
include/crm_internal.h | 1 +
lib/common/utils.c | 27 +++++++
12 files changed, 380 insertions(+), 14 deletions(-)
create mode 100644 crmd/notify.c
create mode 100644 crmd/notify.h
create mode 100755 extra/pcmk_notify_sample.sh
diff --git a/crmd/Makefile.am b/crmd/Makefile.am
index 8e5e1df..984f5d0 100644
--- a/crmd/Makefile.am
+++ b/crmd/Makefile.am
@@ -28,7 +28,7 @@ noinst_HEADERS = crmd.h crmd_fsa.h crmd_messages.h fsa_defines.h \
fsa_matrix.h fsa_proto.h crmd_utils.h crmd_callbacks.h \
crmd_lrm.h te_callbacks.h tengine.h
-crmd_SOURCES = main.c crmd.c corosync.c \
+crmd_SOURCES = main.c crmd.c corosync.c notify.c \
fsa.c control.c messages.c membership.c callbacks.c \
election.c join_client.c join_dc.c subsystems.c throttle.c \
cib.c pengine.c tengine.c lrm.c lrm_state.c remote_lrmd_ra.c \
diff --git a/crmd/callbacks.c b/crmd/callbacks.c
index f646927..38fb30b 100644
--- a/crmd/callbacks.c
+++ b/crmd/callbacks.c
@@ -126,6 +126,7 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d
case crm_status_nstate:
crm_info("%s is now %s (was %s)",
node->uname, state_text(node->state), state_text(data));
+
if (safe_str_eq(data, node->state)) {
/* State did not change */
return;
@@ -147,7 +148,10 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d
}
}
}
+
+ crmd_notify_node_event(node);
break;
+
case crm_status_processes:
if (data) {
old = *(const uint32_t *)data;
diff --git a/crmd/control.c b/crmd/control.c
index f4add49..d92f46b 100644
--- a/crmd/control.c
+++ b/crmd/control.c
@@ -873,28 +873,64 @@ do_recover(long long action,
/* *INDENT-OFF* */
pe_cluster_option crmd_opts[] = {
- /* name, old-name, validate, default, description */
- { "dc-version", NULL, "string", NULL, "none", NULL, "Version of Pacemaker on the cluster's DC.", "Includes the hash which identifies the exact Mercurial changeset it was built from. Used for diagnostic purposes." },
- { "cluster-infrastructure", NULL, "string", NULL, "heartbeat", NULL, "The messaging stack on which Pacemaker is currently running.", "Used for informational and diagnostic purposes." },
- { XML_CONFIG_ATTR_DC_DEADTIME, "dc_deadtime", "time", NULL, "20s", &check_time, "How long to wait for a response from other nodes during startup.", "The \"correct\" value will depend on the speed/load of your network and the type of switches used." },
+ /* name, old-name, validate, values, default, short description, long description */
+ { "dc-version", NULL, "string", NULL, "none", NULL,
+ "Version of Pacemaker on the cluster's DC.",
+ "Includes the hash which identifies the exact changeset it was built from. Used for diagnostic purposes."
+ },
+ { "cluster-infrastructure", NULL, "string", NULL, "heartbeat", NULL,
+ "The messaging stack on which Pacemaker is currently running.",
+ "Used for informational and diagnostic purposes." },
+ { XML_CONFIG_ATTR_DC_DEADTIME, "dc_deadtime", "time", NULL, "20s", &check_time,
+ "How long to wait for a response from other nodes during startup.",
+ "The \"correct\" value will depend on the speed/load of your network and the type of switches used."
+ },
{ XML_CONFIG_ATTR_RECHECK, "cluster_recheck_interval", "time",
- "Zero disables polling. Positive values are an interval in seconds (unless other SI units are specified. eg. 5min)", "15min", &check_timer,
+ "Zero disables polling. Positive values are an interval in seconds (unless other SI units are specified. eg. 5min)",
+ "15min", &check_timer,
"Polling interval for time based changes to options, resource parameters and constraints.",
"The Cluster is primarily event driven, however the configuration can have elements that change based on time."
- " To ensure these changes take effect, we can optionally poll the cluster's status for changes." },
+ " To ensure these changes take effect, we can optionally poll the cluster's status for changes."
+ },
+
+ { "notification-script", NULL, "string", NULL, "/dev/null", &check_script,
+ "Notification script to be called after significant cluster events",
+ "Full path to a script that will be invoked when resources start/stop/fail, fencing occurs or nodes join/leave the cluster.\n"
+ "Must exist on all nodes in the cluster."
+ },
+ { "notification-target", NULL, "string", NULL, "", NULL,
+ "Destination for notifications (Optional)",
+ "Where should the supplied script send notifications to. Useful to avoid hard-coding this in the script."
+ },
+
{ "load-threshold", NULL, "percentage", NULL, "80%", &check_utilization,
"The maximum amount of system resources that should be used by nodes in the cluster",
"The cluster will slow down its recovery process when the amount of system resources used"
- " (currently CPU) approaches this limit", },
+ " (currently CPU) approaches this limit",
+ },
{ "node-action-limit", NULL, "integer", NULL, "0", &check_number,
"The maximum number of jobs that can be scheduled per node. Defaults to 2x cores"},
- { XML_CONFIG_ATTR_ELECTION_FAIL, "election_timeout", "time", NULL, "2min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." },
- { XML_CONFIG_ATTR_FORCE_QUIT, "shutdown_escalation", "time", NULL, "20min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." },
- { "crmd-integration-timeout", NULL, "time", NULL, "3min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." },
- { "crmd-finalization-timeout", NULL, "time", NULL, "30min", &check_timer, "*** Advanced Use Only ***.", "If you need to adjust this value, it probably indicates the presence of a bug." },
- { "crmd-transition-delay", NULL, "time", NULL, "0s", &check_timer, "*** Advanced Use Only ***\nEnabling this option will slow down cluster recovery under all conditions", "Delay cluster recovery for the configured interval to allow for additional/related events to occur.\nUseful if your configuration is sensitive to the order in which ping updates arrive." },
+ { XML_CONFIG_ATTR_ELECTION_FAIL, "election_timeout", "time", NULL, "2min", &check_timer,
+ "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug."
+ },
+ { XML_CONFIG_ATTR_FORCE_QUIT, "shutdown_escalation", "time", NULL, "20min", &check_timer,
+ "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug."
+ },
+ { "crmd-integration-timeout", NULL, "time", NULL, "3min", &check_timer,
+ "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug."
+ },
+ { "crmd-finalization-timeout", NULL, "time", NULL, "30min", &check_timer,
+ "*** Advanced Use Only ***.", "If you need to adjust this value, it probably indicates the presence of a bug."
+ },
+ { "crmd-transition-delay", NULL, "time", NULL, "0s", &check_timer,
+ "*** Advanced Use Only ***\n"
+ "Enabling this option will slow down cluster recovery under all conditions",
+ "Delay cluster recovery for the configured interval to allow for additional/related events to occur.\n"
+ "Useful if your configuration is sensitive to the order in which ping updates arrive."
+ },
{ "stonith-watchdog-timeout", NULL, "time", NULL, NULL, &check_timer,
- "How long to wait before we can assume nodes are safely down", NULL },
+ "How long to wait before we can assume nodes are safely down", NULL
+ },
{ "no-quorum-policy", "no_quorum_policy", "enum", "stop, freeze, ignore, suicide", "stop", &check_quorum, NULL, NULL },
#if SUPPORT_PLUGIN
@@ -927,6 +963,7 @@ crmd_pref(GHashTable * options, const char *name)
static void
config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
{
+ const char *script = NULL;
const char *value = NULL;
GHashTable *config_hash = NULL;
crm_time_t *now = crm_time_new(NULL);
@@ -955,6 +992,10 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void
verify_crmd_options(config_hash);
+ script = crmd_pref(config_hash, "notification-script");
+ value = crmd_pref(config_hash, "notification-target");
+ crmd_enable_notifications(script, value);
+
value = crmd_pref(config_hash, XML_CONFIG_ATTR_DC_DEADTIME);
election_trigger->period_ms = crm_get_msec(value);
diff --git a/crmd/crmd_utils.h b/crmd/crmd_utils.h
index 78214bf..7e8c3e6 100644
--- a/crmd/crmd_utils.h
+++ b/crmd/crmd_utils.h
@@ -21,6 +21,7 @@
# include <crm/crm.h>
# include <crm/common/xml.h>
# include <crm/cib/internal.h> /* For CIB_OP_MODIFY */
+# include "notify.h"
# define CLIENT_EXIT_WAIT 30
# define FAKE_TE_ID "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
diff --git a/crmd/lrm.c b/crmd/lrm.c
index 418e7cf..48195e8 100644
--- a/crmd/lrm.c
+++ b/crmd/lrm.c
@@ -2415,6 +2415,8 @@ process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurr
free(prefix);
}
+ crmd_notify_resource_op(lrm_state->node_name, op);
+
if (op->rsc_deleted) {
crm_info("Deletion of resource '%s' complete after %s", op->rsc_id, op_key);
delete_rsc_entry(lrm_state, NULL, op->rsc_id, NULL, pcmk_ok, NULL);
diff --git a/crmd/notify.c b/crmd/notify.c
new file mode 100644
index 0000000..980bfa6
--- /dev/null
+++ b/crmd/notify.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (C) 2015 Andrew Beekhof <andrew@beekhof.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This software is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <crm_internal.h>
+#include <crm/crm.h>
+#include <crm/msg_xml.h>
+#include "notify.h"
+
+char *notify_script = NULL;
+char *notify_target = NULL;
+
+
+static const char *notify_keys[] =
+{
+ "CRM_notify_recipient",
+ "CRM_notify_node",
+ "CRM_notify_rsc",
+ "CRM_notify_task",
+ "CRM_notify_interval",
+ "CRM_notify_desc",
+ "CRM_notify_status",
+ "CRM_notify_target_rc",
+ "CRM_notify_rc",
+ "CRM_notify_kind",
+ "CRM_notify_version",
+};
+
+
+void
+crmd_enable_notifications(const char *script, const char *target)
+{
+ free(notify_script);
+ notify_script = NULL;
+
+ free(notify_target);
+ notify_target = NULL;
+
+ if(safe_str_eq(script, "/dev/null")) {
+ crm_notice("Notifications disabled");
+ return;
+ }
+
+ notify_script = strdup(script);
+ notify_target = strdup(target);
+ crm_notice("Notifications enabled");
+}
+
+static void
+set_notify_key(const char *name, const char *cvalue, char *value)
+{
+ int lpc;
+ bool found = 0;
+
+ if(cvalue == NULL) {
+ cvalue = value;
+ }
+
+ for(lpc = 0; lpc < DIMOF(notify_keys); lpc++) {
+ if(safe_str_eq(name, notify_keys[lpc])) {
+ found = 1;
+ crm_trace("Setting notify key %s = '%s'", name, cvalue);
+ setenv(name, cvalue, 1);
+ break;
+ }
+ }
+
+ CRM_ASSERT(found != 0);
+ free(value);
+}
+
+
+static void
+send_notification(const char *kind)
+{
+ int lpc;
+ pid_t pid;
+
+ crm_debug("Sending '%s' notification to '%s' via '%s'", kind, notify_target, notify_script);
+
+ set_notify_key("CRM_notify_recipient", notify_target, NULL);
+ set_notify_key("CRM_notify_kind", kind, NULL);
+ set_notify_key("CRM_notify_version", VERSION, NULL);
+
+ pid = fork();
+ if (pid == -1) {
+ crm_perror(LOG_ERR, "notification failed");
+ }
+
+ if (pid == 0) {
+ /* crm_debug("notification: I am the child. Executing the nofitication program."); */
+ execl(notify_script, notify_script, NULL);
+ exit(EXIT_FAILURE);
+
+ } else {
+ for(lpc = 0; lpc < DIMOF(notify_keys); lpc++) {
+ unsetenv(notify_keys[lpc]);
+ }
+ }
+}
+
+void crmd_notify_node_event(crm_node_t *node)
+{
+ if(notify_script == NULL) {
+ return;
+ }
+
+ set_notify_key("CRM_notify_node", node->uname, NULL);
+ set_notify_key("CRM_notify_desc", node->state, NULL);
+
+ send_notification("node");
+}
+
+void
+crmd_notify_fencing_op(stonith_event_t * e)
+{
+ char *desc = NULL;
+
+ if(notify_script) {
+ return;
+ }
+
+ desc = crm_strdup_printf("Operation %s requested by %s for peer %s: %s (ref=%s)",
+ e->operation, e->origin, e->target, pcmk_strerror(e->result),
+ e->id);
+
+ set_notify_key("CRM_notify_node", e->target, NULL);
+ set_notify_key("CRM_notify_task", e->operation, NULL);
+ set_notify_key("CRM_notify_desc", NULL, desc);
+ set_notify_key("CRM_notify_rc", NULL, crm_itoa(e->result));
+
+ send_notification("fencing");
+}
+
+void
+crmd_notify_resource_op(const char *node, lrmd_event_data_t * op)
+{
+ int target_rc = 0;
+
+ if(notify_script == NULL) {
+ return;
+ }
+
+ target_rc = rsc_op_expected_rc(op);
+ if(op->interval == 0 && target_rc == op->rc && safe_str_eq(op->op_type, RSC_STATUS)) {
+ /* Leave it up to the script if they want to notify for
+ * 'failed' probes, only swallow ones for which the result was
+ * unexpected.
+ *
+ * Even if we find a resource running, it was probably because
+ * someone erased the status section.
+ */
+ return;
+ }
+
+ set_notify_key("CRM_notify_node", node, NULL);
+
+ set_notify_key("CRM_notify_rsc", op->rsc_id, NULL);
+ set_notify_key("CRM_notify_task", op->op_type, NULL);
+ set_notify_key("CRM_notify_interval", NULL, crm_itoa(op->interval));
+
+ set_notify_key("CRM_notify_target_rc", NULL, crm_itoa(target_rc));
+ set_notify_key("CRM_notify_status", NULL, crm_itoa(op->op_status));
+ set_notify_key("CRM_notify_rc", NULL, crm_itoa(op->rc));
+
+ if(op->op_status == PCMK_LRM_OP_DONE) {
+ set_notify_key("CRM_notify_desc", services_ocf_exitcode_str(op->rc), NULL);
+ } else {
+ set_notify_key("CRM_notify_desc", services_lrm_status_str(op->op_status), NULL);
+ }
+
+ send_notification("resource");
+}
+
diff --git a/crmd/notify.h b/crmd/notify.h
new file mode 100644
index 0000000..4b138ea
--- /dev/null
+++ b/crmd/notify.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2015 Andrew Beekhof <andrew@beekhof.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This software is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef CRMD_NOTIFY__H
+# define CRMD_NOTIFY__H
+
+# include <crm/crm.h>
+# include <crm/cluster.h>
+# include <crm/stonith-ng.h>
+
+void crmd_enable_notifications(const char *script, const char *target);
+void crmd_notify_node_event(crm_node_t *node);
+void crmd_notify_fencing_op(stonith_event_t * e);
+void crmd_notify_resource_op(const char *node, lrmd_event_data_t * op);
+
+#endif
diff --git a/crmd/te_utils.c b/crmd/te_utils.c
index a1d29f6..22551ba 100644
--- a/crmd/te_utils.c
+++ b/crmd/te_utils.c
@@ -124,6 +124,8 @@ tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event)
return;
}
+ crmd_notify_fencing_op(st_event);
+
if (st_event->result == pcmk_ok && safe_str_eq("on", st_event->action)) {
crm_notice("%s was successfully unfenced by %s (at the request of %s)",
st_event->target, st_event->executioner ? st_event->executioner : "<anyone>", st_event->origin);
diff --git a/cts/CIB.py b/cts/CIB.py
index 8fbba6c..cd3a6a1 100644
--- a/cts/CIB.py
+++ b/cts/CIB.py
@@ -219,6 +219,8 @@ class CIB11(ConfigBase):
o["dc-deadtime"] = "5s"
o["no-quorum-policy"] = no_quorum
o["expected-quorum-votes"] = self.num_nodes
+ o["notification-script"] = "/var/lib/pacemaker/notify.sh"
+ o["notification-target"] = "/var/lib/pacemaker/notify.log"
if self.CM.Env["DoBSC"] == 1:
o["ident-string"] = "Linux-HA TEST configuration file - REMOVEME!!"
diff --git a/extra/pcmk_notify_sample.sh b/extra/pcmk_notify_sample.sh
new file mode 100755
index 0000000..83cf8e9
--- /dev/null
+++ b/extra/pcmk_notify_sample.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+#
+# Copyright (C) 2015 Andrew Beekhof <andrew@beekhof.net>
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+if [ -z $CRM_notify_version ]; then
+ echo "Pacemaker version 1.1.14 is required" >> ${CRM_notify_recipient}
+ exit 0
+fi
+
+case $CRM_notify_kind in
+ node)
+ echo "Node '${CRM_notify_node}' is now '${CRM_notify_desc}'" >> ${CRM_notify_recipient}
+ ;;
+ fencing)
+ # Other keys:
+ #
+ # CRM_notify_node
+ # CRM_notify_task
+ # CRM_notify_rc
+ #
+ echo "Fencing ${CRM_notify_desc}" >> ${CRM_notify_recipient}
+ ;;
+ resource)
+ # Other keys:
+ #
+ # CRM_notify_target_rc
+ # CRM_notify_status
+ # CRM_notify_rc
+ #
+ if [ ${CRM_notify_interval} = "0" ]; then
+ CRM_notify_interval=""
+ else
+ CRM_notify_interval=" (${CRM_notify_interval})"
+ fi
+
+ if [ ${CRM_notify_target_rc} = "0" ]; then
+ CRM_notify_target_rc=""
+ else
+ CRM_notify_target_rc=" (target: ${CRM_notify_target_rc})"
+ fi
+
+ case ${CRM_notify_desc} in
+ Cancelled) ;;
+ *)
+ echo "Resource operation '${CRM_notify_task}${CRM_notify_interval}' for '${CRM_notify_rsc}' on '${CRM_notify_node}': ${CRM_notify_desc}${CRM_notify_target_rc}" >> ${CRM_notify_recipient}
+ ;;
+ esac
+ ;;
+ *)
+ echo "Unhandled $CRM_notify_kind notification" >> ${CRM_notify_recipient}
+ env | grep CRM_notify >> ${CRM_notify_recipient}
+ ;;
+
+esac
diff --git a/include/crm_internal.h b/include/crm_internal.h
index c13bc7b..fb03537 100644
--- a/include/crm_internal.h
+++ b/include/crm_internal.h
@@ -127,6 +127,7 @@ gboolean check_timer(const char *value);
gboolean check_boolean(const char *value);
gboolean check_number(const char *value);
gboolean check_quorum(const char *value);
+gboolean check_script(const char *value);
gboolean check_utilization(const char *value);
/* Shared PE/crmd functionality */
diff --git a/lib/common/utils.c b/lib/common/utils.c
index 6a234dc..628cf2f 100644
--- a/lib/common/utils.c
+++ b/lib/common/utils.c
@@ -180,6 +180,33 @@ check_quorum(const char *value)
}
gboolean
+check_script(const char *value)
+{
+ struct stat st;
+
+ if(safe_str_eq(value, "/dev/null")) {
+ return TRUE;
+ }
+
+ if(stat(value, &st) != 0) {
+ crm_err("Script %s does not exist", value);
+ return FALSE;
+ }
+
+ if(S_ISREG(st.st_mode) == 0) {
+ crm_err("Script %s is not a regular file", value);
+ return FALSE;
+ }
+
+ if( (st.st_mode & (S_IXUSR | S_IXGRP )) == 0) {
+ crm_err("Script %s is not executable", value);
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+gboolean
check_utilization(const char *value)
{
char *end = NULL;