import pacemaker-2.0.2-1.el8
This commit is contained in:
commit
98ca009dce
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
SOURCES/nagios-agents-metadata-105ab8a.tar.gz
|
||||
SOURCES/pacemaker-744a30d.tar.gz
|
2
.pacemaker.metadata
Normal file
2
.pacemaker.metadata
Normal file
@ -0,0 +1,2 @@
|
||||
ea6c0a27fd0ae8ce02f84a11f08a0d79377041c3 SOURCES/nagios-agents-metadata-105ab8a.tar.gz
|
||||
98d783c49fa894c5bdc30f907f5355539030578d SOURCES/pacemaker-744a30d.tar.gz
|
284
SOURCES/001-xmldiffs.patch
Normal file
284
SOURCES/001-xmldiffs.patch
Normal file
@ -0,0 +1,284 @@
|
||||
From 66e5e4d83e90be3cecab7bf5f50d0e10fbaa7cea Mon Sep 17 00:00:00 2001
|
||||
From: "Gao,Yan" <ygao@suse.com>
|
||||
Date: Fri, 26 Apr 2019 11:52:59 +0200
|
||||
Subject: [PATCH 1/3] Fix: libcrmcommon: correctly apply XML diffs with
|
||||
multiple move/create changes
|
||||
|
||||
Given a resource group:
|
||||
```
|
||||
<group id="dummies">
|
||||
<primitive id="dummy0"/>
|
||||
<primitive id="dummy1"/>
|
||||
<primitive id="dummy2"/>
|
||||
<primitive id="dummy3"/>
|
||||
<primitive id="dummy4"/>
|
||||
</group>
|
||||
```
|
||||
|
||||
, if we'd like to change it to:
|
||||
```
|
||||
<group id="dummies">
|
||||
<primitive id="dummy3"/>
|
||||
<primitive id="dummy4"/>
|
||||
<primitive id="dummy2"/>
|
||||
<primitive id="dummy0"/>
|
||||
<primitive id="dummy1"/>
|
||||
</group>
|
||||
```
|
||||
|
||||
, the generated XML diff would be like:
|
||||
```
|
||||
<diff format="2">
|
||||
<change operation="move" path="//primitive[@id=dummy3]" position="0"/>
|
||||
<change operation="move" path="//primitive[@id=dummy4]" position="1"/>
|
||||
<change operation="move" path="//primitive[@id=dummy0]" position="3"/>
|
||||
<change operation="move" path="//primitive[@id=dummy1]" position="4"/>
|
||||
</diff>
|
||||
```
|
||||
|
||||
Previously after applying the XML diff, the resulting XML would be a mess:
|
||||
```
|
||||
<group id="dummies">
|
||||
<primitive id="dummy3"/>
|
||||
<primitive id="dummy4"/>
|
||||
<primitive id="dummy0"/>
|
||||
<primitive id="dummy2"/>
|
||||
<primitive id="dummy1"/>
|
||||
</group>
|
||||
```
|
||||
It's because the positions of the already moved XML objects could be
|
||||
affected by the later moved objects.
|
||||
|
||||
This commit fixes it by temporarily putting "move" objects after the
|
||||
last sibling and also delaying the adding of any "create" objects, then
|
||||
placing them to the target positions in the right order.
|
||||
---
|
||||
lib/common/xml.c | 126 ++++++++++++++++++++++++++++++++++++++++++-------------
|
||||
1 file changed, 97 insertions(+), 29 deletions(-)
|
||||
|
||||
diff --git a/lib/common/xml.c b/lib/common/xml.c
|
||||
index 66b5f66..d815a48 100644
|
||||
--- a/lib/common/xml.c
|
||||
+++ b/lib/common/xml.c
|
||||
@@ -1466,11 +1466,40 @@ __xml_find_path(xmlNode *top, const char *key, int target_position)
|
||||
return target;
|
||||
}
|
||||
|
||||
+typedef struct xml_change_obj_s {
|
||||
+ xmlNode *change;
|
||||
+ xmlNode *match;
|
||||
+} xml_change_obj_t;
|
||||
+
|
||||
+static gint
|
||||
+sort_change_obj_by_position(gconstpointer a, gconstpointer b)
|
||||
+{
|
||||
+ const xml_change_obj_t *change_obj_a = a;
|
||||
+ const xml_change_obj_t *change_obj_b = b;
|
||||
+ int position_a = -1;
|
||||
+ int position_b = -1;
|
||||
+
|
||||
+ crm_element_value_int(change_obj_a->change, XML_DIFF_POSITION, &position_a);
|
||||
+ crm_element_value_int(change_obj_b->change, XML_DIFF_POSITION, &position_b);
|
||||
+
|
||||
+ if (position_a < position_b) {
|
||||
+ return -1;
|
||||
+
|
||||
+ } else if (position_a > position_b) {
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static int
|
||||
xml_apply_patchset_v2(xmlNode *xml, xmlNode *patchset)
|
||||
{
|
||||
int rc = pcmk_ok;
|
||||
xmlNode *change = NULL;
|
||||
+ GListPtr change_objs = NULL;
|
||||
+ GListPtr gIter = NULL;
|
||||
+
|
||||
for (change = __xml_first_child(patchset); change != NULL; change = __xml_next(change)) {
|
||||
xmlNode *match = NULL;
|
||||
const char *op = crm_element_value(change, XML_DIFF_OP);
|
||||
@@ -1482,6 +1511,7 @@ xml_apply_patchset_v2(xmlNode *xml, xmlNode *patchset)
|
||||
continue;
|
||||
}
|
||||
|
||||
+ // "delete" changes for XML comments are generated with "position"
|
||||
if(strcmp(op, "delete") == 0) {
|
||||
crm_element_value_int(change, XML_DIFF_POSITION, &position);
|
||||
}
|
||||
@@ -1497,7 +1527,71 @@ xml_apply_patchset_v2(xmlNode *xml, xmlNode *patchset)
|
||||
rc = -pcmk_err_diff_failed;
|
||||
continue;
|
||||
|
||||
- } else if(strcmp(op, "create") == 0) {
|
||||
+ } else if (strcmp(op, "create") == 0 || strcmp(op, "move") == 0) {
|
||||
+ // Delay the adding of a "create" object
|
||||
+ xml_change_obj_t *change_obj = calloc(1, sizeof(xml_change_obj_t));
|
||||
+
|
||||
+ CRM_ASSERT(change_obj != NULL);
|
||||
+
|
||||
+ change_obj->change = change;
|
||||
+ change_obj->match = match;
|
||||
+
|
||||
+ change_objs = g_list_append(change_objs, change_obj);
|
||||
+
|
||||
+ if (strcmp(op, "move") == 0) {
|
||||
+ // Temporarily put the "move" object after the last sibling
|
||||
+ if (match->parent != NULL && match->parent->last != NULL) {
|
||||
+ xmlAddNextSibling(match->parent->last, match);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ } else if(strcmp(op, "delete") == 0) {
|
||||
+ free_xml(match);
|
||||
+
|
||||
+ } else if(strcmp(op, "modify") == 0) {
|
||||
+ xmlAttr *pIter = pcmk__first_xml_attr(match);
|
||||
+ xmlNode *attrs = __xml_first_child(first_named_child(change, XML_DIFF_RESULT));
|
||||
+
|
||||
+ if(attrs == NULL) {
|
||||
+ rc = -ENOMSG;
|
||||
+ continue;
|
||||
+ }
|
||||
+ while(pIter != NULL) {
|
||||
+ const char *name = (const char *)pIter->name;
|
||||
+
|
||||
+ pIter = pIter->next;
|
||||
+ xml_remove_prop(match, name);
|
||||
+ }
|
||||
+
|
||||
+ for (pIter = pcmk__first_xml_attr(attrs); pIter != NULL; pIter = pIter->next) {
|
||||
+ const char *name = (const char *)pIter->name;
|
||||
+ const char *value = crm_element_value(attrs, name);
|
||||
+
|
||||
+ crm_xml_add(match, name, value);
|
||||
+ }
|
||||
+
|
||||
+ } else {
|
||||
+ crm_err("Unknown operation: %s", op);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ // Changes should be generated in the right order. Double checking.
|
||||
+ change_objs = g_list_sort(change_objs, sort_change_obj_by_position);
|
||||
+
|
||||
+ for (gIter = change_objs; gIter; gIter = gIter->next) {
|
||||
+ xml_change_obj_t *change_obj = gIter->data;
|
||||
+ xmlNode *match = change_obj->match;
|
||||
+ const char *op = NULL;
|
||||
+ const char *xpath = NULL;
|
||||
+
|
||||
+ change = change_obj->change;
|
||||
+
|
||||
+ op = crm_element_value(change, XML_DIFF_OP);
|
||||
+ xpath = crm_element_value(change, XML_DIFF_PATH);
|
||||
+
|
||||
+ crm_trace("Continue performing %s on %s with %p", op, xpath, match);
|
||||
+
|
||||
+ if(strcmp(op, "create") == 0) {
|
||||
int position = 0;
|
||||
xmlNode *child = NULL;
|
||||
xmlNode *match_child = NULL;
|
||||
@@ -1565,36 +1659,10 @@ xml_apply_patchset_v2(xmlNode *xml, xmlNode *patchset)
|
||||
match->name, ID(match), __xml_offset(match), position, match->prev);
|
||||
rc = -pcmk_err_diff_failed;
|
||||
}
|
||||
-
|
||||
- } else if(strcmp(op, "delete") == 0) {
|
||||
- free_xml(match);
|
||||
-
|
||||
- } else if(strcmp(op, "modify") == 0) {
|
||||
- xmlAttr *pIter = pcmk__first_xml_attr(match);
|
||||
- xmlNode *attrs = __xml_first_child(first_named_child(change, XML_DIFF_RESULT));
|
||||
-
|
||||
- if(attrs == NULL) {
|
||||
- rc = -ENOMSG;
|
||||
- continue;
|
||||
- }
|
||||
- while(pIter != NULL) {
|
||||
- const char *name = (const char *)pIter->name;
|
||||
-
|
||||
- pIter = pIter->next;
|
||||
- xml_remove_prop(match, name);
|
||||
- }
|
||||
-
|
||||
- for (pIter = pcmk__first_xml_attr(attrs); pIter != NULL; pIter = pIter->next) {
|
||||
- const char *name = (const char *)pIter->name;
|
||||
- const char *value = crm_element_value(attrs, name);
|
||||
-
|
||||
- crm_xml_add(match, name, value);
|
||||
- }
|
||||
-
|
||||
- } else {
|
||||
- crm_err("Unknown operation: %s", op);
|
||||
}
|
||||
}
|
||||
+
|
||||
+ g_list_free_full(change_objs, free);
|
||||
return rc;
|
||||
}
|
||||
|
||||
--
|
||||
1.8.3.1
|
||||
|
||||
|
||||
From f8d008d8d3a29900ee0c6decbb71a243fa4c2d8c Mon Sep 17 00:00:00 2001
|
||||
From: "Gao,Yan" <ygao@suse.com>
|
||||
Date: Tue, 30 Apr 2019 00:15:03 +0200
|
||||
Subject: [PATCH 2/3] Fix: libcrmcommon: avoid possible use-of-NULL when
|
||||
applying XML diffs
|
||||
|
||||
---
|
||||
lib/common/xml.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/lib/common/xml.c b/lib/common/xml.c
|
||||
index d815a48..fe87de6 100644
|
||||
--- a/lib/common/xml.c
|
||||
+++ b/lib/common/xml.c
|
||||
@@ -1506,11 +1506,12 @@ xml_apply_patchset_v2(xmlNode *xml, xmlNode *patchset)
|
||||
const char *xpath = crm_element_value(change, XML_DIFF_PATH);
|
||||
int position = -1;
|
||||
|
||||
- crm_trace("Processing %s %s", change->name, op);
|
||||
if(op == NULL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
+ crm_trace("Processing %s %s", change->name, op);
|
||||
+
|
||||
// "delete" changes for XML comments are generated with "position"
|
||||
if(strcmp(op, "delete") == 0) {
|
||||
crm_element_value_int(change, XML_DIFF_POSITION, &position);
|
||||
--
|
||||
1.8.3.1
|
||||
|
||||
|
||||
From e6b2bf0cf7e7ed839583d529b190a7a6cd1bd594 Mon Sep 17 00:00:00 2001
|
||||
From: "Gao,Yan" <ygao@suse.com>
|
||||
Date: Tue, 30 Apr 2019 00:19:46 +0200
|
||||
Subject: [PATCH 3/3] Fix: libcrmcommon: return error when applying XML diffs
|
||||
containing unknown operations
|
||||
|
||||
---
|
||||
lib/common/xml.c | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/lib/common/xml.c b/lib/common/xml.c
|
||||
index fe87de6..940c4b9 100644
|
||||
--- a/lib/common/xml.c
|
||||
+++ b/lib/common/xml.c
|
||||
@@ -1573,6 +1573,7 @@ xml_apply_patchset_v2(xmlNode *xml, xmlNode *patchset)
|
||||
|
||||
} else {
|
||||
crm_err("Unknown operation: %s", op);
|
||||
+ rc = -pcmk_err_diff_failed;
|
||||
}
|
||||
}
|
||||
|
||||
--
|
||||
1.8.3.1
|
||||
|
273
SOURCES/002-failed-monitors.patch
Normal file
273
SOURCES/002-failed-monitors.patch
Normal file
@ -0,0 +1,273 @@
|
||||
From 5470f1d9c776dbf753e015fa96153b6a63c17b83 Mon Sep 17 00:00:00 2001
|
||||
From: "Gao,Yan" <ygao@suse.com>
|
||||
Date: Thu, 9 May 2019 13:24:35 +0200
|
||||
Subject: [PATCH] Fix: controller: confirm cancel of failed monitors
|
||||
|
||||
Usually after a monitor has been cancelled from executor, contoller
|
||||
erases the corresponding lrm_rsc_op from the cib, and DC will confirm
|
||||
the cancel action by process_op_deletion() according to the cib diff.
|
||||
|
||||
But if a monitor has failed, the lrm_rsc_op will be recorded as
|
||||
"last_failure". When cancelling it, the lrm_rsc_op won't get erased from
|
||||
the cib given the logic on purpose in erase_lrm_history_by_op(). So that
|
||||
the cancel action won't have a chance to get confirmed by DC with
|
||||
process_op_deletion().
|
||||
|
||||
Previously cluster transition would get stuck waiting for the remaining
|
||||
action timer to time out.
|
||||
|
||||
This commit fixes the issue by directly acknowledging the cancel action
|
||||
in this case and enabling DC to be able to confirm it.
|
||||
|
||||
This also moves get_node_id() function into controld_utils.c for common
|
||||
use.
|
||||
|
||||
Producer:
|
||||
```
|
||||
# Insert a 10s sleep in the monitor action of RA
|
||||
# /usr/lib/ocf/resource.d/pacemaker/Stateful:
|
||||
|
||||
stateful_monitor() {
|
||||
+ sleep 10
|
||||
stateful_check_state "master"
|
||||
|
||||
# Add a promotable clone resource:
|
||||
|
||||
crm configure primitive stateful ocf:pacemaker:Stateful \
|
||||
op monitor interval=5 role=Master \
|
||||
op monitor interval=10 role=Slave
|
||||
crm configure clone p-clone stateful \
|
||||
meta promotable=true
|
||||
|
||||
# Wait for the resource instance to be started, promoted to be master,
|
||||
# and monitor for master role to complete.
|
||||
|
||||
# Set is-managed=false for the promotable clone:
|
||||
crm_resource --meta -p is-managed -v false -r p-clone
|
||||
|
||||
# Change the status of the master instance to be slave and immediately
|
||||
# enforce refresh of it:
|
||||
echo slave > /var/run/Stateful-stateful.state; crm_resource --refresh -r stateful --force
|
||||
|
||||
# Wait for probe to complete, and then monitor for slave role to be
|
||||
# issued:
|
||||
sleep 15
|
||||
|
||||
# While the monitor for slave role is still in progress, change the
|
||||
# status to be master again:
|
||||
echo master > /var/run/Stateful-stateful.state
|
||||
|
||||
# The monitor for slave role returns error. Cluster issues monitor for
|
||||
# master role instead and tries to cancel the failed one for slave role.
|
||||
# But cluster transition gets stuck. Depending on the monitor timeout
|
||||
# configured for the slave role plus cluster-delay, only after that
|
||||
# controller eventually says:
|
||||
|
||||
pacemaker-controld[21205] error: Node opensuse150 did not send cancel result (via controller) within 20000ms (action timeout plus cluster-delay)
|
||||
pacemaker-controld[21205] error: [Action 1]: In-flight rsc op stateful_monitor_10000 on opensuse150 (priority: 0, waiting: none)
|
||||
pacemaker-controld[21205] notice: Transition 6 aborted: Action lost
|
||||
|
||||
```
|
||||
---
|
||||
daemons/controld/controld_execd.c | 38 ++++++++++++++++++++++++++++++++
|
||||
daemons/controld/controld_te_callbacks.c | 21 ++----------------
|
||||
daemons/controld/controld_te_events.c | 32 +++++++++++++++++++++++++++
|
||||
daemons/controld/controld_transition.h | 1 +
|
||||
daemons/controld/controld_utils.c | 13 +++++++++++
|
||||
daemons/controld/controld_utils.h | 2 ++
|
||||
6 files changed, 88 insertions(+), 19 deletions(-)
|
||||
|
||||
diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
|
||||
index 976fed1..8282fed 100644
|
||||
--- a/daemons/controld/controld_execd.c
|
||||
+++ b/daemons/controld/controld_execd.c
|
||||
@@ -2476,6 +2476,30 @@ unescape_newlines(const char *string)
|
||||
return ret;
|
||||
}
|
||||
|
||||
+static bool
|
||||
+did_lrm_rsc_op_fail(lrm_state_t *lrm_state, const char * rsc_id,
|
||||
+ const char * op_type, guint interval_ms)
|
||||
+{
|
||||
+ rsc_history_t *entry = NULL;
|
||||
+
|
||||
+ CRM_CHECK(lrm_state != NULL, return FALSE);
|
||||
+ CRM_CHECK(rsc_id != NULL, return FALSE);
|
||||
+ CRM_CHECK(op_type != NULL, return FALSE);
|
||||
+
|
||||
+ entry = g_hash_table_lookup(lrm_state->resource_history, rsc_id);
|
||||
+ if (entry == NULL || entry->failed == NULL) {
|
||||
+ return FALSE;
|
||||
+ }
|
||||
+
|
||||
+ if (crm_str_eq(entry->failed->rsc_id, rsc_id, TRUE)
|
||||
+ && safe_str_eq(entry->failed->op_type, op_type)
|
||||
+ && entry->failed->interval_ms == interval_ms) {
|
||||
+ return TRUE;
|
||||
+ }
|
||||
+
|
||||
+ return FALSE;
|
||||
+}
|
||||
+
|
||||
void
|
||||
process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op,
|
||||
struct recurring_op_s *pending, xmlNode *action_xml)
|
||||
@@ -2605,6 +2629,20 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op,
|
||||
erase_lrm_history_by_op(lrm_state, op);
|
||||
}
|
||||
|
||||
+ /* If the recurring operation had failed, the lrm_rsc_op is recorded as
|
||||
+ * "last_failure" which won't get erased from the cib given the logic on
|
||||
+ * purpose in erase_lrm_history_by_op(). So that the cancel action won't
|
||||
+ * have a chance to get confirmed by DC with process_op_deletion().
|
||||
+ * Cluster transition would get stuck waiting for the remaining action
|
||||
+ * timer to time out.
|
||||
+ *
|
||||
+ * Directly acknowledge the cancel operation in this case.
|
||||
+ */
|
||||
+ if (did_lrm_rsc_op_fail(lrm_state, pending->rsc_id,
|
||||
+ pending->op_type, pending->interval_ms)) {
|
||||
+ need_direct_ack = TRUE;
|
||||
+ }
|
||||
+
|
||||
} else if (op->rsc_deleted) {
|
||||
/* This recurring operation was cancelled (but not by us, and the
|
||||
* executor does not have resource information, likely due to resource
|
||||
diff --git a/daemons/controld/controld_te_callbacks.c b/daemons/controld/controld_te_callbacks.c
|
||||
index 51d908e..22b5f4b 100644
|
||||
--- a/daemons/controld/controld_te_callbacks.c
|
||||
+++ b/daemons/controld/controld_te_callbacks.c
|
||||
@@ -32,19 +32,6 @@ static unsigned long int stonith_max_attempts = 10;
|
||||
/* #define RSC_OP_TEMPLATE "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_CIB_TAG_STATE"[@uname='%s']"//"XML_LRM_TAG_RSC_OP"[@id='%s]" */
|
||||
#define RSC_OP_TEMPLATE "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_LRM_TAG_RSC_OP"[@id='%s']"
|
||||
|
||||
-static const char *
|
||||
-get_node_id(xmlNode * rsc_op)
|
||||
-{
|
||||
- xmlNode *node = rsc_op;
|
||||
-
|
||||
- while (node != NULL && safe_str_neq(XML_CIB_TAG_STATE, TYPE(node))) {
|
||||
- node = node->parent;
|
||||
- }
|
||||
-
|
||||
- CRM_CHECK(node != NULL, return NULL);
|
||||
- return ID(node);
|
||||
-}
|
||||
-
|
||||
void
|
||||
update_stonith_max_attempts(const char* value)
|
||||
{
|
||||
@@ -374,12 +361,8 @@ process_op_deletion(const char *xpath, xmlNode *change)
|
||||
node_uuid = extract_node_uuid(xpath);
|
||||
cancel = get_cancel_action(key, node_uuid);
|
||||
if (cancel) {
|
||||
- crm_info("Cancellation of %s on %s confirmed (%d)",
|
||||
- key, node_uuid, cancel->id);
|
||||
- stop_te_timer(cancel->timer);
|
||||
- te_action_confirmed(cancel);
|
||||
- update_graph(transition_graph, cancel);
|
||||
- trigger_graph();
|
||||
+ confirm_cancel_action(cancel);
|
||||
+
|
||||
} else {
|
||||
abort_transition(INFINITY, tg_restart, "Resource operation removal",
|
||||
change);
|
||||
diff --git a/daemons/controld/controld_te_events.c b/daemons/controld/controld_te_events.c
|
||||
index c0d096f..b7b48a4 100644
|
||||
--- a/daemons/controld/controld_te_events.c
|
||||
+++ b/daemons/controld/controld_te_events.c
|
||||
@@ -355,6 +355,27 @@ get_cancel_action(const char *id, const char *node)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
+void
|
||||
+confirm_cancel_action(crm_action_t *cancel)
|
||||
+{
|
||||
+ const char *op_key = NULL;
|
||||
+ const char *node_name = NULL;
|
||||
+
|
||||
+ CRM_ASSERT(cancel != NULL);
|
||||
+
|
||||
+ op_key = crm_element_value(cancel->xml, XML_LRM_ATTR_TASK_KEY);
|
||||
+ node_name = crm_element_value(cancel->xml, XML_LRM_ATTR_TARGET);
|
||||
+
|
||||
+ stop_te_timer(cancel->timer);
|
||||
+ te_action_confirmed(cancel);
|
||||
+ update_graph(transition_graph, cancel);
|
||||
+
|
||||
+ crm_info("Cancellation of %s on %s confirmed (action %d)",
|
||||
+ op_key, node_name, cancel->id);
|
||||
+
|
||||
+ trigger_graph();
|
||||
+}
|
||||
+
|
||||
/* downed nodes are listed like: <downed> <node id="UUID1" /> ... </downed> */
|
||||
#define XPATH_DOWNED "//" XML_GRAPH_TAG_DOWNED \
|
||||
"/" XML_CIB_TAG_NODE "[@" XML_ATTR_UUID "='%s']"
|
||||
@@ -471,6 +492,17 @@ process_graph_event(xmlNode *event, const char *event_node)
|
||||
/* Recurring actions have the transition number they were first
|
||||
* scheduled in.
|
||||
*/
|
||||
+
|
||||
+ if (status == PCMK_LRM_OP_CANCELLED) {
|
||||
+ const char *node_id = get_node_id(event);
|
||||
+
|
||||
+ action = get_cancel_action(id, node_id);
|
||||
+ if (action) {
|
||||
+ confirm_cancel_action(action);
|
||||
+ }
|
||||
+ goto bail;
|
||||
+ }
|
||||
+
|
||||
desc = "arrived after initial scheduling";
|
||||
abort_transition(INFINITY, tg_restart, "Change in recurring result",
|
||||
event);
|
||||
diff --git a/daemons/controld/controld_transition.h b/daemons/controld/controld_transition.h
|
||||
index 0a33599..a162f99 100644
|
||||
--- a/daemons/controld/controld_transition.h
|
||||
+++ b/daemons/controld/controld_transition.h
|
||||
@@ -25,6 +25,7 @@ void execute_stonith_cleanup(void);
|
||||
/* tengine */
|
||||
extern crm_action_t *match_down_event(const char *target);
|
||||
extern crm_action_t *get_cancel_action(const char *id, const char *node);
|
||||
+void confirm_cancel_action(crm_action_t *cancel);
|
||||
|
||||
void controld_record_action_timeout(crm_action_t *action);
|
||||
extern gboolean fail_incompletable_actions(crm_graph_t * graph, const char *down_node);
|
||||
diff --git a/daemons/controld/controld_utils.c b/daemons/controld/controld_utils.c
|
||||
index ca7e15d..35922f0 100644
|
||||
--- a/daemons/controld/controld_utils.c
|
||||
+++ b/daemons/controld/controld_utils.c
|
||||
@@ -1073,3 +1073,16 @@ feature_set_compatible(const char *dc_version, const char *join_version)
|
||||
// DC's minor version must be the same or older
|
||||
return dc_v <= join_v;
|
||||
}
|
||||
+
|
||||
+const char *
|
||||
+get_node_id(xmlNode *lrm_rsc_op)
|
||||
+{
|
||||
+ xmlNode *node = lrm_rsc_op;
|
||||
+
|
||||
+ while (node != NULL && safe_str_neq(XML_CIB_TAG_STATE, TYPE(node))) {
|
||||
+ node = node->parent;
|
||||
+ }
|
||||
+
|
||||
+ CRM_CHECK(node != NULL, return NULL);
|
||||
+ return ID(node);
|
||||
+}
|
||||
diff --git a/daemons/controld/controld_utils.h b/daemons/controld/controld_utils.h
|
||||
index 2a92db5..68992f5 100644
|
||||
--- a/daemons/controld/controld_utils.h
|
||||
+++ b/daemons/controld/controld_utils.h
|
||||
@@ -95,6 +95,8 @@ unsigned int cib_op_timeout(void);
|
||||
bool feature_set_compatible(const char *dc_version, const char *join_version);
|
||||
bool controld_action_is_recordable(const char *action);
|
||||
|
||||
+const char *get_node_id(xmlNode *lrm_rsc_op);
|
||||
+
|
||||
/* Convenience macro for registering a CIB callback
|
||||
* (assumes that data can be freed with free())
|
||||
*/
|
||||
--
|
||||
1.8.3.1
|
||||
|
652
SOURCES/003-fencer-logs.patch
Normal file
652
SOURCES/003-fencer-logs.patch
Normal file
@ -0,0 +1,652 @@
|
||||
From 0a884f325e1049febc28bf0419ab307dd0bce5af Mon Sep 17 00:00:00 2001
|
||||
From: Ken Gaillot <kgaillot@redhat.com>
|
||||
Date: Thu, 16 May 2019 20:04:57 -0500
|
||||
Subject: [PATCH] Log: various: improve fencer connection messages
|
||||
|
||||
Previously, log messages around fencer connections were inconsistent.
|
||||
|
||||
This attempts to make them more consistent by: having stonith_api_signon() log
|
||||
only at debug level, letting the callers log at a level appropriate to the
|
||||
situation using the return code; functionizing retrying a connection; and
|
||||
using similar wording across clients.
|
||||
|
||||
This also does a bit of refactoring for better error checking and improved
|
||||
efficiency.
|
||||
---
|
||||
daemons/controld/controld_control.c | 7 +-
|
||||
daemons/controld/controld_te_utils.c | 59 ++++++-----
|
||||
daemons/execd/pacemaker-execd.c | 28 ++---
|
||||
daemons/fenced/cts-fence-helper.c | 38 +++----
|
||||
include/crm/stonith-ng.h | 4 +
|
||||
lib/fencing/st_client.c | 195 ++++++++++++++++++++---------------
|
||||
tools/crm_mon.c | 1 -
|
||||
tools/stonith_admin.c | 29 +-----
|
||||
8 files changed, 181 insertions(+), 180 deletions(-)
|
||||
|
||||
diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c
|
||||
index 89b5b5d..6d9f335 100644
|
||||
--- a/daemons/controld/controld_control.c
|
||||
+++ b/daemons/controld/controld_control.c
|
||||
@@ -628,10 +628,11 @@ do_started(long long action,
|
||||
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
|
||||
}
|
||||
|
||||
+ // Try connecting to fencer (retrying later in mainloop if failed)
|
||||
if (stonith_reconnect == NULL) {
|
||||
- int dummy;
|
||||
-
|
||||
- stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW, te_connect_stonith, &dummy);
|
||||
+ stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW,
|
||||
+ te_connect_stonith,
|
||||
+ GINT_TO_POINTER(TRUE));
|
||||
}
|
||||
set_bit(fsa_input_register, R_ST_REQUIRED);
|
||||
mainloop_set_trigger(stonith_reconnect);
|
||||
diff --git a/daemons/controld/controld_te_utils.c b/daemons/controld/controld_te_utils.c
|
||||
index 5606ed6..22f83ad 100644
|
||||
--- a/daemons/controld/controld_te_utils.c
|
||||
+++ b/daemons/controld/controld_te_utils.c
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
- * Copyright 2004-2018 Andrew Beekhof <andrew@beekhof.net>
|
||||
+ * Copyright 2004-2019 the Pacemaker project contributors
|
||||
*
|
||||
* This source code is licensed under the GNU General Public License version 2
|
||||
* or later (GPLv2+) WITHOUT ANY WARRANTY.
|
||||
@@ -385,10 +385,18 @@ te_trigger_stonith_history_sync(void)
|
||||
mainloop_timer_start(stonith_history_sync_timer);
|
||||
}
|
||||
|
||||
+/*!
|
||||
+ * \brief Connect to fencer
|
||||
+ *
|
||||
+ * \param[in] user_data If NULL, retry failures now, otherwise retry in main loop
|
||||
+ *
|
||||
+ * \return TRUE
|
||||
+ * \note If user_data is NULL, this will wait 2s between attempts, for up to
|
||||
+ * 30 attempts, meaning the controller could be blocked as long as 58s.
|
||||
+ */
|
||||
gboolean
|
||||
te_connect_stonith(gpointer user_data)
|
||||
{
|
||||
- int lpc = 0;
|
||||
int rc = pcmk_ok;
|
||||
|
||||
if (stonith_api == NULL) {
|
||||
@@ -396,42 +404,41 @@ te_connect_stonith(gpointer user_data)
|
||||
}
|
||||
|
||||
if (stonith_api->state != stonith_disconnected) {
|
||||
- crm_trace("Still connected");
|
||||
+ crm_trace("Already connected to fencer, no need to retry");
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
- for (lpc = 0; lpc < 30; lpc++) {
|
||||
- crm_debug("Attempting connection to fencing daemon...");
|
||||
-
|
||||
- sleep(1);
|
||||
- rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL);
|
||||
-
|
||||
- if (rc == pcmk_ok) {
|
||||
- break;
|
||||
+ if (user_data == NULL) {
|
||||
+ // Blocking (retry failures now until successful)
|
||||
+ rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30);
|
||||
+ if (rc != pcmk_ok) {
|
||||
+ crm_err("Could not connect to fencer in 30 attempts: %s "
|
||||
+ CRM_XS " rc=%d", pcmk_strerror(rc), rc);
|
||||
}
|
||||
-
|
||||
- if (user_data != NULL) {
|
||||
+ } else {
|
||||
+ // Non-blocking (retry failures later in main loop)
|
||||
+ rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL);
|
||||
+ if (rc != pcmk_ok) {
|
||||
if (is_set(fsa_input_register, R_ST_REQUIRED)) {
|
||||
- crm_err("Sign-in failed: triggered a retry");
|
||||
+ crm_err("Fencer connection failed (will retry): %s "
|
||||
+ CRM_XS " rc=%d", pcmk_strerror(rc), rc);
|
||||
mainloop_set_trigger(stonith_reconnect);
|
||||
} else {
|
||||
- crm_info("Sign-in failed, but no longer required");
|
||||
+ crm_info("Fencer connection failed (ignoring because no longer required): %s "
|
||||
+ CRM_XS " rc=%d", pcmk_strerror(rc), rc);
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
-
|
||||
- crm_err("Sign-in failed: pausing and trying again in 2s...");
|
||||
- sleep(1);
|
||||
}
|
||||
|
||||
- CRM_CHECK(rc == pcmk_ok, return TRUE); /* If not, we failed 30 times... just get out */
|
||||
- stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT,
|
||||
- tengine_stonith_connection_destroy);
|
||||
-
|
||||
- stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_FENCE,
|
||||
- tengine_stonith_notify);
|
||||
-
|
||||
- crm_trace("Connected");
|
||||
+ if (rc == pcmk_ok) {
|
||||
+ stonith_api->cmds->register_notification(stonith_api,
|
||||
+ T_STONITH_NOTIFY_DISCONNECT,
|
||||
+ tengine_stonith_connection_destroy);
|
||||
+ stonith_api->cmds->register_notification(stonith_api,
|
||||
+ T_STONITH_NOTIFY_FENCE,
|
||||
+ tengine_stonith_notify);
|
||||
+ }
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
diff --git a/daemons/execd/pacemaker-execd.c b/daemons/execd/pacemaker-execd.c
|
||||
index 21bb0ed..e2fdfca 100644
|
||||
--- a/daemons/execd/pacemaker-execd.c
|
||||
+++ b/daemons/execd/pacemaker-execd.c
|
||||
@@ -65,28 +65,20 @@ get_stonith_connection(void)
|
||||
stonith_api = NULL;
|
||||
}
|
||||
|
||||
- if (!stonith_api) {
|
||||
- int rc = 0;
|
||||
- int tries = 10;
|
||||
+ if (stonith_api == NULL) {
|
||||
+ int rc = pcmk_ok;
|
||||
|
||||
stonith_api = stonith_api_new();
|
||||
- do {
|
||||
- rc = stonith_api->cmds->connect(stonith_api, "pacemaker-execd", NULL);
|
||||
- if (rc == pcmk_ok) {
|
||||
- stonith_api->cmds->register_notification(stonith_api,
|
||||
- T_STONITH_NOTIFY_DISCONNECT,
|
||||
- stonith_connection_destroy_cb);
|
||||
- break;
|
||||
- }
|
||||
- sleep(1);
|
||||
- tries--;
|
||||
- } while (tries);
|
||||
-
|
||||
- if (rc) {
|
||||
- crm_err("Unable to connect to stonith daemon to execute command. error: %s",
|
||||
- pcmk_strerror(rc));
|
||||
+ rc = stonith_api_connect_retry(stonith_api, crm_system_name, 10);
|
||||
+ if (rc != pcmk_ok) {
|
||||
+ crm_err("Could not connect to fencer in 10 attempts: %s "
|
||||
+ CRM_XS " rc=%d", pcmk_strerror(rc), rc);
|
||||
stonith_api_delete(stonith_api);
|
||||
stonith_api = NULL;
|
||||
+ } else {
|
||||
+ stonith_api->cmds->register_notification(stonith_api,
|
||||
+ T_STONITH_NOTIFY_DISCONNECT,
|
||||
+ stonith_connection_destroy_cb);
|
||||
}
|
||||
}
|
||||
return stonith_api;
|
||||
diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c
|
||||
index c5ce1ab..4552fc1 100644
|
||||
--- a/daemons/fenced/cts-fence-helper.c
|
||||
+++ b/daemons/fenced/cts-fence-helper.c
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
- * Copyright 2009-2018 Andrew Beekhof <andrew@beekhof.net>
|
||||
+ * Copyright 2009-2019 the Pacemaker project contributors
|
||||
*
|
||||
* This source code is licensed under the GNU General Public License version 2
|
||||
* or later (GPLv2+) WITHOUT ANY WARRANTY.
|
||||
@@ -124,8 +124,10 @@ passive_test(void)
|
||||
int rc = 0;
|
||||
|
||||
rc = st->cmds->connect(st, crm_system_name, &pollfd.fd);
|
||||
- crm_debug("Connect: %d", rc);
|
||||
-
|
||||
+ if (rc != pcmk_ok) {
|
||||
+ stonith_api_delete(st);
|
||||
+ crm_exit(CRM_EX_DISCONNECT);
|
||||
+ }
|
||||
st->cmds->register_notification(st, T_STONITH_NOTIFY_DISCONNECT, st_callback);
|
||||
st->cmds->register_notification(st, T_STONITH_NOTIFY_FENCE, st_callback);
|
||||
st->cmds->register_notification(st, STONITH_OP_DEVICE_ADD, st_callback);
|
||||
@@ -271,8 +273,10 @@ sanity_tests(void)
|
||||
int rc = 0;
|
||||
|
||||
rc = st->cmds->connect(st, crm_system_name, &pollfd.fd);
|
||||
- crm_debug("Connect: %d", rc);
|
||||
-
|
||||
+ if (rc != pcmk_ok) {
|
||||
+ stonith_api_delete(st);
|
||||
+ crm_exit(CRM_EX_DISCONNECT);
|
||||
+ }
|
||||
st->cmds->register_notification(st, T_STONITH_NOTIFY_DISCONNECT, st_callback);
|
||||
st->cmds->register_notification(st, T_STONITH_NOTIFY_FENCE, st_callback);
|
||||
st->cmds->register_notification(st, STONITH_OP_DEVICE_ADD, st_callback);
|
||||
@@ -295,7 +299,10 @@ standard_dev_test(void)
|
||||
stonith_key_value_t *params = NULL;
|
||||
|
||||
rc = st->cmds->connect(st, crm_system_name, &pollfd.fd);
|
||||
- crm_debug("Connect: %d", rc);
|
||||
+ if (rc != pcmk_ok) {
|
||||
+ stonith_api_delete(st);
|
||||
+ crm_exit(CRM_EX_DISCONNECT);
|
||||
+ }
|
||||
|
||||
params = stonith_key_value_add(params, "pcmk_host_map", "some-host=pcmk-7 true_1_node1=3,4");
|
||||
|
||||
@@ -502,23 +509,12 @@ test_register_async_devices(int check_event)
|
||||
static void
|
||||
try_mainloop_connect(int check_event)
|
||||
{
|
||||
- int tries = 10;
|
||||
- int i = 0;
|
||||
- int rc = 0;
|
||||
+ int rc = stonith_api_connect_retry(st, crm_system_name, 10);
|
||||
|
||||
- for (i = 0; i < tries; i++) {
|
||||
- rc = st->cmds->connect(st, crm_system_name, NULL);
|
||||
-
|
||||
- if (!rc) {
|
||||
- crm_info("stonith client connection established");
|
||||
- mainloop_test_done(TRUE);
|
||||
- return;
|
||||
- } else {
|
||||
- crm_info("stonith client connection failed");
|
||||
- }
|
||||
- sleep(1);
|
||||
+ if (rc == pcmk_ok) {
|
||||
+ mainloop_test_done(TRUE);
|
||||
+ return;
|
||||
}
|
||||
-
|
||||
crm_err("API CONNECTION FAILURE");
|
||||
mainloop_test_done(FALSE);
|
||||
}
|
||||
diff --git a/include/crm/stonith-ng.h b/include/crm/stonith-ng.h
|
||||
index b7365a9..b640732 100644
|
||||
--- a/include/crm/stonith-ng.h
|
||||
+++ b/include/crm/stonith-ng.h
|
||||
@@ -430,6 +430,10 @@ void stonith_key_value_freeall(stonith_key_value_t * kvp, int keys, int values);
|
||||
|
||||
void stonith_history_free(stonith_history_t *history);
|
||||
|
||||
+// Convenience functions
|
||||
+int stonith_api_connect_retry(stonith_t *st, const char *name,
|
||||
+ int max_attempts);
|
||||
+
|
||||
/* Basic helpers that allows nodes to be fenced and the history to be
|
||||
* queried without mainloop or the caller understanding the full API
|
||||
*
|
||||
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
|
||||
index 270ef8d..ceee944 100644
|
||||
--- a/lib/fencing/st_client.c
|
||||
+++ b/lib/fencing/st_client.c
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
- * Copyright 2004-2018 Andrew Beekhof <andrew@beekhof.net>
|
||||
+ * Copyright 2004-2019 the Pacemaker project contributors
|
||||
*
|
||||
* This source code is licensed under the GNU Lesser General Public License
|
||||
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
|
||||
@@ -1415,14 +1415,21 @@ static int
|
||||
stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd)
|
||||
{
|
||||
int rc = pcmk_ok;
|
||||
- stonith_private_t *native = stonith->st_private;
|
||||
+ stonith_private_t *native = NULL;
|
||||
+ const char *display_name = name? name : "client";
|
||||
|
||||
static struct ipc_client_callbacks st_callbacks = {
|
||||
.dispatch = stonith_dispatch_internal,
|
||||
.destroy = stonith_connection_destroy
|
||||
};
|
||||
|
||||
- crm_trace("Connecting command channel");
|
||||
+ CRM_CHECK(stonith != NULL, return -EINVAL);
|
||||
+
|
||||
+ native = stonith->st_private;
|
||||
+ CRM_ASSERT(native != NULL);
|
||||
+
|
||||
+ crm_debug("Attempting fencer connection by %s with%s mainloop",
|
||||
+ display_name, (stonith_fd? "out" : ""));
|
||||
|
||||
stonith->state = stonith_connected_command;
|
||||
if (stonith_fd) {
|
||||
@@ -1432,8 +1439,9 @@ stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd)
|
||||
if (native->ipc && crm_ipc_connect(native->ipc)) {
|
||||
*stonith_fd = crm_ipc_get_fd(native->ipc);
|
||||
} else if (native->ipc) {
|
||||
- crm_perror(LOG_ERR, "Connection to fencer failed");
|
||||
- rc = -ENOTCONN;
|
||||
+ crm_ipc_close(native->ipc);
|
||||
+ crm_ipc_destroy(native->ipc);
|
||||
+ native->ipc = NULL;
|
||||
}
|
||||
|
||||
} else {
|
||||
@@ -1444,11 +1452,8 @@ stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd)
|
||||
}
|
||||
|
||||
if (native->ipc == NULL) {
|
||||
- crm_debug("Could not connect to the Stonith API");
|
||||
rc = -ENOTCONN;
|
||||
- }
|
||||
-
|
||||
- if (rc == pcmk_ok) {
|
||||
+ } else {
|
||||
xmlNode *reply = NULL;
|
||||
xmlNode *hello = create_xml_node(NULL, "stonith_command");
|
||||
|
||||
@@ -1458,11 +1463,12 @@ stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd)
|
||||
rc = crm_ipc_send(native->ipc, hello, crm_ipc_client_response, -1, &reply);
|
||||
|
||||
if (rc < 0) {
|
||||
- crm_perror(LOG_DEBUG, "Couldn't complete registration with the fencing API: %d", rc);
|
||||
+ crm_debug("Couldn't register with the fencer: %s "
|
||||
+ CRM_XS " rc=%d", pcmk_strerror(rc), rc);
|
||||
rc = -ECOMM;
|
||||
|
||||
} else if (reply == NULL) {
|
||||
- crm_err("Did not receive registration reply");
|
||||
+ crm_debug("Couldn't register with the fencer: no reply");
|
||||
rc = -EPROTO;
|
||||
|
||||
} else {
|
||||
@@ -1470,18 +1476,23 @@ stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd)
|
||||
const char *tmp_ticket = crm_element_value(reply, F_STONITH_CLIENTID);
|
||||
|
||||
if (safe_str_neq(msg_type, CRM_OP_REGISTER)) {
|
||||
- crm_err("Invalid registration message: %s", msg_type);
|
||||
- crm_log_xml_err(reply, "Bad reply");
|
||||
+ crm_debug("Couldn't register with the fencer: invalid reply type '%s'",
|
||||
+ (msg_type? msg_type : "(missing)"));
|
||||
+ crm_log_xml_debug(reply, "Invalid fencer reply");
|
||||
rc = -EPROTO;
|
||||
|
||||
} else if (tmp_ticket == NULL) {
|
||||
- crm_err("No registration token provided");
|
||||
- crm_log_xml_err(reply, "Bad reply");
|
||||
+ crm_debug("Couldn't register with the fencer: no token in reply");
|
||||
+ crm_log_xml_debug(reply, "Invalid fencer reply");
|
||||
rc = -EPROTO;
|
||||
|
||||
} else {
|
||||
- crm_trace("Obtained registration token: %s", tmp_ticket);
|
||||
native->token = strdup(tmp_ticket);
|
||||
+#if HAVE_MSGFROMIPC_TIMEOUT
|
||||
+ stonith->call_timeout = MAX_IPC_DELAY;
|
||||
+#endif
|
||||
+ crm_debug("Connection to fencer by %s succeeded (registration token: %s)",
|
||||
+ display_name, native->token);
|
||||
rc = pcmk_ok;
|
||||
}
|
||||
}
|
||||
@@ -1490,16 +1501,11 @@ stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd)
|
||||
free_xml(hello);
|
||||
}
|
||||
|
||||
- if (rc == pcmk_ok) {
|
||||
-#if HAVE_MSGFROMIPC_TIMEOUT
|
||||
- stonith->call_timeout = MAX_IPC_DELAY;
|
||||
-#endif
|
||||
- crm_debug("Connection to fencer successful");
|
||||
- return pcmk_ok;
|
||||
+ if (rc != pcmk_ok) {
|
||||
+ crm_debug("Connection attempt to fencer by %s failed: %s "
|
||||
+ CRM_XS " rc=%d", display_name, pcmk_strerror(rc), rc);
|
||||
+ stonith->cmds->disconnect(stonith);
|
||||
}
|
||||
-
|
||||
- crm_debug("Connection to fencer failed: %s", pcmk_strerror(rc));
|
||||
- stonith->cmds->disconnect(stonith);
|
||||
return rc;
|
||||
}
|
||||
|
||||
@@ -2071,6 +2077,36 @@ stonith_api_new(void)
|
||||
return new_stonith;
|
||||
}
|
||||
|
||||
+/*!
|
||||
+ * \brief Make a blocking connection attempt to the fencer
|
||||
+ *
|
||||
+ * \param[in,out] st Fencer API object
|
||||
+ * \param[in] name Client name to use with fencer
|
||||
+ * \param[in] max_attempts Return error if this many attempts fail
|
||||
+ *
|
||||
+ * \return pcmk_ok on success, result of last attempt otherwise
|
||||
+ */
|
||||
+int
|
||||
+stonith_api_connect_retry(stonith_t *st, const char *name, int max_attempts)
|
||||
+{
|
||||
+ int rc = -EINVAL; // if max_attempts is not positive
|
||||
+
|
||||
+ for (int attempt = 1; attempt <= max_attempts; attempt++) {
|
||||
+ rc = st->cmds->connect(st, name, NULL);
|
||||
+ if (rc == pcmk_ok) {
|
||||
+ return pcmk_ok;
|
||||
+ } else if (attempt < max_attempts) {
|
||||
+ crm_notice("Fencer connection attempt %d of %d failed (retrying in 2s): %s "
|
||||
+ CRM_XS " rc=%d",
|
||||
+ attempt, max_attempts, pcmk_strerror(rc), rc);
|
||||
+ sleep(2);
|
||||
+ }
|
||||
+ }
|
||||
+ crm_notice("Could not connect to fencer: %s " CRM_XS " rc=%d",
|
||||
+ pcmk_strerror(rc), rc);
|
||||
+ return rc;
|
||||
+}
|
||||
+
|
||||
stonith_key_value_t *
|
||||
stonith_key_value_add(stonith_key_value_t * head, const char *key, const char *value)
|
||||
{
|
||||
@@ -2122,85 +2158,78 @@ stonith_key_value_freeall(stonith_key_value_t * head, int keys, int values)
|
||||
int
|
||||
stonith_api_kick(uint32_t nodeid, const char *uname, int timeout, bool off)
|
||||
{
|
||||
- char *name = NULL;
|
||||
- const char *action = "reboot";
|
||||
-
|
||||
- int rc = -EPROTO;
|
||||
- stonith_t *st = NULL;
|
||||
- enum stonith_call_options opts = st_opt_sync_call | st_opt_allow_suicide;
|
||||
+ int rc = pcmk_ok;
|
||||
+ stonith_t *st = stonith_api_new();
|
||||
+ const char *action = off? "off" : "reboot";
|
||||
|
||||
api_log_open();
|
||||
- st = stonith_api_new();
|
||||
- if (st) {
|
||||
- rc = st->cmds->connect(st, "stonith-api", NULL);
|
||||
- if(rc != pcmk_ok) {
|
||||
- api_log(LOG_ERR, "Connection failed, could not kick (%s) node %u/%s : %s (%d)", action, nodeid, uname, pcmk_strerror(rc), rc);
|
||||
- }
|
||||
+ if (st == NULL) {
|
||||
+ api_log(LOG_ERR, "API initialization failed, could not kick (%s) node %u/%s",
|
||||
+ action, nodeid, uname);
|
||||
+ return -EPROTO;
|
||||
}
|
||||
|
||||
- if (uname != NULL) {
|
||||
- name = strdup(uname);
|
||||
-
|
||||
- } else if (nodeid > 0) {
|
||||
- opts |= st_opt_cs_nodeid;
|
||||
- name = crm_itoa(nodeid);
|
||||
- }
|
||||
-
|
||||
- if (off) {
|
||||
- action = "off";
|
||||
- }
|
||||
-
|
||||
- if (rc == pcmk_ok) {
|
||||
+ rc = st->cmds->connect(st, "stonith-api", NULL);
|
||||
+ if (rc != pcmk_ok) {
|
||||
+ api_log(LOG_ERR, "Connection failed, could not kick (%s) node %u/%s : %s (%d)",
|
||||
+ action, nodeid, uname, pcmk_strerror(rc), rc);
|
||||
+ } else {
|
||||
+ char *name = NULL;
|
||||
+ enum stonith_call_options opts = st_opt_sync_call | st_opt_allow_suicide;
|
||||
+
|
||||
+ if (uname != NULL) {
|
||||
+ name = strdup(uname);
|
||||
+ } else if (nodeid > 0) {
|
||||
+ opts |= st_opt_cs_nodeid;
|
||||
+ name = crm_itoa(nodeid);
|
||||
+ }
|
||||
rc = st->cmds->fence(st, opts, name, action, timeout, 0);
|
||||
- if(rc != pcmk_ok) {
|
||||
- api_log(LOG_ERR, "Could not kick (%s) node %u/%s : %s (%d)", action, nodeid, uname, pcmk_strerror(rc), rc);
|
||||
+ free(name);
|
||||
+
|
||||
+ if (rc != pcmk_ok) {
|
||||
+ api_log(LOG_ERR, "Could not kick (%s) node %u/%s : %s (%d)",
|
||||
+ action, nodeid, uname, pcmk_strerror(rc), rc);
|
||||
} else {
|
||||
- api_log(LOG_NOTICE, "Node %u/%s kicked: %s ", nodeid, uname, action);
|
||||
+ api_log(LOG_NOTICE, "Node %u/%s kicked: %s", nodeid, uname, action);
|
||||
}
|
||||
}
|
||||
|
||||
- if (st) {
|
||||
- st->cmds->disconnect(st);
|
||||
- stonith_api_delete(st);
|
||||
- }
|
||||
-
|
||||
- free(name);
|
||||
+ stonith_api_delete(st);
|
||||
return rc;
|
||||
}
|
||||
|
||||
time_t
|
||||
stonith_api_time(uint32_t nodeid, const char *uname, bool in_progress)
|
||||
{
|
||||
- int rc = 0;
|
||||
- char *name = NULL;
|
||||
-
|
||||
+ int rc = pcmk_ok;
|
||||
time_t when = 0;
|
||||
- stonith_t *st = NULL;
|
||||
+ stonith_t *st = stonith_api_new();
|
||||
stonith_history_t *history = NULL, *hp = NULL;
|
||||
- enum stonith_call_options opts = st_opt_sync_call;
|
||||
-
|
||||
- st = stonith_api_new();
|
||||
- if (st) {
|
||||
- rc = st->cmds->connect(st, "stonith-api", NULL);
|
||||
- if(rc != pcmk_ok) {
|
||||
- api_log(LOG_NOTICE, "Connection failed: %s (%d)", pcmk_strerror(rc), rc);
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- if (uname != NULL) {
|
||||
- name = strdup(uname);
|
||||
|
||||
- } else if (nodeid > 0) {
|
||||
- opts |= st_opt_cs_nodeid;
|
||||
- name = crm_itoa(nodeid);
|
||||
+ if (st == NULL) {
|
||||
+ api_log(LOG_ERR, "Could not retrieve fence history for %u/%s: "
|
||||
+ "API initialization failed", nodeid, uname);
|
||||
+ return when;
|
||||
}
|
||||
|
||||
- if (st && rc == pcmk_ok) {
|
||||
+ rc = st->cmds->connect(st, "stonith-api", NULL);
|
||||
+ if (rc != pcmk_ok) {
|
||||
+ api_log(LOG_NOTICE, "Connection failed: %s (%d)", pcmk_strerror(rc), rc);
|
||||
+ } else {
|
||||
int entries = 0;
|
||||
int progress = 0;
|
||||
int completed = 0;
|
||||
-
|
||||
+ char *name = NULL;
|
||||
+ enum stonith_call_options opts = st_opt_sync_call;
|
||||
+
|
||||
+ if (uname != NULL) {
|
||||
+ name = strdup(uname);
|
||||
+ } else if (nodeid > 0) {
|
||||
+ opts |= st_opt_cs_nodeid;
|
||||
+ name = crm_itoa(nodeid);
|
||||
+ }
|
||||
rc = st->cmds->history(st, opts, name, &history, 120);
|
||||
+ free(name);
|
||||
|
||||
for (hp = history; hp; hp = hp->next) {
|
||||
entries++;
|
||||
@@ -2227,15 +2256,11 @@ stonith_api_time(uint32_t nodeid, const char *uname, bool in_progress)
|
||||
}
|
||||
}
|
||||
|
||||
- if (st) {
|
||||
- st->cmds->disconnect(st);
|
||||
- stonith_api_delete(st);
|
||||
- }
|
||||
+ stonith_api_delete(st);
|
||||
|
||||
if(when) {
|
||||
api_log(LOG_INFO, "Node %u/%s last kicked at: %ld", nodeid, uname, (long int)when);
|
||||
}
|
||||
- free(name);
|
||||
return when;
|
||||
}
|
||||
|
||||
diff --git a/tools/crm_mon.c b/tools/crm_mon.c
|
||||
index e101b62..bed0796 100644
|
||||
--- a/tools/crm_mon.c
|
||||
+++ b/tools/crm_mon.c
|
||||
@@ -298,7 +298,6 @@ cib_connect(gboolean full)
|
||||
}
|
||||
|
||||
if ((fence_connect) && (st->state == stonith_disconnected)) {
|
||||
- crm_trace("Connecting to stonith");
|
||||
rc = st->cmds->connect(st, crm_system_name, NULL);
|
||||
if (rc == pcmk_ok) {
|
||||
crm_trace("Setting up stonith callbacks");
|
||||
diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c
|
||||
index d960fb1..6be66c6 100644
|
||||
--- a/tools/stonith_admin.c
|
||||
+++ b/tools/stonith_admin.c
|
||||
@@ -198,31 +198,6 @@ struct {
|
||||
int rc;
|
||||
} async_fence_data;
|
||||
|
||||
-static int
|
||||
-try_mainloop_connect(void)
|
||||
-{
|
||||
- stonith_t *st = async_fence_data.st;
|
||||
- int tries = 10;
|
||||
- int i = 0;
|
||||
- int rc = 0;
|
||||
-
|
||||
- for (i = 0; i < tries; i++) {
|
||||
- crm_debug("Connecting as %s", async_fence_data.name);
|
||||
- rc = st->cmds->connect(st, async_fence_data.name, NULL);
|
||||
-
|
||||
- if (!rc) {
|
||||
- crm_debug("stonith client connection established");
|
||||
- return 0;
|
||||
- } else {
|
||||
- crm_debug("stonith client connection failed");
|
||||
- }
|
||||
- sleep(1);
|
||||
- }
|
||||
-
|
||||
- crm_err("Could not connect to the fencer");
|
||||
- return -1;
|
||||
-}
|
||||
-
|
||||
static void
|
||||
notify_callback(stonith_t * st, stonith_event_t * e)
|
||||
{
|
||||
@@ -251,8 +226,10 @@ async_fence_helper(gpointer user_data)
|
||||
{
|
||||
stonith_t *st = async_fence_data.st;
|
||||
int call_id = 0;
|
||||
+ int rc = stonith_api_connect_retry(st, async_fence_data.name, 10);
|
||||
|
||||
- if (try_mainloop_connect()) {
|
||||
+ if (rc != pcmk_ok) {
|
||||
+ fprintf(stderr, "Could not connect to fencer: %s\n", pcmk_strerror(rc));
|
||||
g_main_loop_quit(mainloop);
|
||||
return TRUE;
|
||||
}
|
||||
--
|
||||
1.8.3.1
|
||||
|
49
SOURCES/004-concurrent-fencing.patch
Normal file
49
SOURCES/004-concurrent-fencing.patch
Normal file
@ -0,0 +1,49 @@
|
||||
From 463eb8e36e2d2bf10a0e37938e0924ea6699f041 Mon Sep 17 00:00:00 2001
|
||||
From: Ken Gaillot <kgaillot@redhat.com>
|
||||
Date: Thu, 30 May 2019 08:37:52 -0500
|
||||
Subject: [PATCH] Low: libpe_status: offer compile-time option to change
|
||||
concurrent-fencing default
|
||||
|
||||
We most likely want to make concurrent-fencing default to true at some point.
|
||||
For now, offer that possibility via a compile-time constant, for experimenting.
|
||||
---
|
||||
lib/pengine/common.c | 8 +++++++-
|
||||
lib/pengine/status.c | 3 +++
|
||||
2 files changed, 10 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/lib/pengine/common.c b/lib/pengine/common.c
|
||||
index 9513633..3a283b4 100644
|
||||
--- a/lib/pengine/common.c
|
||||
+++ b/lib/pengine/common.c
|
||||
@@ -95,7 +95,13 @@ static pe_cluster_option pe_opts[] = {
|
||||
"How long to wait for the STONITH action (reboot,on,off) to complete", NULL },
|
||||
{ XML_ATTR_HAVE_WATCHDOG, NULL, "boolean", NULL, "false", &check_boolean,
|
||||
"Enable watchdog integration", "Set automatically by the cluster if SBD is detected. User configured values are ignored." },
|
||||
- { "concurrent-fencing", NULL, "boolean", NULL, "false", &check_boolean,
|
||||
+ { "concurrent-fencing", NULL, "boolean", NULL,
|
||||
+#ifdef DEFAULT_CONCURRENT_FENCING_TRUE
|
||||
+ "true",
|
||||
+#else
|
||||
+ "false",
|
||||
+#endif
|
||||
+ &check_boolean,
|
||||
"Allow performing fencing operations in parallel", NULL },
|
||||
{ "startup-fencing", NULL, "boolean", NULL, "true", &check_boolean,
|
||||
"STONITH unseen nodes", "Advanced Use Only! Not using the default is very unsafe!" },
|
||||
diff --git a/lib/pengine/status.c b/lib/pengine/status.c
|
||||
index 3ccfac4..a8b0947 100644
|
||||
--- a/lib/pengine/status.c
|
||||
+++ b/lib/pengine/status.c
|
||||
@@ -354,6 +354,9 @@ set_working_set_defaults(pe_working_set_t * data_set)
|
||||
set_bit(data_set->flags, pe_flag_stop_rsc_orphans);
|
||||
set_bit(data_set->flags, pe_flag_symmetric_cluster);
|
||||
set_bit(data_set->flags, pe_flag_stop_action_orphans);
|
||||
+#ifdef DEFAULT_CONCURRENT_FENCING_TRUE
|
||||
+ set_bit(data_set->flags, pe_flag_concurrent_fencing);
|
||||
+#endif
|
||||
}
|
||||
|
||||
resource_t *
|
||||
--
|
||||
1.8.3.1
|
||||
|
211
SOURCES/005-glib-priorities.patch
Normal file
211
SOURCES/005-glib-priorities.patch
Normal file
@ -0,0 +1,211 @@
|
||||
From 65170ffd5fa10cbda176b3f88e817d534b6331d6 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Jan=20Pokorn=C3=BD?= <jpokorny@redhat.com>
|
||||
Date: Wed, 29 Aug 2018 15:49:58 +0200
|
||||
Subject: [PATCH 1/2] Low: mainloop: make it possible to specify server's
|
||||
priority in mainloop
|
||||
|
||||
---
|
||||
include/crm/common/mainloop.h | 24 +++++++++++++
|
||||
lib/common/mainloop.c | 82 +++++++++++++++++++++++++++++++++++++++++--
|
||||
2 files changed, 103 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/include/crm/common/mainloop.h b/include/crm/common/mainloop.h
|
||||
index 85da1cd..2cfb63e 100644
|
||||
--- a/include/crm/common/mainloop.h
|
||||
+++ b/include/crm/common/mainloop.h
|
||||
@@ -79,6 +79,30 @@ struct ipc_client_callbacks {
|
||||
qb_ipcs_service_t *mainloop_add_ipc_server(const char *name, enum qb_ipc_type type,
|
||||
struct qb_ipcs_service_handlers *callbacks);
|
||||
|
||||
+/*!
|
||||
+ * \brief Start server-side API end-point, hooked into the internal event loop
|
||||
+ *
|
||||
+ * \param[in] name name of the IPC end-point ("address" for the client)
|
||||
+ * \param[in] type selects libqb's IPC back-end (or use #QB_IPC_NATIVE)
|
||||
+ * \param[in] callbacks defines libqb's IPC service-level handlers
|
||||
+ * \param[in] priority priority relative to other events handled in the
|
||||
+ * abstract handling loop, use #QB_LOOP_MED when unsure
|
||||
+ *
|
||||
+ * \return libqb's opaque handle to the created service abstraction
|
||||
+ *
|
||||
+ * \note For portability concerns, do not use this function if you keep
|
||||
+ * \p priority as #QB_LOOP_MED, stick with #mainloop_add_ipc_server
|
||||
+ * (with exactly such semantics) instead (once you link with this new
|
||||
+ * symbol employed, you can't downgrade the library freely anymore).
|
||||
+ *
|
||||
+ * \note The intended effect will only get fully reflected when run-time
|
||||
+ * linked to patched libqb: https://github.com/ClusterLabs/libqb/pull/352
|
||||
+ */
|
||||
+qb_ipcs_service_t *mainloop_add_ipc_server_with_prio(const char *name,
|
||||
+ enum qb_ipc_type type,
|
||||
+ struct qb_ipcs_service_handlers *callbacks,
|
||||
+ enum qb_loop_priority prio);
|
||||
+
|
||||
void mainloop_del_ipc_server(qb_ipcs_service_t * server);
|
||||
|
||||
mainloop_io_t *mainloop_add_ipc_client(const char *name, int priority, size_t max_size,
|
||||
diff --git a/lib/common/mainloop.c b/lib/common/mainloop.c
|
||||
index 18f7014..17e69f0 100644
|
||||
--- a/lib/common/mainloop.c
|
||||
+++ b/lib/common/mainloop.c
|
||||
@@ -509,6 +509,65 @@ gio_poll_destroy(gpointer data)
|
||||
}
|
||||
}
|
||||
|
||||
+/*!
|
||||
+ * \internal
|
||||
+ * \brief Convert libqb's poll priority into GLib's one
|
||||
+ *
|
||||
+ * \param[in] prio libqb's poll priority (#QB_LOOP_MED assumed as fallback)
|
||||
+ *
|
||||
+ * \return best matching GLib's priority
|
||||
+ */
|
||||
+static gint
|
||||
+conv_prio_libqb2glib(enum qb_loop_priority prio)
|
||||
+{
|
||||
+ gint ret = G_PRIORITY_DEFAULT;
|
||||
+ switch (prio) {
|
||||
+ case QB_LOOP_LOW:
|
||||
+ ret = G_PRIORITY_LOW;
|
||||
+ break;
|
||||
+ case QB_LOOP_HIGH:
|
||||
+ ret = G_PRIORITY_HIGH;
|
||||
+ break;
|
||||
+ default:
|
||||
+ crm_trace("Invalid libqb's loop priority %d, assuming QB_LOOP_MED",
|
||||
+ prio);
|
||||
+ /* fall-through */
|
||||
+ case QB_LOOP_MED:
|
||||
+ break;
|
||||
+ }
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+/*!
|
||||
+ * \internal
|
||||
+ * \brief Convert libqb's poll priority to rate limiting spec
|
||||
+ *
|
||||
+ * \param[in] prio libqb's poll priority (#QB_LOOP_MED assumed as fallback)
|
||||
+ *
|
||||
+ * \return best matching rate limiting spec
|
||||
+ */
|
||||
+static enum qb_ipcs_rate_limit
|
||||
+conv_libqb_prio2ratelimit(enum qb_loop_priority prio)
|
||||
+{
|
||||
+ /* this is an inversion of what libqb's qb_ipcs_request_rate_limit does */
|
||||
+ enum qb_ipcs_rate_limit ret = QB_IPCS_RATE_NORMAL;
|
||||
+ switch (prio) {
|
||||
+ case QB_LOOP_LOW:
|
||||
+ ret = QB_IPCS_RATE_SLOW;
|
||||
+ break;
|
||||
+ case QB_LOOP_HIGH:
|
||||
+ ret = QB_IPCS_RATE_FAST;
|
||||
+ break;
|
||||
+ default:
|
||||
+ crm_trace("Invalid libqb's loop priority %d, assuming QB_LOOP_MED",
|
||||
+ prio);
|
||||
+ /* fall-through */
|
||||
+ case QB_LOOP_MED:
|
||||
+ break;
|
||||
+ }
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
static int32_t
|
||||
gio_poll_dispatch_update(enum qb_loop_priority p, int32_t fd, int32_t evts,
|
||||
void *data, qb_ipcs_dispatch_fn_t fn, int32_t add)
|
||||
@@ -555,8 +614,8 @@ gio_poll_dispatch_update(enum qb_loop_priority p, int32_t fd, int32_t evts,
|
||||
adaptor->p = p;
|
||||
adaptor->is_used++;
|
||||
adaptor->source =
|
||||
- g_io_add_watch_full(channel, G_PRIORITY_DEFAULT, evts, gio_read_socket, adaptor,
|
||||
- gio_poll_destroy);
|
||||
+ g_io_add_watch_full(channel, conv_prio_libqb2glib(p), evts,
|
||||
+ gio_read_socket, adaptor, gio_poll_destroy);
|
||||
|
||||
/* Now that mainloop now holds a reference to channel,
|
||||
* thanks to g_io_add_watch_full(), drop ours from g_io_channel_unix_new().
|
||||
@@ -640,7 +699,15 @@ pick_ipc_type(enum qb_ipc_type requested)
|
||||
|
||||
qb_ipcs_service_t *
|
||||
mainloop_add_ipc_server(const char *name, enum qb_ipc_type type,
|
||||
- struct qb_ipcs_service_handlers * callbacks)
|
||||
+ struct qb_ipcs_service_handlers *callbacks)
|
||||
+{
|
||||
+ return mainloop_add_ipc_server_with_prio(name, type, callbacks, QB_LOOP_MED);
|
||||
+}
|
||||
+
|
||||
+qb_ipcs_service_t *
|
||||
+mainloop_add_ipc_server_with_prio(const char *name, enum qb_ipc_type type,
|
||||
+ struct qb_ipcs_service_handlers *callbacks,
|
||||
+ enum qb_loop_priority prio)
|
||||
{
|
||||
int rc = 0;
|
||||
qb_ipcs_service_t *server = NULL;
|
||||
@@ -652,6 +719,15 @@ mainloop_add_ipc_server(const char *name, enum qb_ipc_type type,
|
||||
crm_client_init();
|
||||
server = qb_ipcs_create(name, 0, pick_ipc_type(type), callbacks);
|
||||
|
||||
+ if (server == NULL) {
|
||||
+ crm_err("Could not create %s IPC server: %s (%d)", name, pcmk_strerror(rc), rc);
|
||||
+ return NULL;
|
||||
+ }
|
||||
+
|
||||
+ if (prio != QB_LOOP_MED) {
|
||||
+ qb_ipcs_request_rate_limit(server, conv_libqb_prio2ratelimit(prio));
|
||||
+ }
|
||||
+
|
||||
#ifdef HAVE_IPCS_GET_BUFFER_SIZE
|
||||
/* All clients should use at least ipc_buffer_max as their buffer size */
|
||||
qb_ipcs_enforce_buffer_size(server, crm_ipc_default_buffer_size());
|
||||
--
|
||||
1.8.3.1
|
||||
|
||||
|
||||
From 3401f25994e8cc059898550082f9b75f2d07f103 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Jan=20Pokorn=C3=BD?= <jpokorny@redhat.com>
|
||||
Date: Wed, 29 Aug 2018 15:50:57 +0200
|
||||
Subject: [PATCH 2/2] High: stonith-ng's function cannot be blocked with CIB
|
||||
updates forever
|
||||
|
||||
In the high-load (or high-rate-config-change) scenarios,
|
||||
pacemaker-fenced would be unable to provide service when basically DoS'd
|
||||
with CIB update notifications. Try to reconcile that with elevated
|
||||
priority of the server's proper listening interface in the mainloop, at
|
||||
worst, it will try to fence with slightly outdated config, but appears
|
||||
to be less bad than not carrying the execution at all, for instance.
|
||||
Other daemons might be considered as well.
|
||||
|
||||
Prerequisites:
|
||||
- https://github.com/ClusterLabs/libqb/pull/352
|
||||
(libqb used to contain a bug due to which one particular step in the
|
||||
initial-client-connection-accepting-at-the-server procedure that would
|
||||
be carried out with hard-coded (and hence possibly lower than competing
|
||||
events') priority, which backfires exactly in this case (once the
|
||||
pacemaker part is fixed -- by the means of elevating priority for
|
||||
the API end-point of fenced so that it won't get consistently
|
||||
overridden with a non-socket-based event source/trigger)
|
||||
|
||||
How to verify:
|
||||
- mocked/based -N (see commit adding that module to mocked based daemon)
|
||||
---
|
||||
lib/common/utils.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/lib/common/utils.c b/lib/common/utils.c
|
||||
index 758eb1b..d1c3e26 100644
|
||||
--- a/lib/common/utils.c
|
||||
+++ b/lib/common/utils.c
|
||||
@@ -1031,7 +1031,8 @@ attrd_ipc_server_init(qb_ipcs_service_t **ipcs, struct qb_ipcs_service_handlers
|
||||
void
|
||||
stonith_ipc_server_init(qb_ipcs_service_t **ipcs, struct qb_ipcs_service_handlers *cb)
|
||||
{
|
||||
- *ipcs = mainloop_add_ipc_server("stonith-ng", QB_IPC_NATIVE, cb);
|
||||
+ *ipcs = mainloop_add_ipc_server_with_prio("stonith-ng", QB_IPC_NATIVE, cb,
|
||||
+ QB_LOOP_HIGH);
|
||||
|
||||
if (*ipcs == NULL) {
|
||||
crm_err("Failed to create fencer: exiting and inhibiting respawn.");
|
||||
--
|
||||
1.8.3.1
|
||||
|
233
SOURCES/006-bundle-fixes.patch
Normal file
233
SOURCES/006-bundle-fixes.patch
Normal file
@ -0,0 +1,233 @@
|
||||
From 169d424cf88594f15e7e66baa705df6b727aa807 Mon Sep 17 00:00:00 2001
|
||||
From: Ken Gaillot <kgaillot@redhat.com>
|
||||
Date: Tue, 4 Jun 2019 16:24:16 -0500
|
||||
Subject: [PATCH 1/4] Log: pacemaker-remoted: use different default log if pid
|
||||
1
|
||||
|
||||
When pacemaker-remoted runs as pid 1 inside a container, there may not be a
|
||||
/var/log/pacemaker directory. To get around this, use a default log of
|
||||
/var/log/pcmk-init.log when running as pid 1.
|
||||
|
||||
This was chosen over alternatives (creating the /var/log/pacemaker directory,
|
||||
or passing the log location as an environment variable when creating the
|
||||
implicit container resource) because it both avoids forcing a restart of
|
||||
active bundles due to configuration change (as well as preserving regression
|
||||
test output) and allows users to configure an explicit log location via the
|
||||
container image or the bundle's extra arguments.
|
||||
---
|
||||
daemons/execd/pacemaker-execd.c | 8 ++++++++
|
||||
1 file changed, 8 insertions(+)
|
||||
|
||||
diff --git a/daemons/execd/pacemaker-execd.c b/daemons/execd/pacemaker-execd.c
|
||||
index e2fdfca..cfa5500 100644
|
||||
--- a/daemons/execd/pacemaker-execd.c
|
||||
+++ b/daemons/execd/pacemaker-execd.c
|
||||
@@ -429,6 +429,14 @@ static void spawn_pidone(int argc, char **argv, char **envp)
|
||||
return;
|
||||
}
|
||||
|
||||
+ /* Containers can be expected to have /var/log, but they may not have
|
||||
+ * /var/log/pacemaker, so use a different default if no value has been
|
||||
+ * explicitly configured in the container's environment.
|
||||
+ */
|
||||
+ if (daemon_option("logfile") == NULL) {
|
||||
+ set_daemon_option("logfile", "/var/log/pcmk-init.log");
|
||||
+ }
|
||||
+
|
||||
sigfillset(&set);
|
||||
sigprocmask(SIG_BLOCK, &set, 0);
|
||||
|
||||
--
|
||||
1.8.3.1
|
||||
|
||||
|
||||
From 7e362387a092b5617b36a69961115f7703e4d801 Mon Sep 17 00:00:00 2001
|
||||
From: Ken Gaillot <kgaillot@redhat.com>
|
||||
Date: Fri, 17 May 2019 12:39:43 -0500
|
||||
Subject: [PATCH 2/4] Refactor: libpe_status: add enum for bundle mount flags
|
||||
|
||||
More readable than 0 or 1
|
||||
---
|
||||
lib/pengine/bundle.c | 17 +++++++++--------
|
||||
lib/pengine/variant.h | 9 ++++++++-
|
||||
2 files changed, 17 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/lib/pengine/bundle.c b/lib/pengine/bundle.c
|
||||
index 3b32f04..b223f03 100644
|
||||
--- a/lib/pengine/bundle.c
|
||||
+++ b/lib/pengine/bundle.c
|
||||
@@ -228,7 +228,7 @@ create_docker_resource(pe_resource_t *parent, pe__bundle_variant_data_t *data,
|
||||
for(GListPtr pIter = data->mounts; pIter != NULL; pIter = pIter->next) {
|
||||
pe__bundle_mount_t *mount = pIter->data;
|
||||
|
||||
- if(mount->flags) {
|
||||
+ if (is_set(mount->flags, pe__bundle_mount_subdir)) {
|
||||
char *source = crm_strdup_printf(
|
||||
"%s/%s-%d", mount->source, data->prefix, replica->offset);
|
||||
|
||||
@@ -396,7 +396,7 @@ create_podman_resource(pe_resource_t *parent, pe__bundle_variant_data_t *data,
|
||||
for(GListPtr pIter = data->mounts; pIter != NULL; pIter = pIter->next) {
|
||||
pe__bundle_mount_t *mount = pIter->data;
|
||||
|
||||
- if(mount->flags) {
|
||||
+ if (is_set(mount->flags, pe__bundle_mount_subdir)) {
|
||||
char *source = crm_strdup_printf(
|
||||
"%s/%s-%d", mount->source, data->prefix, replica->offset);
|
||||
|
||||
@@ -562,7 +562,7 @@ create_rkt_resource(pe_resource_t *parent, pe__bundle_variant_data_t *data,
|
||||
for(GListPtr pIter = data->mounts; pIter != NULL; pIter = pIter->next) {
|
||||
pe__bundle_mount_t *mount = pIter->data;
|
||||
|
||||
- if(mount->flags) {
|
||||
+ if (is_set(mount->flags, pe__bundle_mount_subdir)) {
|
||||
char *source = crm_strdup_printf(
|
||||
"%s/%s-%d", mount->source, data->prefix, replica->offset);
|
||||
|
||||
@@ -894,7 +894,7 @@ create_container(pe_resource_t *parent, pe__bundle_variant_data_t *data,
|
||||
|
||||
static void
|
||||
mount_add(pe__bundle_variant_data_t *bundle_data, const char *source,
|
||||
- const char *target, const char *options, int flags)
|
||||
+ const char *target, const char *options, uint32_t flags)
|
||||
{
|
||||
pe__bundle_mount_t *mount = calloc(1, sizeof(pe__bundle_mount_t));
|
||||
|
||||
@@ -1142,11 +1142,11 @@ pe__unpack_bundle(pe_resource_t *rsc, pe_working_set_t *data_set)
|
||||
const char *source = crm_element_value(xml_child, "source-dir");
|
||||
const char *target = crm_element_value(xml_child, "target-dir");
|
||||
const char *options = crm_element_value(xml_child, "options");
|
||||
- int flags = 0;
|
||||
+ int flags = pe__bundle_mount_none;
|
||||
|
||||
if (source == NULL) {
|
||||
source = crm_element_value(xml_child, "source-dir-root");
|
||||
- flags = 1;
|
||||
+ set_bit(flags, pe__bundle_mount_subdir);
|
||||
}
|
||||
|
||||
if (source && target) {
|
||||
@@ -1251,9 +1251,10 @@ pe__unpack_bundle(pe_resource_t *rsc, pe_working_set_t *data_set)
|
||||
* reasonable.
|
||||
*/
|
||||
mount_add(bundle_data, DEFAULT_REMOTE_KEY_LOCATION,
|
||||
- DEFAULT_REMOTE_KEY_LOCATION, NULL, 0);
|
||||
+ DEFAULT_REMOTE_KEY_LOCATION, NULL, pe__bundle_mount_none);
|
||||
|
||||
- mount_add(bundle_data, CRM_BUNDLE_DIR, "/var/log", NULL, 1);
|
||||
+ mount_add(bundle_data, CRM_BUNDLE_DIR, "/var/log", NULL,
|
||||
+ pe__bundle_mount_subdir);
|
||||
|
||||
port = calloc(1, sizeof(pe__bundle_port_t));
|
||||
if(bundle_data->control_port) {
|
||||
diff --git a/lib/pengine/variant.h b/lib/pengine/variant.h
|
||||
index f46aa11..7f77eef 100644
|
||||
--- a/lib/pengine/variant.h
|
||||
+++ b/lib/pengine/variant.h
|
||||
@@ -51,11 +51,18 @@ typedef struct {
|
||||
pe_resource_t *remote;
|
||||
} pe__bundle_replica_t;
|
||||
|
||||
+enum pe__bundle_mount_flags {
|
||||
+ pe__bundle_mount_none = 0x00,
|
||||
+
|
||||
+ // mount instance-specific subdirectory rather than source directly
|
||||
+ pe__bundle_mount_subdir = 0x01
|
||||
+};
|
||||
+
|
||||
typedef struct {
|
||||
char *source;
|
||||
char *target;
|
||||
char *options;
|
||||
- int flags;
|
||||
+ uint32_t flags; // bitmask of pe__bundle_mount_flags
|
||||
} pe__bundle_mount_t;
|
||||
|
||||
typedef struct {
|
||||
--
|
||||
1.8.3.1
|
||||
|
||||
|
||||
From 87eac95868930ffda4d964c2b6bd9960b6893cc9 Mon Sep 17 00:00:00 2001
|
||||
From: Ken Gaillot <kgaillot@redhat.com>
|
||||
Date: Fri, 17 May 2019 14:13:54 -0500
|
||||
Subject: [PATCH 3/4] Fix: controller: don't check join status after remote
|
||||
node appears
|
||||
|
||||
Only cluster nodes have join state
|
||||
---
|
||||
daemons/controld/controld_callbacks.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/daemons/controld/controld_callbacks.c b/daemons/controld/controld_callbacks.c
|
||||
index 06ffb9d..3ce7470 100644
|
||||
--- a/daemons/controld/controld_callbacks.c
|
||||
+++ b/daemons/controld/controld_callbacks.c
|
||||
@@ -228,7 +228,7 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d
|
||||
crm_trace("Alive=%d, appeared=%d, down=%d",
|
||||
alive, appeared, (down? down->id : -1));
|
||||
|
||||
- if (appeared && (alive > 0)) {
|
||||
+ if (appeared && (alive > 0) && !is_remote) {
|
||||
register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL);
|
||||
}
|
||||
|
||||
--
|
||||
1.8.3.1
|
||||
|
||||
|
||||
From 5755b63850a17cd91bca28e83c39119378fe1887 Mon Sep 17 00:00:00 2001
|
||||
From: Ken Gaillot <kgaillot@redhat.com>
|
||||
Date: Sat, 18 May 2019 21:59:00 -0500
|
||||
Subject: [PATCH 4/4] Doc: Pacemaker Explained: document effect of SELinux on
|
||||
bundle storage
|
||||
|
||||
---
|
||||
doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt | 15 ++++++++++++---
|
||||
1 file changed, 12 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt b/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt
|
||||
index e431626..4a181df 100644
|
||||
--- a/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt
|
||||
+++ b/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt
|
||||
@@ -999,11 +999,11 @@ association with Docker, Inc. is implied.]
|
||||
<storage-mapping id="httpd-root"
|
||||
source-dir="/srv/html"
|
||||
target-dir="/var/www/html"
|
||||
- options="rw"/>
|
||||
+ options="rw,Z"/>
|
||||
<storage-mapping id="httpd-logs"
|
||||
source-dir-root="/var/log/pacemaker/bundles"
|
||||
target-dir="/etc/httpd/logs"
|
||||
- options="rw"/>
|
||||
+ options="rw,Z"/>
|
||||
</storage>
|
||||
<primitive class="ocf" id="httpd" provider="heartbeat" type="apache"/>
|
||||
</bundle>
|
||||
@@ -1293,7 +1293,8 @@ indexterm:[bundle,storage,storage-mapping]
|
||||
|
||||
|options
|
||||
|
|
||||
-|File system mount options to use when mapping the storage
|
||||
+|A comma-separated list of file system mount options to use when mapping the
|
||||
+ storage
|
||||
indexterm:[options,storage-mapping]
|
||||
indexterm:[storage-mapping,Property,options]
|
||||
|
||||
@@ -1322,6 +1323,14 @@ The +PCMK_authkey_location+ environment variable must not be set to anything
|
||||
other than the default of `/etc/pacemaker/authkey` on any node in the cluster.
|
||||
====
|
||||
|
||||
+[IMPORTANT]
|
||||
+====
|
||||
+If SELinux is used in enforcing mode on the host, you must ensure the container
|
||||
+is allowed to use any storage you mount into it. For Docker and podman bundles,
|
||||
+adding "Z" to the mount options will create a container-specific label for the
|
||||
+mount that allows the container access.
|
||||
+====
|
||||
+
|
||||
=== Bundle Primitive ===
|
||||
|
||||
A bundle may optionally contain one +<primitive>+ resource
|
||||
--
|
||||
1.8.3.1
|
||||
|
131
SOURCES/100-concurrent-fencing-tests.patch
Normal file
131
SOURCES/100-concurrent-fencing-tests.patch
Normal file
@ -0,0 +1,131 @@
|
||||
From d6e2db2702aa533bca7208bbdc18cb4254cc89d2 Mon Sep 17 00:00:00 2001
|
||||
From: Ken Gaillot <kgaillot@redhat.com>
|
||||
Date: Thu, 6 Jun 2019 14:18:37 -0500
|
||||
Subject: [PATCH] Test: scheduler: explicitly set concurrent-fencing in
|
||||
relevant regression tests
|
||||
|
||||
... since concurrent-fencing's default is likely to eventually change,
|
||||
which would otherwise affect the results of these tests
|
||||
---
|
||||
cts/scheduler/rec-node-14.xml | 1 +
|
||||
cts/scheduler/remote-connection-unrecoverable.xml | 1 +
|
||||
cts/scheduler/remote-recover-all.xml | 1 +
|
||||
cts/scheduler/remote-recover-no-resources.xml | 1 +
|
||||
cts/scheduler/remote-recover-unknown.xml | 1 +
|
||||
cts/scheduler/stonith-4.xml | 1 +
|
||||
cts/scheduler/suicide-needed-inquorate.xml | 1 +
|
||||
cts/scheduler/ticket-clone-21.xml | 1 +
|
||||
cts/scheduler/ticket-clone-9.xml | 1 +
|
||||
9 files changed, 9 insertions(+)
|
||||
|
||||
diff --git a/cts/scheduler/rec-node-14.xml b/cts/scheduler/rec-node-14.xml
|
||||
index 60307ba..aefa410 100644
|
||||
--- a/cts/scheduler/rec-node-14.xml
|
||||
+++ b/cts/scheduler/rec-node-14.xml
|
||||
@@ -4,6 +4,7 @@
|
||||
<cluster_property_set id="cib-bootstrap-options">
|
||||
<nvpair id="nvpair.id21835" name="stonith-enabled" value="true"/>
|
||||
<nvpair id="nvpair.id21844" name="no-quorum-policy" value="ignore"/>
|
||||
+ <nvpair id="options-concurrent-fencing" name="concurrent-fencing" value="false"/>
|
||||
</cluster_property_set>
|
||||
</crm_config>
|
||||
<nodes>
|
||||
diff --git a/cts/scheduler/remote-connection-unrecoverable.xml b/cts/scheduler/remote-connection-unrecoverable.xml
|
||||
index df9fee2..efec646 100644
|
||||
--- a/cts/scheduler/remote-connection-unrecoverable.xml
|
||||
+++ b/cts/scheduler/remote-connection-unrecoverable.xml
|
||||
@@ -7,6 +7,7 @@
|
||||
<nvpair id="cib-bootstrap-options-cluster-infrastructure" name="cluster-infrastructure" value="corosync"/>
|
||||
<nvpair id="cib-bootstrap-options-cluster-name" name="cluster-name" value="mycluster"/>
|
||||
<nvpair id="cib-bootstrap-options-last-lrm-refresh" name="last-lrm-refresh" value="1459735110"/>
|
||||
+ <nvpair id="options-concurrent-fencing" name="concurrent-fencing" value="false"/>
|
||||
</cluster_property_set>
|
||||
</crm_config>
|
||||
<nodes>
|
||||
diff --git a/cts/scheduler/remote-recover-all.xml b/cts/scheduler/remote-recover-all.xml
|
||||
index 0ade7cd..1680166 100644
|
||||
--- a/cts/scheduler/remote-recover-all.xml
|
||||
+++ b/cts/scheduler/remote-recover-all.xml
|
||||
@@ -10,6 +10,7 @@
|
||||
<nvpair id="cib-bootstrap-options-cluster-recheck-interval" name="cluster-recheck-interval" value="60s"/>
|
||||
<nvpair id="cib-bootstrap-options-maintenance-mode" name="maintenance-mode" value="false"/>
|
||||
<nvpair id="cib-bootstrap-options-last-lrm-refresh" name="last-lrm-refresh" value="1493817755"/>
|
||||
+ <nvpair id="options-concurrent-fencing" name="concurrent-fencing" value="false"/>
|
||||
</cluster_property_set>
|
||||
<cluster_property_set id="redis_replication">
|
||||
<nvpair id="redis_replication-redis_REPL_INFO" name="redis_REPL_INFO" value="controller-0"/>
|
||||
diff --git a/cts/scheduler/remote-recover-no-resources.xml b/cts/scheduler/remote-recover-no-resources.xml
|
||||
index 37708bb..602ed2b 100644
|
||||
--- a/cts/scheduler/remote-recover-no-resources.xml
|
||||
+++ b/cts/scheduler/remote-recover-no-resources.xml
|
||||
@@ -10,6 +10,7 @@
|
||||
<nvpair id="cib-bootstrap-options-cluster-recheck-interval" name="cluster-recheck-interval" value="60s"/>
|
||||
<nvpair id="cib-bootstrap-options-maintenance-mode" name="maintenance-mode" value="false"/>
|
||||
<nvpair id="cib-bootstrap-options-last-lrm-refresh" name="last-lrm-refresh" value="1493817755"/>
|
||||
+ <nvpair id="options-concurrent-fencing" name="concurrent-fencing" value="false"/>
|
||||
</cluster_property_set>
|
||||
<cluster_property_set id="redis_replication">
|
||||
<nvpair id="redis_replication-redis_REPL_INFO" name="redis_REPL_INFO" value="controller-0"/>
|
||||
diff --git a/cts/scheduler/remote-recover-unknown.xml b/cts/scheduler/remote-recover-unknown.xml
|
||||
index f070f11..f47a841 100644
|
||||
--- a/cts/scheduler/remote-recover-unknown.xml
|
||||
+++ b/cts/scheduler/remote-recover-unknown.xml
|
||||
@@ -10,6 +10,7 @@
|
||||
<nvpair id="cib-bootstrap-options-cluster-recheck-interval" name="cluster-recheck-interval" value="60s"/>
|
||||
<nvpair id="cib-bootstrap-options-maintenance-mode" name="maintenance-mode" value="false"/>
|
||||
<nvpair id="cib-bootstrap-options-last-lrm-refresh" name="last-lrm-refresh" value="1493817755"/>
|
||||
+ <nvpair id="options-concurrent-fencing" name="concurrent-fencing" value="false"/>
|
||||
</cluster_property_set>
|
||||
<cluster_property_set id="redis_replication">
|
||||
<nvpair id="redis_replication-redis_REPL_INFO" name="redis_REPL_INFO" value="controller-0"/>
|
||||
diff --git a/cts/scheduler/stonith-4.xml b/cts/scheduler/stonith-4.xml
|
||||
index 7979462..dd7af8d 100644
|
||||
--- a/cts/scheduler/stonith-4.xml
|
||||
+++ b/cts/scheduler/stonith-4.xml
|
||||
@@ -4,6 +4,7 @@
|
||||
<cluster_property_set id="cib-bootstrap-options">
|
||||
<nvpair id="cib-bootstrap-options-dc-version" name="dc-version" value="1.1.8-0.772.26fe3e5.git.fc17-26fe3e52d259e4726699300d27991fc1a80c556b"/>
|
||||
<nvpair id="cib-bootstrap-options-cluster-infrastructure" name="cluster-infrastructure" value="corosync"/>
|
||||
+ <nvpair id="options-concurrent-fencing" name="concurrent-fencing" value="false"/>
|
||||
</cluster_property_set>
|
||||
</crm_config>
|
||||
<nodes>
|
||||
diff --git a/cts/scheduler/suicide-needed-inquorate.xml b/cts/scheduler/suicide-needed-inquorate.xml
|
||||
index e626ea6..f87422b 100644
|
||||
--- a/cts/scheduler/suicide-needed-inquorate.xml
|
||||
+++ b/cts/scheduler/suicide-needed-inquorate.xml
|
||||
@@ -6,6 +6,7 @@
|
||||
<nvpair id="cib-bootstrap-options-dc-version" name="dc-version" value="1.1.17-1"/>
|
||||
<nvpair id="cib-bootstrap-options-cluster-infrastructure" name="cluster-infrastructure" value="corosync"/>
|
||||
<nvpair id="cib-bootstrap-options-no-quorum-policy" name="no-quorum-policy" value="suicide"/>
|
||||
+ <nvpair id="options-concurrent-fencing" name="concurrent-fencing" value="false"/>
|
||||
</cluster_property_set>
|
||||
</crm_config>
|
||||
<nodes>
|
||||
diff --git a/cts/scheduler/ticket-clone-21.xml b/cts/scheduler/ticket-clone-21.xml
|
||||
index bb1f044..efd5294 100644
|
||||
--- a/cts/scheduler/ticket-clone-21.xml
|
||||
+++ b/cts/scheduler/ticket-clone-21.xml
|
||||
@@ -4,6 +4,7 @@
|
||||
<cluster_property_set id="cib-bootstrap-options">
|
||||
<nvpair id="cib-bootstrap-options-stonith-enabled" name="stonith-enabled" value="true"/>
|
||||
<nvpair id="cib-bootstrap-options-no-quorum-policy" name="no-quorum-policy" value="ignore"/>
|
||||
+ <nvpair id="options-concurrent-fencing" name="concurrent-fencing" value="false"/>
|
||||
</cluster_property_set>
|
||||
</crm_config>
|
||||
<nodes>
|
||||
diff --git a/cts/scheduler/ticket-clone-9.xml b/cts/scheduler/ticket-clone-9.xml
|
||||
index e77210d..c6d5809 100644
|
||||
--- a/cts/scheduler/ticket-clone-9.xml
|
||||
+++ b/cts/scheduler/ticket-clone-9.xml
|
||||
@@ -4,6 +4,7 @@
|
||||
<cluster_property_set id="cib-bootstrap-options">
|
||||
<nvpair id="cib-bootstrap-options-stonith-enabled" name="stonith-enabled" value="true"/>
|
||||
<nvpair id="cib-bootstrap-options-no-quorum-policy" name="no-quorum-policy" value="ignore"/>
|
||||
+ <nvpair id="options-concurrent-fencing" name="concurrent-fencing" value="false"/>
|
||||
</cluster_property_set>
|
||||
</crm_config>
|
||||
<nodes>
|
||||
--
|
||||
1.8.3.1
|
||||
|
1180
SPECS/pacemaker.spec
Normal file
1180
SPECS/pacemaker.spec
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user