pacemaker/SOURCES/002-failed-monitors.patch

From 5470f1d9c776dbf753e015fa96153b6a63c17b83 Mon Sep 17 00:00:00 2001
From: "Gao,Yan" <ygao@suse.com>
Date: Thu, 9 May 2019 13:24:35 +0200
Subject: [PATCH] Fix: controller: confirm cancel of failed monitors

Usually after a monitor has been cancelled from executor, contoller
erases the corresponding lrm_rsc_op from the cib, and DC will confirm
the cancel action by process_op_deletion() according to the cib diff.

But if a monitor has failed, the lrm_rsc_op will be recorded as
"last_failure". When cancelling it, the lrm_rsc_op won't get erased from
the cib given the logic on purpose in erase_lrm_history_by_op(). So that
the cancel action won't have a chance to get confirmed by DC with
process_op_deletion().

Previously cluster transition would get stuck waiting for the remaining
action timer to time out.

This commit fixes the issue by directly acknowledging the cancel action
in this case and enabling DC to be able to confirm it.

This also moves get_node_id() function into controld_utils.c for common
use.

Producer:
```
 # Insert a 10s sleep in the monitor action of RA
 # /usr/lib/ocf/resource.d/pacemaker/Stateful:

  stateful_monitor() {
 +    sleep 10
      stateful_check_state "master"

 # Add a promotable clone resource:

 crm configure primitive stateful ocf:pacemaker:Stateful \
         op monitor interval=5 role=Master \
         op monitor interval=10 role=Slave
 crm configure clone p-clone stateful \
         meta promotable=true

 # Wait for the resource instance to be started, promoted to be master,
 # and monitor for master role to complete.

 # Set is-managed=false for the promotable clone:
 crm_resource --meta -p is-managed -v false -r p-clone

 # Change the status of the master instance to be slave and immediately
 # enforce refresh of it:
 echo slave > /var/run/Stateful-stateful.state; crm_resource --refresh -r stateful --force

 # Wait for probe to complete, and then monitor for slave role to be
 # issued:
 sleep 15

 # While the monitor for slave role is still in progress, change the
 # status to be master again:
 echo master > /var/run/Stateful-stateful.state

 # The monitor for slave role returns error. Cluster issues monitor for
 # master role instead and tries to cancel the failed one for slave role.
 # But cluster transition gets stuck. Depending on the monitor timeout
 # configured for the slave role plus cluster-delay, only after that
 # controller eventually says:

 pacemaker-controld[21205] error: Node opensuse150 did not send cancel result (via controller) within 20000ms (action timeout plus cluster-delay)
 pacemaker-controld[21205] error: [Action    1]: In-flight rsc op stateful_monitor_10000            on opensuse150 (priority: 0, waiting: none)
 pacemaker-controld[21205] notice: Transition 6 aborted: Action lost

```
---
 daemons/controld/controld_execd.c        | 38 ++++++++++++++++++++++++++++++++
 daemons/controld/controld_te_callbacks.c | 21 ++----------------
 daemons/controld/controld_te_events.c    | 32 +++++++++++++++++++++++++++
 daemons/controld/controld_transition.h   |  1 +
 daemons/controld/controld_utils.c        | 13 +++++++++++
 daemons/controld/controld_utils.h        |  2 ++
 6 files changed, 88 insertions(+), 19 deletions(-)

diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 976fed1..8282fed 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -2476,6 +2476,30 @@ unescape_newlines(const char *string)
     return ret;
 }

+static bool
+did_lrm_rsc_op_fail(lrm_state_t *lrm_state, const char * rsc_id,
+                    const char * op_type, guint interval_ms)
+{
+    rsc_history_t *entry = NULL;
+
+    CRM_CHECK(lrm_state != NULL, return FALSE);
+    CRM_CHECK(rsc_id != NULL, return FALSE);
+    CRM_CHECK(op_type != NULL, return FALSE);
+
+    entry = g_hash_table_lookup(lrm_state->resource_history, rsc_id);
+    if (entry == NULL || entry->failed == NULL) {
+        return FALSE;
+    }
+
+    if (crm_str_eq(entry->failed->rsc_id, rsc_id, TRUE)
+        && safe_str_eq(entry->failed->op_type, op_type)
+        && entry->failed->interval_ms == interval_ms) {
+        return TRUE;
+    }
+
+    return FALSE;
+}
+
 void
 process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op,
                   struct recurring_op_s *pending, xmlNode *action_xml)
@@ -2605,6 +2629,20 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op,
             erase_lrm_history_by_op(lrm_state, op);
         }

+        /* If the recurring operation had failed, the lrm_rsc_op is recorded as
+         * "last_failure" which won't get erased from the cib given the logic on
+         * purpose in erase_lrm_history_by_op(). So that the cancel action won't
+         * have a chance to get confirmed by DC with process_op_deletion().
+         * Cluster transition would get stuck waiting for the remaining action
+         * timer to time out.
+         *
+         * Directly acknowledge the cancel operation in this case.
+         */
+        if (did_lrm_rsc_op_fail(lrm_state, pending->rsc_id,
+                                pending->op_type, pending->interval_ms)) {
+            need_direct_ack = TRUE;
+        }
+
     } else if (op->rsc_deleted) {
         /* This recurring operation was cancelled (but not by us, and the
          * executor does not have resource information, likely due to resource
diff --git a/daemons/controld/controld_te_callbacks.c b/daemons/controld/controld_te_callbacks.c
index 51d908e..22b5f4b 100644
--- a/daemons/controld/controld_te_callbacks.c
+++ b/daemons/controld/controld_te_callbacks.c
@@ -32,19 +32,6 @@ static unsigned long int stonith_max_attempts = 10;
 /* #define RSC_OP_TEMPLATE "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_CIB_TAG_STATE"[@uname='%s']"//"XML_LRM_TAG_RSC_OP"[@id='%s]" */
 #define RSC_OP_TEMPLATE "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_LRM_TAG_RSC_OP"[@id='%s']"

-static const char *
-get_node_id(xmlNode * rsc_op)
-{
-    xmlNode *node = rsc_op;
-
-    while (node != NULL && safe_str_neq(XML_CIB_TAG_STATE, TYPE(node))) {
-        node = node->parent;
-    }
-
-    CRM_CHECK(node != NULL, return NULL);
-    return ID(node);
-}
-
 void
 update_stonith_max_attempts(const char* value)
 {
@@ -374,12 +361,8 @@ process_op_deletion(const char *xpath, xmlNode *change)
     node_uuid = extract_node_uuid(xpath);
     cancel = get_cancel_action(key, node_uuid);
     if (cancel) {
-        crm_info("Cancellation of %s on %s confirmed (%d)",
-                 key, node_uuid, cancel->id);
-        stop_te_timer(cancel->timer);
-        te_action_confirmed(cancel);
-        update_graph(transition_graph, cancel);
-        trigger_graph();
+        confirm_cancel_action(cancel);
+
     } else {
         abort_transition(INFINITY, tg_restart, "Resource operation removal",
                          change);
diff --git a/daemons/controld/controld_te_events.c b/daemons/controld/controld_te_events.c
index c0d096f..b7b48a4 100644
--- a/daemons/controld/controld_te_events.c
+++ b/daemons/controld/controld_te_events.c
@@ -355,6 +355,27 @@ get_cancel_action(const char *id, const char *node)
     return NULL;
 }

+void
+confirm_cancel_action(crm_action_t *cancel)
+{
+    const char *op_key = NULL;
+    const char *node_name = NULL;
+
+    CRM_ASSERT(cancel != NULL);
+
+    op_key = crm_element_value(cancel->xml, XML_LRM_ATTR_TASK_KEY);
+    node_name = crm_element_value(cancel->xml, XML_LRM_ATTR_TARGET);
+
+    stop_te_timer(cancel->timer);
+    te_action_confirmed(cancel);
+    update_graph(transition_graph, cancel);
+
+    crm_info("Cancellation of %s on %s confirmed (action %d)",
+             op_key, node_name, cancel->id);
+
+    trigger_graph();
+}
+
 /* downed nodes are listed like: <downed> <node id="UUID1" /> ... </downed> */
 #define XPATH_DOWNED "//" XML_GRAPH_TAG_DOWNED \
                      "/" XML_CIB_TAG_NODE "[@" XML_ATTR_UUID "='%s']"
@@ -471,6 +492,17 @@ process_graph_event(xmlNode *event, const char *event_node)
             /* Recurring actions have the transition number they were first
              * scheduled in.
              */
+
+            if (status == PCMK_LRM_OP_CANCELLED) {
+                const char *node_id = get_node_id(event);
+
+                action = get_cancel_action(id, node_id);
+                if (action) {
+                    confirm_cancel_action(action);
+                }
+                goto bail;
+            }
+
             desc = "arrived after initial scheduling";
             abort_transition(INFINITY, tg_restart, "Change in recurring result",
                              event);
diff --git a/daemons/controld/controld_transition.h b/daemons/controld/controld_transition.h
index 0a33599..a162f99 100644
--- a/daemons/controld/controld_transition.h
+++ b/daemons/controld/controld_transition.h
@@ -25,6 +25,7 @@ void execute_stonith_cleanup(void);
 /* tengine */
 extern crm_action_t *match_down_event(const char *target);
 extern crm_action_t *get_cancel_action(const char *id, const char *node);
+void confirm_cancel_action(crm_action_t *cancel);

 void controld_record_action_timeout(crm_action_t *action);
 extern gboolean fail_incompletable_actions(crm_graph_t * graph, const char *down_node);
diff --git a/daemons/controld/controld_utils.c b/daemons/controld/controld_utils.c
index ca7e15d..35922f0 100644
--- a/daemons/controld/controld_utils.c
+++ b/daemons/controld/controld_utils.c
@@ -1073,3 +1073,16 @@ feature_set_compatible(const char *dc_version, const char *join_version)
     // DC's minor version must be the same or older
     return dc_v <= join_v;
 }
+
+const char *
+get_node_id(xmlNode *lrm_rsc_op)
+{
+    xmlNode *node = lrm_rsc_op;
+
+    while (node != NULL && safe_str_neq(XML_CIB_TAG_STATE, TYPE(node))) {
+        node = node->parent;
+    }
+
+    CRM_CHECK(node != NULL, return NULL);
+    return ID(node);
+}
diff --git a/daemons/controld/controld_utils.h b/daemons/controld/controld_utils.h
index 2a92db5..68992f5 100644
--- a/daemons/controld/controld_utils.h
+++ b/daemons/controld/controld_utils.h
@@ -95,6 +95,8 @@ unsigned int cib_op_timeout(void);
 bool feature_set_compatible(const char *dc_version, const char *join_version);
 bool controld_action_is_recordable(const char *action);

+const char *get_node_id(xmlNode *lrm_rsc_op);
+
 /* Convenience macro for registering a CIB callback
  * (assumes that data can be freed with free())
  */
--
1.8.3.1