Compare commits

...

No commits in common. "c8" and "a9" have entirely different histories.
c8 ... a9

21 changed files with 6384 additions and 19621 deletions

4
.gitignore vendored
View File

@ -1,2 +1,2 @@
SOURCES/nagios-agents-metadata-105ab8a.tar.gz
SOURCES/pacemaker-6fdc9deea.tar.gz
SOURCES/nagios-agents-metadata-105ab8a7b2c16b9a29cf1c1596b80136eeef332b.tar.gz
SOURCES/pacemaker-a3f44794f.tar.gz

View File

@ -1,2 +1,2 @@
ea6c0a27fd0ae8ce02f84a11f08a0d79377041c3 SOURCES/nagios-agents-metadata-105ab8a.tar.gz
fbf71fb3fb42c76f9f1e98497505eb8521cab55e SOURCES/pacemaker-6fdc9deea.tar.gz
2cbec94ad67dfbeba75e38d2c3c5c44961b3cd16 SOURCES/nagios-agents-metadata-105ab8a7b2c16b9a29cf1c1596b80136eeef332b.tar.gz
b16198db5f86857ba8bc0ebd04fd386da360478a SOURCES/pacemaker-a3f44794f.tar.gz

View File

@ -1,402 +0,0 @@
From cf53f523e691295879cd75cff1a86bc15664fa51 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Tue, 2 May 2023 09:59:13 -0400
Subject: [PATCH 1/7] Feature: daemons: Add start state to LRMD handshake XML
This gets read out of /etc/sysconfig/pacemaker and set into the
environment. The remote node executor will then add that to the XML
that it sends to the controller upon startup.
Ref T183
---
daemons/execd/execd_commands.c | 5 +++++
include/crm_internal.h | 1 +
2 files changed, 6 insertions(+)
diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c
index fa2761e..9a783a5 100644
--- a/daemons/execd/execd_commands.c
+++ b/daemons/execd/execd_commands.c
@@ -1474,6 +1474,7 @@ process_lrmd_signon(pcmk__client_t *client, xmlNode *request, int call_id,
int rc = pcmk_ok;
time_t now = time(NULL);
const char *protocol_version = crm_element_value(request, F_LRMD_PROTOCOL_VERSION);
+ const char *start_state = pcmk__env_option(PCMK__ENV_NODE_START_STATE);
if (compare_version(protocol_version, LRMD_MIN_PROTOCOL_VERSION) < 0) {
crm_err("Cluster API version must be greater than or equal to %s, not %s",
@@ -1503,6 +1504,10 @@ process_lrmd_signon(pcmk__client_t *client, xmlNode *request, int call_id,
crm_xml_add(*reply, F_LRMD_PROTOCOL_VERSION, LRMD_PROTOCOL_VERSION);
crm_xml_add_ll(*reply, PCMK__XA_UPTIME, now - start_time);
+ if (start_state) {
+ crm_xml_add(*reply, PCMK__XA_NODE_START_STATE, start_state);
+ }
+
return rc;
}
diff --git a/include/crm_internal.h b/include/crm_internal.h
index 5f6531f..771bd26 100644
--- a/include/crm_internal.h
+++ b/include/crm_internal.h
@@ -84,6 +84,7 @@
#define PCMK__XA_GRAPH_ERRORS "graph-errors"
#define PCMK__XA_GRAPH_WARNINGS "graph-warnings"
#define PCMK__XA_MODE "mode"
+#define PCMK__XA_NODE_START_STATE "node_start_state"
#define PCMK__XA_TASK "task"
#define PCMK__XA_UPTIME "uptime"
#define PCMK__XA_CONN_HOST "connection_host"
--
2.31.1
From c950291742711b5c4c8986adc8e938fe6fef861c Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Tue, 2 May 2023 10:04:32 -0400
Subject: [PATCH 2/7] Feature: liblrmd: Save a remote node's requested start
state
Ref T183
---
include/crm/common/ipc_internal.h | 1 +
lib/lrmd/lrmd_client.c | 7 +++++++
2 files changed, 8 insertions(+)
diff --git a/include/crm/common/ipc_internal.h b/include/crm/common/ipc_internal.h
index 5099dda..d203924 100644
--- a/include/crm/common/ipc_internal.h
+++ b/include/crm/common/ipc_internal.h
@@ -112,6 +112,7 @@ struct pcmk__remote_s {
int tcp_socket;
mainloop_io_t *source;
time_t uptime;
+ char *start_state;
/* CIB-only */
char *token;
diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c
index c565728..4239105 100644
--- a/lib/lrmd/lrmd_client.c
+++ b/lib/lrmd/lrmd_client.c
@@ -588,7 +588,9 @@ lrmd_tls_connection_destroy(gpointer userdata)
}
free(native->remote->buffer);
+ free(native->remote->start_state);
native->remote->buffer = NULL;
+ native->remote->start_state = NULL;
native->source = 0;
native->sock = 0;
native->psk_cred_c = NULL;
@@ -980,6 +982,7 @@ lrmd_handshake(lrmd_t * lrmd, const char *name)
const char *version = crm_element_value(reply, F_LRMD_PROTOCOL_VERSION);
const char *msg_type = crm_element_value(reply, F_LRMD_OPERATION);
const char *tmp_ticket = crm_element_value(reply, F_LRMD_CLIENTID);
+ const char *start_state = crm_element_value(reply, PCMK__XA_NODE_START_STATE);
long long uptime = -1;
crm_element_value_int(reply, F_LRMD_RC, &rc);
@@ -992,6 +995,10 @@ lrmd_handshake(lrmd_t * lrmd, const char *name)
crm_element_value_ll(reply, PCMK__XA_UPTIME, &uptime);
native->remote->uptime = uptime;
+ if (start_state) {
+ native->remote->start_state = strdup(start_state);
+ }
+
if (rc == -EPROTO) {
crm_err("Executor protocol version mismatch between client (%s) and server (%s)",
LRMD_PROTOCOL_VERSION, version);
--
2.31.1
From 7302014c7b7296be31b1f542b3f107d55b1fb2a0 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Tue, 2 May 2023 10:05:13 -0400
Subject: [PATCH 3/7] Feature: liblrmd: Add lrmd__node_start_state.
This function is used to get the start state out of an lrmd_private_t
structure.
Ref T183
---
include/crm/lrmd_internal.h | 1 +
lib/lrmd/lrmd_client.c | 12 ++++++++++++
2 files changed, 13 insertions(+)
diff --git a/include/crm/lrmd_internal.h b/include/crm/lrmd_internal.h
index 5810554..d1cd25d 100644
--- a/include/crm/lrmd_internal.h
+++ b/include/crm/lrmd_internal.h
@@ -47,6 +47,7 @@ void lrmd__set_result(lrmd_event_data_t *event, enum ocf_exitcode rc,
void lrmd__reset_result(lrmd_event_data_t *event);
time_t lrmd__uptime(lrmd_t *lrmd);
+const char *lrmd__node_start_state(lrmd_t *lrmd);
/* Shared functions for IPC proxy back end */
diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c
index 4239105..82434b9 100644
--- a/lib/lrmd/lrmd_client.c
+++ b/lib/lrmd/lrmd_client.c
@@ -2538,3 +2538,15 @@ lrmd__uptime(lrmd_t *lrmd)
return native->remote->uptime;
}
}
+
+const char *
+lrmd__node_start_state(lrmd_t *lrmd)
+{
+ lrmd_private_t *native = lrmd->lrmd_private;
+
+ if (native->remote == NULL) {
+ return NULL;
+ } else {
+ return native->remote->start_state;
+ }
+}
--
2.31.1
From e5e4d43f847da0930bae12f63c7e9d9c44c07cdf Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Tue, 2 May 2023 10:07:58 -0400
Subject: [PATCH 4/7] Refactor: controller: Make set_join_state a public
function.
This already does all the work of setting a node's start state. It just
needs to be made public and given arguments for what node to set instead
of reading globals.
Ref T183
---
daemons/controld/controld_join_client.c | 20 ++++++++++----------
daemons/controld/pacemaker-controld.h | 3 +++
2 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/daemons/controld/controld_join_client.c b/daemons/controld/controld_join_client.c
index da6a9d6..07e2a27 100644
--- a/daemons/controld/controld_join_client.c
+++ b/daemons/controld/controld_join_client.c
@@ -195,32 +195,31 @@ join_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *
free_xml(generation);
}
-static void
-set_join_state(const char * start_state)
+void
+set_join_state(const char *start_state, const char *node_name, const char *node_uuid)
{
if (pcmk__str_eq(start_state, "standby", pcmk__str_casei)) {
crm_notice("Forcing node %s to join in %s state per configured "
- "environment", controld_globals.our_nodename, start_state);
+ "environment", node_name, start_state);
cib__update_node_attr(controld_globals.logger_out,
controld_globals.cib_conn, cib_sync_call,
- XML_CIB_TAG_NODES, controld_globals.our_uuid,
+ XML_CIB_TAG_NODES, node_uuid,
NULL, NULL, NULL, "standby", "on", NULL, NULL);
} else if (pcmk__str_eq(start_state, "online", pcmk__str_casei)) {
crm_notice("Forcing node %s to join in %s state per configured "
- "environment", controld_globals.our_nodename, start_state);
+ "environment", node_name, start_state);
cib__update_node_attr(controld_globals.logger_out,
controld_globals.cib_conn, cib_sync_call,
- XML_CIB_TAG_NODES, controld_globals.our_uuid,
+ XML_CIB_TAG_NODES, node_uuid,
NULL, NULL, NULL, "standby", "off", NULL, NULL);
} else if (pcmk__str_eq(start_state, "default", pcmk__str_casei)) {
- crm_debug("Not forcing a starting state on node %s",
- controld_globals.our_nodename);
+ crm_debug("Not forcing a starting state on node %s", node_name);
} else {
crm_warn("Unrecognized start state '%s', using 'default' (%s)",
- start_state, controld_globals.our_nodename);
+ start_state, node_name);
}
}
@@ -335,7 +334,8 @@ do_cl_join_finalize_respond(long long action,
first_join = FALSE;
if (start_state) {
- set_join_state(start_state);
+ set_join_state(start_state, controld_globals.our_nodename,
+ controld_globals.our_uuid);
}
}
diff --git a/daemons/controld/pacemaker-controld.h b/daemons/controld/pacemaker-controld.h
index 1484a00..d8c2ddd 100644
--- a/daemons/controld/pacemaker-controld.h
+++ b/daemons/controld/pacemaker-controld.h
@@ -36,4 +36,7 @@ void controld_remove_voter(const char *uname);
void controld_election_fini(void);
void controld_stop_current_election_timeout(void);
+void set_join_state(const char *start_state, const char *node_name,
+ const char *node_uuid);
+
#endif
--
2.31.1
From 63d069adb344bba2c982013226f87dfd95afaff3 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Tue, 2 May 2023 13:38:03 -0400
Subject: [PATCH 5/7] Refactor: controller: set_join_state needs to take a
remote parameter.
Without this parameter, we won't know what to pass to as node_type to
cib__update_node_attr. And without that, that function will not know to
update a remote node - it'll try to update a regular node by the same
name, which either doesn't exist or is not what we were hoping would
happen.
Ref T138
---
daemons/controld/controld_join_client.c | 11 +++++++----
daemons/controld/pacemaker-controld.h | 2 +-
2 files changed, 8 insertions(+), 5 deletions(-)
diff --git a/daemons/controld/controld_join_client.c b/daemons/controld/controld_join_client.c
index 07e2a27..799d1b4 100644
--- a/daemons/controld/controld_join_client.c
+++ b/daemons/controld/controld_join_client.c
@@ -196,7 +196,8 @@ join_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *
}
void
-set_join_state(const char *start_state, const char *node_name, const char *node_uuid)
+set_join_state(const char *start_state, const char *node_name, const char *node_uuid,
+ bool remote)
{
if (pcmk__str_eq(start_state, "standby", pcmk__str_casei)) {
crm_notice("Forcing node %s to join in %s state per configured "
@@ -204,7 +205,8 @@ set_join_state(const char *start_state, const char *node_name, const char *node_
cib__update_node_attr(controld_globals.logger_out,
controld_globals.cib_conn, cib_sync_call,
XML_CIB_TAG_NODES, node_uuid,
- NULL, NULL, NULL, "standby", "on", NULL, NULL);
+ NULL, NULL, NULL, "standby", "on", NULL,
+ remote ? "remote" : NULL);
} else if (pcmk__str_eq(start_state, "online", pcmk__str_casei)) {
crm_notice("Forcing node %s to join in %s state per configured "
@@ -212,7 +214,8 @@ set_join_state(const char *start_state, const char *node_name, const char *node_
cib__update_node_attr(controld_globals.logger_out,
controld_globals.cib_conn, cib_sync_call,
XML_CIB_TAG_NODES, node_uuid,
- NULL, NULL, NULL, "standby", "off", NULL, NULL);
+ NULL, NULL, NULL, "standby", "off", NULL,
+ remote ? "remote" : NULL);
} else if (pcmk__str_eq(start_state, "default", pcmk__str_casei)) {
crm_debug("Not forcing a starting state on node %s", node_name);
@@ -335,7 +338,7 @@ do_cl_join_finalize_respond(long long action,
first_join = FALSE;
if (start_state) {
set_join_state(start_state, controld_globals.our_nodename,
- controld_globals.our_uuid);
+ controld_globals.our_uuid, false);
}
}
diff --git a/daemons/controld/pacemaker-controld.h b/daemons/controld/pacemaker-controld.h
index d8c2ddd..2334cce 100644
--- a/daemons/controld/pacemaker-controld.h
+++ b/daemons/controld/pacemaker-controld.h
@@ -37,6 +37,6 @@ void controld_election_fini(void);
void controld_stop_current_election_timeout(void);
void set_join_state(const char *start_state, const char *node_name,
- const char *node_uuid);
+ const char *node_uuid, bool remote);
#endif
--
2.31.1
From 67274787898355065315f8c06d62458e2c2b0afe Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Tue, 2 May 2023 10:09:02 -0400
Subject: [PATCH 6/7] Feature: controller: When a remote node starts, apply any
start state.
If we were given a start state in the handshake XML, that is now stored
in the remote node cache's private data. Extract it and set the state
on the node with set_node_state.
Fixes T183
---
daemons/controld/controld_remote_ra.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c
index f24b755..8ab1e46 100644
--- a/daemons/controld/controld_remote_ra.c
+++ b/daemons/controld/controld_remote_ra.c
@@ -280,6 +280,7 @@ remote_node_up(const char *node_name)
int call_opt;
xmlNode *update, *state;
crm_node_t *node;
+ lrm_state_t *connection_rsc = NULL;
CRM_CHECK(node_name != NULL, return);
crm_info("Announcing Pacemaker Remote node %s", node_name);
@@ -301,6 +302,20 @@ remote_node_up(const char *node_name)
purge_remote_node_attrs(call_opt, node);
pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0);
+ /* Apply any start state that we were given from the environment on the
+ * remote node.
+ */
+ connection_rsc = lrm_state_find(node->uname);
+
+ if (connection_rsc != NULL) {
+ lrmd_t *lrm = connection_rsc->conn;
+ const char *start_state = lrmd__node_start_state(lrm);
+
+ if (start_state) {
+ set_join_state(start_state, node->uname, node->uuid, true);
+ }
+ }
+
/* pacemaker_remote nodes don't participate in the membership layer,
* so cluster nodes don't automatically get notified when they come and go.
* We send a cluster message to the DC, and update the CIB node state entry,
--
2.31.1
From 91cdda7056c9b9254a0d7e7a016b30f788e3e3ff Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Tue, 2 May 2023 10:16:30 -0400
Subject: [PATCH 7/7] Doc: sysconfig: Remote nodes now respect start state.
Ref T183
---
etc/sysconfig/pacemaker.in | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/etc/sysconfig/pacemaker.in b/etc/sysconfig/pacemaker.in
index 3b03ad6..041da71 100644
--- a/etc/sysconfig/pacemaker.in
+++ b/etc/sysconfig/pacemaker.in
@@ -144,8 +144,7 @@
# By default, the local host will join the cluster in an online or standby
# state when Pacemaker first starts depending on whether it was previously put
# into standby mode. If this variable is set to "standby" or "online", it will
-# force the local host to join in the specified state. This has no effect on
-# Pacemaker Remote nodes.
+# force the local host to join in the specified state.
#
# Default: PCMK_node_start_state="default"
--
2.31.1

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,98 @@
From d8e08729ad5e3dc62f774172f992210902fc0ed4 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 23 Jan 2023 14:25:56 -0600
Subject: [PATCH] High: executor: fix regression in remote node shutdown
This reverts the essential part of d61494347, which was based on misdiagnosing
a remote node shutdown issue. Initially, it was thought that a "TLS server
session ended" log just after a remote node requested shutdown indicated that
the proxy connection coincidentally dropped at that moment. It actually is the
routine stopping of accepting new proxy connections, and existing when that
happens makes the remote node exit immediately without waiting for the
all-clear from the cluster.
Fixes T361
---
daemons/execd/pacemaker-execd.c | 19 +------------------
daemons/execd/pacemaker-execd.h | 3 +--
daemons/execd/remoted_tls.c | 6 +-----
3 files changed, 3 insertions(+), 25 deletions(-)
diff --git a/daemons/execd/pacemaker-execd.c b/daemons/execd/pacemaker-execd.c
index db12674f13..491808974a 100644
--- a/daemons/execd/pacemaker-execd.c
+++ b/daemons/execd/pacemaker-execd.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2012-2022 the Pacemaker project contributors
+ * Copyright 2012-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -305,23 +305,6 @@ lrmd_exit(gpointer data)
return FALSE;
}
-/*!
- * \internal
- * \brief Clean up and exit if shutdown has started
- *
- * \return Doesn't return
- */
-void
-execd_exit_if_shutting_down(void)
-{
-#ifdef PCMK__COMPILE_REMOTE
- if (shutting_down) {
- crm_warn("exit because TLS connection was closed and 'shutting_down' set");
- lrmd_exit(NULL);
- }
-#endif
-}
-
/*!
* \internal
* \brief Request cluster shutdown if appropriate, otherwise exit immediately
diff --git a/daemons/execd/pacemaker-execd.h b/daemons/execd/pacemaker-execd.h
index 6646ae29e3..f78e8dcdde 100644
--- a/daemons/execd/pacemaker-execd.h
+++ b/daemons/execd/pacemaker-execd.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2012-2022 the Pacemaker project contributors
+ * Copyright 2012-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -105,6 +105,5 @@ void remoted_spawn_pidone(int argc, char **argv, char **envp);
int process_lrmd_alert_exec(pcmk__client_t *client, uint32_t id,
xmlNode *request);
void lrmd_drain_alerts(GMainLoop *mloop);
-void execd_exit_if_shutting_down(void);
#endif // PACEMAKER_EXECD__H
diff --git a/daemons/execd/remoted_tls.c b/daemons/execd/remoted_tls.c
index 6f4b2d0062..c65e3f394d 100644
--- a/daemons/execd/remoted_tls.c
+++ b/daemons/execd/remoted_tls.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2012-2022 the Pacemaker project contributors
+ * Copyright 2012-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -250,10 +250,6 @@ static void
tls_server_dropped(gpointer user_data)
{
crm_notice("TLS server session ended");
- /* If we are in the process of shutting down, then we should actually exit.
- * bz#1804259
- */
- execd_exit_if_shutting_down();
return;
}
--
2.31.1

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,35 +0,0 @@
From 770d417e28dc9527fec8b8a00caaba8825995454 Mon Sep 17 00:00:00 2001
From: Grace Chin <gchin@redhat.com>
Date: Wed, 19 Jul 2023 10:25:55 -0400
Subject: [PATCH] Fix: tools: Fix a bug in clone resource description display
Previously, descriptions of resources running on multiple
nodes were displayed despite --full not being used (with pcs
status) or --show-detail not being used (with crm_mon).
For example, clone resources running on multiple nodes were
affected.
Now, --full and --show-detail must be used in order for resource
descriptions to be displayed, regardless of the number of nodes
the resource is run on.
see bz: 2106642
---
lib/pengine/pe_output.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/lib/pengine/pe_output.c b/lib/pengine/pe_output.c
index e0b43d997a..d1c9f6e226 100644
--- a/lib/pengine/pe_output.c
+++ b/lib/pengine/pe_output.c
@@ -20,8 +20,7 @@ pe__resource_description(const pe_resource_t *rsc, uint32_t show_opts)
{
const char * desc = NULL;
// User-supplied description
- if (pcmk_any_flags_set(show_opts, pcmk_show_rsc_only|pcmk_show_description)
- || pcmk__list_of_multiple(rsc->running_on)) {
+ if (pcmk_any_flags_set(show_opts, pcmk_show_rsc_only|pcmk_show_description)) {
desc = crm_element_value(rsc->xml, XML_ATTR_DESC);
}
return desc;

View File

@ -0,0 +1,107 @@
From 45617b727e280cac384a28ae3d96145e066e6197 Mon Sep 17 00:00:00 2001
From: Reid Wahl <nrwahl@protonmail.com>
Date: Fri, 3 Feb 2023 12:08:57 -0800
Subject: [PATCH 01/02] Fix: fencer: Prevent double g_source_remove of op_timer_one
QE observed a rarely reproducible core dump in the fencer during
Pacemaker shutdown, in which we try to g_source_remove() an op timer
that's already been removed.
free_stonith_remote_op_list()
-> g_hash_table_destroy()
-> g_hash_table_remove_all_nodes()
-> clear_remote_op_timers()
-> g_source_remove()
-> crm_glib_handler()
-> "Source ID 190 was not found when attempting to remove it"
The likely cause is that request_peer_fencing() doesn't set
op->op_timer_one to 0 after calling g_source_remove() on it, so if that
op is still in the stonith_remote_op_list at shutdown with the same
timer, clear_remote_op_timers() tries to remove the source for
op_timer_one again.
There are only five locations that call g_source_remove() on a
remote_fencing_op_t timer.
* Three of them are in clear_remote_op_timers(), which first 0-checks
the timer and then sets it to 0 after g_source_remove().
* One is in remote_op_query_timeout(), which does the same.
* The last is the one we fix here in request_peer_fencing().
I don't know all the conditions of QE's test scenario at this point.
What I do know:
* have-watchdog=true
* stonith-watchdog-timeout=10
* no explicit topology
* fence agent script is missing for the configured fence device
* requested fencing of one node
* cluster shutdown
Fixes RHBZ2166967
Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
---
daemons/fenced/fenced_remote.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c
index d61b5bd..b7426ff 100644
--- a/daemons/fenced/fenced_remote.c
+++ b/daemons/fenced/fenced_remote.c
@@ -1825,6 +1825,7 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
op->state = st_exec;
if (op->op_timer_one) {
g_source_remove(op->op_timer_one);
+ op->op_timer_one = 0;
}
if (!((stonith_watchdog_timeout_ms > 0)
--
2.31.1
From 0291db4750322ec7f01ae6a4a2a30abca9d8e19e Mon Sep 17 00:00:00 2001
From: Reid Wahl <nrwahl@protonmail.com>
Date: Wed, 15 Feb 2023 22:30:27 -0800
Subject: [PATCH 02/02] Fix: fencer: Avoid double source remove of op_timer_total
remote_op_timeout() returns G_SOURCE_REMOVE, which tells GLib to remove
the source from the main loop after returning. Currently this function
is used as the callback only when creating op->op_timer_total.
If we don't set op->op_timer_total to 0 before returning from
remote_op_timeout(), then we can get an assertion and core dump from
GLib when the op's timers are being cleared (either during op
finalization or during fencer shutdown). This is because
clear_remote_op_timers() sees that op->op_timer_total != 0 and tries to
remove the source, but the source has already been removed.
Note that we're already (correctly) zeroing op->op_timer_one and
op->query_timeout as appropriate in their respective callback functions.
Fortunately, GLib doesn't care whether the source has already been
removed before we return G_SOURCE_REMOVE from a callback. So it's safe
to call finalize_op() (which removes all the op's timer sources) from
within a callback.
Fixes RHBZ#2166967
Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
---
daemons/fenced/fenced_remote.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c
index b7426ff88..adea3d7d8 100644
--- a/daemons/fenced/fenced_remote.c
+++ b/daemons/fenced/fenced_remote.c
@@ -718,6 +718,8 @@ remote_op_timeout(gpointer userdata)
{
remote_fencing_op_t *op = userdata;
+ op->op_timer_total = 0;
+
if (op->state == st_done) {
crm_debug("Action '%s' targeting %s for client %s already completed "
CRM_XS " id=%.8s",
--
2.39.0

View File

@ -0,0 +1,151 @@
From 0d15568a538349ac41028db6b506d13dd23e8732 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Tue, 14 Feb 2023 14:00:37 -0500
Subject: [PATCH] High: libcrmcommon: Fix handling node=NULL in
pcmk__attrd_api_query.
According to the header file, if node is NULL, pcmk__attrd_api_query
should query the value of the given attribute on all cluster nodes.
This is also what the server expects and how attrd_updater is supposed
to work.
However, pcmk__attrd_api_query has no way of letting callers decide
whether they want to query all nodes or whether they want to use the
local node. We were passing NULL for the node name, which it took to
mean it should look up the local node name. This calls
pcmk__node_attr_target, which probes the local cluster name and returns
that to pcmk__attrd_api_query. If it returns non-NULL, that value will
then be put into the XML IPC call which means the server will only
return the value for that node.
In testing this was usually fine. However, in pratice, the methods
pcmk__node_attr_target uses to figure out the local cluster node name
involves checking the OCF_RESKEY_CRM_meta_on_node environment variable
among others.
This variable was never set in testing, but can be set in the real
world. This leads to circumstances where the user did "attrd_updater -QA"
expecting to get the values on all nodes, but instead only got the value
on the local cluster node.
In pacemaker-2.1.4 and prior, pcmk__node_attr_target was simply never
called if the node was NULL but was called otherwise.
The fix is to modify pcmk__attrd_api_query to take an option for
querying all nodes. If that's present, we'll query all nodes. If it's
not present, we'll look at the given node name - NULL means look it up,
anything else means just that node.
Regression in 2.1.5 introduced by eb20a65577
---
include/crm/common/attrd_internal.h | 6 +++++-
include/crm/common/ipc_attrd_internal.h | 7 +++++--
lib/common/ipc_attrd.c | 12 ++++++++----
tools/attrd_updater.c | 5 +++--
4 files changed, 21 insertions(+), 9 deletions(-)
diff --git a/include/crm/common/attrd_internal.h b/include/crm/common/attrd_internal.h
index 389be48..7337c38 100644
--- a/include/crm/common/attrd_internal.h
+++ b/include/crm/common/attrd_internal.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2004-2022 the Pacemaker project contributors
+ * Copyright 2004-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -25,6 +25,10 @@ enum pcmk__node_attr_opts {
pcmk__node_attr_perm = (1 << 5),
pcmk__node_attr_sync_local = (1 << 6),
pcmk__node_attr_sync_cluster = (1 << 7),
+ // pcmk__node_attr_utilization is 8, but that has not been backported.
+ // I'm leaving the gap here in case we backport that in the future and
+ // also to avoid problems on mixed-version clusters.
+ pcmk__node_attr_query_all = (1 << 9),
};
#define pcmk__set_node_attr_flags(node_attr_flags, flags_to_set) do { \
diff --git a/include/crm/common/ipc_attrd_internal.h b/include/crm/common/ipc_attrd_internal.h
index 2c6713f..b1b7584 100644
--- a/include/crm/common/ipc_attrd_internal.h
+++ b/include/crm/common/ipc_attrd_internal.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2022 the Pacemaker project contributors
+ * Copyright 2022-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -110,10 +110,13 @@ int pcmk__attrd_api_purge(pcmk_ipc_api_t *api, const char *node);
*
* \param[in,out] api Connection to pacemaker-attrd
* \param[in] node Look up the attribute for this node
- * (or NULL for all nodes)
+ * (or NULL for the local node)
* \param[in] name Attribute name
* \param[in] options Bitmask of pcmk__node_attr_opts
*
+ * \note Passing pcmk__node_attr_query_all will cause the function to query
+ * the value of \p name on all nodes, regardless of the value of \p node.
+ *
* \return Standard Pacemaker return code
*/
int pcmk__attrd_api_query(pcmk_ipc_api_t *api, const char *node, const char *name,
diff --git a/lib/common/ipc_attrd.c b/lib/common/ipc_attrd.c
index 4606509..dece49b 100644
--- a/lib/common/ipc_attrd.c
+++ b/lib/common/ipc_attrd.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2011-2022 the Pacemaker project contributors
+ * Copyright 2011-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -332,10 +332,14 @@ pcmk__attrd_api_query(pcmk_ipc_api_t *api, const char *node, const char *name,
return EINVAL;
}
- target = pcmk__node_attr_target(node);
+ if (pcmk_is_set(options, pcmk__node_attr_query_all)) {
+ node = NULL;
+ } else {
+ target = pcmk__node_attr_target(node);
- if (target != NULL) {
- node = target;
+ if (target != NULL) {
+ node = target;
+ }
}
request = create_attrd_op(NULL);
diff --git a/tools/attrd_updater.c b/tools/attrd_updater.c
index 3cd766d..cbd341d 100644
--- a/tools/attrd_updater.c
+++ b/tools/attrd_updater.c
@@ -376,6 +376,7 @@ attrd_event_cb(pcmk_ipc_api_t *attrd_api, enum pcmk_ipc_event event_type,
static int
send_attrd_query(pcmk__output_t *out, const char *attr_name, const char *attr_node, gboolean query_all)
{
+ uint32_t options = pcmk__node_attr_none;
pcmk_ipc_api_t *attrd_api = NULL;
int rc = pcmk_rc_ok;
@@ -400,10 +401,10 @@ send_attrd_query(pcmk__output_t *out, const char *attr_name, const char *attr_no
/* Decide which node(s) to query */
if (query_all == TRUE) {
- attr_node = NULL;
+ options |= pcmk__node_attr_query_all;
}
- rc = pcmk__attrd_api_query(attrd_api, attr_node, attr_name, 0);
+ rc = pcmk__attrd_api_query(attrd_api, attr_node, attr_name, options);
if (rc != pcmk_rc_ok) {
g_set_error(&error, PCMK__RC_ERROR, rc, "Could not query value of %s: %s (%d)",
--
2.31.1

View File

@ -0,0 +1,142 @@
From 17cc49e1564b0ae55cc8212d14c5c055f88040da Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Tue, 14 Feb 2023 15:35:37 +0100
Subject: [PATCH] Fix: watchdog-fencing: terminate dangling timer before
watchdog-waiting
---
daemons/fenced/fenced_remote.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c
index 5c3fe25e3..aab185adb 100644
--- a/daemons/fenced/fenced_remote.c
+++ b/daemons/fenced/fenced_remote.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2009-2022 the Pacemaker project contributors
+ * Copyright 2009-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -1702,6 +1702,10 @@ check_watchdog_fencing_and_wait(remote_fencing_op_t * op)
"client %s " CRM_XS " id=%.8s",
(stonith_watchdog_timeout_ms / 1000),
op->target, op->action, op->client_name, op->id);
+
+ if (op->op_timer_one) {
+ g_source_remove(op->op_timer_one);
+ }
op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms,
remote_op_watchdog_done, op);
return TRUE;
--
2.39.0
From f2cc2a4277124230903a18713e50604a8f1842cd Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Wed, 1 Mar 2023 15:00:15 +0100
Subject: [PATCH] Refactor: watchdog-fencing: convenience function
pcmk__is_fencing_action
for consistency and add comment making clear why this block exits
with new timer set in any case
---
daemons/fenced/fenced_remote.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c
index aab185adb..e0f8de057 100644
--- a/daemons/fenced/fenced_remote.c
+++ b/daemons/fenced/fenced_remote.c
@@ -1834,7 +1834,7 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
if (!((stonith_watchdog_timeout_ms > 0)
&& (pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_none)
|| (pcmk__str_eq(peer->host, op->target, pcmk__str_casei)
- && !pcmk__str_eq(op->action, "on", pcmk__str_none)))
+ && pcmk__is_fencing_action(op->action)))
&& check_watchdog_fencing_and_wait(op))) {
/* Some thoughts about self-fencing cases reaching this point:
@@ -1854,6 +1854,9 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
Otherwise the selection of stonith-watchdog-timeout at
least is questionable.
*/
+
+ /* coming here we're not waiting for watchdog timeout -
+ thus engage timer with timout evaluated before */
op->op_timer_one = g_timeout_add((1000 * timeout_one), remote_op_timeout_one, op);
}
--
2.39.0
From c4eb45a986f8865fc5e69350fd5b9f4b056d9d69 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Tue, 14 Feb 2023 11:57:17 +0100
Subject: [PATCH] Fix: watchdog-fencing: correctly derive timeout with topology
up to now the timeout for watchdog-fencing was just added to
the overall timeout if the node to be fenced was visible and
reported back to the query.
---
daemons/fenced/fenced_remote.c | 28 +++++++++++++++++++++++++---
1 file changed, 25 insertions(+), 3 deletions(-)
diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c
index e0f8de057..3b7ab05e9 100644
--- a/daemons/fenced/fenced_remote.c
+++ b/daemons/fenced/fenced_remote.c
@@ -969,8 +969,9 @@ advance_topology_level(remote_fencing_op_t *op, bool empty_ok)
return pcmk_rc_ok;
}
- crm_info("All fencing options targeting %s for client %s@%s failed "
+ crm_info("All %sfencing options targeting %s for client %s@%s failed "
CRM_XS " id=%.8s",
+ (stonith_watchdog_timeout_ms > 0)?"non-watchdog ":"",
op->target, op->client_name, op->originator, op->id);
return ENODEV;
}
@@ -1434,8 +1435,17 @@ stonith_choose_peer(remote_fencing_op_t * op)
&& pcmk_is_set(op->call_options, st_opt_topology)
&& (advance_topology_level(op, false) == pcmk_rc_ok));
- crm_notice("Couldn't find anyone to fence (%s) %s using %s",
- op->action, op->target, (device? device : "any device"));
+ if ((stonith_watchdog_timeout_ms > 0)
+ && pcmk__is_fencing_action(op->action)
+ && pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_none)
+ && node_does_watchdog_fencing(op->target)) {
+ crm_info("Couldn't contact watchdog-fencing target-node (%s)",
+ op->target);
+ /* check_watchdog_fencing_and_wait will log additional info */
+ } else {
+ crm_notice("Couldn't find anyone to fence (%s) %s using %s",
+ op->action, op->target, (device? device : "any device"));
+ }
return NULL;
}
@@ -1531,6 +1541,18 @@ get_op_total_timeout(const remote_fencing_op_t *op,
continue;
}
for (device_list = tp->levels[i]; device_list; device_list = device_list->next) {
+ /* in case of watchdog-device we add the timeout to the budget
+ regardless of if we got a reply or not
+ */
+ if ((stonith_watchdog_timeout_ms > 0)
+ && pcmk__is_fencing_action(op->action)
+ && pcmk__str_eq(device_list->data, STONITH_WATCHDOG_ID,
+ pcmk__str_none)
+ && node_does_watchdog_fencing(op->target)) {
+ total_timeout += stonith_watchdog_timeout_ms / 1000;
+ continue;
+ }
+
for (iter = op->query_results; iter != NULL; iter = iter->next) {
const peer_device_info_t *peer = iter->data;
--
2.39.0

View File

@ -12,7 +12,7 @@ diff --git a/tools/attrd_updater.c b/tools/attrd_updater.c
index b615a3575..4688b9ff6 100644
--- a/tools/attrd_updater.c
+++ b/tools/attrd_updater.c
@@ -501,7 +501,7 @@ send_attrd_update(char command, const char *attr_node, const char *attr_name,
@@ -445,7 +445,7 @@ send_attrd_update(char command, const char *attr_node, const char *attr_name,
case 'U':
rc = pcmk__attrd_api_update(NULL, attr_node, attr_name, attr_value,

View File

@ -1,163 +0,0 @@
From 63f4bd4d5a324e6eb279340a42c7c36c8902ada7 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 2 Aug 2023 15:55:26 -0500
Subject: [PATCH 1/4] Fix: controller: don't try to execute agent action at
shutdown
Normally, agent execution is not possible at shutdown. However, when metadata
is needed for some action, the agent can be called asynchronously, and when the
metadata action returns, the original action is performed. If the metadata is
initiated before shutdown, but completes after shutdown has begun, do not try
to attempt the original action, so we avoid unnecessary error logs.
---
daemons/controld/controld_execd.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 530e4346c8..a90e8d833e 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -1400,7 +1400,9 @@ metadata_complete(int pid, const pcmk__action_result_t *result, void *user_data)
md = controld_cache_metadata(lrm_state->metadata_cache, data->rsc,
result->action_stdout);
}
- do_lrm_rsc_op(lrm_state, data->rsc, data->input_xml, md);
+ if (!pcmk_is_set(controld_globals.fsa_input_register, R_HA_DISCONNECTED)) {
+ do_lrm_rsc_op(lrm_state, data->rsc, data->input_xml, md);
+ }
free_metadata_cb_data(data);
}
From 247d9534f36f690c1474e36cedaadb3934022a05 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 2 Aug 2023 16:16:31 -0500
Subject: [PATCH 2/4] Refactor: controller: de-functionize lrm_state_destroy()
It was a one-liner called once
---
daemons/controld/controld_execd_state.c | 8 +-------
daemons/controld/controld_lrm.h | 5 -----
2 files changed, 1 insertion(+), 12 deletions(-)
diff --git a/daemons/controld/controld_execd_state.c b/daemons/controld/controld_execd_state.c
index 8c68bfca08..4a87a9b332 100644
--- a/daemons/controld/controld_execd_state.c
+++ b/daemons/controld/controld_execd_state.c
@@ -132,12 +132,6 @@ lrm_state_create(const char *node_name)
return state;
}
-void
-lrm_state_destroy(const char *node_name)
-{
- g_hash_table_remove(lrm_state_table, node_name);
-}
-
static gboolean
remote_proxy_remove_by_node(gpointer key, gpointer value, gpointer user_data)
{
@@ -799,7 +793,7 @@ lrm_state_unregister_rsc(lrm_state_t * lrm_state,
}
if (is_remote_lrmd_ra(NULL, NULL, rsc_id)) {
- lrm_state_destroy(rsc_id);
+ g_hash_table_remove(lrm_state_table, rsc_id);
return pcmk_ok;
}
diff --git a/daemons/controld/controld_lrm.h b/daemons/controld/controld_lrm.h
index 25f3db3316..c3113e49c3 100644
--- a/daemons/controld/controld_lrm.h
+++ b/daemons/controld/controld_lrm.h
@@ -108,11 +108,6 @@ gboolean lrm_state_init_local(void);
*/
void lrm_state_destroy_all(void);
-/*!
- * \brief Destroy executor connection by node name
- */
-void lrm_state_destroy(const char *node_name);
-
/*!
* \brief Find lrm_state data by node name
*/
From 1b915f1ce38756431f7faa142565e3e07aade194 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 2 Aug 2023 15:58:09 -0500
Subject: [PATCH 3/4] Low: controller: guard lrm_state_table usage with NULL
check
It is NULL while draining the mainloop during the shutdown sequence.
---
daemons/controld/controld_execd_state.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/daemons/controld/controld_execd_state.c b/daemons/controld/controld_execd_state.c
index 4a87a9b332..b90cc5e635 100644
--- a/daemons/controld/controld_execd_state.c
+++ b/daemons/controld/controld_execd_state.c
@@ -301,7 +301,7 @@ lrm_state_destroy_all(void)
lrm_state_t *
lrm_state_find(const char *node_name)
{
- if (!node_name) {
+ if ((node_name == NULL) || (lrm_state_table == NULL)) {
return NULL;
}
return g_hash_table_lookup(lrm_state_table, node_name);
@@ -312,6 +312,8 @@ lrm_state_find_or_create(const char *node_name)
{
lrm_state_t *lrm_state;
+ CRM_CHECK(lrm_state_table != NULL, return NULL);
+
lrm_state = g_hash_table_lookup(lrm_state_table, node_name);
if (!lrm_state) {
lrm_state = lrm_state_create(node_name);
@@ -323,6 +325,9 @@ lrm_state_find_or_create(const char *node_name)
GList *
lrm_state_get_list(void)
{
+ if (lrm_state_table == NULL) {
+ return NULL;
+ }
return g_hash_table_get_values(lrm_state_table);
}
From 78581213ed3bf4183b0ec1f391b720d5d91f3f68 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 2 Aug 2023 15:48:36 -0500
Subject: [PATCH 4/4] Log: controller: improve messages for resource history
updates
---
daemons/controld/controld_cib.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/daemons/controld/controld_cib.c b/daemons/controld/controld_cib.c
index 22ac42486f..c9dde0b748 100644
--- a/daemons/controld/controld_cib.c
+++ b/daemons/controld/controld_cib.c
@@ -861,10 +861,17 @@ cib_rsc_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *use
case pcmk_ok:
case -pcmk_err_diff_failed:
case -pcmk_err_diff_resync:
- crm_trace("Resource update %d complete: rc=%d", call_id, rc);
+ crm_trace("Resource history update completed (call=%d rc=%d)",
+ call_id, rc);
break;
default:
- crm_warn("Resource update %d failed: (rc=%d) %s", call_id, rc, pcmk_strerror(rc));
+ if (call_id > 0) {
+ crm_warn("Resource history update %d failed: %s "
+ CRM_XS " rc=%d", call_id, pcmk_strerror(rc), rc);
+ } else {
+ crm_warn("Resource history update failed: %s " CRM_XS " rc=%d",
+ pcmk_strerror(rc), rc);
+ }
}
if (call_id == pending_rsc_update) {

View File

@ -26,7 +26,7 @@ diff --git a/lib/common/ipc_controld.c b/lib/common/ipc_controld.c
index 3c3a98964..405fd0518 100644
--- a/lib/common/ipc_controld.c
+++ b/lib/common/ipc_controld.c
@@ -177,18 +177,16 @@ set_nodes_data(pcmk_controld_api_reply_t *data, xmlNode *msg_data)
@@ -143,18 +143,16 @@ set_nodes_data(pcmk_controld_api_reply_t *data, xmlNode *msg_data)
static bool
reply_expected(pcmk_ipc_api_t *api, xmlNode *request)
{
@ -55,7 +55,7 @@ index 3c3a98964..405fd0518 100644
}
static bool
@@ -202,22 +200,12 @@ dispatch(pcmk_ipc_api_t *api, xmlNode *reply)
@@ -168,22 +166,12 @@ dispatch(pcmk_ipc_api_t *api, xmlNode *reply)
pcmk_controld_reply_unknown, NULL, NULL,
};
@ -83,7 +83,7 @@ index 3c3a98964..405fd0518 100644
}
if (private->replies_expected > 0) {
@@ -344,18 +332,15 @@ static int
@@ -310,18 +298,15 @@ static int
send_controller_request(pcmk_ipc_api_t *api, xmlNode *request,
bool reply_is_expected)
{

View File

@ -0,0 +1,127 @@
From 2587f9fabea3a7ef01eb7752d4e2ef082823934e Mon Sep 17 00:00:00 2001
From: eabdullin <ed.abdullin.1@gmail.com>
Date: Wed, 13 Sep 2023 14:15:46 +0300
Subject: [PATCH] - Fix: controller: don't try to execute agent action at
shutdown Normally, agent execution is not possible at shutdown. However, when
metadata is needed for some action, the agent can be called asynchronously,
and when the metadata action returns, the original action is performed. If
the metadata is initiated before shutdown, but completes after shutdown has
begun, do not try to attempt the original action, so we avoid unnecessary
error logs. - Refactor: controller: de-functionize lrm_state_destroy() It was
a one-liner called once - Log: controller: improve messages for resource
history updates - Low: controller: guard lrm_state_table usage with NULLcheck
It is NULL while draining the mainloop during the shutdown sequence.
---
daemons/controld/controld_execd.c | 15 ++++++++++++---
daemons/controld/controld_execd_state.c | 15 +++++++--------
daemons/controld/controld_lrm.h | 5 -----
3 files changed, 19 insertions(+), 16 deletions(-)
diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index afead92..e7a91ab 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -1728,7 +1728,9 @@ metadata_complete(int pid, const pcmk__action_result_t *result, void *user_data)
md = controld_cache_metadata(lrm_state->metadata_cache, data->rsc,
result->action_stdout);
}
- do_lrm_rsc_op(lrm_state, data->rsc, data->input_xml, md);
+ if (!pcmk_is_set(fsa_input_register, R_HA_DISCONNECTED)) {
+ do_lrm_rsc_op(lrm_state, data->rsc, data->input_xml, md);
+ }
free_metadata_cb_data(data);
}
@@ -2406,10 +2408,17 @@ cib_rsc_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *use
case pcmk_ok:
case -pcmk_err_diff_failed:
case -pcmk_err_diff_resync:
- crm_trace("Resource update %d complete: rc=%d", call_id, rc);
+ crm_trace("Resource history update completed (call=%d rc=%d)",
+ call_id, rc);
break;
default:
- crm_warn("Resource update %d failed: (rc=%d) %s", call_id, rc, pcmk_strerror(rc));
+ if (call_id > 0) {
+ crm_warn("Resource history update %d failed: %s "
+ CRM_XS " rc=%d", call_id, pcmk_strerror(rc), rc);
+ } else {
+ crm_warn("Resource history update failed: %s " CRM_XS " rc=%d",
+ pcmk_strerror(rc), rc);
+ }
}
if (call_id == last_resource_update) {
diff --git a/daemons/controld/controld_execd_state.c b/daemons/controld/controld_execd_state.c
index adba2e5..3994b6d 100644
--- a/daemons/controld/controld_execd_state.c
+++ b/daemons/controld/controld_execd_state.c
@@ -131,12 +131,6 @@ lrm_state_create(const char *node_name)
return state;
}
-void
-lrm_state_destroy(const char *node_name)
-{
- g_hash_table_remove(lrm_state_table, node_name);
-}
-
static gboolean
remote_proxy_remove_by_node(gpointer key, gpointer value, gpointer user_data)
{
@@ -252,7 +246,7 @@ lrm_state_destroy_all(void)
lrm_state_t *
lrm_state_find(const char *node_name)
{
- if (!node_name) {
+ if ((node_name == NULL) || (lrm_state_table == NULL)) {
return NULL;
}
return g_hash_table_lookup(lrm_state_table, node_name);
@@ -263,6 +257,8 @@ lrm_state_find_or_create(const char *node_name)
{
lrm_state_t *lrm_state;
+ CRM_CHECK(lrm_state_table != NULL, return NULL);
+
lrm_state = g_hash_table_lookup(lrm_state_table, node_name);
if (!lrm_state) {
lrm_state = lrm_state_create(node_name);
@@ -274,6 +270,9 @@ lrm_state_find_or_create(const char *node_name)
GList *
lrm_state_get_list(void)
{
+ if (lrm_state_table == NULL) {
+ return NULL;
+ }
return g_hash_table_get_values(lrm_state_table);
}
@@ -764,7 +763,7 @@ lrm_state_unregister_rsc(lrm_state_t * lrm_state,
}
if (is_remote_lrmd_ra(NULL, NULL, rsc_id)) {
- lrm_state_destroy(rsc_id);
+ g_hash_table_remove(lrm_state_table, rsc_id);
return pcmk_ok;
}
diff --git a/daemons/controld/controld_lrm.h b/daemons/controld/controld_lrm.h
index 983c288..11ff1bc 100644
--- a/daemons/controld/controld_lrm.h
+++ b/daemons/controld/controld_lrm.h
@@ -113,11 +113,6 @@ void lrm_state_destroy_all(void);
*/
lrm_state_t *lrm_state_create(const char *node_name);
-/*!
- * \brief Destroy executor connection by node name
- */
-void lrm_state_destroy(const char *node_name);
-
/*!
* \brief Find lrm_state data by node name
*/
--

View File

@ -1,62 +0,0 @@
From 2e81e0db9a716c486805e0760f78be65ca79eeae Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 17 Oct 2023 15:28:27 -0500
Subject: [PATCH] Fix: attrd: avoid regression by reverting 58400e27
Fixes T714
---
daemons/attrd/attrd_cib.c | 5 -----
daemons/attrd/attrd_elections.c | 10 +---------
daemons/attrd/pacemaker-attrd.h | 1 -
3 files changed, 1 insertion(+), 15 deletions(-)
diff --git a/daemons/attrd/attrd_cib.c b/daemons/attrd/attrd_cib.c
index 2de37a7cb6..9ce2872715 100644
--- a/daemons/attrd/attrd_cib.c
+++ b/daemons/attrd/attrd_cib.c
@@ -641,11 +641,6 @@ attrd_write_attributes(uint32_t options)
pcmk_is_set(options, attrd_write_all)? "all" : "changed");
g_hash_table_iter_init(&iter, attributes);
while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & a)) {
- if (pcmk_is_set(options, attrd_write_skip_shutdown)
- && pcmk__str_eq(a->id, XML_CIB_ATTR_SHUTDOWN, pcmk__str_none)) {
- continue;
- }
-
if (!pcmk_is_set(options, attrd_write_all) && a->unknown_peer_uuids) {
// Try writing this attribute again, in case peer ID was learned
a->changed = true;
diff --git a/daemons/attrd/attrd_elections.c b/daemons/attrd/attrd_elections.c
index a95cd44cbd..62310ed1d8 100644
--- a/daemons/attrd/attrd_elections.c
+++ b/daemons/attrd/attrd_elections.c
@@ -35,16 +35,8 @@ attrd_election_cb(gpointer user_data)
/* After winning an election, update the CIB with the values of all
* attributes as the winner knows them.
- *
- * However, do not write out any "shutdown" attributes. A node that is
- * shutting down will have all its transient attributes removed from the CIB
- * when its controller exits, and from the attribute manager's memory (on
- * remaining nodes) when its attribute manager exits; if an election is won
- * between when those two things happen, we don't want to write the shutdown
- * attribute back out, which would cause the node to immediately shut down
- * the next time it rejoins.
*/
- attrd_write_attributes(attrd_write_all|attrd_write_skip_shutdown);
+ attrd_write_attributes(attrd_write_all);
return G_SOURCE_REMOVE;
}
diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h
index e3c369b5bc..a95bb54367 100644
--- a/daemons/attrd/pacemaker-attrd.h
+++ b/daemons/attrd/pacemaker-attrd.h
@@ -181,7 +181,6 @@ enum attrd_write_options {
attrd_write_changed = 0,
attrd_write_all = (1 << 0),
attrd_write_no_delay = (1 << 1),
- attrd_write_skip_shutdown = (1 << 2),
};
void attrd_write_attribute(attribute_t *a, bool ignore_delay);

View File

@ -1,34 +0,0 @@
From 14b87a38786ae5b4dc12fc1581e5d39a274fced2 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 30 Oct 2023 12:21:24 -0500
Subject: [PATCH] Fix: attrd: revert faulty T138 fix
f5263c9401 created a timing issue where a node could get a shutdown attribute,
the original writer leaves the cluster before writing it out, then the
shutting-down node wins the writer election. In that case, it would skip the
write-out and the scheduler would never shut it down.
Reopens T138
---
daemons/attrd/attrd_elections.c | 8 --------
1 file changed, 8 deletions(-)
diff --git a/daemons/attrd/attrd_elections.c b/daemons/attrd/attrd_elections.c
index 62310ed1d8..82fbe8affc 100644
--- a/daemons/attrd/attrd_elections.c
+++ b/daemons/attrd/attrd_elections.c
@@ -22,14 +22,6 @@
{
attrd_declare_winner();
- if (attrd_requesting_shutdown() || attrd_shutting_down()) {
- /* This node is shutting down or about to, meaning its attributes will
- * be removed (and may have already been removed from the CIB by a
- * controller). Don't sync or write its attributes in this case.
- */
- return G_SOURCE_REMOVE;
- }
-
/* Update the peers after an election */
attrd_peer_sync(NULL, NULL);

File diff suppressed because it is too large Load Diff