Compare commits

...

No commits in common. "c8" and "c8s" have entirely different histories.
c8 ... c8s

19 changed files with 5648 additions and 19511 deletions

2
.gitignore vendored
View File

@ -1,2 +1,2 @@
SOURCES/nagios-agents-metadata-105ab8a.tar.gz
SOURCES/pacemaker-6fdc9deea.tar.gz
SOURCES/pacemaker-a3f4479.tar.gz

View File

@ -1,2 +1,2 @@
ea6c0a27fd0ae8ce02f84a11f08a0d79377041c3 SOURCES/nagios-agents-metadata-105ab8a.tar.gz
fbf71fb3fb42c76f9f1e98497505eb8521cab55e SOURCES/pacemaker-6fdc9deea.tar.gz
883efa27f94c6a07942f51cf7c8959c5fbb624fe SOURCES/pacemaker-a3f4479.tar.gz

View File

@ -1,402 +0,0 @@
From cf53f523e691295879cd75cff1a86bc15664fa51 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Tue, 2 May 2023 09:59:13 -0400
Subject: [PATCH 1/7] Feature: daemons: Add start state to LRMD handshake XML
This gets read out of /etc/sysconfig/pacemaker and set into the
environment. The remote node executor will then add that to the XML
that it sends to the controller upon startup.
Ref T183
---
daemons/execd/execd_commands.c | 5 +++++
include/crm_internal.h | 1 +
2 files changed, 6 insertions(+)
diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c
index fa2761e..9a783a5 100644
--- a/daemons/execd/execd_commands.c
+++ b/daemons/execd/execd_commands.c
@@ -1474,6 +1474,7 @@ process_lrmd_signon(pcmk__client_t *client, xmlNode *request, int call_id,
int rc = pcmk_ok;
time_t now = time(NULL);
const char *protocol_version = crm_element_value(request, F_LRMD_PROTOCOL_VERSION);
+ const char *start_state = pcmk__env_option(PCMK__ENV_NODE_START_STATE);
if (compare_version(protocol_version, LRMD_MIN_PROTOCOL_VERSION) < 0) {
crm_err("Cluster API version must be greater than or equal to %s, not %s",
@@ -1503,6 +1504,10 @@ process_lrmd_signon(pcmk__client_t *client, xmlNode *request, int call_id,
crm_xml_add(*reply, F_LRMD_PROTOCOL_VERSION, LRMD_PROTOCOL_VERSION);
crm_xml_add_ll(*reply, PCMK__XA_UPTIME, now - start_time);
+ if (start_state) {
+ crm_xml_add(*reply, PCMK__XA_NODE_START_STATE, start_state);
+ }
+
return rc;
}
diff --git a/include/crm_internal.h b/include/crm_internal.h
index 5f6531f..771bd26 100644
--- a/include/crm_internal.h
+++ b/include/crm_internal.h
@@ -84,6 +84,7 @@
#define PCMK__XA_GRAPH_ERRORS "graph-errors"
#define PCMK__XA_GRAPH_WARNINGS "graph-warnings"
#define PCMK__XA_MODE "mode"
+#define PCMK__XA_NODE_START_STATE "node_start_state"
#define PCMK__XA_TASK "task"
#define PCMK__XA_UPTIME "uptime"
#define PCMK__XA_CONN_HOST "connection_host"
--
2.31.1
From c950291742711b5c4c8986adc8e938fe6fef861c Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Tue, 2 May 2023 10:04:32 -0400
Subject: [PATCH 2/7] Feature: liblrmd: Save a remote node's requested start
state
Ref T183
---
include/crm/common/ipc_internal.h | 1 +
lib/lrmd/lrmd_client.c | 7 +++++++
2 files changed, 8 insertions(+)
diff --git a/include/crm/common/ipc_internal.h b/include/crm/common/ipc_internal.h
index 5099dda..d203924 100644
--- a/include/crm/common/ipc_internal.h
+++ b/include/crm/common/ipc_internal.h
@@ -112,6 +112,7 @@ struct pcmk__remote_s {
int tcp_socket;
mainloop_io_t *source;
time_t uptime;
+ char *start_state;
/* CIB-only */
char *token;
diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c
index c565728..4239105 100644
--- a/lib/lrmd/lrmd_client.c
+++ b/lib/lrmd/lrmd_client.c
@@ -588,7 +588,9 @@ lrmd_tls_connection_destroy(gpointer userdata)
}
free(native->remote->buffer);
+ free(native->remote->start_state);
native->remote->buffer = NULL;
+ native->remote->start_state = NULL;
native->source = 0;
native->sock = 0;
native->psk_cred_c = NULL;
@@ -980,6 +982,7 @@ lrmd_handshake(lrmd_t * lrmd, const char *name)
const char *version = crm_element_value(reply, F_LRMD_PROTOCOL_VERSION);
const char *msg_type = crm_element_value(reply, F_LRMD_OPERATION);
const char *tmp_ticket = crm_element_value(reply, F_LRMD_CLIENTID);
+ const char *start_state = crm_element_value(reply, PCMK__XA_NODE_START_STATE);
long long uptime = -1;
crm_element_value_int(reply, F_LRMD_RC, &rc);
@@ -992,6 +995,10 @@ lrmd_handshake(lrmd_t * lrmd, const char *name)
crm_element_value_ll(reply, PCMK__XA_UPTIME, &uptime);
native->remote->uptime = uptime;
+ if (start_state) {
+ native->remote->start_state = strdup(start_state);
+ }
+
if (rc == -EPROTO) {
crm_err("Executor protocol version mismatch between client (%s) and server (%s)",
LRMD_PROTOCOL_VERSION, version);
--
2.31.1
From 7302014c7b7296be31b1f542b3f107d55b1fb2a0 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Tue, 2 May 2023 10:05:13 -0400
Subject: [PATCH 3/7] Feature: liblrmd: Add lrmd__node_start_state.
This function is used to get the start state out of an lrmd_private_t
structure.
Ref T183
---
include/crm/lrmd_internal.h | 1 +
lib/lrmd/lrmd_client.c | 12 ++++++++++++
2 files changed, 13 insertions(+)
diff --git a/include/crm/lrmd_internal.h b/include/crm/lrmd_internal.h
index 5810554..d1cd25d 100644
--- a/include/crm/lrmd_internal.h
+++ b/include/crm/lrmd_internal.h
@@ -47,6 +47,7 @@ void lrmd__set_result(lrmd_event_data_t *event, enum ocf_exitcode rc,
void lrmd__reset_result(lrmd_event_data_t *event);
time_t lrmd__uptime(lrmd_t *lrmd);
+const char *lrmd__node_start_state(lrmd_t *lrmd);
/* Shared functions for IPC proxy back end */
diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c
index 4239105..82434b9 100644
--- a/lib/lrmd/lrmd_client.c
+++ b/lib/lrmd/lrmd_client.c
@@ -2538,3 +2538,15 @@ lrmd__uptime(lrmd_t *lrmd)
return native->remote->uptime;
}
}
+
+const char *
+lrmd__node_start_state(lrmd_t *lrmd)
+{
+ lrmd_private_t *native = lrmd->lrmd_private;
+
+ if (native->remote == NULL) {
+ return NULL;
+ } else {
+ return native->remote->start_state;
+ }
+}
--
2.31.1
From e5e4d43f847da0930bae12f63c7e9d9c44c07cdf Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Tue, 2 May 2023 10:07:58 -0400
Subject: [PATCH 4/7] Refactor: controller: Make set_join_state a public
function.
This already does all the work of setting a node's start state. It just
needs to be made public and given arguments for what node to set instead
of reading globals.
Ref T183
---
daemons/controld/controld_join_client.c | 20 ++++++++++----------
daemons/controld/pacemaker-controld.h | 3 +++
2 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/daemons/controld/controld_join_client.c b/daemons/controld/controld_join_client.c
index da6a9d6..07e2a27 100644
--- a/daemons/controld/controld_join_client.c
+++ b/daemons/controld/controld_join_client.c
@@ -195,32 +195,31 @@ join_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *
free_xml(generation);
}
-static void
-set_join_state(const char * start_state)
+void
+set_join_state(const char *start_state, const char *node_name, const char *node_uuid)
{
if (pcmk__str_eq(start_state, "standby", pcmk__str_casei)) {
crm_notice("Forcing node %s to join in %s state per configured "
- "environment", controld_globals.our_nodename, start_state);
+ "environment", node_name, start_state);
cib__update_node_attr(controld_globals.logger_out,
controld_globals.cib_conn, cib_sync_call,
- XML_CIB_TAG_NODES, controld_globals.our_uuid,
+ XML_CIB_TAG_NODES, node_uuid,
NULL, NULL, NULL, "standby", "on", NULL, NULL);
} else if (pcmk__str_eq(start_state, "online", pcmk__str_casei)) {
crm_notice("Forcing node %s to join in %s state per configured "
- "environment", controld_globals.our_nodename, start_state);
+ "environment", node_name, start_state);
cib__update_node_attr(controld_globals.logger_out,
controld_globals.cib_conn, cib_sync_call,
- XML_CIB_TAG_NODES, controld_globals.our_uuid,
+ XML_CIB_TAG_NODES, node_uuid,
NULL, NULL, NULL, "standby", "off", NULL, NULL);
} else if (pcmk__str_eq(start_state, "default", pcmk__str_casei)) {
- crm_debug("Not forcing a starting state on node %s",
- controld_globals.our_nodename);
+ crm_debug("Not forcing a starting state on node %s", node_name);
} else {
crm_warn("Unrecognized start state '%s', using 'default' (%s)",
- start_state, controld_globals.our_nodename);
+ start_state, node_name);
}
}
@@ -335,7 +334,8 @@ do_cl_join_finalize_respond(long long action,
first_join = FALSE;
if (start_state) {
- set_join_state(start_state);
+ set_join_state(start_state, controld_globals.our_nodename,
+ controld_globals.our_uuid);
}
}
diff --git a/daemons/controld/pacemaker-controld.h b/daemons/controld/pacemaker-controld.h
index 1484a00..d8c2ddd 100644
--- a/daemons/controld/pacemaker-controld.h
+++ b/daemons/controld/pacemaker-controld.h
@@ -36,4 +36,7 @@ void controld_remove_voter(const char *uname);
void controld_election_fini(void);
void controld_stop_current_election_timeout(void);
+void set_join_state(const char *start_state, const char *node_name,
+ const char *node_uuid);
+
#endif
--
2.31.1
From 63d069adb344bba2c982013226f87dfd95afaff3 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Tue, 2 May 2023 13:38:03 -0400
Subject: [PATCH 5/7] Refactor: controller: set_join_state needs to take a
remote parameter.
Without this parameter, we won't know what to pass to as node_type to
cib__update_node_attr. And without that, that function will not know to
update a remote node - it'll try to update a regular node by the same
name, which either doesn't exist or is not what we were hoping would
happen.
Ref T138
---
daemons/controld/controld_join_client.c | 11 +++++++----
daemons/controld/pacemaker-controld.h | 2 +-
2 files changed, 8 insertions(+), 5 deletions(-)
diff --git a/daemons/controld/controld_join_client.c b/daemons/controld/controld_join_client.c
index 07e2a27..799d1b4 100644
--- a/daemons/controld/controld_join_client.c
+++ b/daemons/controld/controld_join_client.c
@@ -196,7 +196,8 @@ join_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *
}
void
-set_join_state(const char *start_state, const char *node_name, const char *node_uuid)
+set_join_state(const char *start_state, const char *node_name, const char *node_uuid,
+ bool remote)
{
if (pcmk__str_eq(start_state, "standby", pcmk__str_casei)) {
crm_notice("Forcing node %s to join in %s state per configured "
@@ -204,7 +205,8 @@ set_join_state(const char *start_state, const char *node_name, const char *node_
cib__update_node_attr(controld_globals.logger_out,
controld_globals.cib_conn, cib_sync_call,
XML_CIB_TAG_NODES, node_uuid,
- NULL, NULL, NULL, "standby", "on", NULL, NULL);
+ NULL, NULL, NULL, "standby", "on", NULL,
+ remote ? "remote" : NULL);
} else if (pcmk__str_eq(start_state, "online", pcmk__str_casei)) {
crm_notice("Forcing node %s to join in %s state per configured "
@@ -212,7 +214,8 @@ set_join_state(const char *start_state, const char *node_name, const char *node_
cib__update_node_attr(controld_globals.logger_out,
controld_globals.cib_conn, cib_sync_call,
XML_CIB_TAG_NODES, node_uuid,
- NULL, NULL, NULL, "standby", "off", NULL, NULL);
+ NULL, NULL, NULL, "standby", "off", NULL,
+ remote ? "remote" : NULL);
} else if (pcmk__str_eq(start_state, "default", pcmk__str_casei)) {
crm_debug("Not forcing a starting state on node %s", node_name);
@@ -335,7 +338,7 @@ do_cl_join_finalize_respond(long long action,
first_join = FALSE;
if (start_state) {
set_join_state(start_state, controld_globals.our_nodename,
- controld_globals.our_uuid);
+ controld_globals.our_uuid, false);
}
}
diff --git a/daemons/controld/pacemaker-controld.h b/daemons/controld/pacemaker-controld.h
index d8c2ddd..2334cce 100644
--- a/daemons/controld/pacemaker-controld.h
+++ b/daemons/controld/pacemaker-controld.h
@@ -37,6 +37,6 @@ void controld_election_fini(void);
void controld_stop_current_election_timeout(void);
void set_join_state(const char *start_state, const char *node_name,
- const char *node_uuid);
+ const char *node_uuid, bool remote);
#endif
--
2.31.1
From 67274787898355065315f8c06d62458e2c2b0afe Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Tue, 2 May 2023 10:09:02 -0400
Subject: [PATCH 6/7] Feature: controller: When a remote node starts, apply any
start state.
If we were given a start state in the handshake XML, that is now stored
in the remote node cache's private data. Extract it and set the state
on the node with set_node_state.
Fixes T183
---
daemons/controld/controld_remote_ra.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c
index f24b755..8ab1e46 100644
--- a/daemons/controld/controld_remote_ra.c
+++ b/daemons/controld/controld_remote_ra.c
@@ -280,6 +280,7 @@ remote_node_up(const char *node_name)
int call_opt;
xmlNode *update, *state;
crm_node_t *node;
+ lrm_state_t *connection_rsc = NULL;
CRM_CHECK(node_name != NULL, return);
crm_info("Announcing Pacemaker Remote node %s", node_name);
@@ -301,6 +302,20 @@ remote_node_up(const char *node_name)
purge_remote_node_attrs(call_opt, node);
pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0);
+ /* Apply any start state that we were given from the environment on the
+ * remote node.
+ */
+ connection_rsc = lrm_state_find(node->uname);
+
+ if (connection_rsc != NULL) {
+ lrmd_t *lrm = connection_rsc->conn;
+ const char *start_state = lrmd__node_start_state(lrm);
+
+ if (start_state) {
+ set_join_state(start_state, node->uname, node->uuid, true);
+ }
+ }
+
/* pacemaker_remote nodes don't participate in the membership layer,
* so cluster nodes don't automatically get notified when they come and go.
* We send a cluster message to the DC, and update the CIB node state entry,
--
2.31.1
From 91cdda7056c9b9254a0d7e7a016b30f788e3e3ff Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Tue, 2 May 2023 10:16:30 -0400
Subject: [PATCH 7/7] Doc: sysconfig: Remote nodes now respect start state.
Ref T183
---
etc/sysconfig/pacemaker.in | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/etc/sysconfig/pacemaker.in b/etc/sysconfig/pacemaker.in
index 3b03ad6..041da71 100644
--- a/etc/sysconfig/pacemaker.in
+++ b/etc/sysconfig/pacemaker.in
@@ -144,8 +144,7 @@
# By default, the local host will join the cluster in an online or standby
# state when Pacemaker first starts depending on whether it was previously put
# into standby mode. If this variable is set to "standby" or "online", it will
-# force the local host to join in the specified state. This has no effect on
-# Pacemaker Remote nodes.
+# force the local host to join in the specified state.
#
# Default: PCMK_node_start_state="default"
--
2.31.1

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,98 @@
From d8e08729ad5e3dc62f774172f992210902fc0ed4 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 23 Jan 2023 14:25:56 -0600
Subject: [PATCH] High: executor: fix regression in remote node shutdown
This reverts the essential part of d61494347, which was based on misdiagnosing
a remote node shutdown issue. Initially, it was thought that a "TLS server
session ended" log just after a remote node requested shutdown indicated that
the proxy connection coincidentally dropped at that moment. It actually is the
routine stopping of accepting new proxy connections, and existing when that
happens makes the remote node exit immediately without waiting for the
all-clear from the cluster.
Fixes T361
---
daemons/execd/pacemaker-execd.c | 19 +------------------
daemons/execd/pacemaker-execd.h | 3 +--
daemons/execd/remoted_tls.c | 6 +-----
3 files changed, 3 insertions(+), 25 deletions(-)
diff --git a/daemons/execd/pacemaker-execd.c b/daemons/execd/pacemaker-execd.c
index db12674f13..491808974a 100644
--- a/daemons/execd/pacemaker-execd.c
+++ b/daemons/execd/pacemaker-execd.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2012-2022 the Pacemaker project contributors
+ * Copyright 2012-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -305,23 +305,6 @@ lrmd_exit(gpointer data)
return FALSE;
}
-/*!
- * \internal
- * \brief Clean up and exit if shutdown has started
- *
- * \return Doesn't return
- */
-void
-execd_exit_if_shutting_down(void)
-{
-#ifdef PCMK__COMPILE_REMOTE
- if (shutting_down) {
- crm_warn("exit because TLS connection was closed and 'shutting_down' set");
- lrmd_exit(NULL);
- }
-#endif
-}
-
/*!
* \internal
* \brief Request cluster shutdown if appropriate, otherwise exit immediately
diff --git a/daemons/execd/pacemaker-execd.h b/daemons/execd/pacemaker-execd.h
index 6646ae29e3..f78e8dcdde 100644
--- a/daemons/execd/pacemaker-execd.h
+++ b/daemons/execd/pacemaker-execd.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2012-2022 the Pacemaker project contributors
+ * Copyright 2012-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -105,6 +105,5 @@ void remoted_spawn_pidone(int argc, char **argv, char **envp);
int process_lrmd_alert_exec(pcmk__client_t *client, uint32_t id,
xmlNode *request);
void lrmd_drain_alerts(GMainLoop *mloop);
-void execd_exit_if_shutting_down(void);
#endif // PACEMAKER_EXECD__H
diff --git a/daemons/execd/remoted_tls.c b/daemons/execd/remoted_tls.c
index 6f4b2d0062..c65e3f394d 100644
--- a/daemons/execd/remoted_tls.c
+++ b/daemons/execd/remoted_tls.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2012-2022 the Pacemaker project contributors
+ * Copyright 2012-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -250,10 +250,6 @@ static void
tls_server_dropped(gpointer user_data)
{
crm_notice("TLS server session ended");
- /* If we are in the process of shutting down, then we should actually exit.
- * bz#1804259
- */
- execd_exit_if_shutting_down();
return;
}
--
2.31.1

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,35 +0,0 @@
From 770d417e28dc9527fec8b8a00caaba8825995454 Mon Sep 17 00:00:00 2001
From: Grace Chin <gchin@redhat.com>
Date: Wed, 19 Jul 2023 10:25:55 -0400
Subject: [PATCH] Fix: tools: Fix a bug in clone resource description display
Previously, descriptions of resources running on multiple
nodes were displayed despite --full not being used (with pcs
status) or --show-detail not being used (with crm_mon).
For example, clone resources running on multiple nodes were
affected.
Now, --full and --show-detail must be used in order for resource
descriptions to be displayed, regardless of the number of nodes
the resource is run on.
see bz: 2106642
---
lib/pengine/pe_output.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/lib/pengine/pe_output.c b/lib/pengine/pe_output.c
index e0b43d997a..d1c9f6e226 100644
--- a/lib/pengine/pe_output.c
+++ b/lib/pengine/pe_output.c
@@ -20,8 +20,7 @@ pe__resource_description(const pe_resource_t *rsc, uint32_t show_opts)
{
const char * desc = NULL;
// User-supplied description
- if (pcmk_any_flags_set(show_opts, pcmk_show_rsc_only|pcmk_show_description)
- || pcmk__list_of_multiple(rsc->running_on)) {
+ if (pcmk_any_flags_set(show_opts, pcmk_show_rsc_only|pcmk_show_description)) {
desc = crm_element_value(rsc->xml, XML_ATTR_DESC);
}
return desc;

View File

@ -0,0 +1,107 @@
From 45617b727e280cac384a28ae3d96145e066e6197 Mon Sep 17 00:00:00 2001
From: Reid Wahl <nrwahl@protonmail.com>
Date: Fri, 3 Feb 2023 12:08:57 -0800
Subject: [PATCH 01/02] Fix: fencer: Prevent double g_source_remove of op_timer_one
QE observed a rarely reproducible core dump in the fencer during
Pacemaker shutdown, in which we try to g_source_remove() an op timer
that's already been removed.
free_stonith_remote_op_list()
-> g_hash_table_destroy()
-> g_hash_table_remove_all_nodes()
-> clear_remote_op_timers()
-> g_source_remove()
-> crm_glib_handler()
-> "Source ID 190 was not found when attempting to remove it"
The likely cause is that request_peer_fencing() doesn't set
op->op_timer_one to 0 after calling g_source_remove() on it, so if that
op is still in the stonith_remote_op_list at shutdown with the same
timer, clear_remote_op_timers() tries to remove the source for
op_timer_one again.
There are only five locations that call g_source_remove() on a
remote_fencing_op_t timer.
* Three of them are in clear_remote_op_timers(), which first 0-checks
the timer and then sets it to 0 after g_source_remove().
* One is in remote_op_query_timeout(), which does the same.
* The last is the one we fix here in request_peer_fencing().
I don't know all the conditions of QE's test scenario at this point.
What I do know:
* have-watchdog=true
* stonith-watchdog-timeout=10
* no explicit topology
* fence agent script is missing for the configured fence device
* requested fencing of one node
* cluster shutdown
Fixes RHBZ2166967
Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
---
daemons/fenced/fenced_remote.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c
index d61b5bd..b7426ff 100644
--- a/daemons/fenced/fenced_remote.c
+++ b/daemons/fenced/fenced_remote.c
@@ -1825,6 +1825,7 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
op->state = st_exec;
if (op->op_timer_one) {
g_source_remove(op->op_timer_one);
+ op->op_timer_one = 0;
}
if (!((stonith_watchdog_timeout_ms > 0)
--
2.31.1
From 0291db4750322ec7f01ae6a4a2a30abca9d8e19e Mon Sep 17 00:00:00 2001
From: Reid Wahl <nrwahl@protonmail.com>
Date: Wed, 15 Feb 2023 22:30:27 -0800
Subject: [PATCH 02/02] Fix: fencer: Avoid double source remove of op_timer_total
remote_op_timeout() returns G_SOURCE_REMOVE, which tells GLib to remove
the source from the main loop after returning. Currently this function
is used as the callback only when creating op->op_timer_total.
If we don't set op->op_timer_total to 0 before returning from
remote_op_timeout(), then we can get an assertion and core dump from
GLib when the op's timers are being cleared (either during op
finalization or during fencer shutdown). This is because
clear_remote_op_timers() sees that op->op_timer_total != 0 and tries to
remove the source, but the source has already been removed.
Note that we're already (correctly) zeroing op->op_timer_one and
op->query_timeout as appropriate in their respective callback functions.
Fortunately, GLib doesn't care whether the source has already been
removed before we return G_SOURCE_REMOVE from a callback. So it's safe
to call finalize_op() (which removes all the op's timer sources) from
within a callback.
Fixes RHBZ#2166967
Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
---
daemons/fenced/fenced_remote.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c
index b7426ff88..adea3d7d8 100644
--- a/daemons/fenced/fenced_remote.c
+++ b/daemons/fenced/fenced_remote.c
@@ -718,6 +718,8 @@ remote_op_timeout(gpointer userdata)
{
remote_fencing_op_t *op = userdata;
+ op->op_timer_total = 0;
+
if (op->state == st_done) {
crm_debug("Action '%s' targeting %s for client %s already completed "
CRM_XS " id=%.8s",
--
2.39.0

View File

@ -1,26 +0,0 @@
From ebac530c815a62f7c3a1c24f64e9a530d9753dbe Mon Sep 17 00:00:00 2001
From: Hideo Yamauchi <renayama19661014@ybb.ne.jp>
Date: Wed, 19 Jul 2023 18:21:07 +0900
Subject: [PATCH] High: tools: The dampen parameter is disabled when setting
values with attrd_updater.
---
tools/attrd_updater.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/attrd_updater.c b/tools/attrd_updater.c
index b615a3575..4688b9ff6 100644
--- a/tools/attrd_updater.c
+++ b/tools/attrd_updater.c
@@ -501,7 +501,7 @@ send_attrd_update(char command, const char *attr_node, const char *attr_name,
case 'U':
rc = pcmk__attrd_api_update(NULL, attr_node, attr_name, attr_value,
- NULL, attr_set, NULL,
+ attr_dampen, attr_set, NULL,
attr_options | pcmk__node_attr_value);
break;
--
2.41.0

View File

@ -0,0 +1,151 @@
From 0d15568a538349ac41028db6b506d13dd23e8732 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Tue, 14 Feb 2023 14:00:37 -0500
Subject: [PATCH] High: libcrmcommon: Fix handling node=NULL in
pcmk__attrd_api_query.
According to the header file, if node is NULL, pcmk__attrd_api_query
should query the value of the given attribute on all cluster nodes.
This is also what the server expects and how attrd_updater is supposed
to work.
However, pcmk__attrd_api_query has no way of letting callers decide
whether they want to query all nodes or whether they want to use the
local node. We were passing NULL for the node name, which it took to
mean it should look up the local node name. This calls
pcmk__node_attr_target, which probes the local cluster name and returns
that to pcmk__attrd_api_query. If it returns non-NULL, that value will
then be put into the XML IPC call which means the server will only
return the value for that node.
In testing this was usually fine. However, in pratice, the methods
pcmk__node_attr_target uses to figure out the local cluster node name
involves checking the OCF_RESKEY_CRM_meta_on_node environment variable
among others.
This variable was never set in testing, but can be set in the real
world. This leads to circumstances where the user did "attrd_updater -QA"
expecting to get the values on all nodes, but instead only got the value
on the local cluster node.
In pacemaker-2.1.4 and prior, pcmk__node_attr_target was simply never
called if the node was NULL but was called otherwise.
The fix is to modify pcmk__attrd_api_query to take an option for
querying all nodes. If that's present, we'll query all nodes. If it's
not present, we'll look at the given node name - NULL means look it up,
anything else means just that node.
Regression in 2.1.5 introduced by eb20a65577
---
include/crm/common/attrd_internal.h | 6 +++++-
include/crm/common/ipc_attrd_internal.h | 7 +++++--
lib/common/ipc_attrd.c | 12 ++++++++----
tools/attrd_updater.c | 5 +++--
4 files changed, 21 insertions(+), 9 deletions(-)
diff --git a/include/crm/common/attrd_internal.h b/include/crm/common/attrd_internal.h
index 389be48..7337c38 100644
--- a/include/crm/common/attrd_internal.h
+++ b/include/crm/common/attrd_internal.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2004-2022 the Pacemaker project contributors
+ * Copyright 2004-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -25,6 +25,10 @@ enum pcmk__node_attr_opts {
pcmk__node_attr_perm = (1 << 5),
pcmk__node_attr_sync_local = (1 << 6),
pcmk__node_attr_sync_cluster = (1 << 7),
+ // pcmk__node_attr_utilization is 8, but that has not been backported.
+ // I'm leaving the gap here in case we backport that in the future and
+ // also to avoid problems on mixed-version clusters.
+ pcmk__node_attr_query_all = (1 << 9),
};
#define pcmk__set_node_attr_flags(node_attr_flags, flags_to_set) do { \
diff --git a/include/crm/common/ipc_attrd_internal.h b/include/crm/common/ipc_attrd_internal.h
index 2c6713f..b1b7584 100644
--- a/include/crm/common/ipc_attrd_internal.h
+++ b/include/crm/common/ipc_attrd_internal.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2022 the Pacemaker project contributors
+ * Copyright 2022-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -110,10 +110,13 @@ int pcmk__attrd_api_purge(pcmk_ipc_api_t *api, const char *node);
*
* \param[in,out] api Connection to pacemaker-attrd
* \param[in] node Look up the attribute for this node
- * (or NULL for all nodes)
+ * (or NULL for the local node)
* \param[in] name Attribute name
* \param[in] options Bitmask of pcmk__node_attr_opts
*
+ * \note Passing pcmk__node_attr_query_all will cause the function to query
+ * the value of \p name on all nodes, regardless of the value of \p node.
+ *
* \return Standard Pacemaker return code
*/
int pcmk__attrd_api_query(pcmk_ipc_api_t *api, const char *node, const char *name,
diff --git a/lib/common/ipc_attrd.c b/lib/common/ipc_attrd.c
index 4606509..dece49b 100644
--- a/lib/common/ipc_attrd.c
+++ b/lib/common/ipc_attrd.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2011-2022 the Pacemaker project contributors
+ * Copyright 2011-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -332,10 +332,14 @@ pcmk__attrd_api_query(pcmk_ipc_api_t *api, const char *node, const char *name,
return EINVAL;
}
- target = pcmk__node_attr_target(node);
+ if (pcmk_is_set(options, pcmk__node_attr_query_all)) {
+ node = NULL;
+ } else {
+ target = pcmk__node_attr_target(node);
- if (target != NULL) {
- node = target;
+ if (target != NULL) {
+ node = target;
+ }
}
request = create_attrd_op(NULL);
diff --git a/tools/attrd_updater.c b/tools/attrd_updater.c
index 3cd766d..cbd341d 100644
--- a/tools/attrd_updater.c
+++ b/tools/attrd_updater.c
@@ -376,6 +376,7 @@ attrd_event_cb(pcmk_ipc_api_t *attrd_api, enum pcmk_ipc_event event_type,
static int
send_attrd_query(pcmk__output_t *out, const char *attr_name, const char *attr_node, gboolean query_all)
{
+ uint32_t options = pcmk__node_attr_none;
pcmk_ipc_api_t *attrd_api = NULL;
int rc = pcmk_rc_ok;
@@ -400,10 +401,10 @@ send_attrd_query(pcmk__output_t *out, const char *attr_name, const char *attr_no
/* Decide which node(s) to query */
if (query_all == TRUE) {
- attr_node = NULL;
+ options |= pcmk__node_attr_query_all;
}
- rc = pcmk__attrd_api_query(attrd_api, attr_node, attr_name, 0);
+ rc = pcmk__attrd_api_query(attrd_api, attr_node, attr_name, options);
if (rc != pcmk_rc_ok) {
g_set_error(&error, PCMK__RC_ERROR, rc, "Could not query value of %s: %s (%d)",
--
2.31.1

View File

@ -1,109 +0,0 @@
From 3e31da0016795397bfeacb2f3d76ecfe35cc1f67 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 17 Jul 2023 14:52:42 -0500
Subject: [PATCH] Fix: libcrmcommon: wait for reply from appropriate controller
commands
ipc_controld.c:reply_expected() wrongly omitted PCMK__CONTROLD_CMD_NODES (which
hasn't been a problem because crm_node uses a mainloop instead of sync dispatch
for that) and CRM_OP_RM_NODE_CACHE (which can be sent via
ipc_client.c:pcmk_ipc_purge_node()).
Because CRM_OP_RM_NODE_CACHE gets only an ack and no further replies, we now
have to be careful not to return true from the controller's dispatch()
function, otherwise crm_node -R would wait forever for more data. That means
we have to check for whether any replies are expected, which means we have to
increment expected replies *before* sending a request (in case it's sync).
Regression introduced in 2.0.5 by ae14fa4a
Fixes T681
---
lib/common/ipc_controld.c | 49 ++++++++++++++-------------------------
1 file changed, 17 insertions(+), 32 deletions(-)
diff --git a/lib/common/ipc_controld.c b/lib/common/ipc_controld.c
index 3c3a98964..405fd0518 100644
--- a/lib/common/ipc_controld.c
+++ b/lib/common/ipc_controld.c
@@ -177,18 +177,16 @@ set_nodes_data(pcmk_controld_api_reply_t *data, xmlNode *msg_data)
static bool
reply_expected(pcmk_ipc_api_t *api, xmlNode *request)
{
- const char *command = crm_element_value(request, F_CRM_TASK);
-
- if (command == NULL) {
- return false;
- }
-
- // We only need to handle commands that functions in this file can send
- return !strcmp(command, CRM_OP_REPROBE)
- || !strcmp(command, CRM_OP_NODE_INFO)
- || !strcmp(command, CRM_OP_PING)
- || !strcmp(command, CRM_OP_LRM_FAIL)
- || !strcmp(command, CRM_OP_LRM_DELETE);
+ // We only need to handle commands that API functions can send
+ return pcmk__str_any_of(crm_element_value(request, F_CRM_TASK),
+ PCMK__CONTROLD_CMD_NODES,
+ CRM_OP_LRM_DELETE,
+ CRM_OP_LRM_FAIL,
+ CRM_OP_NODE_INFO,
+ CRM_OP_PING,
+ CRM_OP_REPROBE,
+ CRM_OP_RM_NODE_CACHE,
+ NULL);
}
static bool
@@ -202,22 +200,12 @@ dispatch(pcmk_ipc_api_t *api, xmlNode *reply)
pcmk_controld_reply_unknown, NULL, NULL,
};
- /* If we got an ACK, return true so the caller knows to expect more responses
- * from the IPC server. We do this before decrementing replies_expected because
- * ACKs are not going to be included in that value.
- *
- * Note that we cannot do the same kind of status checking here that we do in
- * ipc_pacemakerd.c. The ACK message we receive does not necessarily contain
- * a status attribute. That is, we may receive this:
- *
- * <ack function="crmd_remote_proxy_cb" line="556"/>
- *
- * Instead of this:
- *
- * <ack function="dispatch_controller_ipc" line="391" status="112"/>
- */
if (pcmk__str_eq(crm_element_name(reply), "ack", pcmk__str_none)) {
- return true; // More replies needed
+ /* ACKs are trivial responses that do not count toward expected replies,
+ * and do not have all the fields that validation requires, so skip that
+ * processing.
+ */
+ return private->replies_expected > 0;
}
if (private->replies_expected > 0) {
@@ -344,18 +332,15 @@ static int
send_controller_request(pcmk_ipc_api_t *api, xmlNode *request,
bool reply_is_expected)
{
- int rc;
-
if (crm_element_value(request, XML_ATTR_REFERENCE) == NULL) {
return EINVAL;
}
- rc = pcmk__send_ipc_request(api, request);
- if ((rc == pcmk_rc_ok) && reply_is_expected) {
+ if (reply_is_expected) {
struct controld_api_private_s *private = api->api_data;
private->replies_expected++;
}
- return rc;
+ return pcmk__send_ipc_request(api, request);
}
static xmlNode *
--
2.41.0

View File

@ -1,163 +0,0 @@
From 63f4bd4d5a324e6eb279340a42c7c36c8902ada7 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 2 Aug 2023 15:55:26 -0500
Subject: [PATCH 1/4] Fix: controller: don't try to execute agent action at
shutdown
Normally, agent execution is not possible at shutdown. However, when metadata
is needed for some action, the agent can be called asynchronously, and when the
metadata action returns, the original action is performed. If the metadata is
initiated before shutdown, but completes after shutdown has begun, do not try
to attempt the original action, so we avoid unnecessary error logs.
---
daemons/controld/controld_execd.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 530e4346c8..a90e8d833e 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -1400,7 +1400,9 @@ metadata_complete(int pid, const pcmk__action_result_t *result, void *user_data)
md = controld_cache_metadata(lrm_state->metadata_cache, data->rsc,
result->action_stdout);
}
- do_lrm_rsc_op(lrm_state, data->rsc, data->input_xml, md);
+ if (!pcmk_is_set(controld_globals.fsa_input_register, R_HA_DISCONNECTED)) {
+ do_lrm_rsc_op(lrm_state, data->rsc, data->input_xml, md);
+ }
free_metadata_cb_data(data);
}
From 247d9534f36f690c1474e36cedaadb3934022a05 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 2 Aug 2023 16:16:31 -0500
Subject: [PATCH 2/4] Refactor: controller: de-functionize lrm_state_destroy()
It was a one-liner called once
---
daemons/controld/controld_execd_state.c | 8 +-------
daemons/controld/controld_lrm.h | 5 -----
2 files changed, 1 insertion(+), 12 deletions(-)
diff --git a/daemons/controld/controld_execd_state.c b/daemons/controld/controld_execd_state.c
index 8c68bfca08..4a87a9b332 100644
--- a/daemons/controld/controld_execd_state.c
+++ b/daemons/controld/controld_execd_state.c
@@ -132,12 +132,6 @@ lrm_state_create(const char *node_name)
return state;
}
-void
-lrm_state_destroy(const char *node_name)
-{
- g_hash_table_remove(lrm_state_table, node_name);
-}
-
static gboolean
remote_proxy_remove_by_node(gpointer key, gpointer value, gpointer user_data)
{
@@ -799,7 +793,7 @@ lrm_state_unregister_rsc(lrm_state_t * lrm_state,
}
if (is_remote_lrmd_ra(NULL, NULL, rsc_id)) {
- lrm_state_destroy(rsc_id);
+ g_hash_table_remove(lrm_state_table, rsc_id);
return pcmk_ok;
}
diff --git a/daemons/controld/controld_lrm.h b/daemons/controld/controld_lrm.h
index 25f3db3316..c3113e49c3 100644
--- a/daemons/controld/controld_lrm.h
+++ b/daemons/controld/controld_lrm.h
@@ -108,11 +108,6 @@ gboolean lrm_state_init_local(void);
*/
void lrm_state_destroy_all(void);
-/*!
- * \brief Destroy executor connection by node name
- */
-void lrm_state_destroy(const char *node_name);
-
/*!
* \brief Find lrm_state data by node name
*/
From 1b915f1ce38756431f7faa142565e3e07aade194 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 2 Aug 2023 15:58:09 -0500
Subject: [PATCH 3/4] Low: controller: guard lrm_state_table usage with NULL
check
It is NULL while draining the mainloop during the shutdown sequence.
---
daemons/controld/controld_execd_state.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/daemons/controld/controld_execd_state.c b/daemons/controld/controld_execd_state.c
index 4a87a9b332..b90cc5e635 100644
--- a/daemons/controld/controld_execd_state.c
+++ b/daemons/controld/controld_execd_state.c
@@ -301,7 +301,7 @@ lrm_state_destroy_all(void)
lrm_state_t *
lrm_state_find(const char *node_name)
{
- if (!node_name) {
+ if ((node_name == NULL) || (lrm_state_table == NULL)) {
return NULL;
}
return g_hash_table_lookup(lrm_state_table, node_name);
@@ -312,6 +312,8 @@ lrm_state_find_or_create(const char *node_name)
{
lrm_state_t *lrm_state;
+ CRM_CHECK(lrm_state_table != NULL, return NULL);
+
lrm_state = g_hash_table_lookup(lrm_state_table, node_name);
if (!lrm_state) {
lrm_state = lrm_state_create(node_name);
@@ -323,6 +325,9 @@ lrm_state_find_or_create(const char *node_name)
GList *
lrm_state_get_list(void)
{
+ if (lrm_state_table == NULL) {
+ return NULL;
+ }
return g_hash_table_get_values(lrm_state_table);
}
From 78581213ed3bf4183b0ec1f391b720d5d91f3f68 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 2 Aug 2023 15:48:36 -0500
Subject: [PATCH 4/4] Log: controller: improve messages for resource history
updates
---
daemons/controld/controld_cib.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/daemons/controld/controld_cib.c b/daemons/controld/controld_cib.c
index 22ac42486f..c9dde0b748 100644
--- a/daemons/controld/controld_cib.c
+++ b/daemons/controld/controld_cib.c
@@ -861,10 +861,17 @@ cib_rsc_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *use
case pcmk_ok:
case -pcmk_err_diff_failed:
case -pcmk_err_diff_resync:
- crm_trace("Resource update %d complete: rc=%d", call_id, rc);
+ crm_trace("Resource history update completed (call=%d rc=%d)",
+ call_id, rc);
break;
default:
- crm_warn("Resource update %d failed: (rc=%d) %s", call_id, rc, pcmk_strerror(rc));
+ if (call_id > 0) {
+ crm_warn("Resource history update %d failed: %s "
+ CRM_XS " rc=%d", call_id, pcmk_strerror(rc), rc);
+ } else {
+ crm_warn("Resource history update failed: %s " CRM_XS " rc=%d",
+ pcmk_strerror(rc), rc);
+ }
}
if (call_id == pending_rsc_update) {

View File

@ -1,45 +0,0 @@
From f5263c9401c9c38d4e039149deddcc0da0c184ba Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 3 Aug 2023 12:17:08 -0500
Subject: [PATCH] Fix: attrd: avoid race condition when shutting down
This addresses a race condition that can occur when the DC and the attribute
writer are different nodes, and shutting down at the same time. When the DC
controller leaves its Corosync process group, the remaining nodes erase its
transient node attributes (including "shutdown") from the CIB. However if the
(former) DC's attrd is still up, it can win the attribute writer election
called after the original writer leaves. As the election winner, it writes out
all its attributes to the CIB, including "shutdown". The next time it rejoins
the cluster, it will be immediately shut down.
Fixes T138
---
daemons/attrd/attrd_elections.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/daemons/attrd/attrd_elections.c b/daemons/attrd/attrd_elections.c
index 3b6b55a0f59..6f4916888a9 100644
--- a/daemons/attrd/attrd_elections.c
+++ b/daemons/attrd/attrd_elections.c
@@ -22,12 +22,20 @@ attrd_election_cb(gpointer user_data)
{
attrd_declare_winner();
+ if (attrd_requesting_shutdown() || attrd_shutting_down()) {
+ /* This node is shutting down or about to, meaning its attributes will
+ * be removed (and may have already been removed from the CIB by a
+ * controller). Don't sync or write its attributes in this case.
+ */
+ return G_SOURCE_REMOVE;
+ }
+
/* Update the peers after an election */
attrd_peer_sync(NULL, NULL);
/* Update the CIB after an election */
attrd_write_attributes(true, false);
- return FALSE;
+ return G_SOURCE_REMOVE;
}
void

View File

@ -1,210 +0,0 @@
From 83e547cc64f2586031a007ab58e91fc22cd1a68a Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 24 Aug 2023 12:18:23 -0500
Subject: [PATCH] Refactor: attrd: use enum instead of bools for
attrd_write_attributes()
---
daemons/attrd/attrd_cib.c | 24 ++++++++++++++++++------
daemons/attrd/attrd_corosync.c | 2 +-
daemons/attrd/attrd_elections.c | 2 +-
daemons/attrd/attrd_ipc.c | 2 +-
daemons/attrd/attrd_utils.c | 2 +-
daemons/attrd/pacemaker-attrd.h | 8 +++++++-
6 files changed, 29 insertions(+), 11 deletions(-)
diff --git a/daemons/attrd/attrd_cib.c b/daemons/attrd/attrd_cib.c
index 928c0133745..9c787fe1024 100644
--- a/daemons/attrd/attrd_cib.c
+++ b/daemons/attrd/attrd_cib.c
@@ -343,16 +343,23 @@ attrd_write_attribute(attribute_t *a, bool ignore_delay)
free_xml(xml_top);
}
+/*!
+ * \internal
+ * \brief Write out attributes
+ *
+ * \param[in] options Group of enum attrd_write_options
+ */
void
-attrd_write_attributes(bool all, bool ignore_delay)
+attrd_write_attributes(uint32_t options)
{
GHashTableIter iter;
attribute_t *a = NULL;
- crm_debug("Writing out %s attributes", all? "all" : "changed");
+ crm_debug("Writing out %s attributes",
+ pcmk_is_set(options, attrd_write_all)? "all" : "changed");
g_hash_table_iter_init(&iter, attributes);
while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & a)) {
- if (!all && a->unknown_peer_uuids) {
+ if (!pcmk_is_set(options, attrd_write_all) && a->unknown_peer_uuids) {
// Try writing this attribute again, in case peer ID was learned
a->changed = true;
} else if (a->force_write) {
@@ -360,9 +367,14 @@ attrd_write_attributes(bool all, bool ignore_delay)
a->changed = true;
}
- if(all || a->changed) {
- /* When forced write flag is set, ignore delay. */
- attrd_write_attribute(a, (a->force_write ? true : ignore_delay));
+ if (pcmk_is_set(options, attrd_write_all) || a->changed) {
+ bool ignore_delay = pcmk_is_set(options, attrd_write_no_delay);
+
+ if (a->force_write) {
+ // Always ignore delay when forced write flag is set
+ ignore_delay = true;
+ }
+ attrd_write_attribute(a, ignore_delay);
} else {
crm_trace("Skipping unchanged attribute %s", a->id);
}
diff --git a/daemons/attrd/attrd_corosync.c b/daemons/attrd/attrd_corosync.c
index 1aec35a054e..49631df6e44 100644
--- a/daemons/attrd/attrd_corosync.c
+++ b/daemons/attrd/attrd_corosync.c
@@ -285,7 +285,7 @@ record_peer_nodeid(attribute_value_t *v, const char *host)
crm_trace("Learned %s has node id %s", known_peer->uname, known_peer->uuid);
if (attrd_election_won()) {
- attrd_write_attributes(false, false);
+ attrd_write_attributes(attrd_write_changed);
}
}
diff --git a/daemons/attrd/attrd_elections.c b/daemons/attrd/attrd_elections.c
index c25a41a4492..01341db18e4 100644
--- a/daemons/attrd/attrd_elections.c
+++ b/daemons/attrd/attrd_elections.c
@@ -34,7 +34,7 @@ attrd_election_cb(gpointer user_data)
attrd_peer_sync(NULL, NULL);
/* Update the CIB after an election */
- attrd_write_attributes(true, false);
+ attrd_write_attributes(attrd_write_all);
return G_SOURCE_REMOVE;
}
diff --git a/daemons/attrd/attrd_ipc.c b/daemons/attrd/attrd_ipc.c
index 4be789de7f9..05c4a696a19 100644
--- a/daemons/attrd/attrd_ipc.c
+++ b/daemons/attrd/attrd_ipc.c
@@ -232,7 +232,7 @@ attrd_client_refresh(pcmk__request_t *request)
crm_info("Updating all attributes");
attrd_send_ack(request->ipc_client, request->ipc_id, request->ipc_flags);
- attrd_write_attributes(true, true);
+ attrd_write_attributes(attrd_write_all|attrd_write_no_delay);
pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
return NULL;
diff --git a/daemons/attrd/attrd_utils.c b/daemons/attrd/attrd_utils.c
index c43eac1695a..bfd51368890 100644
--- a/daemons/attrd/attrd_utils.c
+++ b/daemons/attrd/attrd_utils.c
@@ -156,7 +156,7 @@ attrd_cib_replaced_cb(const char *event, xmlNode * msg)
if (attrd_election_won()) {
if (change_section & (cib_change_section_nodes | cib_change_section_status)) {
crm_notice("Updating all attributes after %s event", event);
- attrd_write_attributes(true, false);
+ attrd_write_attributes(attrd_write_all);
}
}
diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h
index 41f31d97b3b..2d781d11394 100644
--- a/daemons/attrd/pacemaker-attrd.h
+++ b/daemons/attrd/pacemaker-attrd.h
@@ -176,8 +176,14 @@ void attrd_free_attribute(gpointer data);
void attrd_free_attribute_value(gpointer data);
attribute_t *attrd_populate_attribute(xmlNode *xml, const char *attr);
+enum attrd_write_options {
+ attrd_write_changed = 0,
+ attrd_write_all = (1 << 0),
+ attrd_write_no_delay = (1 << 1),
+};
+
void attrd_write_attribute(attribute_t *a, bool ignore_delay);
-void attrd_write_attributes(bool all, bool ignore_delay);
+void attrd_write_attributes(uint32_t options);
void attrd_write_or_elect_attribute(attribute_t *a);
extern int minimum_protocol_version;
From 58400e272cfc51f02eec69cdd0ed0d27a30e78a3 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 24 Aug 2023 12:27:53 -0500
Subject: [PATCH] Fix: attrd: avoid race condition at writer election
f5263c94 was not a complete fix. The issue may also occur if a remaining node
(not the original DC or writer) wins the attribute writer election after the
original DC's controller has exited but before its attribute manger has exited.
The long-term solution will be to have the attribute manager (instead of the
controller) be in control of erasing transient attributes from the CIB when a
node leaves. This short-term workaround simply has new attribute writers skip
shutdown attributes when writing out all attributes.
Fixes T138
---
daemons/attrd/attrd_cib.c | 5 +++++
daemons/attrd/attrd_elections.c | 14 ++++++++++++--
daemons/attrd/pacemaker-attrd.h | 1 +
3 files changed, 18 insertions(+), 2 deletions(-)
diff --git a/daemons/attrd/attrd_cib.c b/daemons/attrd/attrd_cib.c
index 9c787fe102..2c910b4c64 100644
--- a/daemons/attrd/attrd_cib.c
+++ b/daemons/attrd/attrd_cib.c
@@ -359,6 +359,11 @@ attrd_write_attributes(uint32_t options)
pcmk_is_set(options, attrd_write_all)? "all" : "changed");
g_hash_table_iter_init(&iter, attributes);
while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & a)) {
+ if (pcmk_is_set(options, attrd_write_skip_shutdown)
+ && pcmk__str_eq(a->id, XML_CIB_ATTR_SHUTDOWN, pcmk__str_none)) {
+ continue;
+ }
+
if (!pcmk_is_set(options, attrd_write_all) && a->unknown_peer_uuids) {
// Try writing this attribute again, in case peer ID was learned
a->changed = true;
diff --git a/daemons/attrd/attrd_elections.c b/daemons/attrd/attrd_elections.c
index 01341db18e..a95cd44cbd 100644
--- a/daemons/attrd/attrd_elections.c
+++ b/daemons/attrd/attrd_elections.c
@@ -33,8 +33,18 @@ attrd_election_cb(gpointer user_data)
/* Update the peers after an election */
attrd_peer_sync(NULL, NULL);
- /* Update the CIB after an election */
- attrd_write_attributes(attrd_write_all);
+ /* After winning an election, update the CIB with the values of all
+ * attributes as the winner knows them.
+ *
+ * However, do not write out any "shutdown" attributes. A node that is
+ * shutting down will have all its transient attributes removed from the CIB
+ * when its controller exits, and from the attribute manager's memory (on
+ * remaining nodes) when its attribute manager exits; if an election is won
+ * between when those two things happen, we don't want to write the shutdown
+ * attribute back out, which would cause the node to immediately shut down
+ * the next time it rejoins.
+ */
+ attrd_write_attributes(attrd_write_all|attrd_write_skip_shutdown);
return G_SOURCE_REMOVE;
}
diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h
index 2d781d1139..2e35bd7ec5 100644
--- a/daemons/attrd/pacemaker-attrd.h
+++ b/daemons/attrd/pacemaker-attrd.h
@@ -180,6 +180,7 @@ enum attrd_write_options {
attrd_write_changed = 0,
attrd_write_all = (1 << 0),
attrd_write_no_delay = (1 << 1),
+ attrd_write_skip_shutdown = (1 << 2),
};
void attrd_write_attribute(attribute_t *a, bool ignore_delay);

View File

@ -1,62 +0,0 @@
From 2e81e0db9a716c486805e0760f78be65ca79eeae Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 17 Oct 2023 15:28:27 -0500
Subject: [PATCH] Fix: attrd: avoid regression by reverting 58400e27
Fixes T714
---
daemons/attrd/attrd_cib.c | 5 -----
daemons/attrd/attrd_elections.c | 10 +---------
daemons/attrd/pacemaker-attrd.h | 1 -
3 files changed, 1 insertion(+), 15 deletions(-)
diff --git a/daemons/attrd/attrd_cib.c b/daemons/attrd/attrd_cib.c
index 2de37a7cb6..9ce2872715 100644
--- a/daemons/attrd/attrd_cib.c
+++ b/daemons/attrd/attrd_cib.c
@@ -641,11 +641,6 @@ attrd_write_attributes(uint32_t options)
pcmk_is_set(options, attrd_write_all)? "all" : "changed");
g_hash_table_iter_init(&iter, attributes);
while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & a)) {
- if (pcmk_is_set(options, attrd_write_skip_shutdown)
- && pcmk__str_eq(a->id, XML_CIB_ATTR_SHUTDOWN, pcmk__str_none)) {
- continue;
- }
-
if (!pcmk_is_set(options, attrd_write_all) && a->unknown_peer_uuids) {
// Try writing this attribute again, in case peer ID was learned
a->changed = true;
diff --git a/daemons/attrd/attrd_elections.c b/daemons/attrd/attrd_elections.c
index a95cd44cbd..62310ed1d8 100644
--- a/daemons/attrd/attrd_elections.c
+++ b/daemons/attrd/attrd_elections.c
@@ -35,16 +35,8 @@ attrd_election_cb(gpointer user_data)
/* After winning an election, update the CIB with the values of all
* attributes as the winner knows them.
- *
- * However, do not write out any "shutdown" attributes. A node that is
- * shutting down will have all its transient attributes removed from the CIB
- * when its controller exits, and from the attribute manager's memory (on
- * remaining nodes) when its attribute manager exits; if an election is won
- * between when those two things happen, we don't want to write the shutdown
- * attribute back out, which would cause the node to immediately shut down
- * the next time it rejoins.
*/
- attrd_write_attributes(attrd_write_all|attrd_write_skip_shutdown);
+ attrd_write_attributes(attrd_write_all);
return G_SOURCE_REMOVE;
}
diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h
index e3c369b5bc..a95bb54367 100644
--- a/daemons/attrd/pacemaker-attrd.h
+++ b/daemons/attrd/pacemaker-attrd.h
@@ -181,7 +181,6 @@ enum attrd_write_options {
attrd_write_changed = 0,
attrd_write_all = (1 << 0),
attrd_write_no_delay = (1 << 1),
- attrd_write_skip_shutdown = (1 << 2),
};
void attrd_write_attribute(attribute_t *a, bool ignore_delay);

View File

@ -1,34 +0,0 @@
From 14b87a38786ae5b4dc12fc1581e5d39a274fced2 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 30 Oct 2023 12:21:24 -0500
Subject: [PATCH] Fix: attrd: revert faulty T138 fix
f5263c9401 created a timing issue where a node could get a shutdown attribute,
the original writer leaves the cluster before writing it out, then the
shutting-down node wins the writer election. In that case, it would skip the
write-out and the scheduler would never shut it down.
Reopens T138
---
daemons/attrd/attrd_elections.c | 8 --------
1 file changed, 8 deletions(-)
diff --git a/daemons/attrd/attrd_elections.c b/daemons/attrd/attrd_elections.c
index 62310ed1d8..82fbe8affc 100644
--- a/daemons/attrd/attrd_elections.c
+++ b/daemons/attrd/attrd_elections.c
@@ -22,14 +22,6 @@
{
attrd_declare_winner();
- if (attrd_requesting_shutdown() || attrd_shutting_down()) {
- /* This node is shutting down or about to, meaning its attributes will
- * be removed (and may have already been removed from the CIB by a
- * controller). Don't sync or write its attributes in this case.
- */
- return G_SOURCE_REMOVE;
- }
-
/* Update the peers after an election */
attrd_peer_sync(NULL, NULL);

View File

@ -35,13 +35,19 @@
## Upstream pacemaker version, and its package version (specversion
## can be incremented to build packages reliably considered "newer"
## than previously built packages with the same pcmkversion)
%global pcmkversion 2.1.6
%global specversion 9
%global pcmkversion 2.1.5
%global specversion 8
## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build
%global commit 6fdc9deea294bbad629b003c6ae036aaed8e3ee0
%global commit a3f44794f94e1571c6ba0042915ade369b4ce4b1
## Since git v2.11, the extent of abbreviation is autoscaled by default
## (used to be constant of 7), so we need to convey it for non-tags, too.
%if (0%{?fedora} >= 26) || (0%{?rhel} >= 9)
%global commit_abbrev 9
%else
%global commit_abbrev 7
%endif
## Nagios source control identifiers
%global nagios_name nagios-agents-metadata
@ -242,12 +248,12 @@
Name: pacemaker
Summary: Scalable High-Availability cluster resource manager
Version: %{pcmkversion}
Release: %{pcmk_release}.1%{?dist}
Release: %{pcmk_release}%{?dist}
%if %{defined _unitdir}
License: GPL-2.0-or-later AND LGPL-2.1-or-later
License: GPLv2+ and LGPLv2+
%else
# initscript is Revised BSD
License: GPL-2.0-or-later AND LGPL-2.1-or-later AND BSD-3-Clause
License: GPLv2+ and LGPLv2+ and BSD
%endif
Url: https://www.clusterlabs.org/
@ -263,17 +269,11 @@ Source0: https://codeload.github.com/%{github_owner}/%{name}/tar.gz/%{arch
Source1: nagios-agents-metadata-%{nagios_hash}.tar.gz
# upstream commits
Patch001: 001-remote-start-state.patch
Patch002: 002-group-colocation-constraint.patch
Patch003: 003-clone-shuffle.patch
Patch004: 004-clone-rsc-display.patch
Patch005: 005-attrd-dampen.patch
Patch006: 006-controller-reply.patch
Patch007: 007-glib-assertions.patch
Patch008: 008-attrd-shutdown.patch
Patch009: 009-attrd-shutdown-2.patch
Patch010: 010-revert-58400e27.patch
Patch011: 011-revert-f5263c94.patch
Patch001: 001-sync-points.patch
Patch002: 002-remote-regression.patch
Patch003: 003-history-cleanup.patch
Patch004: 004-g_source_remove.patch
Patch005: 005-query-null.patch
# downstream-only commits
#Patch1xx: 1xx-xxxx.patch
@ -298,7 +298,6 @@ ExclusiveArch: aarch64 i686 ppc64le s390x x86_64
Requires: %{python_path}
BuildRequires: %{python_name}-devel
BuildRequires: %{python_name}-setuptools
# Pacemaker requires a minimum libqb functionality
Requires: libqb >= 0.17.0
@ -389,7 +388,7 @@ Available rpmbuild rebuild options:
stonithd
%package cli
License: GPL-2.0-or-later AND LGPL-2.1-or-later
License: GPLv2+ and LGPLv2+
Summary: Command line tools for controlling Pacemaker clusters
Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release}
# For crm_report
@ -409,7 +408,7 @@ to query and control the cluster from machines that may, or may not,
be part of the cluster.
%package -n %{pkgname_pcmk_libs}
License: GPL-2.0-or-later AND LGPL-2.1-or-later
License: GPLv2+ and LGPLv2+
Summary: Core Pacemaker libraries
Requires(pre): %{pkgname_shadow_utils}
Requires: %{name}-schemas = %{version}-%{release}
@ -426,7 +425,7 @@ The %{pkgname_pcmk_libs} package contains shared libraries needed for cluster
nodes and those just running the CLI tools.
%package cluster-libs
License: GPL-2.0-or-later AND LGPL-2.1-or-later
License: GPLv2+ and LGPLv2+
Summary: Cluster Libraries used by Pacemaker
Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release}
@ -437,26 +436,12 @@ manager.
The %{name}-cluster-libs package contains cluster-aware shared
libraries needed for nodes that will form part of the cluster nodes.
%package -n %{python_name}-%{name}
License: LGPL-2.1-or-later
Summary: Python libraries for Pacemaker
Requires: %{python_path}
Requires: %{pkgname_pcmk_libs} = %{version}-%{release}
BuildArch: noarch
%description -n %{python_name}-%{name}
Pacemaker is an advanced, scalable High-Availability cluster resource
manager.
The %{python_name}-%{name} package contains a Python library that can be used
to interface with Pacemaker.
%package remote
%if %{defined _unitdir}
License: GPL-2.0-or-later AND LGPL-2.1-or-later
License: GPLv2+ and LGPLv2+
%else
# initscript is Revised BSD
License: GPL-2.0-or-later AND LGPL-2.1-or-later AND BSD-3-Clause
License: GPLv2+ and LGPLv2+ and BSD
%endif
Summary: Pacemaker remote executor daemon for non-cluster nodes
Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release}
@ -479,7 +464,7 @@ which is capable of extending pacemaker functionality to remote
nodes not running the full corosync/cluster stack.
%package -n %{pkgname_pcmk_libs}-devel
License: GPL-2.0-or-later AND LGPL-2.1-or-later
License: GPLv2+ and LGPLv2+
Summary: Pacemaker development package
Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release}
Requires: %{name}-cluster-libs%{?_isa} = %{version}-%{release}
@ -502,12 +487,11 @@ The %{pkgname_pcmk_libs}-devel package contains headers and shared libraries
for developing tools for Pacemaker.
%package cts
License: GPL-2.0-or-later AND LGPL-2.1-or-later
License: GPLv2+ and LGPLv2+
Summary: Test framework for cluster-related technologies like Pacemaker
Requires: %{python_path}
Requires: %{pkgname_pcmk_libs} = %{version}-%{release}
Requires: %{name}-cli = %{version}-%{release}
Requires: %{python_name}-%{name} = %{version}-%{release}
Requires: %{pkgname_procps}
Requires: psmisc
Requires: %{python_name}-psutil
@ -535,7 +519,7 @@ Pacemaker is an advanced, scalable High-Availability cluster resource
manager.
%package schemas
License: GPL-2.0-or-later
License: GPLv2+
Summary: Schemas and upgrade stylesheets for Pacemaker
BuildArch: noarch
@ -617,10 +601,6 @@ sed -i 's|^runpath_var=LD_RUN_PATH|runpath_var=DIE_RPATH_DIE|g' libtool
make %{_smp_mflags} V=1
pushd python
%py3_build
popd
%check
make %{_smp_mflags} check
{ cts/cts-scheduler --run load-stopped-loop \
@ -639,10 +619,6 @@ make install \
DESTDIR=%{buildroot} V=1 docdir=%{pcmk_docdir} \
%{?_python_bytecompile_extra:%{?py_byte_compile:am__py_compile=true}}
pushd python
%py3_install
popd
%if %{with upstart_job}
mkdir -p ${RPM_BUILD_ROOT}%{_sysconfdir}/init
install -m 644 pacemakerd/pacemaker.upstart ${RPM_BUILD_ROOT}%{_sysconfdir}/init/pacemaker.conf
@ -893,21 +869,17 @@ exit 0
%dir %{ocf_root}/resource.d
%{ocf_root}/resource.d/pacemaker
%doc %{_mandir}/man7/*pacemaker*
%doc %{_mandir}/man7/*
%exclude %{_mandir}/man7/pacemaker-controld.*
%exclude %{_mandir}/man7/pacemaker-schedulerd.*
%exclude %{_mandir}/man7/pacemaker-fenced.*
%exclude %{_mandir}/man7/ocf_pacemaker_controld.*
%exclude %{_mandir}/man7/ocf_pacemaker_remote.*
%doc %{_mandir}/man8/crm*.8.gz
%doc %{_mandir}/man8/*
%exclude %{_mandir}/man8/crm_master.*
%doc %{_mandir}/man8/attrd_updater.*
%doc %{_mandir}/man8/cibadmin.*
%if %{with cibsecrets}
%doc %{_mandir}/man8/cibsecret.*
%endif
%doc %{_mandir}/man8/iso8601.*
%doc %{_mandir}/man8/stonith_admin.*
%exclude %{_mandir}/man8/fence_watchdog.*
%exclude %{_mandir}/man8/pacemakerd.*
%exclude %{_mandir}/man8/pacemaker-remoted.*
%license licenses/GPLv2
%doc COPYING
@ -938,14 +910,6 @@ exit 0
%doc COPYING
%doc ChangeLog
%files -n %{python_name}-%{name}
%{python3_sitelib}/pacemaker/
%{python3_sitelib}/pacemaker-*.egg-info
%exclude %{python3_sitelib}/pacemaker/_cts/
%license licenses/LGPLv2.1
%doc COPYING
%doc ChangeLog
%files remote
%config(noreplace) %{_sysconfdir}/sysconfig/pacemaker
%if %{defined _unitdir}
@ -971,7 +935,6 @@ exit 0
%files cts
%{python_site}/cts
%{python3_sitelib}/pacemaker/_cts/
%{_datadir}/pacemaker/tests
%{_libexecdir}/pacemaker/cts-log-watcher
@ -983,16 +946,8 @@ exit 0
%files -n %{pkgname_pcmk_libs}-devel
%{_includedir}/pacemaker
%{_libdir}/libcib.so
%{_libdir}/liblrmd.so
%{_libdir}/libcrmservice.so
%{_libdir}/libcrmcommon.so
%{_libdir}/libpe_status.so
%{_libdir}/libpe_rules.so
%{_libdir}/libpacemaker.so
%{_libdir}/libstonithd.so
%{_libdir}/libcrmcluster.so
%{_libdir}/pkgconfig/*pacemaker*.pc
%{_libdir}/*.so
%{_libdir}/pkgconfig/*.pc
%license licenses/LGPLv2.1
%doc COPYING
%doc ChangeLog
@ -1012,61 +967,6 @@ exit 0
%license %{nagios_name}-%{nagios_hash}/COPYING
%changelog
* Tue Oct 31 2023 Chris Lumens <clumens@redhat.com> - 2.1.6-9.1
- Revert the rest of the attrd shutdown race condition fix
- Related: RHEL-14052
* Mon Oct 23 2023 Chris Lumens <clumens@redhat.com> - 2.1.6-9
- Avoid an error if the elected attrd is on a node that is shutting down
- Resolves: RHEL-14052
* Tue Aug 29 2023 Chris Lumens <clumens@redhat.com> - 2.1.6-8
- Fix an additional shutdown race between attrd and the controller
- Related: rhbz2228955
* Mon Aug 7 2023 Chris Lumens <clumens@redhat.com> - 2.1.6-7
- Fix attrd race condition when shutting down
- Resolves: rhbz2228955
* Thu Jul 27 2023 Chris Lumens <clumens@redhat.com> - 2.1.6-6
- Wait for a reply from various controller commands
- Resolves: rhbz2225631
- Related: rhbz2189300
* Tue Jul 25 2023 Chris Lumens <clumens@redhat.com> - 2.1.6-5
- Apply dampening when creating attributes with attrd_updater -U
- Resolves: rhbz2224046
- Related: rhbz2189300
* Wed Jul 19 2023 Chris Lumens <clumens@redhat.com> - 2.1.6-4
- Clone instances should not shuffle unnecessarily
- Fix a bug in clone resource description display
- Resolves: rhbz1931023
- Resolves: rhbz1688149
- Related: rhbz2106642
- Related: rhbz2189300
* Mon Jul 10 2023 Chris Lumens <clumens@redhat.com> - 2.1.6-3
- Fix moving groups when there's a constraint for a single group member
- Resolves: rhbz2218232
- Resolves: rhbz2189300
* Wed Jun 21 2023 Chris Lumens <clumens@redhat.com> - 2.1.6-2
- Support start state for Pacemaker Remote nodes
- Resolves: rhbz1502795
* Thu May 25 2023 Chris Lumens <clumens@redhat.com> - 2.1.6-1
- Rebase pacemaker on upstream 2.1.6 final release
- Resolves: rhbz1578820
- Resolves: rhbz1632951
- Resolves: rhbz1876173
- Resolves: rhbz2010084
- Resolves: rhbz2030869
- Resolves: rhbz2078611
- Resolves: rhbz2106642
- Resolves: rhbz2160206
- Resolves: rhbz2168633
* Wed Feb 22 2023 Chris Lumens <clumens@redhat.com> - 2.1.5-8
- Rebuild with new release due to build system problems
- Related: rhbz2168249