Fix attrd race condition when shutting down
- Resolves: rhbz2228955
This commit is contained in:
parent
fa92bb6ea7
commit
bd3a707a67
163
007-glib-assertions.patch
Normal file
163
007-glib-assertions.patch
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
From 63f4bd4d5a324e6eb279340a42c7c36c8902ada7 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Ken Gaillot <kgaillot@redhat.com>
|
||||||
|
Date: Wed, 2 Aug 2023 15:55:26 -0500
|
||||||
|
Subject: [PATCH 1/4] Fix: controller: don't try to execute agent action at
|
||||||
|
shutdown
|
||||||
|
|
||||||
|
Normally, agent execution is not possible at shutdown. However, when metadata
|
||||||
|
is needed for some action, the agent can be called asynchronously, and when the
|
||||||
|
metadata action returns, the original action is performed. If the metadata is
|
||||||
|
initiated before shutdown, but completes after shutdown has begun, do not try
|
||||||
|
to attempt the original action, so we avoid unnecessary error logs.
|
||||||
|
---
|
||||||
|
daemons/controld/controld_execd.c | 4 +++-
|
||||||
|
1 file changed, 3 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
|
||||||
|
index 530e4346c8..a90e8d833e 100644
|
||||||
|
--- a/daemons/controld/controld_execd.c
|
||||||
|
+++ b/daemons/controld/controld_execd.c
|
||||||
|
@@ -1400,7 +1400,9 @@ metadata_complete(int pid, const pcmk__action_result_t *result, void *user_data)
|
||||||
|
md = controld_cache_metadata(lrm_state->metadata_cache, data->rsc,
|
||||||
|
result->action_stdout);
|
||||||
|
}
|
||||||
|
- do_lrm_rsc_op(lrm_state, data->rsc, data->input_xml, md);
|
||||||
|
+ if (!pcmk_is_set(controld_globals.fsa_input_register, R_HA_DISCONNECTED)) {
|
||||||
|
+ do_lrm_rsc_op(lrm_state, data->rsc, data->input_xml, md);
|
||||||
|
+ }
|
||||||
|
free_metadata_cb_data(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
From 247d9534f36f690c1474e36cedaadb3934022a05 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Ken Gaillot <kgaillot@redhat.com>
|
||||||
|
Date: Wed, 2 Aug 2023 16:16:31 -0500
|
||||||
|
Subject: [PATCH 2/4] Refactor: controller: de-functionize lrm_state_destroy()
|
||||||
|
|
||||||
|
It was a one-liner called once
|
||||||
|
---
|
||||||
|
daemons/controld/controld_execd_state.c | 8 +-------
|
||||||
|
daemons/controld/controld_lrm.h | 5 -----
|
||||||
|
2 files changed, 1 insertion(+), 12 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/daemons/controld/controld_execd_state.c b/daemons/controld/controld_execd_state.c
|
||||||
|
index 8c68bfca08..4a87a9b332 100644
|
||||||
|
--- a/daemons/controld/controld_execd_state.c
|
||||||
|
+++ b/daemons/controld/controld_execd_state.c
|
||||||
|
@@ -132,12 +132,6 @@ lrm_state_create(const char *node_name)
|
||||||
|
return state;
|
||||||
|
}
|
||||||
|
|
||||||
|
-void
|
||||||
|
-lrm_state_destroy(const char *node_name)
|
||||||
|
-{
|
||||||
|
- g_hash_table_remove(lrm_state_table, node_name);
|
||||||
|
-}
|
||||||
|
-
|
||||||
|
static gboolean
|
||||||
|
remote_proxy_remove_by_node(gpointer key, gpointer value, gpointer user_data)
|
||||||
|
{
|
||||||
|
@@ -799,7 +793,7 @@ lrm_state_unregister_rsc(lrm_state_t * lrm_state,
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_remote_lrmd_ra(NULL, NULL, rsc_id)) {
|
||||||
|
- lrm_state_destroy(rsc_id);
|
||||||
|
+ g_hash_table_remove(lrm_state_table, rsc_id);
|
||||||
|
return pcmk_ok;
|
||||||
|
}
|
||||||
|
|
||||||
|
diff --git a/daemons/controld/controld_lrm.h b/daemons/controld/controld_lrm.h
|
||||||
|
index 25f3db3316..c3113e49c3 100644
|
||||||
|
--- a/daemons/controld/controld_lrm.h
|
||||||
|
+++ b/daemons/controld/controld_lrm.h
|
||||||
|
@@ -108,11 +108,6 @@ gboolean lrm_state_init_local(void);
|
||||||
|
*/
|
||||||
|
void lrm_state_destroy_all(void);
|
||||||
|
|
||||||
|
-/*!
|
||||||
|
- * \brief Destroy executor connection by node name
|
||||||
|
- */
|
||||||
|
-void lrm_state_destroy(const char *node_name);
|
||||||
|
-
|
||||||
|
/*!
|
||||||
|
* \brief Find lrm_state data by node name
|
||||||
|
*/
|
||||||
|
|
||||||
|
From 1b915f1ce38756431f7faa142565e3e07aade194 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Ken Gaillot <kgaillot@redhat.com>
|
||||||
|
Date: Wed, 2 Aug 2023 15:58:09 -0500
|
||||||
|
Subject: [PATCH 3/4] Low: controller: guard lrm_state_table usage with NULL
|
||||||
|
check
|
||||||
|
|
||||||
|
It is NULL while draining the mainloop during the shutdown sequence.
|
||||||
|
---
|
||||||
|
daemons/controld/controld_execd_state.c | 7 ++++++-
|
||||||
|
1 file changed, 6 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/daemons/controld/controld_execd_state.c b/daemons/controld/controld_execd_state.c
|
||||||
|
index 4a87a9b332..b90cc5e635 100644
|
||||||
|
--- a/daemons/controld/controld_execd_state.c
|
||||||
|
+++ b/daemons/controld/controld_execd_state.c
|
||||||
|
@@ -301,7 +301,7 @@ lrm_state_destroy_all(void)
|
||||||
|
lrm_state_t *
|
||||||
|
lrm_state_find(const char *node_name)
|
||||||
|
{
|
||||||
|
- if (!node_name) {
|
||||||
|
+ if ((node_name == NULL) || (lrm_state_table == NULL)) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return g_hash_table_lookup(lrm_state_table, node_name);
|
||||||
|
@@ -312,6 +312,8 @@ lrm_state_find_or_create(const char *node_name)
|
||||||
|
{
|
||||||
|
lrm_state_t *lrm_state;
|
||||||
|
|
||||||
|
+ CRM_CHECK(lrm_state_table != NULL, return NULL);
|
||||||
|
+
|
||||||
|
lrm_state = g_hash_table_lookup(lrm_state_table, node_name);
|
||||||
|
if (!lrm_state) {
|
||||||
|
lrm_state = lrm_state_create(node_name);
|
||||||
|
@@ -323,6 +325,9 @@ lrm_state_find_or_create(const char *node_name)
|
||||||
|
GList *
|
||||||
|
lrm_state_get_list(void)
|
||||||
|
{
|
||||||
|
+ if (lrm_state_table == NULL) {
|
||||||
|
+ return NULL;
|
||||||
|
+ }
|
||||||
|
return g_hash_table_get_values(lrm_state_table);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
From 78581213ed3bf4183b0ec1f391b720d5d91f3f68 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Ken Gaillot <kgaillot@redhat.com>
|
||||||
|
Date: Wed, 2 Aug 2023 15:48:36 -0500
|
||||||
|
Subject: [PATCH 4/4] Log: controller: improve messages for resource history
|
||||||
|
updates
|
||||||
|
|
||||||
|
---
|
||||||
|
daemons/controld/controld_cib.c | 11 +++++++++--
|
||||||
|
1 file changed, 9 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/daemons/controld/controld_cib.c b/daemons/controld/controld_cib.c
|
||||||
|
index 22ac42486f..c9dde0b748 100644
|
||||||
|
--- a/daemons/controld/controld_cib.c
|
||||||
|
+++ b/daemons/controld/controld_cib.c
|
||||||
|
@@ -861,10 +861,17 @@ cib_rsc_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *use
|
||||||
|
case pcmk_ok:
|
||||||
|
case -pcmk_err_diff_failed:
|
||||||
|
case -pcmk_err_diff_resync:
|
||||||
|
- crm_trace("Resource update %d complete: rc=%d", call_id, rc);
|
||||||
|
+ crm_trace("Resource history update completed (call=%d rc=%d)",
|
||||||
|
+ call_id, rc);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
- crm_warn("Resource update %d failed: (rc=%d) %s", call_id, rc, pcmk_strerror(rc));
|
||||||
|
+ if (call_id > 0) {
|
||||||
|
+ crm_warn("Resource history update %d failed: %s "
|
||||||
|
+ CRM_XS " rc=%d", call_id, pcmk_strerror(rc), rc);
|
||||||
|
+ } else {
|
||||||
|
+ crm_warn("Resource history update failed: %s " CRM_XS " rc=%d",
|
||||||
|
+ pcmk_strerror(rc), rc);
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
|
||||||
|
if (call_id == pending_rsc_update) {
|
45
008-attrd-shutdown.patch
Normal file
45
008-attrd-shutdown.patch
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
From f5263c9401c9c38d4e039149deddcc0da0c184ba Mon Sep 17 00:00:00 2001
|
||||||
|
From: Ken Gaillot <kgaillot@redhat.com>
|
||||||
|
Date: Thu, 3 Aug 2023 12:17:08 -0500
|
||||||
|
Subject: [PATCH] Fix: attrd: avoid race condition when shutting down
|
||||||
|
|
||||||
|
This addresses a race condition that can occur when the DC and the attribute
|
||||||
|
writer are different nodes, and shutting down at the same time. When the DC
|
||||||
|
controller leaves its Corosync process group, the remaining nodes erase its
|
||||||
|
transient node attributes (including "shutdown") from the CIB. However if the
|
||||||
|
(former) DC's attrd is still up, it can win the attribute writer election
|
||||||
|
called after the original writer leaves. As the election winner, it writes out
|
||||||
|
all its attributes to the CIB, including "shutdown". The next time it rejoins
|
||||||
|
the cluster, it will be immediately shut down.
|
||||||
|
|
||||||
|
Fixes T138
|
||||||
|
---
|
||||||
|
daemons/attrd/attrd_elections.c | 10 +++++++++-
|
||||||
|
1 file changed, 9 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/daemons/attrd/attrd_elections.c b/daemons/attrd/attrd_elections.c
|
||||||
|
index 3b6b55a0f59..6f4916888a9 100644
|
||||||
|
--- a/daemons/attrd/attrd_elections.c
|
||||||
|
+++ b/daemons/attrd/attrd_elections.c
|
||||||
|
@@ -22,12 +22,20 @@ attrd_election_cb(gpointer user_data)
|
||||||
|
{
|
||||||
|
attrd_declare_winner();
|
||||||
|
|
||||||
|
+ if (attrd_requesting_shutdown() || attrd_shutting_down()) {
|
||||||
|
+ /* This node is shutting down or about to, meaning its attributes will
|
||||||
|
+ * be removed (and may have already been removed from the CIB by a
|
||||||
|
+ * controller). Don't sync or write its attributes in this case.
|
||||||
|
+ */
|
||||||
|
+ return G_SOURCE_REMOVE;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
/* Update the peers after an election */
|
||||||
|
attrd_peer_sync(NULL, NULL);
|
||||||
|
|
||||||
|
/* Update the CIB after an election */
|
||||||
|
attrd_write_attributes(true, false);
|
||||||
|
- return FALSE;
|
||||||
|
+ return G_SOURCE_REMOVE;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
@ -36,7 +36,7 @@
|
|||||||
## can be incremented to build packages reliably considered "newer"
|
## can be incremented to build packages reliably considered "newer"
|
||||||
## than previously built packages with the same pcmkversion)
|
## than previously built packages with the same pcmkversion)
|
||||||
%global pcmkversion 2.1.6
|
%global pcmkversion 2.1.6
|
||||||
%global specversion 6
|
%global specversion 7
|
||||||
|
|
||||||
## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build
|
## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build
|
||||||
%global commit 6fdc9deea294bbad629b003c6ae036aaed8e3ee0
|
%global commit 6fdc9deea294bbad629b003c6ae036aaed8e3ee0
|
||||||
@ -269,6 +269,8 @@ Patch003: 003-clone-shuffle.patch
|
|||||||
Patch004: 004-clone-rsc-display.patch
|
Patch004: 004-clone-rsc-display.patch
|
||||||
Patch005: 005-attrd-dampen.patch
|
Patch005: 005-attrd-dampen.patch
|
||||||
Patch006: 006-controller-reply.patch
|
Patch006: 006-controller-reply.patch
|
||||||
|
Patch007: 007-glib-assertions.patch
|
||||||
|
Patch008: 008-attrd-shutdown.patch
|
||||||
|
|
||||||
# downstream-only commits
|
# downstream-only commits
|
||||||
#Patch1xx: 1xx-xxxx.patch
|
#Patch1xx: 1xx-xxxx.patch
|
||||||
@ -1007,6 +1009,10 @@ exit 0
|
|||||||
%license %{nagios_name}-%{nagios_hash}/COPYING
|
%license %{nagios_name}-%{nagios_hash}/COPYING
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Mon Aug 7 2023 Chris Lumens <clumens@redhat.com> - 2.1.6-7
|
||||||
|
- Fix attrd race condition when shutting down
|
||||||
|
- Resolves: rhbz2228955
|
||||||
|
|
||||||
* Thu Jul 27 2023 Chris Lumens <clumens@redhat.com> - 2.1.6-6
|
* Thu Jul 27 2023 Chris Lumens <clumens@redhat.com> - 2.1.6-6
|
||||||
- Wait for a reply from various controller commands
|
- Wait for a reply from various controller commands
|
||||||
- Resolves: rhbz2225631
|
- Resolves: rhbz2225631
|
||||||
|
Loading…
Reference in New Issue
Block a user