Backport fixes from main.
- Handle large timeouts correctly in crm_resource --wait - Do not try to connect to subdaemons before they're respawned - Don't evict IPC clients as long as they're still processing messages - Don't overwhelm the FSA queue with repeated CIB queries - Resolves: RHEL-45869 - Resolves: RHEL-87484 - Resolves: RHEL-114894
This commit is contained in:
parent
04570bb85c
commit
2982dbace6
131
001-crm_resource_wait.patch
Normal file
131
001-crm_resource_wait.patch
Normal file
@ -0,0 +1,131 @@
|
||||
From 17a2d7acffc86e06b2fa8b49c96c33d72466e7f7 Mon Sep 17 00:00:00 2001
|
||||
From: Reid Wahl <nrwahl@protonmail.com>
|
||||
Date: Fri, 5 Sep 2025 20:35:31 -0700
|
||||
Subject: [PATCH] Fix: tools: Handle large timeouts correctly in crm_resource
|
||||
--wait
|
||||
|
||||
Previously, if the --timeout value parsed to a value greater than
|
||||
(UINT_MAX - 999), the wait timeout would overflow. The effective timeout
|
||||
would be either 0 seconds or 1 second. This is because 999 was added to
|
||||
the guint value before passing it to pcmk__timeout_ms2s().
|
||||
|
||||
Now, we simply pass the timeout in milliseconds to
|
||||
pcmk__timeout_ms2s(), without adding 999.
|
||||
|
||||
This implies a slight behavior change. Previously, timeouts were always
|
||||
rounded up to the next greatest second. Now, they're rounded to the
|
||||
nearest second. For example, previously:
|
||||
* timeout values between 1ms and 500ms => wait timeout of 1 second
|
||||
* timeout values between 501ms and 1500ms => wait timeout of 2 seconds
|
||||
* timeout values between 1501ms and 2500ms => wait timeout of 3 seconds
|
||||
* and so on
|
||||
|
||||
Now:
|
||||
* timeout values between 1ms and 1499ms => wait timeout of 1 second
|
||||
* timeout values between 1500ms and 2499ms => wait timeout of 2 seconds
|
||||
* timeout values between 2500ms and 3499ms => wait timeout of 3 seconds
|
||||
* and so on
|
||||
|
||||
The previous rounding behavior has existed since crm_resource --wait was
|
||||
added by 424afcdf.
|
||||
|
||||
Update the help text to note the granularity and rounding behavior. The
|
||||
exact behavior of the restart command is confusing, and its logic should
|
||||
be cleaned up in the future.
|
||||
|
||||
Fixes RHEL-45869
|
||||
Fixes RHEL-86148
|
||||
Closes T841
|
||||
|
||||
Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
|
||||
---
|
||||
include/crm/common/internal.h | 1 +
|
||||
lib/common/utils.c | 30 ++++++++++++++++++++++++++++++
|
||||
tools/crm_resource.c | 4 +++-
|
||||
tools/crm_resource_runtime.c | 2 +-
|
||||
4 files changed, 35 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/include/crm/common/internal.h b/include/crm/common/internal.h
|
||||
index 7d1e95f..ee26321 100644
|
||||
--- a/include/crm/common/internal.h
|
||||
+++ b/include/crm/common/internal.h
|
||||
@@ -249,6 +249,7 @@ void pcmk__daemonize(const char *name, const char *pidfile);
|
||||
void pcmk__panic(const char *origin);
|
||||
pid_t pcmk__locate_sbd(void);
|
||||
void pcmk__sleep_ms(unsigned int ms);
|
||||
+guint pcmk__timeout_ms2s(guint timeout_ms);
|
||||
|
||||
extern int pcmk__score_red;
|
||||
extern int pcmk__score_green;
|
||||
diff --git a/lib/common/utils.c b/lib/common/utils.c
|
||||
index 9e44c22..0a38811 100644
|
||||
--- a/lib/common/utils.c
|
||||
+++ b/lib/common/utils.c
|
||||
@@ -413,6 +413,36 @@ pcmk__sleep_ms(unsigned int ms)
|
||||
#endif
|
||||
}
|
||||
|
||||
+/*!
|
||||
+ * \internal
|
||||
+ * \brief Convert milliseconds to seconds
|
||||
+ *
|
||||
+ * \param[in] timeout_ms The interval, in ms
|
||||
+ *
|
||||
+ * \return If \p timeout_ms is 0, return 0. Otherwise, return the number of
|
||||
+ * seconds, rounded to the nearest integer, with a minimum of 1.
|
||||
+ */
|
||||
+guint
|
||||
+pcmk__timeout_ms2s(guint timeout_ms)
|
||||
+{
|
||||
+ guint quot, rem;
|
||||
+
|
||||
+ if (timeout_ms == 0) {
|
||||
+ return 0;
|
||||
+ } else if (timeout_ms < 1000) {
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ quot = timeout_ms / 1000;
|
||||
+ rem = timeout_ms % 1000;
|
||||
+
|
||||
+ if (rem >= 500) {
|
||||
+ quot += 1;
|
||||
+ }
|
||||
+
|
||||
+ return quot;
|
||||
+}
|
||||
+
|
||||
// Deprecated functions kept only for backward API compatibility
|
||||
// LCOV_EXCL_START
|
||||
|
||||
diff --git a/tools/crm_resource.c b/tools/crm_resource.c
|
||||
index 898c21e..ac50c5e 100644
|
||||
--- a/tools/crm_resource.c
|
||||
+++ b/tools/crm_resource.c
|
||||
@@ -766,7 +766,9 @@ static GOptionEntry addl_entries[] = {
|
||||
"ID" },
|
||||
{ "timeout", 'T', G_OPTION_FLAG_NONE, G_OPTION_ARG_CALLBACK, timeout_cb,
|
||||
"(Advanced) Abort if command does not finish in this time (with\n"
|
||||
- INDENT "--restart, --wait, --force-*)",
|
||||
+ INDENT "--restart, --wait, --force-*). The --restart command uses a\n"
|
||||
+ INDENT "two-second granularity and the --wait command uses a one-second\n"
|
||||
+ INDENT "granularity, with rounding.",
|
||||
"N" },
|
||||
{ "all", 0, G_OPTION_FLAG_NONE, G_OPTION_ARG_NONE, &options.all,
|
||||
"List all options, including advanced and deprecated (with\n"
|
||||
diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c
|
||||
index 7350f07..286c10c 100644
|
||||
--- a/tools/crm_resource_runtime.c
|
||||
+++ b/tools/crm_resource_runtime.c
|
||||
@@ -1977,7 +1977,7 @@ wait_till_stable(pcmk__output_t *out, guint timeout_ms, cib_t * cib)
|
||||
if (timeout_ms == 0) {
|
||||
expire_time += WAIT_DEFAULT_TIMEOUT_S;
|
||||
} else {
|
||||
- expire_time += (timeout_ms + 999) / 1000;
|
||||
+ expire_time += pcmk__timeout_ms2s(timeout_ms);
|
||||
}
|
||||
|
||||
scheduler = pe_new_working_set();
|
||||
--
|
||||
2.47.1
|
||||
|
||||
178
002-ipc_connect_retry.patch
Normal file
178
002-ipc_connect_retry.patch
Normal file
@ -0,0 +1,178 @@
|
||||
From 8ff58d786907b64c32350e72e341cdd0f5026813 Mon Sep 17 00:00:00 2001
|
||||
From: Thomas Jones <thomas.jones@ibm.com>
|
||||
Date: Fri, 30 May 2025 16:40:13 -0400
|
||||
Subject: [PATCH 1/2] Fix: libcrmcommon: Add retries on connect to avoid fatal
|
||||
errors when sub-daemons communicate Add pcmk__connect_ipc_retry_conrefused()
|
||||
and use it where it makes sense Add retry loop to
|
||||
connect_and_send_attrd_request() that retries connect and send.
|
||||
|
||||
---
|
||||
daemons/controld/controld_schedulerd.c | 2 +-
|
||||
include/crm/common/ipc_internal.h | 4 ++++
|
||||
lib/common/ipc_attrd.c | 19 ++++++++++++----
|
||||
lib/common/ipc_client.c | 30 ++++++++++++++++++++++++++
|
||||
lib/pacemaker/pcmk_cluster_queries.c | 2 +-
|
||||
5 files changed, 51 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/daemons/controld/controld_schedulerd.c b/daemons/controld/controld_schedulerd.c
|
||||
index 22b37b8..444bdef 100644
|
||||
--- a/daemons/controld/controld_schedulerd.c
|
||||
+++ b/daemons/controld/controld_schedulerd.c
|
||||
@@ -197,7 +197,7 @@ new_schedulerd_ipc_connection(void)
|
||||
|
||||
pcmk_register_ipc_callback(schedulerd_api, scheduler_event_callback, NULL);
|
||||
|
||||
- rc = pcmk__connect_ipc(schedulerd_api, pcmk_ipc_dispatch_main, 3);
|
||||
+ rc = pcmk__connect_ipc_retry_conrefused(schedulerd_api, pcmk_ipc_dispatch_main, 3);
|
||||
if (rc != pcmk_rc_ok) {
|
||||
crm_err("Error connecting to %s: %s",
|
||||
pcmk_ipc_name(schedulerd_api, true), pcmk_rc_str(rc));
|
||||
diff --git a/include/crm/common/ipc_internal.h b/include/crm/common/ipc_internal.h
|
||||
index 27b8bfc..09145ba 100644
|
||||
--- a/include/crm/common/ipc_internal.h
|
||||
+++ b/include/crm/common/ipc_internal.h
|
||||
@@ -101,6 +101,10 @@ int pcmk__ipc_fd(crm_ipc_t *ipc, int *fd);
|
||||
int pcmk__connect_ipc(pcmk_ipc_api_t *api, enum pcmk_ipc_dispatch dispatch_type,
|
||||
int attempts);
|
||||
|
||||
+int pcmk__connect_ipc_retry_conrefused(pcmk_ipc_api_t *api,
|
||||
+ enum pcmk_ipc_dispatch dispatch_type,
|
||||
+ int attempts);
|
||||
+
|
||||
/*
|
||||
* Server-related
|
||||
*/
|
||||
diff --git a/lib/common/ipc_attrd.c b/lib/common/ipc_attrd.c
|
||||
index 5ab0f2d..60a92a0 100644
|
||||
--- a/lib/common/ipc_attrd.c
|
||||
+++ b/lib/common/ipc_attrd.c
|
||||
@@ -152,6 +152,8 @@ create_attrd_op(const char *user_name)
|
||||
static int
|
||||
connect_and_send_attrd_request(pcmk_ipc_api_t *api, const xmlNode *request)
|
||||
{
|
||||
+ static const int max_retries = 5;
|
||||
+ int remaining_attempts = max_retries;
|
||||
int rc = pcmk_rc_ok;
|
||||
bool created_api = false;
|
||||
|
||||
@@ -163,10 +165,19 @@ connect_and_send_attrd_request(pcmk_ipc_api_t *api, const xmlNode *request)
|
||||
created_api = true;
|
||||
}
|
||||
|
||||
- rc = pcmk__connect_ipc(api, pcmk_ipc_dispatch_sync, 5);
|
||||
- if (rc == pcmk_rc_ok) {
|
||||
- rc = pcmk__send_ipc_request(api, request);
|
||||
- }
|
||||
+ // If attrd is killed and is being restarted we will temporarily get
|
||||
+ // ECONNREFUSED on connect if it is already dead or ENOTCONN if it died
|
||||
+ // after we connected to it. We should wait a bit and retry in those cases.
|
||||
+ do {
|
||||
+ if (rc == ENOTCONN || rc == ECONNREFUSED) {
|
||||
+ sleep(max_retries - remaining_attempts);
|
||||
+ }
|
||||
+ rc = pcmk__connect_ipc(api, pcmk_ipc_dispatch_sync, remaining_attempts);
|
||||
+ if (rc == pcmk_rc_ok) {
|
||||
+ rc = pcmk__send_ipc_request(api, request);
|
||||
+ }
|
||||
+ remaining_attempts--;
|
||||
+ } while ((rc == ENOTCONN || rc == ECONNREFUSED) && remaining_attempts >= 0);
|
||||
|
||||
if (created_api) {
|
||||
pcmk_free_ipc_api(api);
|
||||
diff --git a/lib/common/ipc_client.c b/lib/common/ipc_client.c
|
||||
index 24d7745..4ac7810 100644
|
||||
--- a/lib/common/ipc_client.c
|
||||
+++ b/lib/common/ipc_client.c
|
||||
@@ -488,6 +488,36 @@ connect_without_main_loop(pcmk_ipc_api_t *api)
|
||||
return rc;
|
||||
}
|
||||
|
||||
+/*!
|
||||
+ * \internal
|
||||
+ * \brief Connect to a Pacemaker daemon via IPC (retrying after soft errors
|
||||
+ * and ECONNREFUSED)
|
||||
+ *
|
||||
+ * \param[in,out] api IPC API instance
|
||||
+ * \param[in] dispatch_type How IPC replies should be dispatched
|
||||
+ * \param[in] attempts How many times to try (in case of soft error)
|
||||
+ *
|
||||
+ * \return Standard Pacemaker return code
|
||||
+*/
|
||||
+int
|
||||
+pcmk__connect_ipc_retry_conrefused(pcmk_ipc_api_t *api,
|
||||
+ enum pcmk_ipc_dispatch dispatch_type,
|
||||
+ int attempts)
|
||||
+{
|
||||
+ int remaining = attempts;
|
||||
+ int rc = pcmk_rc_ok;
|
||||
+
|
||||
+ do {
|
||||
+ if (rc == ECONNREFUSED) {
|
||||
+ pcmk__sleep_ms((attempts - remaining) * 500);
|
||||
+ }
|
||||
+ rc = pcmk__connect_ipc(api, dispatch_type, remaining);
|
||||
+ remaining--;
|
||||
+ } while (rc == ECONNREFUSED && remaining >= 0);
|
||||
+
|
||||
+ return rc;
|
||||
+}
|
||||
+
|
||||
/*!
|
||||
* \internal
|
||||
* \brief Connect to a Pacemaker daemon via IPC (retrying after soft errors)
|
||||
diff --git a/lib/pacemaker/pcmk_cluster_queries.c b/lib/pacemaker/pcmk_cluster_queries.c
|
||||
index bbefcda..c1fcf73 100644
|
||||
--- a/lib/pacemaker/pcmk_cluster_queries.c
|
||||
+++ b/lib/pacemaker/pcmk_cluster_queries.c
|
||||
@@ -360,7 +360,7 @@ ipc_connect(data_t *data, enum pcmk_ipc_server server, pcmk_ipc_callback_t cb,
|
||||
pcmk_register_ipc_callback(api, cb, data);
|
||||
}
|
||||
|
||||
- rc = pcmk__connect_ipc(api, dispatch_type, 5);
|
||||
+ rc = pcmk__connect_ipc_retry_conrefused(api, dispatch_type, 5);
|
||||
if (rc != pcmk_rc_ok) {
|
||||
if (rc == EREMOTEIO) {
|
||||
data->pcmkd_state = pcmk_pacemakerd_state_remote;
|
||||
--
|
||||
2.47.1
|
||||
|
||||
From 2e46b914fb08d346d7b022ba75302f9290034507 Mon Sep 17 00:00:00 2001
|
||||
From: Chris Lumens <clumens@redhat.com>
|
||||
Date: Mon, 4 Aug 2025 10:38:00 -0400
|
||||
Subject: [PATCH 2/2] Med: libpacemaker: Do not retry on ECONNREFUSED in tools.
|
||||
|
||||
This is a regression introduced by e438946787. In that patch, what
|
||||
we're trying to do is retry IPC connections between daemons. If a
|
||||
daemon gets ECONNREFUSED when it initiates an IPC connection, the most
|
||||
likely reason is that another daemon has been killed and is restarting
|
||||
but is not yet ready to accept connections. Waiting and retrying
|
||||
repeatedly is an acceptable way to deal with this.
|
||||
|
||||
However, if a command line tool gets ECONNREFUSED, it's more likely that
|
||||
the problem is the cluster isn't running at all. In this case, waiting
|
||||
and retrying just introduces a delay for a situation that will never be
|
||||
resolved. Reverting just the part in pcmk_cluster_queries.c should fix
|
||||
this problem without affecting any of the daemons - they don't call this
|
||||
code.
|
||||
|
||||
Fixes RHEL-106594
|
||||
---
|
||||
lib/pacemaker/pcmk_cluster_queries.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/lib/pacemaker/pcmk_cluster_queries.c b/lib/pacemaker/pcmk_cluster_queries.c
|
||||
index c1fcf73..bbefcda 100644
|
||||
--- a/lib/pacemaker/pcmk_cluster_queries.c
|
||||
+++ b/lib/pacemaker/pcmk_cluster_queries.c
|
||||
@@ -360,7 +360,7 @@ ipc_connect(data_t *data, enum pcmk_ipc_server server, pcmk_ipc_callback_t cb,
|
||||
pcmk_register_ipc_callback(api, cb, data);
|
||||
}
|
||||
|
||||
- rc = pcmk__connect_ipc_retry_conrefused(api, dispatch_type, 5);
|
||||
+ rc = pcmk__connect_ipc(api, dispatch_type, 5);
|
||||
if (rc != pcmk_rc_ok) {
|
||||
if (rc == EREMOTEIO) {
|
||||
data->pcmkd_state = pcmk_pacemakerd_state_remote;
|
||||
--
|
||||
2.47.1
|
||||
|
||||
330
003-ipc_evict.patch
Normal file
330
003-ipc_evict.patch
Normal file
@ -0,0 +1,330 @@
|
||||
From fd592986fb50c7b07197df077d744c3e4d4aecd1 Mon Sep 17 00:00:00 2001
|
||||
From: Chris Lumens <clumens@redhat.com>
|
||||
Date: Wed, 27 Aug 2025 10:41:13 -0400
|
||||
Subject: [PATCH 1/5] Refactor: libcrmcommon: Rearrange the queue_len check.
|
||||
|
||||
Check if the queue length is 0 first and return, which allows everything
|
||||
else to be un-indented one level.
|
||||
---
|
||||
lib/common/ipc_server.c | 47 ++++++++++++++++++++---------------------
|
||||
1 file changed, 23 insertions(+), 24 deletions(-)
|
||||
|
||||
diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
|
||||
index 96f3cdb..dddd96f 100644
|
||||
--- a/lib/common/ipc_server.c
|
||||
+++ b/lib/common/ipc_server.c
|
||||
@@ -547,34 +547,33 @@ crm_ipcs_flush_events(pcmk__client_t *c)
|
||||
pcmk_rc_str(rc), (long long) qb_rc);
|
||||
}
|
||||
|
||||
- if (queue_len) {
|
||||
-
|
||||
- /* Allow clients to briefly fall behind on processing incoming messages,
|
||||
- * but drop completely unresponsive clients so the connection doesn't
|
||||
- * consume resources indefinitely.
|
||||
- */
|
||||
- if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) {
|
||||
- if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) {
|
||||
- /* Don't evict for a new or shrinking backlog */
|
||||
- crm_warn("Client with process ID %u has a backlog of %u messages "
|
||||
- CRM_XS " %p", c->pid, queue_len, c->ipcs);
|
||||
- } else {
|
||||
- crm_err("Evicting client with process ID %u due to backlog of %u messages "
|
||||
- CRM_XS " %p", c->pid, queue_len, c->ipcs);
|
||||
- c->queue_backlog = 0;
|
||||
- qb_ipcs_disconnect(c->ipcs);
|
||||
- return rc;
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- c->queue_backlog = queue_len;
|
||||
- delay_next_flush(c, queue_len);
|
||||
-
|
||||
- } else {
|
||||
+ if (queue_len == 0) {
|
||||
/* Event queue is empty, there is no backlog */
|
||||
c->queue_backlog = 0;
|
||||
+ return rc;
|
||||
}
|
||||
|
||||
+ /* Allow clients to briefly fall behind on processing incoming messages,
|
||||
+ * but drop completely unresponsive clients so the connection doesn't
|
||||
+ * consume resources indefinitely.
|
||||
+ */
|
||||
+ if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) {
|
||||
+ if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) {
|
||||
+ /* Don't evict for a new or shrinking backlog */
|
||||
+ crm_warn("Client with process ID %u has a backlog of %u messages "
|
||||
+ CRM_XS " %p", c->pid, queue_len, c->ipcs);
|
||||
+ } else {
|
||||
+ crm_err("Evicting client with process ID %u due to backlog of %u messages "
|
||||
+ CRM_XS " %p", c->pid, queue_len, c->ipcs);
|
||||
+ c->queue_backlog = 0;
|
||||
+ qb_ipcs_disconnect(c->ipcs);
|
||||
+ return rc;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ c->queue_backlog = queue_len;
|
||||
+ delay_next_flush(c, queue_len);
|
||||
+
|
||||
return rc;
|
||||
}
|
||||
|
||||
--
|
||||
2.47.1
|
||||
|
||||
From 652e78628d9edc39fc0ec1a6ba6729b2adcaeb22 Mon Sep 17 00:00:00 2001
|
||||
From: Chris Lumens <clumens@redhat.com>
|
||||
Date: Wed, 27 Aug 2025 11:31:37 -0400
|
||||
Subject: [PATCH 2/5] Refactor: libcrmcommon: Simplify an empty event queue
|
||||
check.
|
||||
|
||||
I find this just a little bit more straightforward to follow.
|
||||
---
|
||||
lib/common/ipc_server.c | 9 ++++++---
|
||||
1 file changed, 6 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
|
||||
index dddd96f..249b0d1 100644
|
||||
--- a/lib/common/ipc_server.c
|
||||
+++ b/lib/common/ipc_server.c
|
||||
@@ -512,10 +512,13 @@ crm_ipcs_flush_events(pcmk__client_t *c)
|
||||
pcmk__ipc_header_t *header = NULL;
|
||||
struct iovec *event = NULL;
|
||||
|
||||
- if (c->event_queue) {
|
||||
- // We don't pop unless send is successful
|
||||
- event = g_queue_peek_head(c->event_queue);
|
||||
+ if ((c->event_queue == NULL) || g_queue_is_empty(c->event_queue)) {
|
||||
+ break;
|
||||
}
|
||||
+
|
||||
+ // We don't pop unless send is successful
|
||||
+ event = g_queue_peek_head(c->event_queue);
|
||||
+
|
||||
if (event == NULL) { // Queue is empty
|
||||
break;
|
||||
}
|
||||
--
|
||||
2.47.1
|
||||
|
||||
From e5aa53eda1675a2930527f8af93455e9a6fbecb5 Mon Sep 17 00:00:00 2001
|
||||
From: Chris Lumens <clumens@redhat.com>
|
||||
Date: Wed, 27 Aug 2025 11:35:38 -0400
|
||||
Subject: [PATCH 3/5] Refactor: libcrmcommon: Rearrange a few tests in
|
||||
crm_ipcs_flush_events.
|
||||
|
||||
Again, no important code changes here. I just find these a little
|
||||
easier to follow.
|
||||
---
|
||||
lib/common/ipc_server.c | 6 ++++--
|
||||
1 file changed, 4 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
|
||||
index 249b0d1..e47f461 100644
|
||||
--- a/lib/common/ipc_server.c
|
||||
+++ b/lib/common/ipc_server.c
|
||||
@@ -498,16 +498,18 @@ crm_ipcs_flush_events(pcmk__client_t *c)
|
||||
|
||||
if (c == NULL) {
|
||||
return rc;
|
||||
+ }
|
||||
|
||||
- } else if (c->event_timer) {
|
||||
+ if (c->event_timer != 0) {
|
||||
/* There is already a timer, wait until it goes off */
|
||||
crm_trace("Timer active for %p - %d", c->ipcs, c->event_timer);
|
||||
return rc;
|
||||
}
|
||||
|
||||
- if (c->event_queue) {
|
||||
+ if (c->event_queue != NULL) {
|
||||
queue_len = g_queue_get_length(c->event_queue);
|
||||
}
|
||||
+
|
||||
while (sent < 100) {
|
||||
pcmk__ipc_header_t *header = NULL;
|
||||
struct iovec *event = NULL;
|
||||
--
|
||||
2.47.1
|
||||
|
||||
From d61b6184dbfcd7b534804d95ad3d38707aab2880 Mon Sep 17 00:00:00 2001
|
||||
From: Chris Lumens <clumens@redhat.com>
|
||||
Date: Wed, 27 Aug 2025 13:14:54 -0400
|
||||
Subject: [PATCH 4/5] Feature: libcrmcommon: Be more lenient in evicting IPC
|
||||
clients.
|
||||
|
||||
Each IPC connection has a message queue. If the client is unable to
|
||||
process messages faster than the server is sending them, that queue
|
||||
start to back up. pacemaker enforces a cap on the queue size, and
|
||||
that's adjustable with the cluster-ipc-limit parameter. Once the queue
|
||||
grows beyond that size, the client is assumed to be dead and is evicted
|
||||
so it can be restarted and the queue resources freed.
|
||||
|
||||
However, it's possible that the client is not dead. On clusters with
|
||||
very large numbers of resources (I've tried with 300, but fewer might
|
||||
also cause problems), certain actions can happen that cause a spike in
|
||||
IPC messages. In RHEL-76276, the action that causes this is moving
|
||||
nodes in and out of standby. This spike in messages causes the server
|
||||
to overwhelm the client, which is then evicted.
|
||||
|
||||
My multi-part IPC patches made this even worse, as now if the CIB is so
|
||||
large that it needs to split an IPC message up, there will be more
|
||||
messages than before.
|
||||
|
||||
What this fix does is get rid of the cap on the queue size for pacemaker
|
||||
daemons. As long as the server has been able to send messages to the
|
||||
client, the client is still doing work and shouldn't be evicted. It may
|
||||
just be processing messages slower than the server is sending them.
|
||||
Note that this could lead the queue to grow without bound, eventually
|
||||
crashing the server. For this reason, we're only allowing pacemaker
|
||||
daemons to ignore the queue size limit.
|
||||
|
||||
Potential problems with this approach:
|
||||
|
||||
* If the client is so busy that it can't receive even a single message
|
||||
that crm_ipcs_flush_events tries to send, it will still be evicted.
|
||||
However, the flush operation does retry with a delay several times
|
||||
giving the client time to finish up what it's doing.
|
||||
|
||||
* We have timers all over the place with daemons waiting on replies.
|
||||
It's possible that because we are no longer just evicting the clients,
|
||||
we will now see those timers expire which will just lead to different
|
||||
problems. If so, these fixes would probably need to take place in the
|
||||
client code.
|
||||
|
||||
Fixes T38
|
||||
---
|
||||
lib/common/ipc_server.c | 14 ++++++++++++--
|
||||
1 file changed, 12 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
|
||||
index e47f461..83c501b 100644
|
||||
--- a/lib/common/ipc_server.c
|
||||
+++ b/lib/common/ipc_server.c
|
||||
@@ -563,10 +563,20 @@ crm_ipcs_flush_events(pcmk__client_t *c)
|
||||
* consume resources indefinitely.
|
||||
*/
|
||||
if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) {
|
||||
- if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) {
|
||||
- /* Don't evict for a new or shrinking backlog */
|
||||
+ /* Don't evict:
|
||||
+ * - Clients with a new backlog.
|
||||
+ * - Clients with a shrinking backlog (the client is processing
|
||||
+ * messages faster than the server is sending them).
|
||||
+ * - Clients that are pacemaker daemons and have had any messages sent
|
||||
+ * to them in this flush call (the server is sending messages faster
|
||||
+ * than the client is processing them, but the client is not dead).
|
||||
+ */
|
||||
+ if ((c->queue_backlog <= 1)
|
||||
+ || (queue_len < c->queue_backlog)
|
||||
+ || ((sent > 0) && crm_is_daemon_name(c->name))) {
|
||||
crm_warn("Client with process ID %u has a backlog of %u messages "
|
||||
CRM_XS " %p", c->pid, queue_len, c->ipcs);
|
||||
+
|
||||
} else {
|
||||
crm_err("Evicting client with process ID %u due to backlog of %u messages "
|
||||
CRM_XS " %p", c->pid, queue_len, c->ipcs);
|
||||
--
|
||||
2.47.1
|
||||
|
||||
From 0336f2c961470ad5afbfe18e2a90a0f2c0614e68 Mon Sep 17 00:00:00 2001
|
||||
From: Chris Lumens <clumens@redhat.com>
|
||||
Date: Tue, 30 Sep 2025 13:50:53 -0400
|
||||
Subject: [PATCH 5/5] Feature: libcrmcommon: Update documentation for
|
||||
cluster-ipc-limit.
|
||||
|
||||
Clarify that this no longer applies to pacemaker daemons.
|
||||
---
|
||||
cts/cli/regression.daemons.exp | 4 ++--
|
||||
cts/cli/regression.tools.exp | 16 ++++++++--------
|
||||
lib/common/options.c | 6 +++---
|
||||
3 files changed, 13 insertions(+), 13 deletions(-)
|
||||
|
||||
diff --git a/cts/cli/regression.daemons.exp b/cts/cli/regression.daemons.exp
|
||||
index fe53044..a16056f 100644
|
||||
--- a/cts/cli/regression.daemons.exp
|
||||
+++ b/cts/cli/regression.daemons.exp
|
||||
@@ -21,10 +21,10 @@
|
||||
</parameter>
|
||||
<parameter name="cluster-ipc-limit">
|
||||
<longdesc lang="en">
|
||||
- Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
|
||||
+ Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
|
||||
</longdesc>
|
||||
<shortdesc lang="en">
|
||||
- Maximum IPC message backlog before disconnecting a cluster daemon
|
||||
+ Maximum IPC message backlog before disconnecting a client
|
||||
</shortdesc>
|
||||
<content type="integer" default=""/>
|
||||
</parameter>
|
||||
diff --git a/cts/cli/regression.tools.exp b/cts/cli/regression.tools.exp
|
||||
index 5448ae3..08ddd44 100644
|
||||
--- a/cts/cli/regression.tools.exp
|
||||
+++ b/cts/cli/regression.tools.exp
|
||||
@@ -139,8 +139,8 @@ Also known as properties, these are options that affect behavior across the enti
|
||||
* migration-limit: The number of live migration actions that the cluster is allowed to execute in parallel on a node (-1 means no limit)
|
||||
* Possible values: integer (default: )
|
||||
|
||||
- * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a cluster daemon
|
||||
- * Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
|
||||
+ * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a client
|
||||
+ * Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
|
||||
* Possible values: nonnegative_integer (default: )
|
||||
|
||||
* stop-all-resources: Whether the cluster should stop all active resources
|
||||
@@ -385,8 +385,8 @@ Also known as properties, these are options that affect behavior across the enti
|
||||
<content type="integer" default=""/>
|
||||
</parameter>
|
||||
<parameter name="cluster-ipc-limit" advanced="0" generated="0">
|
||||
- <longdesc lang="en">Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
|
||||
- <shortdesc lang="en">Maximum IPC message backlog before disconnecting a cluster daemon</shortdesc>
|
||||
+ <longdesc lang="en">Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
|
||||
+ <shortdesc lang="en">Maximum IPC message backlog before disconnecting a client</shortdesc>
|
||||
<content type="nonnegative_integer" default=""/>
|
||||
</parameter>
|
||||
<parameter name="stop-all-resources" advanced="0" generated="0">
|
||||
@@ -574,8 +574,8 @@ Also known as properties, these are options that affect behavior across the enti
|
||||
* migration-limit: The number of live migration actions that the cluster is allowed to execute in parallel on a node (-1 means no limit)
|
||||
* Possible values: integer (default: )
|
||||
|
||||
- * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a cluster daemon
|
||||
- * Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
|
||||
+ * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a client
|
||||
+ * Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
|
||||
* Possible values: nonnegative_integer (default: )
|
||||
|
||||
* stop-all-resources: Whether the cluster should stop all active resources
|
||||
@@ -856,8 +856,8 @@ Also known as properties, these are options that affect behavior across the enti
|
||||
<content type="integer" default=""/>
|
||||
</parameter>
|
||||
<parameter name="cluster-ipc-limit" advanced="0" generated="0">
|
||||
- <longdesc lang="en">Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
|
||||
- <shortdesc lang="en">Maximum IPC message backlog before disconnecting a cluster daemon</shortdesc>
|
||||
+ <longdesc lang="en">Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
|
||||
+ <shortdesc lang="en">Maximum IPC message backlog before disconnecting a client</shortdesc>
|
||||
<content type="nonnegative_integer" default=""/>
|
||||
</parameter>
|
||||
<parameter name="stop-all-resources" advanced="0" generated="0">
|
||||
diff --git a/lib/common/options.c b/lib/common/options.c
|
||||
index c47a402..e7a76c5 100644
|
||||
--- a/lib/common/options.c
|
||||
+++ b/lib/common/options.c
|
||||
@@ -440,10 +440,10 @@ static const pcmk__cluster_option_t cluster_options[] = {
|
||||
PCMK_OPT_CLUSTER_IPC_LIMIT, NULL, PCMK_VALUE_NONNEGATIVE_INTEGER, NULL,
|
||||
"500", pcmk__valid_positive_int,
|
||||
pcmk__opt_based,
|
||||
- N_("Maximum IPC message backlog before disconnecting a cluster daemon"),
|
||||
+ N_("Maximum IPC message backlog before disconnecting a client"),
|
||||
N_("Raise this if log has \"Evicting client\" messages for cluster "
|
||||
- "daemon PIDs (a good value is the number of resources in the "
|
||||
- "cluster multiplied by the number of nodes)."),
|
||||
+ "PIDs (a good value is the number of resources in the cluster "
|
||||
+ "multiplied by the number of nodes)."),
|
||||
},
|
||||
|
||||
// Orphans and stopping
|
||||
--
|
||||
2.47.1
|
||||
|
||||
88
004-fewer_messages.patch
Normal file
88
004-fewer_messages.patch
Normal file
@ -0,0 +1,88 @@
|
||||
From 468ea9851958d26c25121f201f3631dfdd709cb4 Mon Sep 17 00:00:00 2001
|
||||
From: Chris Lumens <clumens@redhat.com>
|
||||
Date: Tue, 11 Nov 2025 15:11:58 -0500
|
||||
Subject: [PATCH] Med: daemons: Don't add repeated I_PE_CALC messages to the
|
||||
fsa queue.
|
||||
|
||||
Let's say you have a two node cluster, node1 and node2. For purposes of
|
||||
testing, it's easiest if you use fence_dummy instead of a real fencing
|
||||
agent as this will fake fencing happening but without rebooting the node
|
||||
so you can see all the log files.
|
||||
|
||||
Assume the DC is node1. Now do the following on node2:
|
||||
|
||||
- pcs node standby node1
|
||||
- pcs resource defaults update resource-stickiness=1
|
||||
- for i in $(seq 1 300); do echo $i; pcs resource create dummy$i ocf:heartbeat:Dummy --group dummy-group; done
|
||||
- pcs node unstandby node1
|
||||
|
||||
It will take a long time to create that many resources. After node1
|
||||
comes out of standby, it'll take a minute or two but eventually you'll
|
||||
see that node1 was fenced. On node1, you'll see a lot of transition
|
||||
abort messages happen. Each of these transition aborts causes an
|
||||
I_PE_CALC message to be generated and added to the fsa queue. In my
|
||||
testing, I've seen the queue grow to ~ 600 messages, all of which are
|
||||
exactly the same thing.
|
||||
|
||||
The FSA is triggered at G_PRIORITY_HIGH, and once it is triggered, it
|
||||
will run until its queue is empty. With so many messages being added so
|
||||
quickly, we've basically ensured it won't be empty any time soon. While
|
||||
controld is processing the FSA messages, it will be unable to read
|
||||
anything out of the IPC backlog.
|
||||
|
||||
based continues to attempt to send IPC events to controld but is unable
|
||||
to do so, so the backlog continues to grow. Eventually, the backlog
|
||||
reaches that 500 message threshold without anything having been read by
|
||||
controld, which triggers the eviction process.
|
||||
|
||||
There doesn't seem to be any reason for all these I_PE_CALC messages to
|
||||
be generated. They're all exactly the same, they don't appear to be
|
||||
tagged with any unique data tying them to a specific query, and their
|
||||
presence just slows everything down.
|
||||
|
||||
Thus, the fix here is very simple: if the latest message in the queue is
|
||||
an I_PE_CALC message, just don't add another one. We could also make
|
||||
sure there's only ever one I_PE_CALC message in the queue, but there
|
||||
could potentially be valid reasons for there to be multiple interleaved
|
||||
with other message types. I am erring on the side of caution with this
|
||||
minimal fix.
|
||||
|
||||
Resolves: RHEL-114894
|
||||
---
|
||||
daemons/controld/controld_messages.c | 20 ++++++++++++++++++++
|
||||
1 file changed, 20 insertions(+)
|
||||
|
||||
diff --git a/daemons/controld/controld_messages.c b/daemons/controld/controld_messages.c
|
||||
index 88c032f..ae54743 100644
|
||||
--- a/daemons/controld/controld_messages.c
|
||||
+++ b/daemons/controld/controld_messages.c
|
||||
@@ -71,6 +71,26 @@ register_fsa_input_adv(enum crmd_fsa_cause cause, enum crmd_fsa_input input,
|
||||
return;
|
||||
}
|
||||
|
||||
+ if (input == I_PE_CALC) {
|
||||
+ GList *ele = NULL;
|
||||
+
|
||||
+ if (prepend) {
|
||||
+ ele = g_list_first(controld_globals.fsa_message_queue);
|
||||
+ } else {
|
||||
+ ele = g_list_last(controld_globals.fsa_message_queue);
|
||||
+ }
|
||||
+
|
||||
+ if (ele != NULL) {
|
||||
+ fsa_data_t *message = (fsa_data_t *) ele->data;
|
||||
+
|
||||
+ if (message->fsa_input == I_PE_CALC) {
|
||||
+ crm_debug("%s item in fsa queue is I_PE_CALC, not adding another",
|
||||
+ (prepend ? "First" : "Last"));
|
||||
+ return;
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
if (input == I_WAIT_FOR_EVENT) {
|
||||
controld_set_global_flags(controld_fsa_is_stalled);
|
||||
crm_debug("Stalling the FSA pending further input: source=%s cause=%s data=%p queue=%d",
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -36,7 +36,7 @@
|
||||
## can be incremented to build packages reliably considered "newer"
|
||||
## than previously built packages with the same pcmkversion)
|
||||
%global pcmkversion 2.1.10
|
||||
%global specversion 1
|
||||
%global specversion 2
|
||||
|
||||
## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build
|
||||
%global commit 5693eaeeef06faa1622515963082b5a1731d9fc0
|
||||
@ -247,7 +247,10 @@ Source1: https://codeload.github.com/%{github_owner}/%{nagios_name}/tar.gz
|
||||
Source2: pacemaker.sysusers
|
||||
|
||||
# upstream commits
|
||||
#Patch001: 001-xxxx.patch
|
||||
Patch001: 001-crm_resource_wait.patch
|
||||
Patch002: 002-ipc_connect_retry.patch
|
||||
Patch003: 003-ipc_evict.patch
|
||||
Patch004: 004-fewer_messages.patch
|
||||
|
||||
Requires: resource-agents
|
||||
Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release}
|
||||
@ -914,6 +917,15 @@ exit 0
|
||||
%license %{nagios_name}-%{nagios_hash}/COPYING
|
||||
|
||||
%changelog
|
||||
* Wed Nov 12 2025 Chris Lumens <clumens@redhat.com> - 2.1.10-2
|
||||
- Handle large timeouts correctly in crm_resource --wait
|
||||
- Do not try to connect to subdaemons before they're respawned
|
||||
- Don't evict IPC clients as long as they're still processing messages
|
||||
- Don't overwhelm the FSA queue with repeated CIB queries
|
||||
- Resolves: RHEL-45869
|
||||
- Resolves: RHEL-87484
|
||||
- Resolves: RHEL-114894
|
||||
|
||||
* Mon Jun 23 2025 Chris Lumens <clumens@redhat.com> - 2.1.10-1
|
||||
- Rebase on upstream 2.1.10-rc1 release
|
||||
- Use dbus to detect completion of systemd resource start/stop actions
|
||||
|
||||
Loading…
Reference in New Issue
Block a user