- Fix promoting instances of a cloned resource - Handle large timeouts correctly in crm_resource --wait - Don't evict IPC clients as long as they're still processing messages - Don't overwhelm the FSA queue with repeated CIB queries - Resolves: RHEL-120932 - Resolves: RHEL-86148 - Resolves: RHEL-114895
401 lines
18 KiB
Diff
401 lines
18 KiB
Diff
From 79f5a67e8242b3e72aa9dcf0dbd286b3fb719baa Mon Sep 17 00:00:00 2001
|
|
From: Chris Lumens <clumens@redhat.com>
|
|
Date: Wed, 27 Aug 2025 10:41:13 -0400
|
|
Subject: [PATCH 1/6] Refactor: libcrmcommon: Rearrange the queue_len check.
|
|
|
|
Check if the queue length is 0 first and return, which allows everything
|
|
else to be un-indented one level.
|
|
---
|
|
lib/common/ipc_server.c | 47 ++++++++++++++++++++---------------------
|
|
1 file changed, 23 insertions(+), 24 deletions(-)
|
|
|
|
diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
|
|
index 25c788b..4b33c64 100644
|
|
--- a/lib/common/ipc_server.c
|
|
+++ b/lib/common/ipc_server.c
|
|
@@ -541,34 +541,33 @@ no_more_retries:
|
|
sent, queue_len, c->ipcs, c->pid, pcmk_rc_str(rc), qb_rc);
|
|
}
|
|
|
|
- if (queue_len) {
|
|
-
|
|
- /* Allow clients to briefly fall behind on processing incoming messages,
|
|
- * but drop completely unresponsive clients so the connection doesn't
|
|
- * consume resources indefinitely.
|
|
- */
|
|
- if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) {
|
|
- if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) {
|
|
- /* Don't evict for a new or shrinking backlog */
|
|
- crm_warn("Client with process ID %u has a backlog of %u messages "
|
|
- QB_XS " %p", c->pid, queue_len, c->ipcs);
|
|
- } else {
|
|
- crm_err("Evicting client with process ID %u due to backlog of %u messages "
|
|
- QB_XS " %p", c->pid, queue_len, c->ipcs);
|
|
- c->queue_backlog = 0;
|
|
- qb_ipcs_disconnect(c->ipcs);
|
|
- return rc;
|
|
- }
|
|
- }
|
|
-
|
|
- c->queue_backlog = queue_len;
|
|
- delay_next_flush(c, queue_len);
|
|
-
|
|
- } else {
|
|
+ if (queue_len == 0) {
|
|
/* Event queue is empty, there is no backlog */
|
|
c->queue_backlog = 0;
|
|
+ return rc;
|
|
}
|
|
|
|
+ /* Allow clients to briefly fall behind on processing incoming messages,
|
|
+ * but drop completely unresponsive clients so the connection doesn't
|
|
+ * consume resources indefinitely.
|
|
+ */
|
|
+ if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) {
|
|
+ if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) {
|
|
+ /* Don't evict for a new or shrinking backlog */
|
|
+ crm_warn("Client with process ID %u has a backlog of %u messages "
|
|
+ QB_XS " %p", c->pid, queue_len, c->ipcs);
|
|
+ } else {
|
|
+ crm_err("Evicting client with process ID %u due to backlog of %u messages "
|
|
+ QB_XS " %p", c->pid, queue_len, c->ipcs);
|
|
+ c->queue_backlog = 0;
|
|
+ qb_ipcs_disconnect(c->ipcs);
|
|
+ return rc;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ c->queue_backlog = queue_len;
|
|
+ delay_next_flush(c, queue_len);
|
|
+
|
|
return rc;
|
|
}
|
|
|
|
--
|
|
2.47.1
|
|
|
|
From 014699003c6506bba8638ed57efea49da403d0e1 Mon Sep 17 00:00:00 2001
|
|
From: Chris Lumens <clumens@redhat.com>
|
|
Date: Wed, 27 Aug 2025 11:31:37 -0400
|
|
Subject: [PATCH 2/6] Refactor: libcrmcommon: Simplify an empty event queue
|
|
check.
|
|
|
|
I find this just a little bit more straightforward to follow.
|
|
---
|
|
lib/common/ipc_server.c | 9 ++++-----
|
|
1 file changed, 4 insertions(+), 5 deletions(-)
|
|
|
|
diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
|
|
index 4b33c64..dbd885a 100644
|
|
--- a/lib/common/ipc_server.c
|
|
+++ b/lib/common/ipc_server.c
|
|
@@ -491,14 +491,13 @@ crm_ipcs_flush_events(pcmk__client_t *c)
|
|
pcmk__ipc_header_t *header = NULL;
|
|
struct iovec *event = NULL;
|
|
|
|
- if (c->event_queue) {
|
|
- // We don't pop unless send is successful
|
|
- event = g_queue_peek_head(c->event_queue);
|
|
- }
|
|
- if (event == NULL) { // Queue is empty
|
|
+ if ((c->event_queue == NULL) || g_queue_is_empty(c->event_queue)) {
|
|
break;
|
|
}
|
|
|
|
+ // We don't pop unless send is successful
|
|
+ event = g_queue_peek_head(c->event_queue);
|
|
+
|
|
/* Retry sending the event up to five times. If we get -EAGAIN, sleep
|
|
* a very short amount of time (too long here is bad) and try again.
|
|
* If we simply exit the while loop on -EAGAIN, we'll have to wait until
|
|
--
|
|
2.47.1
|
|
|
|
From f999ac3d86d8107dee5288497f5f7fff07956d18 Mon Sep 17 00:00:00 2001
|
|
From: Chris Lumens <clumens@redhat.com>
|
|
Date: Wed, 27 Aug 2025 11:35:38 -0400
|
|
Subject: [PATCH 3/6] Refactor: libcrmcommon: Rearrange a few tests in
|
|
crm_ipcs_flush_events.
|
|
|
|
Again, no important code changes here. I just find these a little
|
|
easier to follow.
|
|
---
|
|
lib/common/ipc_server.c | 6 ++++--
|
|
1 file changed, 4 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
|
|
index dbd885a..b76847b 100644
|
|
--- a/lib/common/ipc_server.c
|
|
+++ b/lib/common/ipc_server.c
|
|
@@ -477,16 +477,18 @@ crm_ipcs_flush_events(pcmk__client_t *c)
|
|
|
|
if (c == NULL) {
|
|
return rc;
|
|
+ }
|
|
|
|
- } else if (c->event_timer) {
|
|
+ if (c->event_timer != 0) {
|
|
/* There is already a timer, wait until it goes off */
|
|
crm_trace("Timer active for %p - %d", c->ipcs, c->event_timer);
|
|
return rc;
|
|
}
|
|
|
|
- if (c->event_queue) {
|
|
+ if (c->event_queue != NULL) {
|
|
queue_len = g_queue_get_length(c->event_queue);
|
|
}
|
|
+
|
|
while (sent < 100) {
|
|
pcmk__ipc_header_t *header = NULL;
|
|
struct iovec *event = NULL;
|
|
--
|
|
2.47.1
|
|
|
|
From 9e76007bb0bc1d4cb5a88dcfaaf96aa8853f42dc Mon Sep 17 00:00:00 2001
|
|
From: Chris Lumens <clumens@redhat.com>
|
|
Date: Wed, 27 Aug 2025 11:48:48 -0400
|
|
Subject: [PATCH 4/6] Refactor: libcrmcommon: Unindent retry code in
|
|
crm_ipcs_flush_events.
|
|
|
|
If we're breaking or jumping to a label, there's no need to have all
|
|
these nested else blocks.
|
|
---
|
|
lib/common/ipc_server.c | 16 ++++++++--------
|
|
1 file changed, 8 insertions(+), 8 deletions(-)
|
|
|
|
diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
|
|
index b76847b..73cc58f 100644
|
|
--- a/lib/common/ipc_server.c
|
|
+++ b/lib/common/ipc_server.c
|
|
@@ -513,16 +513,16 @@ crm_ipcs_flush_events(pcmk__client_t *c)
|
|
for (unsigned int retries = 5; retries > 0; retries--) {
|
|
qb_rc = qb_ipcs_event_sendv(c->ipcs, event, 2);
|
|
|
|
- if (qb_rc < 0) {
|
|
- if (retries == 1 || qb_rc != -EAGAIN) {
|
|
- rc = (int) -qb_rc;
|
|
- goto no_more_retries;
|
|
- } else {
|
|
- pcmk__sleep_ms(5);
|
|
- }
|
|
- } else {
|
|
+ if (qb_rc >= 0) {
|
|
break;
|
|
}
|
|
+
|
|
+ if (retries == 1 || qb_rc != -EAGAIN) {
|
|
+ rc = (int) -qb_rc;
|
|
+ goto no_more_retries;
|
|
+ }
|
|
+
|
|
+ pcmk__sleep_ms(5);
|
|
}
|
|
|
|
event = g_queue_pop_head(c->event_queue);
|
|
--
|
|
2.47.1
|
|
|
|
From b73be21a454f795bc747aad1dbeea82f67d8b232 Mon Sep 17 00:00:00 2001
|
|
From: Chris Lumens <clumens@redhat.com>
|
|
Date: Wed, 27 Aug 2025 13:14:54 -0400
|
|
Subject: [PATCH 5/6] Feature: libcrmcommon: Be more lenient in evicting IPC
|
|
clients.
|
|
|
|
Each IPC connection has a message queue. If the client is unable to
|
|
process messages faster than the server is sending them, that queue
|
|
start to back up. pacemaker enforces a cap on the queue size, and
|
|
that's adjustable with the cluster-ipc-limit parameter. Once the queue
|
|
grows beyond that size, the client is assumed to be dead and is evicted
|
|
so it can be restarted and the queue resources freed.
|
|
|
|
However, it's possible that the client is not dead. On clusters with
|
|
very large numbers of resources (I've tried with 300, but fewer might
|
|
also cause problems), certain actions can happen that cause a spike in
|
|
IPC messages. In RHEL-76276, the action that causes this is moving
|
|
nodes in and out of standby. This spike in messages causes the server
|
|
to overwhelm the client, which is then evicted.
|
|
|
|
My multi-part IPC patches made this even worse, as now if the CIB is so
|
|
large that it needs to split an IPC message up, there will be more
|
|
messages than before.
|
|
|
|
What this fix does is get rid of the cap on the queue size for pacemaker
|
|
daemons. As long as the server has been able to send messages to the
|
|
client, the client is still doing work and shouldn't be evicted. It may
|
|
just be processing messages slower than the server is sending them.
|
|
Note that this could lead the queue to grow without bound, eventually
|
|
crashing the server. For this reason, we're only allowing pacemaker
|
|
daemons to ignore the queue size limit.
|
|
|
|
Potential problems with this approach:
|
|
|
|
* If the client is so busy that it can't receive even a single message
|
|
that crm_ipcs_flush_events tries to send, it will still be evicted.
|
|
However, the flush operation does retry with a delay several times
|
|
giving the client time to finish up what it's doing.
|
|
|
|
* We have timers all over the place with daemons waiting on replies.
|
|
It's possible that because we are no longer just evicting the clients,
|
|
we will now see those timers expire which will just lead to different
|
|
problems. If so, these fixes would probably need to take place in the
|
|
client code.
|
|
|
|
Fixes T38
|
|
---
|
|
lib/common/ipc_server.c | 14 ++++++++++++--
|
|
1 file changed, 12 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
|
|
index 73cc58f..4420070 100644
|
|
--- a/lib/common/ipc_server.c
|
|
+++ b/lib/common/ipc_server.c
|
|
@@ -553,10 +553,20 @@ no_more_retries:
|
|
* consume resources indefinitely.
|
|
*/
|
|
if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) {
|
|
- if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) {
|
|
- /* Don't evict for a new or shrinking backlog */
|
|
+ /* Don't evict:
|
|
+ * - Clients with a new backlog.
|
|
+ * - Clients with a shrinking backlog (the client is processing
|
|
+ * messages faster than the server is sending them).
|
|
+ * - Clients that are pacemaker daemons and have had any messages sent
|
|
+ * to them in this flush call (the server is sending messages faster
|
|
+ * than the client is processing them, but the client is not dead).
|
|
+ */
|
|
+ if ((c->queue_backlog <= 1)
|
|
+ || (queue_len < c->queue_backlog)
|
|
+ || ((sent > 0) && (pcmk__parse_server(c->name) != pcmk_ipc_unknown))) {
|
|
crm_warn("Client with process ID %u has a backlog of %u messages "
|
|
QB_XS " %p", c->pid, queue_len, c->ipcs);
|
|
+
|
|
} else {
|
|
crm_err("Evicting client with process ID %u due to backlog of %u messages "
|
|
QB_XS " %p", c->pid, queue_len, c->ipcs);
|
|
--
|
|
2.47.1
|
|
|
|
From 4682953c567e16409d8e7972d9d5891348d4c360 Mon Sep 17 00:00:00 2001
|
|
From: Chris Lumens <clumens@redhat.com>
|
|
Date: Wed, 27 Aug 2025 15:56:27 -0400
|
|
Subject: [PATCH 6/6] Feature: libcrmcommon: Update documentation for
|
|
cluster-ipc-limit.
|
|
|
|
Clarify that this no longer applies to pacemaker daemons.
|
|
---
|
|
cts/cli/regression.crm_attribute.exp | 16 ++++++++--------
|
|
cts/cli/regression.daemons.exp | 4 ++--
|
|
.../Pacemaker_Explained/cluster-options.rst | 12 +++++++-----
|
|
lib/common/options.c | 6 +++---
|
|
4 files changed, 20 insertions(+), 18 deletions(-)
|
|
|
|
diff --git a/cts/cli/regression.crm_attribute.exp b/cts/cli/regression.crm_attribute.exp
|
|
index e161f49..36cba76 100644
|
|
--- a/cts/cli/regression.crm_attribute.exp
|
|
+++ b/cts/cli/regression.crm_attribute.exp
|
|
@@ -111,8 +111,8 @@ Also known as properties, these are options that affect behavior across the enti
|
|
* migration-limit: The number of live migration actions that the cluster is allowed to execute in parallel on a node (-1 means no limit)
|
|
* Possible values: integer (default: )
|
|
|
|
- * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a cluster daemon
|
|
- * Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
|
|
+ * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a client
|
|
+ * Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
|
|
* Possible values: nonnegative_integer (default: )
|
|
|
|
* stop-all-resources: Whether the cluster should stop all active resources
|
|
@@ -357,8 +357,8 @@ Also known as properties, these are options that affect behavior across the enti
|
|
<content type="integer" default=""/>
|
|
</parameter>
|
|
<parameter name="cluster-ipc-limit" advanced="0" generated="0">
|
|
- <longdesc lang="en">Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
|
|
- <shortdesc lang="en">Maximum IPC message backlog before disconnecting a cluster daemon</shortdesc>
|
|
+ <longdesc lang="en">Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
|
|
+ <shortdesc lang="en">Maximum IPC message backlog before disconnecting a client</shortdesc>
|
|
<content type="nonnegative_integer" default=""/>
|
|
</parameter>
|
|
<parameter name="stop-all-resources" advanced="0" generated="0">
|
|
@@ -537,8 +537,8 @@ Also known as properties, these are options that affect behavior across the enti
|
|
* migration-limit: The number of live migration actions that the cluster is allowed to execute in parallel on a node (-1 means no limit)
|
|
* Possible values: integer (default: )
|
|
|
|
- * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a cluster daemon
|
|
- * Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
|
|
+ * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a client
|
|
+ * Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
|
|
* Possible values: nonnegative_integer (default: )
|
|
|
|
* stop-all-resources: Whether the cluster should stop all active resources
|
|
@@ -824,8 +824,8 @@ Also known as properties, these are options that affect behavior across the enti
|
|
<content type="integer" default=""/>
|
|
</parameter>
|
|
<parameter name="cluster-ipc-limit" advanced="0" generated="0">
|
|
- <longdesc lang="en">Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
|
|
- <shortdesc lang="en">Maximum IPC message backlog before disconnecting a cluster daemon</shortdesc>
|
|
+ <longdesc lang="en">Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
|
|
+ <shortdesc lang="en">Maximum IPC message backlog before disconnecting a client</shortdesc>
|
|
<content type="nonnegative_integer" default=""/>
|
|
</parameter>
|
|
<parameter name="stop-all-resources" advanced="0" generated="0">
|
|
diff --git a/cts/cli/regression.daemons.exp b/cts/cli/regression.daemons.exp
|
|
index fc8535a..6274eeb 100644
|
|
--- a/cts/cli/regression.daemons.exp
|
|
+++ b/cts/cli/regression.daemons.exp
|
|
@@ -21,10 +21,10 @@
|
|
</parameter>
|
|
<parameter name="cluster-ipc-limit">
|
|
<longdesc lang="en">
|
|
- Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
|
|
+ Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
|
|
</longdesc>
|
|
<shortdesc lang="en">
|
|
- Maximum IPC message backlog before disconnecting a cluster daemon
|
|
+ Maximum IPC message backlog before disconnecting a client
|
|
</shortdesc>
|
|
<content type="integer" default=""/>
|
|
</parameter>
|
|
diff --git a/doc/sphinx/Pacemaker_Explained/cluster-options.rst b/doc/sphinx/Pacemaker_Explained/cluster-options.rst
|
|
index 6ebe5f3..22e1a50 100644
|
|
--- a/doc/sphinx/Pacemaker_Explained/cluster-options.rst
|
|
+++ b/doc/sphinx/Pacemaker_Explained/cluster-options.rst
|
|
@@ -693,11 +693,13 @@ values, by running the ``man pacemaker-schedulerd`` and
|
|
cluster-ipc-limit
|
|
- :ref:`nonnegative integer <nonnegative_integer>`
|
|
- 500
|
|
- - The maximum IPC message backlog before one cluster daemon will
|
|
- disconnect another. This is of use in large clusters, for which a good
|
|
- value is the number of resources in the cluster multiplied by the number
|
|
- of nodes. The default of 500 is also the minimum. Raise this if you see
|
|
- "Evicting client" log messages for cluster daemon process IDs.
|
|
+ - The maximum IPC message backlog before a cluster daemon will disconnect
|
|
+ a client. Other cluster daemons are not subject to this limit as long as
|
|
+ they are still processing messages. This is of use in large clusters,
|
|
+ for which a good value is the number of resources in the cluster
|
|
+ multiplied by the number of nodes. The default of 500 is also the
|
|
+ minimum. Raise this if you see "Evicting client" log messages for
|
|
+ cluster process IDs.
|
|
* - .. _pe_error_series_max:
|
|
|
|
.. index::
|
|
diff --git a/lib/common/options.c b/lib/common/options.c
|
|
index b8f4943..af1b073 100644
|
|
--- a/lib/common/options.c
|
|
+++ b/lib/common/options.c
|
|
@@ -432,10 +432,10 @@ static const pcmk__cluster_option_t cluster_options[] = {
|
|
PCMK_OPT_CLUSTER_IPC_LIMIT, NULL, PCMK_VALUE_NONNEGATIVE_INTEGER, NULL,
|
|
"500", pcmk__valid_positive_int,
|
|
pcmk__opt_based,
|
|
- N_("Maximum IPC message backlog before disconnecting a cluster daemon"),
|
|
+ N_("Maximum IPC message backlog before disconnecting a client"),
|
|
N_("Raise this if log has \"Evicting client\" messages for cluster "
|
|
- "daemon PIDs (a good value is the number of resources in the "
|
|
- "cluster multiplied by the number of nodes)."),
|
|
+ "PIDs (a good value is the number of resources in the cluster "
|
|
+ "multiplied by the number of nodes)."),
|
|
},
|
|
|
|
// Orphans and stopping
|
|
--
|
|
2.47.1
|
|
|