Compare commits

...

1 Commits

Author SHA1 Message Date
9093efb7e6 Import from CS git 2025-12-19 06:46:42 +00:00
3 changed files with 402 additions and 1 deletions

View File

@ -0,0 +1,303 @@
From 581ef435f7b6b0fde76663069ec63b3b4fb4b067 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Wed, 27 Aug 2025 10:41:13 -0400
Subject: [PATCH 1/5] Refactor: libcrmcommon: Rearrange the queue_len check.
Check if the queue length is 0 first and return, which allows everything
else to be un-indented one level.
---
lib/common/ipc_server.c | 47 ++++++++++++++++++++---------------------
1 file changed, 23 insertions(+), 24 deletions(-)
diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
index 5cd7e70..ee8ae9e 100644
--- a/lib/common/ipc_server.c
+++ b/lib/common/ipc_server.c
@@ -535,34 +535,33 @@ crm_ipcs_flush_events(pcmk__client_t *c)
pcmk_rc_str(rc), (long long) qb_rc);
}
- if (queue_len) {
-
- /* Allow clients to briefly fall behind on processing incoming messages,
- * but drop completely unresponsive clients so the connection doesn't
- * consume resources indefinitely.
- */
- if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) {
- if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) {
- /* Don't evict for a new or shrinking backlog */
- crm_warn("Client with process ID %u has a backlog of %u messages "
- CRM_XS " %p", c->pid, queue_len, c->ipcs);
- } else {
- crm_err("Evicting client with process ID %u due to backlog of %u messages "
- CRM_XS " %p", c->pid, queue_len, c->ipcs);
- c->queue_backlog = 0;
- qb_ipcs_disconnect(c->ipcs);
- return rc;
- }
- }
-
- c->queue_backlog = queue_len;
- delay_next_flush(c, queue_len);
-
- } else {
+ if (queue_len == 0) {
/* Event queue is empty, there is no backlog */
c->queue_backlog = 0;
+ return rc;
+ }
+
+ /* Allow clients to briefly fall behind on processing incoming messages,
+ * but drop completely unresponsive clients so the connection doesn't
+ * consume resources indefinitely.
+ */
+ if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) {
+ if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) {
+ /* Don't evict for a new or shrinking backlog */
+ crm_warn("Client with process ID %u has a backlog of %u messages "
+ CRM_XS " %p", c->pid, queue_len, c->ipcs);
+ } else {
+ crm_err("Evicting client with process ID %u due to backlog of %u messages "
+ CRM_XS " %p", c->pid, queue_len, c->ipcs);
+ c->queue_backlog = 0;
+ qb_ipcs_disconnect(c->ipcs);
+ return rc;
+ }
}
+ c->queue_backlog = queue_len;
+ delay_next_flush(c, queue_len);
+
return rc;
}
--
2.43.0
From 54fbc6bea137d0642308d49506f13bd84cd2084e Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Wed, 27 Aug 2025 11:31:37 -0400
Subject: [PATCH 2/5] Refactor: libcrmcommon: Simplify an empty event queue
check.
I find this just a little bit more straightforward to follow.
---
lib/common/ipc_server.c | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
index ee8ae9e..d24db59 100644
--- a/lib/common/ipc_server.c
+++ b/lib/common/ipc_server.c
@@ -500,10 +500,13 @@ crm_ipcs_flush_events(pcmk__client_t *c)
pcmk__ipc_header_t *header = NULL;
struct iovec *event = NULL;
- if (c->event_queue) {
- // We don't pop unless send is successful
- event = g_queue_peek_head(c->event_queue);
+ if ((c->event_queue == NULL) || g_queue_is_empty(c->event_queue)) {
+ break;
}
+
+ // We don't pop unless send is successful
+ event = g_queue_peek_head(c->event_queue);
+
if (event == NULL) { // Queue is empty
break;
}
--
2.43.0
From 6446aa1d917be090989860c3a5cc00ea6a311d67 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Wed, 27 Aug 2025 11:35:38 -0400
Subject: [PATCH 3/5] Refactor: libcrmcommon: Rearrange a few tests in
crm_ipcs_flush_events.
Again, no important code changes here. I just find these a little
easier to follow.
---
lib/common/ipc_server.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
index d24db59..c305dfc 100644
--- a/lib/common/ipc_server.c
+++ b/lib/common/ipc_server.c
@@ -486,16 +486,18 @@ crm_ipcs_flush_events(pcmk__client_t *c)
if (c == NULL) {
return rc;
+ }
- } else if (c->event_timer) {
+ if (c->event_timer != 0) {
/* There is already a timer, wait until it goes off */
crm_trace("Timer active for %p - %d", c->ipcs, c->event_timer);
return rc;
}
- if (c->event_queue) {
+ if (c->event_queue != NULL) {
queue_len = g_queue_get_length(c->event_queue);
}
+
while (sent < 100) {
pcmk__ipc_header_t *header = NULL;
struct iovec *event = NULL;
--
2.43.0
From d7576ecb3f51050a21057d86257bf8b8c273e4db Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Wed, 27 Aug 2025 13:14:54 -0400
Subject: [PATCH 4/5] Feature: libcrmcommon: Be more lenient in evicting IPC
clients.
Each IPC connection has a message queue. If the client is unable to
process messages faster than the server is sending them, that queue
start to back up. pacemaker enforces a cap on the queue size, and
that's adjustable with the cluster-ipc-limit parameter. Once the queue
grows beyond that size, the client is assumed to be dead and is evicted
so it can be restarted and the queue resources freed.
However, it's possible that the client is not dead. On clusters with
very large numbers of resources (I've tried with 300, but fewer might
also cause problems), certain actions can happen that cause a spike in
IPC messages. In RHEL-76276, the action that causes this is moving
nodes in and out of standby. This spike in messages causes the server
to overwhelm the client, which is then evicted.
My multi-part IPC patches made this even worse, as now if the CIB is so
large that it needs to split an IPC message up, there will be more
messages than before.
What this fix does is get rid of the cap on the queue size for pacemaker
daemons. As long as the server has been able to send messages to the
client, the client is still doing work and shouldn't be evicted. It may
just be processing messages slower than the server is sending them.
Note that this could lead the queue to grow without bound, eventually
crashing the server. For this reason, we're only allowing pacemaker
daemons to ignore the queue size limit.
Potential problems with this approach:
* If the client is so busy that it can't receive even a single message
that crm_ipcs_flush_events tries to send, it will still be evicted.
However, the flush operation does retry with a delay several times
giving the client time to finish up what it's doing.
* We have timers all over the place with daemons waiting on replies.
It's possible that because we are no longer just evicting the clients,
we will now see those timers expire which will just lead to different
problems. If so, these fixes would probably need to take place in the
client code.
Fixes T38
---
lib/common/ipc_server.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
index c305dfc..16a2986 100644
--- a/lib/common/ipc_server.c
+++ b/lib/common/ipc_server.c
@@ -551,10 +551,20 @@ crm_ipcs_flush_events(pcmk__client_t *c)
* consume resources indefinitely.
*/
if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) {
- if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) {
- /* Don't evict for a new or shrinking backlog */
+ /* Don't evict:
+ * - Clients with a new backlog.
+ * - Clients with a shrinking backlog (the client is processing
+ * messages faster than the server is sending them).
+ * - Clients that are pacemaker daemons and have had any messages sent
+ * to them in this flush call (the server is sending messages faster
+ * than the client is processing them, but the client is not dead).
+ */
+ if ((c->queue_backlog <= 1)
+ || (queue_len < c->queue_backlog)
+ || ((sent > 0) && crm_is_daemon_name(c->name))) {
crm_warn("Client with process ID %u has a backlog of %u messages "
CRM_XS " %p", c->pid, queue_len, c->ipcs);
+
} else {
crm_err("Evicting client with process ID %u due to backlog of %u messages "
CRM_XS " %p", c->pid, queue_len, c->ipcs);
--
2.43.0
From 6b5e50b272c26c95e9fae1c3270c77a8d72446e8 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Tue, 30 Sep 2025 13:50:53 -0400
Subject: [PATCH 5/5] Feature: libcrmcommon: Update documentation for
cluster-ipc-limit.
Clarify that this no longer applies to pacemaker daemons.
---
cts/cli/regression.daemons.exp | 4 ++--
doc/sphinx/Pacemaker_Explained/cluster-options.rst | 12 +++++++-----
lib/common/options.c | 6 +++---
3 files changed, 12 insertions(+), 10 deletions(-)
diff --git a/cts/cli/regression.daemons.exp b/cts/cli/regression.daemons.exp
index 678cb62..dffbe6a 100644
--- a/cts/cli/regression.daemons.exp
+++ b/cts/cli/regression.daemons.exp
@@ -11,8 +11,8 @@
<content type="boolean" default=""/>
</parameter>
<parameter name="cluster-ipc-limit">
- <longdesc lang="en">Raise this if log has &quot;Evicting client&quot; messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
- <shortdesc lang="en">Maximum IPC message backlog before disconnecting a cluster daemon</shortdesc>
+ <longdesc lang="en">Raise this if log has &quot;Evicting client&quot; messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
+ <shortdesc lang="en">Maximum IPC message backlog before disconnecting a client</shortdesc>
<content type="integer" default=""/>
</parameter>
</parameters>
diff --git a/doc/sphinx/Pacemaker_Explained/cluster-options.rst b/doc/sphinx/Pacemaker_Explained/cluster-options.rst
index 77bd7e6..fe2d4f1 100644
--- a/doc/sphinx/Pacemaker_Explained/cluster-options.rst
+++ b/doc/sphinx/Pacemaker_Explained/cluster-options.rst
@@ -675,11 +675,13 @@ values, by running the ``man pacemaker-schedulerd`` and
cluster-ipc-limit
- :ref:`nonnegative integer <nonnegative_integer>`
- 500
- - The maximum IPC message backlog before one cluster daemon will
- disconnect another. This is of use in large clusters, for which a good
- value is the number of resources in the cluster multiplied by the number
- of nodes. The default of 500 is also the minimum. Raise this if you see
- "Evicting client" log messages for cluster daemon process IDs.
+ - The maximum IPC message backlog before a cluster daemon will disconnect
+ a client. Other cluster daemons are not subject to this limit as long as
+ they are still processing messages. This is of use in large clusters,
+ for which a good value is the number of resources in the cluster
+ multiplied by the number of nodes. The default of 500 is also the
+ minimum. Raise this if you see "Evicting client" log messages for
+ cluster process IDs.
* - .. _pe_error_series_max:
.. index::
diff --git a/lib/common/options.c b/lib/common/options.c
index 96f059c..d3fc684 100644
--- a/lib/common/options.c
+++ b/lib/common/options.c
@@ -422,10 +422,10 @@ static pcmk__cluster_option_t cluster_options[] = {
"cluster-ipc-limit", NULL, "integer", NULL,
"500", pcmk__valid_positive_number,
pcmk__opt_context_based,
- N_("Maximum IPC message backlog before disconnecting a cluster daemon"),
+ N_("Maximum IPC message backlog before disconnecting a client"),
N_("Raise this if log has \"Evicting client\" messages for cluster "
- "daemon PIDs (a good value is the number of resources in the "
- "cluster multiplied by the number of nodes)."),
+ "PIDs (a good value is the number of resources in the cluster "
+ "multiplied by the number of nodes)."),
},
// Orphans and stopping
--
2.43.0

View File

@ -0,0 +1,88 @@
From 7be9cdca98217d002497714aaafcfc292c02555b Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Fri, 31 Oct 2025 11:24:14 -0400
Subject: [PATCH] Med: daemons: Don't add repeated I_PE_CALC messages to the
fsa queue.
Let's say you have a two node cluster, node1 and node2. For purposes of
testing, it's easiest if you use fence_dummy instead of a real fencing
agent as this will fake fencing happen but without rebooting the node so
you can see all the log files.
Assume the DC is node1. Now do the following:
- pcs node standby node1
- pcs resource defaults update resource-stickiness=1
- for i in $(seq 1 300); do echo $i; pcs resource create dummy$i ocf:heartbeat:Dummy --group dummy-group; done
- pcs node unstandby node1
It will take a long time to create that many resources. After node1
comes out of standby, it'll take a minute or two but eventually you'll
see that node1 was fenced. On node1, you'll see a lot of transition
abort messages happen. Each of these transition aborts causes an
I_PE_CALC message to be generated and added to the fsa queue. In my
testing, I've seen the queue grow to ~ 600 messages, all of which are
exactly the same thing.
The FSA is triggered at G_PRIORITY_HIGH, and once it is triggered, it
will run until its queue is empty. With so many messages being added so
quickly, we've basically ensured it won't be empty any time soon. While
controld is processing the FSA messages, it will be unable to read
anything out of the IPC backlog.
based continues to attempt to send IPC events to controld but is unable
to do so, so the backlog continues to grow. Eventually, the backlog
reaches that 500 message threshold without anything having been read by
controld, which triggers the eviction process.
There doesn't seem to be any reason for all these I_PE_CALC messages to
be generated. They're all exactly the same, and they don't appear to be
tagged with any unique data tying them to a specific query, and their
presence just slows everything down.
Thus, the fix here is very simple: if the latest message in the queue is
an I_PE_CALC message, just don't add another one. We could also make
sure there's only ever one I_PE_CALC message in the queue, but there
could potentially be valid reasons for there to be multiple interleaved
with other message types. I am erring on the side of caution with this
minimal fix.
Related: RHEL-76276
---
daemons/controld/controld_messages.c | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/daemons/controld/controld_messages.c b/daemons/controld/controld_messages.c
index 0b0f25b..30af707 100644
--- a/daemons/controld/controld_messages.c
+++ b/daemons/controld/controld_messages.c
@@ -73,6 +73,26 @@ register_fsa_input_adv(enum crmd_fsa_cause cause, enum crmd_fsa_input input,
return;
}
+ if (input == I_PE_CALC) {
+ GList *ele = NULL;
+
+ if (prepend) {
+ ele = g_list_first(controld_globals.fsa_message_queue);
+ } else {
+ ele = g_list_last(controld_globals.fsa_message_queue);
+ }
+
+ if (ele != NULL) {
+ fsa_data_t *message = (fsa_data_t *) ele->data;
+
+ if (message->fsa_input == I_PE_CALC) {
+ crm_debug("%s item in fsa queue is I_PE_CALC, not adding another",
+ (prepend ? "First" : "Last"));
+ return;
+ }
+ }
+ }
+
if (input == I_WAIT_FOR_EVENT) {
controld_set_global_flags(controld_fsa_is_stalled);
crm_debug("Stalling the FSA pending further input: source=%s cause=%s data=%p queue=%d",
--
2.43.0

View File

@ -244,7 +244,7 @@
Name: pacemaker
Summary: Scalable High-Availability cluster resource manager
Version: %{pcmkversion}
Release: %{pcmk_release}.3%{?dist}
Release: %{pcmk_release}.5%{?dist}
%if %{defined _unitdir}
License: GPL-2.0-or-later AND LGPL-2.1-or-later
%else
@ -279,6 +279,8 @@ Patch011: 011-attrd-memory-leak.patch
Patch012: 012-dont-set-as-xml-id.patch
Patch013: 013-crm_node-i-initialize.patch
Patch014: 014-remote-fencing.patch
Patch015: 015-ipc-disconnect.patch
Patch016: 016-fewer-messages.patch
Requires: resource-agents
Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release}
@ -1030,6 +1032,14 @@ exit 0
%license %{nagios_name}-%{nagios_hash}/COPYING
%changelog
* Mon Nov 17 2025 Chris Lumens <clumens@redhat.com> - 2.1.7-5.5
- Don't overwhelm the FSA queue with repeated CIB queries
- Related: RHEL-76276
* Tue Sep 30 2025 Chris Lumens <clumens@redhat.com> - 2.1.7-5.4
- Be more lenient in evicting IPC clients
- Resolves: RHEL-76276
* Thu Jul 10 2025 Chris Lumens <clumens@redhat.com> - 2.1.7-5.3
- Add option for controlling remote node fencing behavior
- Resolves: RHEL-93220