diff --git a/SOURCES/015-ipc-disconnect.patch b/SOURCES/015-ipc-disconnect.patch new file mode 100644 index 0000000..4a70032 --- /dev/null +++ b/SOURCES/015-ipc-disconnect.patch @@ -0,0 +1,303 @@ +From 581ef435f7b6b0fde76663069ec63b3b4fb4b067 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 27 Aug 2025 10:41:13 -0400 +Subject: [PATCH 1/5] Refactor: libcrmcommon: Rearrange the queue_len check. + +Check if the queue length is 0 first and return, which allows everything +else to be un-indented one level. +--- + lib/common/ipc_server.c | 47 ++++++++++++++++++++--------------------- + 1 file changed, 23 insertions(+), 24 deletions(-) + +diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c +index 5cd7e70..ee8ae9e 100644 +--- a/lib/common/ipc_server.c ++++ b/lib/common/ipc_server.c +@@ -535,34 +535,33 @@ crm_ipcs_flush_events(pcmk__client_t *c) + pcmk_rc_str(rc), (long long) qb_rc); + } + +- if (queue_len) { +- +- /* Allow clients to briefly fall behind on processing incoming messages, +- * but drop completely unresponsive clients so the connection doesn't +- * consume resources indefinitely. +- */ +- if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) { +- if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) { +- /* Don't evict for a new or shrinking backlog */ +- crm_warn("Client with process ID %u has a backlog of %u messages " +- CRM_XS " %p", c->pid, queue_len, c->ipcs); +- } else { +- crm_err("Evicting client with process ID %u due to backlog of %u messages " +- CRM_XS " %p", c->pid, queue_len, c->ipcs); +- c->queue_backlog = 0; +- qb_ipcs_disconnect(c->ipcs); +- return rc; +- } +- } +- +- c->queue_backlog = queue_len; +- delay_next_flush(c, queue_len); +- +- } else { ++ if (queue_len == 0) { + /* Event queue is empty, there is no backlog */ + c->queue_backlog = 0; ++ return rc; ++ } ++ ++ /* Allow clients to briefly fall behind on processing incoming messages, ++ * but drop completely unresponsive clients so the connection doesn't ++ * consume resources indefinitely. ++ */ ++ if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) { ++ if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) { ++ /* Don't evict for a new or shrinking backlog */ ++ crm_warn("Client with process ID %u has a backlog of %u messages " ++ CRM_XS " %p", c->pid, queue_len, c->ipcs); ++ } else { ++ crm_err("Evicting client with process ID %u due to backlog of %u messages " ++ CRM_XS " %p", c->pid, queue_len, c->ipcs); ++ c->queue_backlog = 0; ++ qb_ipcs_disconnect(c->ipcs); ++ return rc; ++ } + } + ++ c->queue_backlog = queue_len; ++ delay_next_flush(c, queue_len); ++ + return rc; + } + +-- +2.43.0 + +From 54fbc6bea137d0642308d49506f13bd84cd2084e Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 27 Aug 2025 11:31:37 -0400 +Subject: [PATCH 2/5] Refactor: libcrmcommon: Simplify an empty event queue + check. + +I find this just a little bit more straightforward to follow. +--- + lib/common/ipc_server.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c +index ee8ae9e..d24db59 100644 +--- a/lib/common/ipc_server.c ++++ b/lib/common/ipc_server.c +@@ -500,10 +500,13 @@ crm_ipcs_flush_events(pcmk__client_t *c) + pcmk__ipc_header_t *header = NULL; + struct iovec *event = NULL; + +- if (c->event_queue) { +- // We don't pop unless send is successful +- event = g_queue_peek_head(c->event_queue); ++ if ((c->event_queue == NULL) || g_queue_is_empty(c->event_queue)) { ++ break; + } ++ ++ // We don't pop unless send is successful ++ event = g_queue_peek_head(c->event_queue); ++ + if (event == NULL) { // Queue is empty + break; + } +-- +2.43.0 + +From 6446aa1d917be090989860c3a5cc00ea6a311d67 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 27 Aug 2025 11:35:38 -0400 +Subject: [PATCH 3/5] Refactor: libcrmcommon: Rearrange a few tests in + crm_ipcs_flush_events. + +Again, no important code changes here. I just find these a little +easier to follow. +--- + lib/common/ipc_server.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c +index d24db59..c305dfc 100644 +--- a/lib/common/ipc_server.c ++++ b/lib/common/ipc_server.c +@@ -486,16 +486,18 @@ crm_ipcs_flush_events(pcmk__client_t *c) + + if (c == NULL) { + return rc; ++ } + +- } else if (c->event_timer) { ++ if (c->event_timer != 0) { + /* There is already a timer, wait until it goes off */ + crm_trace("Timer active for %p - %d", c->ipcs, c->event_timer); + return rc; + } + +- if (c->event_queue) { ++ if (c->event_queue != NULL) { + queue_len = g_queue_get_length(c->event_queue); + } ++ + while (sent < 100) { + pcmk__ipc_header_t *header = NULL; + struct iovec *event = NULL; +-- +2.43.0 + +From d7576ecb3f51050a21057d86257bf8b8c273e4db Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 27 Aug 2025 13:14:54 -0400 +Subject: [PATCH 4/5] Feature: libcrmcommon: Be more lenient in evicting IPC + clients. + +Each IPC connection has a message queue. If the client is unable to +process messages faster than the server is sending them, that queue +start to back up. pacemaker enforces a cap on the queue size, and +that's adjustable with the cluster-ipc-limit parameter. Once the queue +grows beyond that size, the client is assumed to be dead and is evicted +so it can be restarted and the queue resources freed. + +However, it's possible that the client is not dead. On clusters with +very large numbers of resources (I've tried with 300, but fewer might +also cause problems), certain actions can happen that cause a spike in +IPC messages. In RHEL-76276, the action that causes this is moving +nodes in and out of standby. This spike in messages causes the server +to overwhelm the client, which is then evicted. + +My multi-part IPC patches made this even worse, as now if the CIB is so +large that it needs to split an IPC message up, there will be more +messages than before. + +What this fix does is get rid of the cap on the queue size for pacemaker +daemons. As long as the server has been able to send messages to the +client, the client is still doing work and shouldn't be evicted. It may +just be processing messages slower than the server is sending them. +Note that this could lead the queue to grow without bound, eventually +crashing the server. For this reason, we're only allowing pacemaker +daemons to ignore the queue size limit. + +Potential problems with this approach: + +* If the client is so busy that it can't receive even a single message + that crm_ipcs_flush_events tries to send, it will still be evicted. + However, the flush operation does retry with a delay several times + giving the client time to finish up what it's doing. + +* We have timers all over the place with daemons waiting on replies. + It's possible that because we are no longer just evicting the clients, + we will now see those timers expire which will just lead to different + problems. If so, these fixes would probably need to take place in the + client code. + +Fixes T38 +--- + lib/common/ipc_server.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c +index c305dfc..16a2986 100644 +--- a/lib/common/ipc_server.c ++++ b/lib/common/ipc_server.c +@@ -551,10 +551,20 @@ crm_ipcs_flush_events(pcmk__client_t *c) + * consume resources indefinitely. + */ + if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) { +- if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) { +- /* Don't evict for a new or shrinking backlog */ ++ /* Don't evict: ++ * - Clients with a new backlog. ++ * - Clients with a shrinking backlog (the client is processing ++ * messages faster than the server is sending them). ++ * - Clients that are pacemaker daemons and have had any messages sent ++ * to them in this flush call (the server is sending messages faster ++ * than the client is processing them, but the client is not dead). ++ */ ++ if ((c->queue_backlog <= 1) ++ || (queue_len < c->queue_backlog) ++ || ((sent > 0) && crm_is_daemon_name(c->name))) { + crm_warn("Client with process ID %u has a backlog of %u messages " + CRM_XS " %p", c->pid, queue_len, c->ipcs); ++ + } else { + crm_err("Evicting client with process ID %u due to backlog of %u messages " + CRM_XS " %p", c->pid, queue_len, c->ipcs); +-- +2.43.0 + +From 6b5e50b272c26c95e9fae1c3270c77a8d72446e8 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Tue, 30 Sep 2025 13:50:53 -0400 +Subject: [PATCH 5/5] Feature: libcrmcommon: Update documentation for + cluster-ipc-limit. + +Clarify that this no longer applies to pacemaker daemons. +--- + cts/cli/regression.daemons.exp | 4 ++-- + doc/sphinx/Pacemaker_Explained/cluster-options.rst | 12 +++++++----- + lib/common/options.c | 6 +++--- + 3 files changed, 12 insertions(+), 10 deletions(-) + +diff --git a/cts/cli/regression.daemons.exp b/cts/cli/regression.daemons.exp +index 678cb62..dffbe6a 100644 +--- a/cts/cli/regression.daemons.exp ++++ b/cts/cli/regression.daemons.exp +@@ -11,8 +11,8 @@ + + + +- Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes). +- Maximum IPC message backlog before disconnecting a cluster daemon ++ Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes). ++ Maximum IPC message backlog before disconnecting a client + + + +diff --git a/doc/sphinx/Pacemaker_Explained/cluster-options.rst b/doc/sphinx/Pacemaker_Explained/cluster-options.rst +index 77bd7e6..fe2d4f1 100644 +--- a/doc/sphinx/Pacemaker_Explained/cluster-options.rst ++++ b/doc/sphinx/Pacemaker_Explained/cluster-options.rst +@@ -675,11 +675,13 @@ values, by running the ``man pacemaker-schedulerd`` and + cluster-ipc-limit + - :ref:`nonnegative integer ` + - 500 +- - The maximum IPC message backlog before one cluster daemon will +- disconnect another. This is of use in large clusters, for which a good +- value is the number of resources in the cluster multiplied by the number +- of nodes. The default of 500 is also the minimum. Raise this if you see +- "Evicting client" log messages for cluster daemon process IDs. ++ - The maximum IPC message backlog before a cluster daemon will disconnect ++ a client. Other cluster daemons are not subject to this limit as long as ++ they are still processing messages. This is of use in large clusters, ++ for which a good value is the number of resources in the cluster ++ multiplied by the number of nodes. The default of 500 is also the ++ minimum. Raise this if you see "Evicting client" log messages for ++ cluster process IDs. + * - .. _pe_error_series_max: + + .. index:: +diff --git a/lib/common/options.c b/lib/common/options.c +index 96f059c..d3fc684 100644 +--- a/lib/common/options.c ++++ b/lib/common/options.c +@@ -422,10 +422,10 @@ static pcmk__cluster_option_t cluster_options[] = { + "cluster-ipc-limit", NULL, "integer", NULL, + "500", pcmk__valid_positive_number, + pcmk__opt_context_based, +- N_("Maximum IPC message backlog before disconnecting a cluster daemon"), ++ N_("Maximum IPC message backlog before disconnecting a client"), + N_("Raise this if log has \"Evicting client\" messages for cluster " +- "daemon PIDs (a good value is the number of resources in the " +- "cluster multiplied by the number of nodes)."), ++ "PIDs (a good value is the number of resources in the cluster " ++ "multiplied by the number of nodes)."), + }, + + // Orphans and stopping +-- +2.43.0 + diff --git a/SOURCES/016-fewer-messages.patch b/SOURCES/016-fewer-messages.patch new file mode 100644 index 0000000..2d37039 --- /dev/null +++ b/SOURCES/016-fewer-messages.patch @@ -0,0 +1,88 @@ +From 7be9cdca98217d002497714aaafcfc292c02555b Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 31 Oct 2025 11:24:14 -0400 +Subject: [PATCH] Med: daemons: Don't add repeated I_PE_CALC messages to the + fsa queue. + +Let's say you have a two node cluster, node1 and node2. For purposes of +testing, it's easiest if you use fence_dummy instead of a real fencing +agent as this will fake fencing happen but without rebooting the node so +you can see all the log files. + +Assume the DC is node1. Now do the following: + +- pcs node standby node1 +- pcs resource defaults update resource-stickiness=1 +- for i in $(seq 1 300); do echo $i; pcs resource create dummy$i ocf:heartbeat:Dummy --group dummy-group; done +- pcs node unstandby node1 + +It will take a long time to create that many resources. After node1 +comes out of standby, it'll take a minute or two but eventually you'll +see that node1 was fenced. On node1, you'll see a lot of transition +abort messages happen. Each of these transition aborts causes an +I_PE_CALC message to be generated and added to the fsa queue. In my +testing, I've seen the queue grow to ~ 600 messages, all of which are +exactly the same thing. + +The FSA is triggered at G_PRIORITY_HIGH, and once it is triggered, it +will run until its queue is empty. With so many messages being added so +quickly, we've basically ensured it won't be empty any time soon. While +controld is processing the FSA messages, it will be unable to read +anything out of the IPC backlog. + +based continues to attempt to send IPC events to controld but is unable +to do so, so the backlog continues to grow. Eventually, the backlog +reaches that 500 message threshold without anything having been read by +controld, which triggers the eviction process. + +There doesn't seem to be any reason for all these I_PE_CALC messages to +be generated. They're all exactly the same, and they don't appear to be +tagged with any unique data tying them to a specific query, and their +presence just slows everything down. + +Thus, the fix here is very simple: if the latest message in the queue is +an I_PE_CALC message, just don't add another one. We could also make +sure there's only ever one I_PE_CALC message in the queue, but there +could potentially be valid reasons for there to be multiple interleaved +with other message types. I am erring on the side of caution with this +minimal fix. + +Related: RHEL-76276 +--- + daemons/controld/controld_messages.c | 20 ++++++++++++++++++++ + 1 file changed, 20 insertions(+) + +diff --git a/daemons/controld/controld_messages.c b/daemons/controld/controld_messages.c +index 0b0f25b..30af707 100644 +--- a/daemons/controld/controld_messages.c ++++ b/daemons/controld/controld_messages.c +@@ -73,6 +73,26 @@ register_fsa_input_adv(enum crmd_fsa_cause cause, enum crmd_fsa_input input, + return; + } + ++ if (input == I_PE_CALC) { ++ GList *ele = NULL; ++ ++ if (prepend) { ++ ele = g_list_first(controld_globals.fsa_message_queue); ++ } else { ++ ele = g_list_last(controld_globals.fsa_message_queue); ++ } ++ ++ if (ele != NULL) { ++ fsa_data_t *message = (fsa_data_t *) ele->data; ++ ++ if (message->fsa_input == I_PE_CALC) { ++ crm_debug("%s item in fsa queue is I_PE_CALC, not adding another", ++ (prepend ? "First" : "Last")); ++ return; ++ } ++ } ++ } ++ + if (input == I_WAIT_FOR_EVENT) { + controld_set_global_flags(controld_fsa_is_stalled); + crm_debug("Stalling the FSA pending further input: source=%s cause=%s data=%p queue=%d", +-- +2.43.0 + diff --git a/SPECS/pacemaker.spec b/SPECS/pacemaker.spec index 3a7d89b..7fbbb7e 100644 --- a/SPECS/pacemaker.spec +++ b/SPECS/pacemaker.spec @@ -244,7 +244,7 @@ Name: pacemaker Summary: Scalable High-Availability cluster resource manager Version: %{pcmkversion} -Release: %{pcmk_release}.3%{?dist} +Release: %{pcmk_release}.5%{?dist} %if %{defined _unitdir} License: GPL-2.0-or-later AND LGPL-2.1-or-later %else @@ -279,6 +279,8 @@ Patch011: 011-attrd-memory-leak.patch Patch012: 012-dont-set-as-xml-id.patch Patch013: 013-crm_node-i-initialize.patch Patch014: 014-remote-fencing.patch +Patch015: 015-ipc-disconnect.patch +Patch016: 016-fewer-messages.patch Requires: resource-agents Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} @@ -1030,6 +1032,14 @@ exit 0 %license %{nagios_name}-%{nagios_hash}/COPYING %changelog +* Mon Nov 17 2025 Chris Lumens - 2.1.7-5.5 +- Don't overwhelm the FSA queue with repeated CIB queries +- Related: RHEL-76276 + +* Tue Sep 30 2025 Chris Lumens - 2.1.7-5.4 +- Be more lenient in evicting IPC clients +- Resolves: RHEL-76276 + * Thu Jul 10 2025 Chris Lumens - 2.1.7-5.3 - Add option for controlling remote node fencing behavior - Resolves: RHEL-93220