pacemaker/005-ipc_evict.patch

From 79f5a67e8242b3e72aa9dcf0dbd286b3fb719baa Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Wed, 27 Aug 2025 10:41:13 -0400
Subject: [PATCH 1/6] Refactor: libcrmcommon: Rearrange the queue_len check.

Check if the queue length is 0 first and return, which allows everything
else to be un-indented one level.
---
 lib/common/ipc_server.c | 47 ++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
index 25c788b..4b33c64 100644
--- a/lib/common/ipc_server.c
+++ b/lib/common/ipc_server.c
@@ -541,34 +541,33 @@ no_more_retries:
                   sent, queue_len, c->ipcs, c->pid, pcmk_rc_str(rc), qb_rc);
     }

-    if (queue_len) {
-
-        /* Allow clients to briefly fall behind on processing incoming messages,
-         * but drop completely unresponsive clients so the connection doesn't
-         * consume resources indefinitely.
-         */
-        if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) {
-            if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) {
-                /* Don't evict for a new or shrinking backlog */
-                crm_warn("Client with process ID %u has a backlog of %u messages "
-                         QB_XS " %p", c->pid, queue_len, c->ipcs);
-            } else {
-                crm_err("Evicting client with process ID %u due to backlog of %u messages "
-                         QB_XS " %p", c->pid, queue_len, c->ipcs);
-                c->queue_backlog = 0;
-                qb_ipcs_disconnect(c->ipcs);
-                return rc;
-            }
-        }
-
-        c->queue_backlog = queue_len;
-        delay_next_flush(c, queue_len);
-
-    } else {
+    if (queue_len == 0) {
         /* Event queue is empty, there is no backlog */
         c->queue_backlog = 0;
+        return rc;
     }

+    /* Allow clients to briefly fall behind on processing incoming messages,
+     * but drop completely unresponsive clients so the connection doesn't
+     * consume resources indefinitely.
+     */
+    if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) {
+        if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) {
+            /* Don't evict for a new or shrinking backlog */
+            crm_warn("Client with process ID %u has a backlog of %u messages "
+                     QB_XS " %p", c->pid, queue_len, c->ipcs);
+        } else {
+            crm_err("Evicting client with process ID %u due to backlog of %u messages "
+                     QB_XS " %p", c->pid, queue_len, c->ipcs);
+            c->queue_backlog = 0;
+            qb_ipcs_disconnect(c->ipcs);
+            return rc;
+        }
+    }
+
+    c->queue_backlog = queue_len;
+    delay_next_flush(c, queue_len);
+
     return rc;
 }

--
2.47.1

From 014699003c6506bba8638ed57efea49da403d0e1 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Wed, 27 Aug 2025 11:31:37 -0400
Subject: [PATCH 2/6] Refactor: libcrmcommon: Simplify an empty event queue
 check.

I find this just a little bit more straightforward to follow.
---
 lib/common/ipc_server.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
index 4b33c64..dbd885a 100644
--- a/lib/common/ipc_server.c
+++ b/lib/common/ipc_server.c
@@ -491,14 +491,13 @@ crm_ipcs_flush_events(pcmk__client_t *c)
         pcmk__ipc_header_t *header = NULL;
         struct iovec *event = NULL;

-        if (c->event_queue) {
-            // We don't pop unless send is successful
-            event = g_queue_peek_head(c->event_queue);
-        }
-        if (event == NULL) { // Queue is empty
+        if ((c->event_queue == NULL) || g_queue_is_empty(c->event_queue)) {
             break;
         }

+        // We don't pop unless send is successful
+        event = g_queue_peek_head(c->event_queue);
+
         /* Retry sending the event up to five times.  If we get -EAGAIN, sleep
          * a very short amount of time (too long here is bad) and try again.
          * If we simply exit the while loop on -EAGAIN, we'll have to wait until
--
2.47.1

From f999ac3d86d8107dee5288497f5f7fff07956d18 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Wed, 27 Aug 2025 11:35:38 -0400
Subject: [PATCH 3/6] Refactor: libcrmcommon: Rearrange a few tests in
 crm_ipcs_flush_events.

Again, no important code changes here.  I just find these a little
easier to follow.
---
 lib/common/ipc_server.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
index dbd885a..b76847b 100644
--- a/lib/common/ipc_server.c
+++ b/lib/common/ipc_server.c
@@ -477,16 +477,18 @@ crm_ipcs_flush_events(pcmk__client_t *c)

     if (c == NULL) {
         return rc;
+    }

-    } else if (c->event_timer) {
+    if (c->event_timer != 0) {
         /* There is already a timer, wait until it goes off */
         crm_trace("Timer active for %p - %d", c->ipcs, c->event_timer);
         return rc;
     }

-    if (c->event_queue) {
+    if (c->event_queue != NULL) {
         queue_len = g_queue_get_length(c->event_queue);
     }
+
     while (sent < 100) {
         pcmk__ipc_header_t *header = NULL;
         struct iovec *event = NULL;
--
2.47.1

From 9e76007bb0bc1d4cb5a88dcfaaf96aa8853f42dc Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Wed, 27 Aug 2025 11:48:48 -0400
Subject: [PATCH 4/6] Refactor: libcrmcommon: Unindent retry code in
 crm_ipcs_flush_events.

If we're breaking or jumping to a label, there's no need to have all
these nested else blocks.
---
 lib/common/ipc_server.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
index b76847b..73cc58f 100644
--- a/lib/common/ipc_server.c
+++ b/lib/common/ipc_server.c
@@ -513,16 +513,16 @@ crm_ipcs_flush_events(pcmk__client_t *c)
         for (unsigned int retries = 5; retries > 0; retries--) {
             qb_rc = qb_ipcs_event_sendv(c->ipcs, event, 2);

-            if (qb_rc < 0) {
-                if (retries == 1 || qb_rc != -EAGAIN) {
-                    rc = (int) -qb_rc;
-                    goto no_more_retries;
-                } else {
-                    pcmk__sleep_ms(5);
-                }
-            } else {
+            if (qb_rc >= 0) {
                 break;
             }
+
+            if (retries == 1 || qb_rc != -EAGAIN) {
+                rc = (int) -qb_rc;
+                goto no_more_retries;
+            }
+
+            pcmk__sleep_ms(5);
         }

         event = g_queue_pop_head(c->event_queue);
--
2.47.1

From b73be21a454f795bc747aad1dbeea82f67d8b232 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Wed, 27 Aug 2025 13:14:54 -0400
Subject: [PATCH 5/6] Feature: libcrmcommon: Be more lenient in evicting IPC
 clients.

Each IPC connection has a message queue.  If the client is unable to
process messages faster than the server is sending them, that queue
start to back up.  pacemaker enforces a cap on the queue size, and
that's adjustable with the cluster-ipc-limit parameter.  Once the queue
grows beyond that size, the client is assumed to be dead and is evicted
so it can be restarted and the queue resources freed.

However, it's possible that the client is not dead.  On clusters with
very large numbers of resources (I've tried with 300, but fewer might
also cause problems), certain actions can happen that cause a spike in
IPC messages.  In RHEL-76276, the action that causes this is moving
nodes in and out of standby.  This spike in messages causes the server
to overwhelm the client, which is then evicted.

My multi-part IPC patches made this even worse, as now if the CIB is so
large that it needs to split an IPC message up, there will be more
messages than before.

What this fix does is get rid of the cap on the queue size for pacemaker
daemons.  As long as the server has been able to send messages to the
client, the client is still doing work and shouldn't be evicted.  It may
just be processing messages slower than the server is sending them.
Note that this could lead the queue to grow without bound, eventually
crashing the server.  For this reason, we're only allowing pacemaker
daemons to ignore the queue size limit.

Potential problems with this approach:

* If the client is so busy that it can't receive even a single message
  that crm_ipcs_flush_events tries to send, it will still be evicted.
  However, the flush operation does retry with a delay several times
  giving the client time to finish up what it's doing.

* We have timers all over the place with daemons waiting on replies.
  It's possible that because we are no longer just evicting the clients,
  we will now see those timers expire which will just lead to different
  problems.  If so, these fixes would probably need to take place in the
  client code.

Fixes T38
---
 lib/common/ipc_server.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
index 73cc58f..4420070 100644
--- a/lib/common/ipc_server.c
+++ b/lib/common/ipc_server.c
@@ -553,10 +553,20 @@ no_more_retries:
      * consume resources indefinitely.
      */
     if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) {
-        if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) {
-            /* Don't evict for a new or shrinking backlog */
+        /* Don't evict:
+         * - Clients with a new backlog.
+         * - Clients with a shrinking backlog (the client is processing
+         *   messages faster than the server is sending them).
+         * - Clients that are pacemaker daemons and have had any messages sent
+         *   to them in this flush call (the server is sending messages faster
+         *   than the client is processing them, but the client is not dead).
+         */
+        if ((c->queue_backlog <= 1)
+            || (queue_len < c->queue_backlog)
+            || ((sent > 0) && (pcmk__parse_server(c->name) != pcmk_ipc_unknown))) {
             crm_warn("Client with process ID %u has a backlog of %u messages "
                      QB_XS " %p", c->pid, queue_len, c->ipcs);
+
         } else {
             crm_err("Evicting client with process ID %u due to backlog of %u messages "
                      QB_XS " %p", c->pid, queue_len, c->ipcs);
--
2.47.1

From 4682953c567e16409d8e7972d9d5891348d4c360 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Wed, 27 Aug 2025 15:56:27 -0400
Subject: [PATCH 6/6] Feature: libcrmcommon: Update documentation for
 cluster-ipc-limit.

Clarify that this no longer applies to pacemaker daemons.
---
 cts/cli/regression.crm_attribute.exp             | 16 ++++++++--------
 cts/cli/regression.daemons.exp                   |  4 ++--
 .../Pacemaker_Explained/cluster-options.rst      | 12 +++++++-----
 lib/common/options.c                             |  6 +++---
 4 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/cts/cli/regression.crm_attribute.exp b/cts/cli/regression.crm_attribute.exp
index e161f49..36cba76 100644
--- a/cts/cli/regression.crm_attribute.exp
+++ b/cts/cli/regression.crm_attribute.exp
@@ -111,8 +111,8 @@ Also known as properties, these are options that affect behavior across the enti
   * migration-limit: The number of live migration actions that the cluster is allowed to execute in parallel on a node (-1 means no limit)
     * Possible values: integer (default: )

-  * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a cluster daemon
-    * Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
+  * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a client
+    * Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
     * Possible values: nonnegative_integer (default: )

   * stop-all-resources: Whether the cluster should stop all active resources
@@ -357,8 +357,8 @@ Also known as properties, these are options that affect behavior across the enti
         <content type="integer" default=""/>
       </parameter>
       <parameter name="cluster-ipc-limit" advanced="0" generated="0">
-        <longdesc lang="en">Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
-        <shortdesc lang="en">Maximum IPC message backlog before disconnecting a cluster daemon</shortdesc>
+        <longdesc lang="en">Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
+        <shortdesc lang="en">Maximum IPC message backlog before disconnecting a client</shortdesc>
         <content type="nonnegative_integer" default=""/>
       </parameter>
       <parameter name="stop-all-resources" advanced="0" generated="0">
@@ -537,8 +537,8 @@ Also known as properties, these are options that affect behavior across the enti
   * migration-limit: The number of live migration actions that the cluster is allowed to execute in parallel on a node (-1 means no limit)
     * Possible values: integer (default: )

-  * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a cluster daemon
-    * Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
+  * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a client
+    * Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
     * Possible values: nonnegative_integer (default: )

   * stop-all-resources: Whether the cluster should stop all active resources
@@ -824,8 +824,8 @@ Also known as properties, these are options that affect behavior across the enti
         <content type="integer" default=""/>
       </parameter>
       <parameter name="cluster-ipc-limit" advanced="0" generated="0">
-        <longdesc lang="en">Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
-        <shortdesc lang="en">Maximum IPC message backlog before disconnecting a cluster daemon</shortdesc>
+        <longdesc lang="en">Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
+        <shortdesc lang="en">Maximum IPC message backlog before disconnecting a client</shortdesc>
         <content type="nonnegative_integer" default=""/>
       </parameter>
       <parameter name="stop-all-resources" advanced="0" generated="0">
diff --git a/cts/cli/regression.daemons.exp b/cts/cli/regression.daemons.exp
index fc8535a..6274eeb 100644
--- a/cts/cli/regression.daemons.exp
+++ b/cts/cli/regression.daemons.exp
@@ -21,10 +21,10 @@
     </parameter>
     <parameter name="cluster-ipc-limit">
       <longdesc lang="en">
-        Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
+        Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
       </longdesc>
       <shortdesc lang="en">
-        Maximum IPC message backlog before disconnecting a cluster daemon
+        Maximum IPC message backlog before disconnecting a client
       </shortdesc>
       <content type="integer" default=""/>
     </parameter>
diff --git a/doc/sphinx/Pacemaker_Explained/cluster-options.rst b/doc/sphinx/Pacemaker_Explained/cluster-options.rst
index 6ebe5f3..22e1a50 100644
--- a/doc/sphinx/Pacemaker_Explained/cluster-options.rst
+++ b/doc/sphinx/Pacemaker_Explained/cluster-options.rst
@@ -693,11 +693,13 @@ values, by running the ``man pacemaker-schedulerd`` and
        cluster-ipc-limit
      - :ref:`nonnegative integer <nonnegative_integer>`
      - 500
-     - The maximum IPC message backlog before one cluster daemon will
-       disconnect another. This is of use in large clusters, for which a good
-       value is the number of resources in the cluster multiplied by the number
-       of nodes. The default of 500 is also the minimum. Raise this if you see
-       "Evicting client" log messages for cluster daemon process IDs.
+     - The maximum IPC message backlog before a cluster daemon will disconnect
+       a client.  Other cluster daemons are not subject to this limit as long as
+       they are still processing messages.  This is of use in large clusters,
+       for which a good value is the number of resources in the cluster
+       multiplied by the number of nodes. The default of 500 is also the
+       minimum. Raise this if you see "Evicting client" log messages for
+       cluster process IDs.
    * - .. _pe_error_series_max:

        .. index::
diff --git a/lib/common/options.c b/lib/common/options.c
index b8f4943..af1b073 100644
--- a/lib/common/options.c
+++ b/lib/common/options.c
@@ -432,10 +432,10 @@ static const pcmk__cluster_option_t cluster_options[] = {
         PCMK_OPT_CLUSTER_IPC_LIMIT, NULL, PCMK_VALUE_NONNEGATIVE_INTEGER, NULL,
         "500", pcmk__valid_positive_int,
         pcmk__opt_based,
-        N_("Maximum IPC message backlog before disconnecting a cluster daemon"),
+        N_("Maximum IPC message backlog before disconnecting a client"),
         N_("Raise this if log has \"Evicting client\" messages for cluster "
-            "daemon PIDs (a good value is the number of resources in the "
-            "cluster multiplied by the number of nodes)."),
+            "PIDs (a good value is the number of resources in the cluster "
+            "multiplied by the number of nodes)."),
     },

     // Orphans and stopping
--
2.47.1