From 0ce8a50f98d49144c895f602a1870dae6e16cc1f Mon Sep 17 00:00:00 2001
From: eabdullin <ed.abdullin.1@gmail.com>
Date: Tue, 4 Feb 2025 02:31:06 +0000
Subject: [PATCH] import UBI passt-0^20240806.gee36266-6.el9_5

---
 .gitignore                                    |  2 +-
 .passt.metadata                               |  2 +-
 ...-if-guest-attempts-to-connect-to-por.patch | 64 +++++++++++++
 ...keep-alive-segments-ignore-them-for-.patch | 69 ++++++++++++++
 ...-Set-again-TCP_NODELAY-on-both-sides.patch | 93 +++++++++++++++++++
 ...orrect-hash-probe-in-flowside_lookup.patch | 43 +++++++++
 ...-on-all-RST-segments-even-for-client.patch | 78 ++++++++++++++++
 SPECS/passt.spec                              | 40 +++++++-
 8 files changed, 385 insertions(+), 6 deletions(-)
 create mode 100644 SOURCES/0002-flow-Don-t-crash-if-guest-attempts-to-connect-to-por.patch
 create mode 100644 SOURCES/0003-tcp-Acknowledge-keep-alive-segments-ignore-them-for-.patch
 create mode 100644 SOURCES/0004-tcp_splice-Set-again-TCP_NODELAY-on-both-sides.patch
 create mode 100644 SOURCES/0005-flow-Fix-incorrect-hash-probe-in-flowside_lookup.patch
 create mode 100644 SOURCES/0006-tcp-Set-ACK-flag-on-all-RST-segments-even-for-client.patch

diff --git a/.gitignore b/.gitignore
index 4e7f596..3ac5abe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1 @@
-SOURCES/passt-b86afe3559c0bd3d24bc6fed7c60466cf141224c.tar.xz
+SOURCES/passt-ee36266a55478672ad2c5f4efbd6ca0bef3d37cd.tar.xz
diff --git a/.passt.metadata b/.passt.metadata
index cca99da..6ae71fd 100644
--- a/.passt.metadata
+++ b/.passt.metadata
@@ -1 +1 @@
-833dc4cee84bf49eb54354f5da5ae07748ce2969 SOURCES/passt-b86afe3559c0bd3d24bc6fed7c60466cf141224c.tar.xz
+421a821e193faa31035a158c913c70d7fc13bf68 SOURCES/passt-ee36266a55478672ad2c5f4efbd6ca0bef3d37cd.tar.xz
diff --git a/SOURCES/0002-flow-Don-t-crash-if-guest-attempts-to-connect-to-por.patch b/SOURCES/0002-flow-Don-t-crash-if-guest-attempts-to-connect-to-por.patch
new file mode 100644
index 0000000..00692fa
--- /dev/null
+++ b/SOURCES/0002-flow-Don-t-crash-if-guest-attempts-to-connect-to-por.patch
@@ -0,0 +1,64 @@
+From 002b2a23380d4df552bac7665d462ac4c7bced0b Mon Sep 17 00:00:00 2001
+From: David Gibson <david@gibson.dropbear.id.au>
+Date: Wed, 14 Aug 2024 20:03:33 +1000
+Subject: [PATCH] flow: Don't crash if guest attempts to connect to port 0
+
+Using a zero port on TCP or UDP is dubious, and we can't really deal with
+forwarding such a flow within the constraints of the socket API.  Hence
+we ASSERT()ed that we had non-zero ports in flow_hash().
+
+The intention was to make sure that the protocol code sanitizes such ports
+before completing a flow entry.  Unfortunately, flow_hash() is also called
+on new packets to see if they have an existing flow, so the unsanitized
+guest packet can crash passt with the assert.
+
+Correct this by moving the assert from flow_hash() to flow_sidx_hash()
+which is only used on entries already in the table, not on unsanitized
+data.
+
+Reported-by: Matt Hamilton <matt@thmail.io>
+Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
+Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
+---
+ flow.c | 18 ++++++++++--------
+ 1 file changed, 10 insertions(+), 8 deletions(-)
+
+diff --git a/flow.c b/flow.c
+index 687e9fd..93b687d 100644
+--- a/flow.c
++++ b/flow.c
+@@ -561,12 +561,6 @@ static uint64_t flow_hash(const struct ctx *c, uint8_t proto, uint8_t pif,
+ {
+ 	struct siphash_state state = SIPHASH_INIT(c->hash_secret);
+ 
+-	/* For the hash table to work, we need complete endpoint information,
+-	 * and at least a forwarding port.
+-	 */
+-	ASSERT(pif != PIF_NONE && !inany_is_unspecified(&side->eaddr) &&
+-	       side->eport != 0 && side->fport != 0);
+-
+ 	inany_siphash_feed(&state, &side->faddr);
+ 	inany_siphash_feed(&state, &side->eaddr);
+ 
+@@ -586,8 +580,16 @@ static uint64_t flow_hash(const struct ctx *c, uint8_t proto, uint8_t pif,
+ static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx)
+ {
+ 	const struct flow_common *f = &flow_at_sidx(sidx)->f;
+-	return flow_hash(c, FLOW_PROTO(f),
+-			 f->pif[sidx.sidei], &f->side[sidx.sidei]);
++	const struct flowside *side = &f->side[sidx.sidei];
++	uint8_t pif = f->pif[sidx.sidei];
++
++	/* For the hash table to work, entries must have complete endpoint
++	 * information, and at least a forwarding port.
++	 */
++	ASSERT(pif != PIF_NONE && !inany_is_unspecified(&side->eaddr) &&
++	       side->eport != 0 && side->fport != 0);
++
++	return flow_hash(c, FLOW_PROTO(f), pif, side);
+ }
+ 
+ /**
+-- 
+2.43.0
+
diff --git a/SOURCES/0003-tcp-Acknowledge-keep-alive-segments-ignore-them-for-.patch b/SOURCES/0003-tcp-Acknowledge-keep-alive-segments-ignore-them-for-.patch
new file mode 100644
index 0000000..429bcbd
--- /dev/null
+++ b/SOURCES/0003-tcp-Acknowledge-keep-alive-segments-ignore-them-for-.patch
@@ -0,0 +1,69 @@
+From 238c69f9af458e41dea5ad8c988dbf65b05b5172 Mon Sep 17 00:00:00 2001
+From: Stefano Brivio <sbrivio@redhat.com>
+Date: Tue, 19 Nov 2024 20:53:44 +0100
+Subject: [PATCH] tcp: Acknowledge keep-alive segments, ignore them for the
+ rest
+
+RFC 9293, 3.8.4 says:
+
+   Implementers MAY include "keep-alives" in their TCP implementations
+   (MAY-5), although this practice is not universally accepted.  Some
+   TCP implementations, however, have included a keep-alive mechanism.
+   To confirm that an idle connection is still active, these
+   implementations send a probe segment designed to elicit a response
+   from the TCP peer.  Such a segment generally contains SEG.SEQ =
+   SND.NXT-1 and may or may not contain one garbage octet of data.  If
+   keep-alives are included, the application MUST be able to turn them
+   on or off for each TCP connection (MUST-24), and they MUST default to
+   off (MUST-25).
+
+but currently, tcp_data_from_tap() is not aware of this and will
+schedule a fast re-transmit on the second keep-alive (because it's
+also a duplicate ACK), ignoring the fact that the sequence number was
+rewinded to SND.NXT-1.
+
+ACK these keep-alive segments, reset the activity timeout, and ignore
+them for the rest.
+
+At some point, we could think of implementing an approximation of
+keep-alive segments on outbound sockets, for example by setting
+TCP_KEEPIDLE to 1, and a large TCP_KEEPINTVL, so that we send a single
+keep-alive segment at approximately the same time, and never reset the
+connection. That's beyond the scope of this fix, though.
+
+Reported-by: Tim Besard <tim.besard@gmail.com>
+Link: https://github.com/containers/podman/discussions/24572
+Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
+Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
+---
+ tcp.c | 14 ++++++++++++++
+ 1 file changed, 14 insertions(+)
+
+diff --git a/tcp.c b/tcp.c
+index f357920..1eb85bb 100644
+--- a/tcp.c
++++ b/tcp.c
+@@ -1763,6 +1763,20 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
+ 			continue;
+ 
+ 		seq = ntohl(th->seq);
++		if (SEQ_LT(seq, conn->seq_from_tap) && len <= 1) {
++			flow_trace(conn,
++				   "keep-alive sequence: %u, previous: %u",
++				   seq, conn->seq_from_tap);
++
++			tcp_send_flag(c, conn, ACK);
++			tcp_timer_ctl(c, conn);
++
++			if (p->count == 1)
++				return 1;
++
++			continue;
++		}
++
+ 		ack_seq = ntohl(th->ack_seq);
+ 
+ 		if (th->ack) {
+-- 
+2.43.5
+
diff --git a/SOURCES/0004-tcp_splice-Set-again-TCP_NODELAY-on-both-sides.patch b/SOURCES/0004-tcp_splice-Set-again-TCP_NODELAY-on-both-sides.patch
new file mode 100644
index 0000000..7eb354e
--- /dev/null
+++ b/SOURCES/0004-tcp_splice-Set-again-TCP_NODELAY-on-both-sides.patch
@@ -0,0 +1,93 @@
+From 725acd111ba340122f2bb0601e373534eb4b5ed8 Mon Sep 17 00:00:00 2001
+From: Stefano Brivio <sbrivio@redhat.com>
+Date: Mon, 6 Jan 2025 10:10:29 +0100
+Subject: [PATCH] tcp_splice: Set (again) TCP_NODELAY on both sides
+
+In commit 7ecf69329787 ("pasta, tcp: Don't set TCP_CORK on spliced
+sockets") I just assumed that we wouldn't benefit from disabling
+Nagle's algorithm once we drop TCP_CORK (and its 200ms fixed delay).
+
+It turns out that with some patterns, such as a PostgreSQL server
+in a container receiving parameterised, short queries, for which pasta
+sees several short inbound messages (Parse, Bind, Describe, Execute
+and Sync commands getting each one their own packet, 5 to 49 bytes TCP
+payload each), we'll read them usually in two batches, and send them
+in matching batches, for example:
+
+  9165.2467:          pasta: epoll event on connected spliced TCP socket 117 (events: 0x00000001)
+  9165.2468:          Flow 0 (TCP connection (spliced)): 76 from read-side call
+  9165.2468:          Flow 0 (TCP connection (spliced)): 76 from write-side call (passed 524288)
+  9165.2469:          pasta: epoll event on connected spliced TCP socket 117 (events: 0x00000001)
+  9165.2470:          Flow 0 (TCP connection (spliced)): 15 from read-side call
+  9165.2470:          Flow 0 (TCP connection (spliced)): 15 from write-side call (passed 524288)
+  9165.2944:          pasta: epoll event on connected spliced TCP socket 118 (events: 0x00000001)
+
+and the kernel delivers the first one, waits for acknowledgement from
+the receiver, then delivers the second one. This adds very substantial
+and unnecessary delay. It's usually a fixed ~40ms between the two
+batches, which is clearly unacceptable for loopback connections.
+
+In this example, the delay is shown by the timestamp of the response
+from socket 118. The peer (server) doesn't actually take that long
+(less than a millisecond), but it takes that long for the kernel to
+deliver our request.
+
+To avoid batching and delays, disable Nagle's algorithm by setting
+TCP_NODELAY on both internal and external sockets: this way, we get
+one inbound packet for each original message, we transfer them right
+away, and the kernel delivers them to the process in the container as
+they are, without delay.
+
+We can do this safely as we don't care much about network utilisation
+when there's in fact pretty much no network (loopback connections).
+
+This is unfortunately not visible in the TCP request-response tests
+from the test suite because, with smaller messages (we use one byte),
+Nagle's algorithm doesn't even kick in. It's probably not trivial to
+implement a universal test covering this case.
+
+Fixes: 7ecf69329787 ("pasta, tcp: Don't set TCP_CORK on spliced sockets")
+Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
+---
+ tcp_splice.c | 14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+diff --git a/tcp_splice.c b/tcp_splice.c
+index 3a0f868..3a000ff 100644
+--- a/tcp_splice.c
++++ b/tcp_splice.c
+@@ -348,6 +348,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
+ 	uint8_t tgtpif = conn->f.pif[TGTSIDE];
+ 	union sockaddr_inany sa;
+ 	socklen_t sl;
++	int one = 1;
+ 
+ 	if (tgtpif == PIF_HOST)
+ 		conn->s[1] = tcp_conn_sock(c, af);
+@@ -359,12 +360,21 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
+ 	if (conn->s[1] < 0)
+ 		return -1;
+ 
+-	if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK,
+-		       &((int){ 1 }), sizeof(int))) {
++	if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK, &one, sizeof(one))) {
+ 		flow_trace(conn, "failed to set TCP_QUICKACK on socket %i",
+ 			   conn->s[1]);
+ 	}
+ 
++	if (setsockopt(conn->s[0], SOL_TCP, TCP_NODELAY, &one, sizeof(one))) {
++		flow_trace(conn, "failed to set TCP_NODELAY on socket %i",
++			   conn->s[0]);
++	}
++
++	if (setsockopt(conn->s[1], SOL_TCP, TCP_NODELAY, &one, sizeof(one))) {
++		flow_trace(conn, "failed to set TCP_NODELAY on socket %i",
++			   conn->s[1]);
++	}
++
+ 	pif_sockaddr(c, &sa, &sl, tgtpif, &tgt->eaddr, tgt->eport);
+ 
+ 	if (connect(conn->s[1], &sa.sa, sl)) {
+-- 
+2.47.1
+
diff --git a/SOURCES/0005-flow-Fix-incorrect-hash-probe-in-flowside_lookup.patch b/SOURCES/0005-flow-Fix-incorrect-hash-probe-in-flowside_lookup.patch
new file mode 100644
index 0000000..2f97956
--- /dev/null
+++ b/SOURCES/0005-flow-Fix-incorrect-hash-probe-in-flowside_lookup.patch
@@ -0,0 +1,43 @@
+From 7ad9f9bd2bbda8d705e0c6faf5acf2792fce063c Mon Sep 17 00:00:00 2001
+From: David Gibson <david@gibson.dropbear.id.au>
+Date: Fri, 6 Sep 2024 15:17:05 +1000
+Subject: [PATCH] flow: Fix incorrect hash probe in flowside_lookup()
+
+Our flow hash table uses linear probing in which we step backwards through
+clusters of adjacent hash entries when we have near collisions.  Usually
+that's implemented by flow_hash_probe().  However, due to some details we
+need a second implementation in flowside_lookup().  An embarrassing
+oversight in rebasing from earlier versions has mean that version is
+incorrect, trying to step forward through clusters rather than backward.
+
+In situations with the right sorts of has near-collisions this can lead to
+us not associating an ACK from the tap device with the right flow, leaving
+it in a not-quite-established state.  If the remote peer does a shutdown()
+at the right time, this can lead to a storm of EPOLLRDHUP events causing
+high CPU load.
+
+Fixes: acca4235c46f ("flow, tcp: Generalise TCP hash table to general flow hash table")
+Link: https://bugs.passt.top/show_bug.cgi?id=94
+Suggested-by: Stefano Brivio <sbrivio@redhat.com>
+Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
+Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
+---
+ flow.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/flow.c b/flow.c
+index 02631eb..a00e01d 100644
+--- a/flow.c
++++ b/flow.c
+@@ -697,7 +697,7 @@ static flow_sidx_t flowside_lookup(const struct ctx *c, uint8_t proto,
+ 	       !(FLOW_PROTO(&flow->f) == proto &&
+ 		 flow->f.pif[sidx.sidei] == pif &&
+ 		 flowside_eq(&flow->f.side[sidx.sidei], side)))
+-		b = (b + 1) % FLOW_HASH_SIZE;
++		b = mod_sub(b, 1, FLOW_HASH_SIZE);
+ 
+ 	return flow_hashtab[b];
+ }
+-- 
+2.47.1
+
diff --git a/SOURCES/0006-tcp-Set-ACK-flag-on-all-RST-segments-even-for-client.patch b/SOURCES/0006-tcp-Set-ACK-flag-on-all-RST-segments-even-for-client.patch
new file mode 100644
index 0000000..a446a34
--- /dev/null
+++ b/SOURCES/0006-tcp-Set-ACK-flag-on-all-RST-segments-even-for-client.patch
@@ -0,0 +1,78 @@
+From b10ddf22581ea470a57a0c2e4e8a5687bede0f53 Mon Sep 17 00:00:00 2001
+From: Stefano Brivio <sbrivio@redhat.com>
+Date: Mon, 20 Jan 2025 18:36:30 +0100
+Subject: [PATCH] tcp: Set ACK flag on *all* RST segments, even for client in
+ SYN-SENT state
+
+Somewhat curiously, RFC 9293, section 3.10.7.3, states:
+
+   If the state is SYN-SENT, then
+   [...]
+
+      Second, check the RST bit:
+      -  If the RST bit is set,
+      [...]
+
+         o  If the ACK was acceptable, then signal to the user "error:
+            connection reset", drop the segment, enter CLOSED state,
+            delete TCB, and return.  Otherwise (no ACK), drop the
+            segment and return.
+
+which matches verbatim RFC 793, pages 66-67, and is implemented as-is
+by tcp_rcv_synsent_state_process() in the Linux kernel, that is:
+
+	/* No ACK in the segment */
+
+	if (th->rst) {
+		/* rfc793:
+		 * "If the RST bit is set
+		 *
+		 *      Otherwise (no ACK) drop the segment and return."
+		 */
+
+		goto discard_and_undo;
+	}
+
+meaning that if a client is in SYN-SENT state, and we send a RST
+segment once we realise that we can't establish the outbound
+connection, the client will ignore our segment and will need to
+pointlessly wait until the connection times out instead of aborting
+it right away.
+
+The ACK flag on a RST, in this case, doesn't really seem to have any
+function, but we must set it nevertheless. The ACK sequence number is
+already correct because we always set it before calling
+tcp_prepare_flags(), whenever relevant.
+
+This leaves us with no cases where we should *not* set the ACK flag
+on non-SYN segments, so always set the ACK flag for RST segments.
+
+Note that non-SYN, non-RST segments were already covered by commit
+4988e2b40631 ("tcp: Unconditionally force ACK for all !SYN, !RST
+packets").
+
+Reported-by: Dirk Janssen <Dirk.Janssen@schiphol.nl>
+Reported-by: Roeland van de Pol <Roeland.van.de.Pol@schiphol.nl>
+Reported-by: Robert Floor <Robert.Floor@schiphol.nl>
+Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
+(cherry picked from commit db2c91ae86c7c0d1d068714db2342b9057506148)
+---
+ tcp.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tcp.c b/tcp.c
+index c0820ce..4e3148e 100644
+--- a/tcp.c
++++ b/tcp.c
+@@ -1199,7 +1199,7 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
+ 		*data++ = OPT_WS;
+ 		*data++ = OPT_WS_LEN;
+ 		*data++ = conn->ws_to_tap;
+-	} else if (!(flags & RST)) {
++	} else {
+ 		flags |= ACK;
+ 	}
+ 
+-- 
+2.47.1
+
diff --git a/SPECS/passt.spec b/SPECS/passt.spec
index b6b9eb8..b3147e2 100644
--- a/SPECS/passt.spec
+++ b/SPECS/passt.spec
@@ -7,19 +7,24 @@
 # Copyright (c) 2022 Red Hat GmbH
 # Author: Stefano Brivio <sbrivio@redhat.com>
 
-%global git_hash b86afe3559c0bd3d24bc6fed7c60466cf141224c
+%global git_hash ee36266a55478672ad2c5f4efbd6ca0bef3d37cd
 %global selinuxtype targeted
 
 Name:		passt
-Version:	0^20231204.gb86afe3
-Release:	1%{?dist}
+Version:	0^20240806.gee36266
+Release:	6%{?dist}
 Summary:	User-mode networking daemons for virtual machines and namespaces
-License:	GPLv2+ and BSD
+License:	GPL-2.0-or-later AND BSD-3-Clause
 Group:		System Environment/Daemons
 URL:		https://passt.top/
 Source:		https://passt.top/passt/snapshot/passt-%{git_hash}.tar.xz
 
 Patch1:		0001-selinux-Drop-user_namespace-create-allow-rules.patch
+Patch2:		0002-flow-Don-t-crash-if-guest-attempts-to-connect-to-por.patch
+Patch3:		0003-tcp-Acknowledge-keep-alive-segments-ignore-them-for-.patch
+Patch4:		0004-tcp_splice-Set-again-TCP_NODELAY-on-both-sides.patch
+Patch5:		0005-flow-Fix-incorrect-hash-probe-in-flowside_lookup.patch
+Patch6:		0006-tcp-Set-ACK-flag-on-all-RST-segments-even-for-client.patch
 
 BuildRequires:	gcc, make, git, checkpolicy, selinux-policy-devel
 Requires:	(%{name}-selinux = %{version}-%{release} if selinux-policy-%{selinuxtype})
@@ -126,6 +131,33 @@ fi
 %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
 
 %changelog
+* Tue Jan 21 2025 Stefano Brivio <sbrivio@redhat.com> - 0^20240806-gee36266-6
+- Resolves: RHEL-75645
+
+* Thu Jan 16 2025 Stefano Brivio <sbrivio@redhat.com> - 0^20240806-gee36266-5
+- Resolves: RHEL-74301
+
+* Fri Jan 10 2025 Stefano Brivio <sbrivio@redhat.com> - 0^20240806-gee36266-4
+- Resolves: RHEL-73251
+
+* Tue Nov 26 2024 Stefano Brivio <sbrivio@redhat.com> - 0^20240806-gee36266-3
+- Resolves: RHEL-68948
+
+* Wed Aug 14 2024 Stefano Brivio <sbrivio@redhat.com> - 0^20240806-gee36266-2
+- Resolves: RHEL-54268
+
+* Wed Aug  7 2024 Stefano Brivio <sbrivio@redhat.com> - 0^20240806.gee36266-1
+- Resolves: RHEL-53189
+
+* Fri Aug  2 2024 Stefano Brivio <sbrivio@redhat.com> - 0^20240726.g57a21d2-1
+- Resolves: RHEL-52638
+
+* Mon Jun 24 2024 Stefano Brivio <sbrivio@redhat.com> - 0^20240624.g1ee2eca-1
+- Resolves: RHEL-44837
+
+* Wed May 22 2024 Stefano Brivio <sbrivio@redhat.com> - 0^20240510.g7288448-1
+- Resolves: RHEL-37647
+
 * Fri Dec 15 2023 Stefano Brivio <sbrivio@redhat.com> - 0^20231204.gb86afe3-1
 - Resolves: RHEL-19590