import UBI passt-0^20240806.gee36266-6.el9_5
This commit is contained in:
parent
70ab0c67fc
commit
0ce8a50f98
2
.gitignore
vendored
2
.gitignore
vendored
@ -1 +1 @@
|
||||
SOURCES/passt-b86afe3559c0bd3d24bc6fed7c60466cf141224c.tar.xz
|
||||
SOURCES/passt-ee36266a55478672ad2c5f4efbd6ca0bef3d37cd.tar.xz
|
||||
|
@ -1 +1 @@
|
||||
833dc4cee84bf49eb54354f5da5ae07748ce2969 SOURCES/passt-b86afe3559c0bd3d24bc6fed7c60466cf141224c.tar.xz
|
||||
421a821e193faa31035a158c913c70d7fc13bf68 SOURCES/passt-ee36266a55478672ad2c5f4efbd6ca0bef3d37cd.tar.xz
|
||||
|
@ -0,0 +1,64 @@
|
||||
From 002b2a23380d4df552bac7665d462ac4c7bced0b Mon Sep 17 00:00:00 2001
|
||||
From: David Gibson <david@gibson.dropbear.id.au>
|
||||
Date: Wed, 14 Aug 2024 20:03:33 +1000
|
||||
Subject: [PATCH] flow: Don't crash if guest attempts to connect to port 0
|
||||
|
||||
Using a zero port on TCP or UDP is dubious, and we can't really deal with
|
||||
forwarding such a flow within the constraints of the socket API. Hence
|
||||
we ASSERT()ed that we had non-zero ports in flow_hash().
|
||||
|
||||
The intention was to make sure that the protocol code sanitizes such ports
|
||||
before completing a flow entry. Unfortunately, flow_hash() is also called
|
||||
on new packets to see if they have an existing flow, so the unsanitized
|
||||
guest packet can crash passt with the assert.
|
||||
|
||||
Correct this by moving the assert from flow_hash() to flow_sidx_hash()
|
||||
which is only used on entries already in the table, not on unsanitized
|
||||
data.
|
||||
|
||||
Reported-by: Matt Hamilton <matt@thmail.io>
|
||||
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
---
|
||||
flow.c | 18 ++++++++++--------
|
||||
1 file changed, 10 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/flow.c b/flow.c
|
||||
index 687e9fd..93b687d 100644
|
||||
--- a/flow.c
|
||||
+++ b/flow.c
|
||||
@@ -561,12 +561,6 @@ static uint64_t flow_hash(const struct ctx *c, uint8_t proto, uint8_t pif,
|
||||
{
|
||||
struct siphash_state state = SIPHASH_INIT(c->hash_secret);
|
||||
|
||||
- /* For the hash table to work, we need complete endpoint information,
|
||||
- * and at least a forwarding port.
|
||||
- */
|
||||
- ASSERT(pif != PIF_NONE && !inany_is_unspecified(&side->eaddr) &&
|
||||
- side->eport != 0 && side->fport != 0);
|
||||
-
|
||||
inany_siphash_feed(&state, &side->faddr);
|
||||
inany_siphash_feed(&state, &side->eaddr);
|
||||
|
||||
@@ -586,8 +580,16 @@ static uint64_t flow_hash(const struct ctx *c, uint8_t proto, uint8_t pif,
|
||||
static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx)
|
||||
{
|
||||
const struct flow_common *f = &flow_at_sidx(sidx)->f;
|
||||
- return flow_hash(c, FLOW_PROTO(f),
|
||||
- f->pif[sidx.sidei], &f->side[sidx.sidei]);
|
||||
+ const struct flowside *side = &f->side[sidx.sidei];
|
||||
+ uint8_t pif = f->pif[sidx.sidei];
|
||||
+
|
||||
+ /* For the hash table to work, entries must have complete endpoint
|
||||
+ * information, and at least a forwarding port.
|
||||
+ */
|
||||
+ ASSERT(pif != PIF_NONE && !inany_is_unspecified(&side->eaddr) &&
|
||||
+ side->eport != 0 && side->fport != 0);
|
||||
+
|
||||
+ return flow_hash(c, FLOW_PROTO(f), pif, side);
|
||||
}
|
||||
|
||||
/**
|
||||
--
|
||||
2.43.0
|
||||
|
@ -0,0 +1,69 @@
|
||||
From 238c69f9af458e41dea5ad8c988dbf65b05b5172 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Tue, 19 Nov 2024 20:53:44 +0100
|
||||
Subject: [PATCH] tcp: Acknowledge keep-alive segments, ignore them for the
|
||||
rest
|
||||
|
||||
RFC 9293, 3.8.4 says:
|
||||
|
||||
Implementers MAY include "keep-alives" in their TCP implementations
|
||||
(MAY-5), although this practice is not universally accepted. Some
|
||||
TCP implementations, however, have included a keep-alive mechanism.
|
||||
To confirm that an idle connection is still active, these
|
||||
implementations send a probe segment designed to elicit a response
|
||||
from the TCP peer. Such a segment generally contains SEG.SEQ =
|
||||
SND.NXT-1 and may or may not contain one garbage octet of data. If
|
||||
keep-alives are included, the application MUST be able to turn them
|
||||
on or off for each TCP connection (MUST-24), and they MUST default to
|
||||
off (MUST-25).
|
||||
|
||||
but currently, tcp_data_from_tap() is not aware of this and will
|
||||
schedule a fast re-transmit on the second keep-alive (because it's
|
||||
also a duplicate ACK), ignoring the fact that the sequence number was
|
||||
rewinded to SND.NXT-1.
|
||||
|
||||
ACK these keep-alive segments, reset the activity timeout, and ignore
|
||||
them for the rest.
|
||||
|
||||
At some point, we could think of implementing an approximation of
|
||||
keep-alive segments on outbound sockets, for example by setting
|
||||
TCP_KEEPIDLE to 1, and a large TCP_KEEPINTVL, so that we send a single
|
||||
keep-alive segment at approximately the same time, and never reset the
|
||||
connection. That's beyond the scope of this fix, though.
|
||||
|
||||
Reported-by: Tim Besard <tim.besard@gmail.com>
|
||||
Link: https://github.com/containers/podman/discussions/24572
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
---
|
||||
tcp.c | 14 ++++++++++++++
|
||||
1 file changed, 14 insertions(+)
|
||||
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index f357920..1eb85bb 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -1763,6 +1763,20 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
continue;
|
||||
|
||||
seq = ntohl(th->seq);
|
||||
+ if (SEQ_LT(seq, conn->seq_from_tap) && len <= 1) {
|
||||
+ flow_trace(conn,
|
||||
+ "keep-alive sequence: %u, previous: %u",
|
||||
+ seq, conn->seq_from_tap);
|
||||
+
|
||||
+ tcp_send_flag(c, conn, ACK);
|
||||
+ tcp_timer_ctl(c, conn);
|
||||
+
|
||||
+ if (p->count == 1)
|
||||
+ return 1;
|
||||
+
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
ack_seq = ntohl(th->ack_seq);
|
||||
|
||||
if (th->ack) {
|
||||
--
|
||||
2.43.5
|
||||
|
@ -0,0 +1,93 @@
|
||||
From 725acd111ba340122f2bb0601e373534eb4b5ed8 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Mon, 6 Jan 2025 10:10:29 +0100
|
||||
Subject: [PATCH] tcp_splice: Set (again) TCP_NODELAY on both sides
|
||||
|
||||
In commit 7ecf69329787 ("pasta, tcp: Don't set TCP_CORK on spliced
|
||||
sockets") I just assumed that we wouldn't benefit from disabling
|
||||
Nagle's algorithm once we drop TCP_CORK (and its 200ms fixed delay).
|
||||
|
||||
It turns out that with some patterns, such as a PostgreSQL server
|
||||
in a container receiving parameterised, short queries, for which pasta
|
||||
sees several short inbound messages (Parse, Bind, Describe, Execute
|
||||
and Sync commands getting each one their own packet, 5 to 49 bytes TCP
|
||||
payload each), we'll read them usually in two batches, and send them
|
||||
in matching batches, for example:
|
||||
|
||||
9165.2467: pasta: epoll event on connected spliced TCP socket 117 (events: 0x00000001)
|
||||
9165.2468: Flow 0 (TCP connection (spliced)): 76 from read-side call
|
||||
9165.2468: Flow 0 (TCP connection (spliced)): 76 from write-side call (passed 524288)
|
||||
9165.2469: pasta: epoll event on connected spliced TCP socket 117 (events: 0x00000001)
|
||||
9165.2470: Flow 0 (TCP connection (spliced)): 15 from read-side call
|
||||
9165.2470: Flow 0 (TCP connection (spliced)): 15 from write-side call (passed 524288)
|
||||
9165.2944: pasta: epoll event on connected spliced TCP socket 118 (events: 0x00000001)
|
||||
|
||||
and the kernel delivers the first one, waits for acknowledgement from
|
||||
the receiver, then delivers the second one. This adds very substantial
|
||||
and unnecessary delay. It's usually a fixed ~40ms between the two
|
||||
batches, which is clearly unacceptable for loopback connections.
|
||||
|
||||
In this example, the delay is shown by the timestamp of the response
|
||||
from socket 118. The peer (server) doesn't actually take that long
|
||||
(less than a millisecond), but it takes that long for the kernel to
|
||||
deliver our request.
|
||||
|
||||
To avoid batching and delays, disable Nagle's algorithm by setting
|
||||
TCP_NODELAY on both internal and external sockets: this way, we get
|
||||
one inbound packet for each original message, we transfer them right
|
||||
away, and the kernel delivers them to the process in the container as
|
||||
they are, without delay.
|
||||
|
||||
We can do this safely as we don't care much about network utilisation
|
||||
when there's in fact pretty much no network (loopback connections).
|
||||
|
||||
This is unfortunately not visible in the TCP request-response tests
|
||||
from the test suite because, with smaller messages (we use one byte),
|
||||
Nagle's algorithm doesn't even kick in. It's probably not trivial to
|
||||
implement a universal test covering this case.
|
||||
|
||||
Fixes: 7ecf69329787 ("pasta, tcp: Don't set TCP_CORK on spliced sockets")
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
---
|
||||
tcp_splice.c | 14 ++++++++++++--
|
||||
1 file changed, 12 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/tcp_splice.c b/tcp_splice.c
|
||||
index 3a0f868..3a000ff 100644
|
||||
--- a/tcp_splice.c
|
||||
+++ b/tcp_splice.c
|
||||
@@ -348,6 +348,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
|
||||
uint8_t tgtpif = conn->f.pif[TGTSIDE];
|
||||
union sockaddr_inany sa;
|
||||
socklen_t sl;
|
||||
+ int one = 1;
|
||||
|
||||
if (tgtpif == PIF_HOST)
|
||||
conn->s[1] = tcp_conn_sock(c, af);
|
||||
@@ -359,12 +360,21 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
|
||||
if (conn->s[1] < 0)
|
||||
return -1;
|
||||
|
||||
- if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK,
|
||||
- &((int){ 1 }), sizeof(int))) {
|
||||
+ if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK, &one, sizeof(one))) {
|
||||
flow_trace(conn, "failed to set TCP_QUICKACK on socket %i",
|
||||
conn->s[1]);
|
||||
}
|
||||
|
||||
+ if (setsockopt(conn->s[0], SOL_TCP, TCP_NODELAY, &one, sizeof(one))) {
|
||||
+ flow_trace(conn, "failed to set TCP_NODELAY on socket %i",
|
||||
+ conn->s[0]);
|
||||
+ }
|
||||
+
|
||||
+ if (setsockopt(conn->s[1], SOL_TCP, TCP_NODELAY, &one, sizeof(one))) {
|
||||
+ flow_trace(conn, "failed to set TCP_NODELAY on socket %i",
|
||||
+ conn->s[1]);
|
||||
+ }
|
||||
+
|
||||
pif_sockaddr(c, &sa, &sl, tgtpif, &tgt->eaddr, tgt->eport);
|
||||
|
||||
if (connect(conn->s[1], &sa.sa, sl)) {
|
||||
--
|
||||
2.47.1
|
||||
|
@ -0,0 +1,43 @@
|
||||
From 7ad9f9bd2bbda8d705e0c6faf5acf2792fce063c Mon Sep 17 00:00:00 2001
|
||||
From: David Gibson <david@gibson.dropbear.id.au>
|
||||
Date: Fri, 6 Sep 2024 15:17:05 +1000
|
||||
Subject: [PATCH] flow: Fix incorrect hash probe in flowside_lookup()
|
||||
|
||||
Our flow hash table uses linear probing in which we step backwards through
|
||||
clusters of adjacent hash entries when we have near collisions. Usually
|
||||
that's implemented by flow_hash_probe(). However, due to some details we
|
||||
need a second implementation in flowside_lookup(). An embarrassing
|
||||
oversight in rebasing from earlier versions has mean that version is
|
||||
incorrect, trying to step forward through clusters rather than backward.
|
||||
|
||||
In situations with the right sorts of has near-collisions this can lead to
|
||||
us not associating an ACK from the tap device with the right flow, leaving
|
||||
it in a not-quite-established state. If the remote peer does a shutdown()
|
||||
at the right time, this can lead to a storm of EPOLLRDHUP events causing
|
||||
high CPU load.
|
||||
|
||||
Fixes: acca4235c46f ("flow, tcp: Generalise TCP hash table to general flow hash table")
|
||||
Link: https://bugs.passt.top/show_bug.cgi?id=94
|
||||
Suggested-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
---
|
||||
flow.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/flow.c b/flow.c
|
||||
index 02631eb..a00e01d 100644
|
||||
--- a/flow.c
|
||||
+++ b/flow.c
|
||||
@@ -697,7 +697,7 @@ static flow_sidx_t flowside_lookup(const struct ctx *c, uint8_t proto,
|
||||
!(FLOW_PROTO(&flow->f) == proto &&
|
||||
flow->f.pif[sidx.sidei] == pif &&
|
||||
flowside_eq(&flow->f.side[sidx.sidei], side)))
|
||||
- b = (b + 1) % FLOW_HASH_SIZE;
|
||||
+ b = mod_sub(b, 1, FLOW_HASH_SIZE);
|
||||
|
||||
return flow_hashtab[b];
|
||||
}
|
||||
--
|
||||
2.47.1
|
||||
|
@ -0,0 +1,78 @@
|
||||
From b10ddf22581ea470a57a0c2e4e8a5687bede0f53 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Mon, 20 Jan 2025 18:36:30 +0100
|
||||
Subject: [PATCH] tcp: Set ACK flag on *all* RST segments, even for client in
|
||||
SYN-SENT state
|
||||
|
||||
Somewhat curiously, RFC 9293, section 3.10.7.3, states:
|
||||
|
||||
If the state is SYN-SENT, then
|
||||
[...]
|
||||
|
||||
Second, check the RST bit:
|
||||
- If the RST bit is set,
|
||||
[...]
|
||||
|
||||
o If the ACK was acceptable, then signal to the user "error:
|
||||
connection reset", drop the segment, enter CLOSED state,
|
||||
delete TCB, and return. Otherwise (no ACK), drop the
|
||||
segment and return.
|
||||
|
||||
which matches verbatim RFC 793, pages 66-67, and is implemented as-is
|
||||
by tcp_rcv_synsent_state_process() in the Linux kernel, that is:
|
||||
|
||||
/* No ACK in the segment */
|
||||
|
||||
if (th->rst) {
|
||||
/* rfc793:
|
||||
* "If the RST bit is set
|
||||
*
|
||||
* Otherwise (no ACK) drop the segment and return."
|
||||
*/
|
||||
|
||||
goto discard_and_undo;
|
||||
}
|
||||
|
||||
meaning that if a client is in SYN-SENT state, and we send a RST
|
||||
segment once we realise that we can't establish the outbound
|
||||
connection, the client will ignore our segment and will need to
|
||||
pointlessly wait until the connection times out instead of aborting
|
||||
it right away.
|
||||
|
||||
The ACK flag on a RST, in this case, doesn't really seem to have any
|
||||
function, but we must set it nevertheless. The ACK sequence number is
|
||||
already correct because we always set it before calling
|
||||
tcp_prepare_flags(), whenever relevant.
|
||||
|
||||
This leaves us with no cases where we should *not* set the ACK flag
|
||||
on non-SYN segments, so always set the ACK flag for RST segments.
|
||||
|
||||
Note that non-SYN, non-RST segments were already covered by commit
|
||||
4988e2b40631 ("tcp: Unconditionally force ACK for all !SYN, !RST
|
||||
packets").
|
||||
|
||||
Reported-by: Dirk Janssen <Dirk.Janssen@schiphol.nl>
|
||||
Reported-by: Roeland van de Pol <Roeland.van.de.Pol@schiphol.nl>
|
||||
Reported-by: Robert Floor <Robert.Floor@schiphol.nl>
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
(cherry picked from commit db2c91ae86c7c0d1d068714db2342b9057506148)
|
||||
---
|
||||
tcp.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index c0820ce..4e3148e 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -1199,7 +1199,7 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
|
||||
*data++ = OPT_WS;
|
||||
*data++ = OPT_WS_LEN;
|
||||
*data++ = conn->ws_to_tap;
|
||||
- } else if (!(flags & RST)) {
|
||||
+ } else {
|
||||
flags |= ACK;
|
||||
}
|
||||
|
||||
--
|
||||
2.47.1
|
||||
|
@ -7,19 +7,24 @@
|
||||
# Copyright (c) 2022 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
%global git_hash b86afe3559c0bd3d24bc6fed7c60466cf141224c
|
||||
%global git_hash ee36266a55478672ad2c5f4efbd6ca0bef3d37cd
|
||||
%global selinuxtype targeted
|
||||
|
||||
Name: passt
|
||||
Version: 0^20231204.gb86afe3
|
||||
Release: 1%{?dist}
|
||||
Version: 0^20240806.gee36266
|
||||
Release: 6%{?dist}
|
||||
Summary: User-mode networking daemons for virtual machines and namespaces
|
||||
License: GPLv2+ and BSD
|
||||
License: GPL-2.0-or-later AND BSD-3-Clause
|
||||
Group: System Environment/Daemons
|
||||
URL: https://passt.top/
|
||||
Source: https://passt.top/passt/snapshot/passt-%{git_hash}.tar.xz
|
||||
|
||||
Patch1: 0001-selinux-Drop-user_namespace-create-allow-rules.patch
|
||||
Patch2: 0002-flow-Don-t-crash-if-guest-attempts-to-connect-to-por.patch
|
||||
Patch3: 0003-tcp-Acknowledge-keep-alive-segments-ignore-them-for-.patch
|
||||
Patch4: 0004-tcp_splice-Set-again-TCP_NODELAY-on-both-sides.patch
|
||||
Patch5: 0005-flow-Fix-incorrect-hash-probe-in-flowside_lookup.patch
|
||||
Patch6: 0006-tcp-Set-ACK-flag-on-all-RST-segments-even-for-client.patch
|
||||
|
||||
BuildRequires: gcc, make, git, checkpolicy, selinux-policy-devel
|
||||
Requires: (%{name}-selinux = %{version}-%{release} if selinux-policy-%{selinuxtype})
|
||||
@ -126,6 +131,33 @@ fi
|
||||
%{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
|
||||
|
||||
%changelog
|
||||
* Tue Jan 21 2025 Stefano Brivio <sbrivio@redhat.com> - 0^20240806-gee36266-6
|
||||
- Resolves: RHEL-75645
|
||||
|
||||
* Thu Jan 16 2025 Stefano Brivio <sbrivio@redhat.com> - 0^20240806-gee36266-5
|
||||
- Resolves: RHEL-74301
|
||||
|
||||
* Fri Jan 10 2025 Stefano Brivio <sbrivio@redhat.com> - 0^20240806-gee36266-4
|
||||
- Resolves: RHEL-73251
|
||||
|
||||
* Tue Nov 26 2024 Stefano Brivio <sbrivio@redhat.com> - 0^20240806-gee36266-3
|
||||
- Resolves: RHEL-68948
|
||||
|
||||
* Wed Aug 14 2024 Stefano Brivio <sbrivio@redhat.com> - 0^20240806-gee36266-2
|
||||
- Resolves: RHEL-54268
|
||||
|
||||
* Wed Aug 7 2024 Stefano Brivio <sbrivio@redhat.com> - 0^20240806.gee36266-1
|
||||
- Resolves: RHEL-53189
|
||||
|
||||
* Fri Aug 2 2024 Stefano Brivio <sbrivio@redhat.com> - 0^20240726.g57a21d2-1
|
||||
- Resolves: RHEL-52638
|
||||
|
||||
* Mon Jun 24 2024 Stefano Brivio <sbrivio@redhat.com> - 0^20240624.g1ee2eca-1
|
||||
- Resolves: RHEL-44837
|
||||
|
||||
* Wed May 22 2024 Stefano Brivio <sbrivio@redhat.com> - 0^20240510.g7288448-1
|
||||
- Resolves: RHEL-37647
|
||||
|
||||
* Fri Dec 15 2023 Stefano Brivio <sbrivio@redhat.com> - 0^20231204.gb86afe3-1
|
||||
- Resolves: RHEL-19590
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user