import UBI passt-0^20251210.gd04c480-4.el9_8

This commit is contained in:
AlmaLinux RelEng Bot 2026-05-19 20:35:01 -04:00
parent 5362eb6d20
commit b396701ee3
24 changed files with 2155 additions and 422 deletions

2
.gitignore vendored
View File

@ -1 +1 @@
SOURCES/passt-8ec134109eb136432a29bdf5a14f8b1fd4e46208.tar.xz
SOURCES/passt-d04c48032bcf724550d0b8f652fd00efcd2dfad0.tar.xz

View File

@ -1 +1 @@
7b91876dcd65569ddf775b2da567345500ec8862 SOURCES/passt-8ec134109eb136432a29bdf5a14f8b1fd4e46208.tar.xz
ec2fcde158b88b1ed9786565025380d03aa32d56 SOURCES/passt-d04c48032bcf724550d0b8f652fd00efcd2dfad0.tar.xz

View File

@ -1,6 +1,6 @@
From 6977619743bbc602a865f79562b59a80921d6063 Mon Sep 17 00:00:00 2001
From 7087adfbab35354f9def7edee87385b82416c722 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Mon, 21 Aug 2023 17:52:28 +0200
Date: Mon, 8 Dec 2025 22:32:50 -0500
Subject: [PATCH] selinux: Drop user_namespace create allow rules
Those are incompatible with current el9 kernels. I introduced them
@ -24,10 +24,10 @@ Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
2 files changed, 2 deletions(-)
diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te
index c6cea34..131fadc 100644
index 6995df8..76d23e8 100644
--- a/contrib/selinux/passt.te
+++ b/contrib/selinux/passt.te
@@ -92,7 +92,6 @@ allow syslogd_t self:cap_userns sys_ptrace;
@@ -105,7 +105,6 @@ allow syslogd_t self:cap_userns sys_ptrace;
allow passt_t self:process setcap;
allow passt_t self:capability { sys_tty_config setpcap net_bind_service setuid setgid};
allow passt_t self:cap_userns { setpcap sys_admin sys_ptrace };
@ -36,16 +36,17 @@ index c6cea34..131fadc 100644
auth_read_passwd(passt_t)
diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te
index 69be081..892edae 100644
index 95fe42a..7e1e821 100644
--- a/contrib/selinux/pasta.te
+++ b/contrib/selinux/pasta.te
@@ -110,7 +110,6 @@ init_daemon_domain(pasta_t, pasta_exec_t)
allow pasta_t self:capability { setpcap net_bind_service sys_tty_config dac_read_search net_admin sys_resource setuid setgid };
allow pasta_t self:cap_userns { setpcap sys_admin sys_ptrace net_admin net_bind_service };
@@ -126,7 +126,6 @@ allow pasta_t self:cap_userns { setpcap sys_admin sys_ptrace net_admin net_bind_
# pasta only calls setuid and setgid with the current UID and GID, so this
# denial is harmless. See https://bugzilla.redhat.com/show_bug.cgi?id=2330512#c10
dontaudit pasta_t self:cap_userns { setgid setuid };
-allow pasta_t self:user_namespace create;
auth_read_passwd(pasta_t)
--
2.39.2
2.47.1

View File

@ -0,0 +1,41 @@
From 2244df26b2cb63acb51a20485e1ca7ad0649b152 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Mon, 22 Dec 2025 21:48:32 -0500
Subject: [PATCH] selinux: Use systemd_logind_exec_t instead of
systemd_user_runtimedir_exec_t
On CentOS Stream 9, selinux-policy doesn't contain commit
700b3622d575 ("Confine /usr/lib/systemd/systemd-user-runtime-dir"),
so the file context of /usr/lib/systemd/systemd-user-runtime-dir is
still systemd_logind_exec_t there.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
contrib/selinux/pasta.te | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te
index 7e1e821..d29d6c4 100644
--- a/contrib/selinux/pasta.te
+++ b/contrib/selinux/pasta.te
@@ -98,7 +98,7 @@ require {
type container_runtime_t;
type container_var_run_t;
type container_t;
- type systemd_user_runtimedir_t;
+ type systemd_logind_exec_t;
}
type pasta_t;
@@ -250,7 +250,7 @@ type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "rootles
type_transition container_runtime_t container_var_run_t : dir ifconfig_var_run_t "rootless-netns";
allow pasta_t ifconfig_var_run_t:dir { add_name open rmdir write };
allow pasta_t ifconfig_var_run_t:file { create open write };
-allow systemd_user_runtimedir_t ifconfig_var_run_t:dir rmdir;
+allow systemd_logind_exec_t ifconfig_var_run_t:dir rmdir;
# Allow pasta to bind to any port
bool pasta_bind_all_ports true;
--
2.47.1

View File

@ -1,264 +0,0 @@
From b0b5ce0a76cf7fec0b00405732fd94e0b34e8d84 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 17 Jul 2025 10:38:17 +0200
Subject: [PATCH] treewide: By default, don't quit source after migration, keep
sockets open
We are hitting an issue in the KubeVirt integration where some data is
still sent to the source instance even after migration is complete. As
we exit, the kernel closes our sockets and resets connections. The
resulting RST segments are sent to peers, effectively terminating
connections that were meanwhile migrated.
At the moment, this is not done intentionally, but in the future
KubeVirt might enable OVN-Kubernetes features where source and
destination nodes are explicitly getting mirrored traffic for a while,
in order to decrease migration downtime.
By default, don't quit after migration is completed on the source: the
previous behaviour can be enabled with the new, but deprecated,
--migrate-exit option. After migration (as source), the -1 / --one-off
option has no effect.
Also, by default, keep migrated TCP sockets open (in repair mode) as
long as we're running, and ignore events on any epoll descriptor
representing data channels. The previous behaviour can be enabled with
the new, equally deprecated, --migrate-no-linger option.
By keeping sockets open, and not exiting, we prevent the kernel
running on the source node to send out RST segments if further data
reaches us.
Reported-by: Nir Dothan <ndothan@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
(cherry picked from commit a8782865c342eb2682cca292d5bf92b567344351)
---
conf.c | 22 ++++++++++++++++++++++
flow.c | 2 +-
passt.1 | 29 +++++++++++++++++++++++++++++
passt.h | 4 ++++
tcp.c | 9 +++++++--
tcp_conn.h | 3 ++-
test/lib/setup | 4 ++--
vhost_user.c | 9 +++++++--
8 files changed, 74 insertions(+), 8 deletions(-)
diff --git a/conf.c b/conf.c
index a6d7e22..1295d89 100644
--- a/conf.c
+++ b/conf.c
@@ -864,6 +864,14 @@ static void usage(const char *name, FILE *f, int status)
FPRINTF(f,
" --repair-path PATH path for passt-repair(1)\n"
" default: append '.repair' to UNIX domain path\n");
+ FPRINTF(f,
+ " --migrate-exit DEPRECATED:\n"
+ " source quits after migration\n"
+ " default: source keeps running after migration\n");
+ FPRINTF(f,
+ " --migrate-no-linger DEPRECATED:\n"
+ " close sockets on migration\n"
+ " default: keep sockets open, ignore events\n");
}
FPRINTF(f,
@@ -1468,6 +1476,8 @@ void conf(struct ctx *c, int argc, char **argv)
{"socket-path", required_argument, NULL, 's' },
{"fqdn", required_argument, NULL, 27 },
{"repair-path", required_argument, NULL, 28 },
+ {"migrate-exit", no_argument, NULL, 29 },
+ {"migrate-no-linger", no_argument, NULL, 30 },
{ 0 },
};
const char *optstring = "+dqfel:hs:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:";
@@ -1683,6 +1693,18 @@ void conf(struct ctx *c, int argc, char **argv)
optarg))
die("Invalid passt-repair path: %s", optarg);
+ break;
+ case 29:
+ if (c->mode != MODE_VU)
+ die("--migrate-exit is for vhost-user mode only");
+ c->migrate_exit = true;
+
+ break;
+ case 30:
+ if (c->mode != MODE_VU)
+ die("--migrate-no-linger is for vhost-user mode only");
+ c->migrate_no_linger = true;
+
break;
case 'd':
c->debug = 1;
diff --git a/flow.c b/flow.c
index 6a5c8aa..a4b65ea 100644
--- a/flow.c
+++ b/flow.c
@@ -1089,7 +1089,7 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
* as EIO).
*/
foreach_established_tcp_flow(flow) {
- rc = tcp_flow_migrate_source_ext(fd, &flow->tcp);
+ rc = tcp_flow_migrate_source_ext(c, fd, &flow->tcp);
if (rc) {
flow_err(flow, "Can't send extended data: %s",
strerror_(-rc));
diff --git a/passt.1 b/passt.1
index 60066c2..cef98b2 100644
--- a/passt.1
+++ b/passt.1
@@ -439,6 +439,30 @@ Default, for \-\-vhost-user mode only, is to append \fI.repair\fR to the path
chosen for the hypervisor UNIX domain socket. No socket is created if not in
\-\-vhost-user mode.
+.TP
+.BR \-\-migrate-exit (DEPRECATED)
+Exit after a completed migration as source. By default, \fBpasst\fR keeps
+running and the migrated guest can continue using its connection, or a new guest
+can connect.
+
+Note that this configuration option is \fBdeprecated\fR and will be removed in a
+future version. It is not expected to be of any use, and it simply reflects a
+legacy behaviour. If you have any use for this, refer to \fBREPORTING BUGS\fR
+below.
+
+.TP
+.BR \-\-migrate-no-linger (DEPRECATED)
+Close TCP sockets on the source instance once migration completes.
+
+By default, sockets are kept open, and events on data sockets are ignored, so
+that any further message reaching sockets after the source migrated is silently
+ignored, to avoid connection resets in case data is received after migration.
+
+Note that this configuration option is \fBdeprecated\fR and will be removed in a
+future version. It is not expected to be of any use, and it simply reflects a
+legacy behaviour. If you have any use for this, refer to \fBREPORTING BUGS\fR
+below.
+
.TP
.BR \-F ", " \-\-fd " " \fIFD
Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened
@@ -454,6 +478,11 @@ is closed.
Quit after handling a single client connection, that is, once the client closes
the socket, or once we get a socket error.
+\fBNote\fR: this option has no effect after \fBpasst\fR completes a migration as
+source, because, in that case, exiting would close sockets for active
+connections, which would in turn cause connection resets if any further data is
+received. See also the description of \fI\-\-migrate-no-linger\fR.
+
.TP
.BR \-t ", " \-\-tcp-ports " " \fIspec
Configure TCP port forwarding to guest. \fIspec\fR can be one of:
diff --git a/passt.h b/passt.h
index 8693794..4cfd6eb 100644
--- a/passt.h
+++ b/passt.h
@@ -241,6 +241,8 @@ struct ip6_ctx {
* @device_state_fd: Device state migration channel
* @device_state_result: Device state migration result
* @migrate_target: Are we the target, on the next migration request?
+ * @migrate_no_linger: Close sockets as we migrate them
+ * @migrate_exit: Exit (on source) once migration is complete
*/
struct ctx {
enum passt_modes mode;
@@ -318,6 +320,8 @@ struct ctx {
int device_state_fd;
int device_state_result;
bool migrate_target;
+ bool migrate_no_linger;
+ bool migrate_exit;
};
void proto_update_l2_buf(const unsigned char *eth_d,
diff --git a/tcp.c b/tcp.c
index 0ac298a..1b22f70 100644
--- a/tcp.c
+++ b/tcp.c
@@ -3284,12 +3284,14 @@ int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn)
/**
* tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data
+ * @c: Execution context
* @fd: Descriptor for state migration
* @conn: Pointer to the TCP connection structure
*
* Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure
*/
-int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
+int tcp_flow_migrate_source_ext(const struct ctx *c,
+ int fd, const struct tcp_tap_conn *conn)
{
uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(conn)];
@@ -3334,7 +3336,10 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
if ((rc = tcp_flow_dump_seq(conn, &t->seq_rcv)))
goto fail;
- close(s);
+ if (c->migrate_no_linger)
+ close(s);
+ else
+ epoll_del(c, s);
/* Adjustments unrelated to FIN segments: sequence numbers we dumped are
* based on the end of the queues.
diff --git a/tcp_conn.h b/tcp_conn.h
index 35d813d..38b5c54 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -236,7 +236,8 @@ int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn);
int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn);
-int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn);
+int tcp_flow_migrate_source_ext(const struct ctx *c, int fd,
+ const struct tcp_tap_conn *conn);
int tcp_flow_migrate_target(struct ctx *c, int fd);
int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd);
diff --git a/test/lib/setup b/test/lib/setup
index 575bc21..5994598 100755
--- a/test/lib/setup
+++ b/test/lib/setup
@@ -350,7 +350,7 @@ setup_migrate() {
sleep 1
- __opts="--vhost-user"
+ __opts="--vhost-user --migrate-exit --migrate-no-linger"
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap"
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
@@ -360,7 +360,7 @@ setup_migrate() {
context_run_bg passt_repair_1 "./passt-repair ${STATESETUP}/passt_1.socket.repair"
- __opts="--vhost-user"
+ __opts="--vhost-user --migrate-exit --migrate-no-linger"
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap"
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
diff --git a/vhost_user.c b/vhost_user.c
index 105f77a..c4d3a52 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -1208,7 +1208,12 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
if (msg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE &&
vdev->context->device_state_result == 0 &&
!vdev->context->migrate_target) {
- info("Migration complete, exiting");
- _exit(EXIT_SUCCESS);
+ if (vdev->context->migrate_exit) {
+ info("Migration complete, exiting");
+ _exit(EXIT_SUCCESS);
+ }
+
+ info("Migration complete");
+ vdev->context->one_off = false;
}
}
--
2.47.1

View File

@ -1,48 +0,0 @@
From bd90a820852ff8966aeb83231c29e48849db3493 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 29 Aug 2025 22:11:31 +0200
Subject: [PATCH 3/4] tcp: Cast operands of sequence comparison macros to
uint32_t before using them
Otherwise, passing signed types causes automatic promotion of the
result of the subtractions as well, which is not what we want, as
these macros rely on unsigned 32-bit arithmetic.
The next patch introduces a ssize_t operand for SEQ_LE, illustrating
the issue.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Tested-by: Paul Holzinger <pholzing@redhat.com>
Reviewed-by: Jon Maloy <jmaloy@redhat.com>
(cherry picked from commit 660cd6907e14a41ad9bc77d317140c70ab416fce)
---
tcp_internal.h | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/tcp_internal.h b/tcp_internal.h
index 36c6533..c80ba40 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -18,10 +18,14 @@
sizeof(struct ipv6hdr), \
sizeof(uint32_t))
-#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW)
-#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW)
-#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW)
-#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW)
+#define SEQ_LE(a, b) \
+ ((uint32_t)(b) - (uint32_t)(a) < MAX_WINDOW)
+#define SEQ_LT(a, b) \
+ ((uint32_t)(b) - (uint32_t)(a) - 1 < MAX_WINDOW)
+#define SEQ_GE(a, b) \
+ ((uint32_t)(a) - (uint32_t)(b) < MAX_WINDOW)
+#define SEQ_GT(a, b) \
+ ((uint32_t)(a) - (uint32_t)(b) - 1 < MAX_WINDOW)
#define FIN (1 << 0)
#define SYN (1 << 1)
--
2.47.1

View File

@ -0,0 +1,110 @@
From b40f5cd8c8e16c6eceb1f26eb895527fda84068b Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Sat, 13 Dec 2025 14:19:13 +0100
Subject: [PATCH] tcp: Use less-than-MSS window on no queued data, or no data
sent recently
We limit the advertised window to guests and containers to the
available length of the sending buffer, and if it's less than the MSS,
since commit cf1925fb7b77 ("tcp: Don't limit window to less-than-MSS
values, use zero instead"), we approximate that limit to zero.
This way, we'll trigger a window update as soon as we realise that we
can advertise a larger value, just like we do in all other cases where
we advertise a zero-sized window.
By doing that, we don't wait for the peer to send us data before we
update the window. This matters because the guest or container might
be trying to aggregate more data and won't send us anything at all if
the advertised window is too small.
However, this might be problematic in two situations:
1. one, reported by Tyler, where the remote (receiving) peer
advertises a window that's smaller than what we usually get and
very close to the MSS, causing the kernel to give us a starting
size of the buffer that's less than the MSS we advertise to the
guest or container.
If this happens, we'll never advertise a non-zero window after
the handshake, and the container or guest will never send us any
data at all.
With a simple 'curl https://cloudflare.com/', we get, with default
TCP memory parameters, a 65535-byte window from the peer, and 46080
bytes of initial sending buffer from the kernel. But we advertised
a 65480-byte MSS, and we'll never actually receive the client
request.
This seems to be specific to Cloudflare for some reason, probably
deriving from a particular tuning of TCP parameters on their
servers.
2. another one, hypothesised by David, where the peer might only be
willing to process (and acknowledge) data in batches.
We might have queued outbound data which is, at the same time, not
enough to fill one of these batches and be acknowledged and removed
from the sending queue, but enough to make our available buffer
smaller than the MSS, and the connection will hang.
Take care of both cases by:
a. not approximating the sending buffer to zero if we have no outboud
queued data at all, because in that case we don't expect the
available buffer to increase if we don't send any data, so there's
no point in waiting for it to grow larger than the MSS.
This fixes problem 1. above.
b. also using the full sending buffer size if we haven't send data to
the socket for a while (reported by tcpi_last_data_sent). This part
was already suggested by David in:
https://archives.passt.top/passt-dev/aTZzgtcKWLb28zrf@zatzit/
and I'm now picking ten times the RTT as a somewhat arbitrary
threshold.
This is meant to take care of potential problem 2. above, but it
also happens to fix 1.
Reported-by: Tyler Cloud <tcloud@redhat.com>
Link: https://bugs.passt.top/show_bug.cgi?id=183
Suggested-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
tcp.c | 15 ++++++++++++++-
1 file changed, 14 insertions(+), 1 deletion(-)
diff --git a/tcp.c b/tcp.c
index 81bc114..b179e39 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1211,8 +1211,21 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
* the MSS to zero, as we already have mechanisms in place to
* force updates after the window becomes zero. This matches the
* suggestion from RFC 813, Section 4.
+ *
+ * But don't do this if, either:
+ *
+ * - there's nothing in the outbound queue: the size of the
+ * sending buffer is limiting us, and it won't increase if we
+ * don't send data, so there's no point in waiting, or
+ *
+ * - we haven't sent data in a while (somewhat arbitrarily, ten
+ * times the RTT), as that might indicate that the receiver
+ * will only process data in batches that are large enough,
+ * but we won't send enough to fill one because we're stuck
+ * with pending data in the outbound queue
*/
- if (limit < MSS_GET(conn))
+ if (limit < MSS_GET(conn) && sendq &&
+ tinfo->tcpi_last_data_sent < tinfo->tcpi_rtt / 1000 * 10)
limit = 0;
new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, limit);
--
2.47.1

View File

@ -0,0 +1,90 @@
From 75dcbc300bf09c3649823b12d30c4f24de7271d4 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 23 Dec 2025 13:39:17 +0100
Subject: [PATCH] pasta: Warn, disable matching IP version if not supported, in
local mode
...instead of exiting, but only if local mode is enabled, that is, if
we couldn't find a template interface or if the user didn't specify
one.
With IPv4, we always try to set or copy an address, so check if that
fails.
With IPv6, in local mode, we rely on the link-local address that's
automatically generated inside the target namespace, and only fail
later, as we try to set up routes. Check if that fails, instead.
Otherwise, we'll fail to start if IPv6 support is not built in or
disabled by the kernel ("ipv6.disable=1" on the command line),
because, in that case, we'll try to enable local mode by default, and
then fail to set any address or route.
It would probably be more elegant to check for IP version support in
conf_ip4_local() and conf_ip6_local(), and not even try to enable
connectivity for unsupported versions, but it looks less robust than
trying and failing, as there might be other ways to disable a given
IP version.
Note that there's currently no way to disable IPv4 support on the
kernel command line, that is, there's no such thing as an
ipv4.disable boot parameter. But I guess that's due to be eventually
implemented, one day, so let's cover that case as well, also for
consistency.
Reported-by: Iyan <iyanmv@gmail.com>
Link: https://bugzilla.redhat.com/show_bug.cgi?id=2424192
Fixes: 4ddd59bc6085 ("conf: Separate local mode for each IP version, don't enable disabled IP version")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
pasta.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/pasta.c b/pasta.c
index c307b8a..0ddd6b0 100644
--- a/pasta.c
+++ b/pasta.c
@@ -348,6 +348,12 @@ void pasta_ns_conf(struct ctx *c)
AF_INET);
}
+ if (c->ifi4 == -1 && rc == -ENOTSUP) {
+ warn("IPv4 not supported, disabling");
+ c->ifi4 = 0;
+ goto ipv4_done;
+ }
+
if (rc < 0) {
die("Couldn't set IPv4 address(es) in namespace: %s",
strerror_(-rc));
@@ -367,6 +373,7 @@ void pasta_ns_conf(struct ctx *c)
strerror_(-rc));
}
}
+ipv4_done:
if (c->ifi6) {
rc = nl_addr_get_ll(nl_sock_ns, c->pasta_ifi,
@@ -413,12 +420,19 @@ void pasta_ns_conf(struct ctx *c)
AF_INET6);
}
+ if (c->ifi6 == -1 && rc == -ENOTSUP) {
+ warn("IPv6 not supported, disabling");
+ c->ifi6 = 0;
+ goto ipv6_done;
+ }
+
if (rc < 0) {
die("Couldn't set IPv6 route(s) in guest: %s",
strerror_(-rc));
}
}
}
+ipv6_done:
proto_update_l2_buf(c->guest_mac);
}
--
2.47.1

View File

@ -1,76 +0,0 @@
From f9278aab878ef58cf8502ea8f904dbb40fbbb16a Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 2 Oct 2025 00:41:54 +0200
Subject: [PATCH 4/4] tcp: Don't consider FIN flags with mismatching sequence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
If a guest or container sends us a FIN segment but its sequence number
doesn't match the highest sequence of data we *accepted* (not
necessarily the highest sequence we received), that is,
conn->seq_from_tap, plus any data we're accepting in the current
batch, we should discard the flag (not necessarily the segment),
because there's still data we need to receive (again) before the end
of the stream.
If we consider those FIN flags as such, we'll end up in the
situation described below.
Here, 192.168.10.102 is a HTTP server in a Podman container, and
192.168.10.44 is a client fetching approximately 121 KB of data from
it:
82 2.026811 192.168.10.102 → 192.168.10.44 54 TCP 55414 → 44992 [FIN, ACK] Seq=121441 Ack=143 Win=65536 Len=0
the server is done sending
83 2.026898 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [ACK] Seq=143 Ack=114394 Win=216192 Len=0
pasta (client) acknowledges a previous sequence, because of
a short sendmsg()
84 2.027324 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [FIN, ACK] Seq=143 Ack=114394 Win=216192 Len=0
pasta (client) sends FIN, ACK as the client has no more data to
send (a single GET request), while still acknowledging a previous
sequence, because the retransmission didn't happen yet
85 2.027349 192.168.10.102 → 192.168.10.44 54 TCP 55414 → 44992 [ACK] Seq=121442 Ack=144 Win=65536 Len=0
the server acknowledges the FIN, ACK
86 2.224125 192.168.10.102 → 192.168.10.44 4150 TCP [TCP Retransmission] 55414 → 44992 [ACK] Seq=114394 Ack=144 Win=65536 Len=4096 [TCP segment of a reassembled PDU]
and finally a retransmission comes, but as we wrongly switched to
the CLOSE-WAIT state,
87 2.224202 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [RST] Seq=144 Win=0 Len=0
we consider frame #86 as an acknowledgement for the FIN segment we
sent, and close the connection, while we still had to re-receive
(and finally send) the missing data segment, instead.
Link: https://github.com/containers/podman/issues/27179
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
(cherry picked from commit b145441913eef6f8885b6b84531e944ff593790c)
---
tcp.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tcp.c b/tcp.c
index 0ac298a..4428305 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1696,7 +1696,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
}
}
- if (th->fin)
+ if (th->fin && seq == seq_from_tap)
fin = 1;
if (!len)
--
2.47.1

View File

@ -0,0 +1,58 @@
From d2c5133990a7758bfa567fc73216393498949e9b Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Tue, 23 Dec 2025 01:59:34 +0100
Subject: [PATCH] selinux: Enable read and watch permissions on netns directory
as well
With commit 7aeda16a7818 ("selinux: Transition to pasta_t in
containers"), we need to make sure that pasta can access the target
namespace directory passed by Podman, and, in a general case, we have
all the permissions we need.
But if we now start a container without the Podman changes referenced
by commit fd1bcc30af07 ("selinux: add container_var_run_t type
transition"), or with them, but with the container being created
before those and without a reboot in between, we'll additionally need
'read' and 'watch' permissions on user_tmp_t directory as well, as
user_tmp_t is still the (inconsistent) context of the namespace entry.
Otherwise, on a container start/restart, we'll get SELinux denials:
type=AVC msg=audit(1766451401.296:184): avc: denied { read } for pid=2159 comm="pasta.avx2" name="netns" dev="tmpfs" ino=60 scontext=unconfined_u:unconfined_r:pasta_t:s0-s0:c0.c1023 tcontext=unconfined_u:obje
ct_r:user_tmp_t:s0 tclass=dir permissive=1
type=AVC msg=audit(1766451401.298:185): avc: denied { watch } for pid=2159 comm="pasta.avx2" path="/run/user/1001/netns" dev="tmpfs" ino=60 scontext=unconfined_u:unconfined_r:pasta_t:s0-s0:c0.c1023 tcontext=unconfined_u:object_r:user_tmp_t:s0 tclass=dir permissive=1
This can be reproduced quite simply:
$ podman create -q --name hello hello
6c4eaf15a03edf799673a97d84d0331f3a3f34a11015b58c69318101a3232770
[upgrade passt's SELinux policy to a version including 7aeda16a7818]
$ podman start hello
Error: unable to start container "6c4eaf15a03edf799673a97d84d0331f3a3f34a11015b58c69318101a3232770": pasta failed with exit code 1:
netns dir open: Permission denied, exiting
Reported-by: Tuomo Soini <tis@foobar.fi>
Fixes: 7aeda16a7818 ("selinux: Transition to pasta_t in containers")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
contrib/selinux/pasta.te | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te
index 95fe42a..3eb58f6 100644
--- a/contrib/selinux/pasta.te
+++ b/contrib/selinux/pasta.te
@@ -149,7 +149,7 @@ allow pasta_t root_t:dir mounton;
manage_files_pattern(pasta_t, pasta_pid_t, pasta_pid_t)
files_pid_filetrans(pasta_t, pasta_pid_t, file)
-allow pasta_t user_tmp_t:dir { add_name remove_name search write };
+allow pasta_t user_tmp_t:dir { add_name read remove_name search watch write };
allow pasta_t user_tmp_t:fifo_file append;
allow pasta_t user_tmp_t:file { create open write };
allow pasta_t user_tmp_t:sock_file { create unlink };
--
2.47.1

View File

@ -0,0 +1,68 @@
From 6babaa8a88eb337e4b81aeff673fcebb28015f36 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 16 Jan 2026 16:48:46 +0100
Subject: [PATCH 6/7] selinux: Enable open permissions on netns directory,
operations on container_var_run_t
Tuomo reports two further SELinux denials after upgrading to a
passt-selinux version that includes the transition to pasta_t for
containers, one I could reproduce:
denied { open } for pid=3343050 comm="pasta.avx2" path="/run/user/1000/netns" dev="tmpfs" ino=51 scontext=unconfined_u:unconfined_r:pasta_t:s0-s0:c0.c1023 tcontext=unconfined_u:object_r:user_tmp_t:s0 tclass=dir permissive=1
which I didn't take care of in the previous commit, d2c5133990a7
("selinux: Enable read and watch permissions on netns directory as
well"), as it didn't appear in my quick test. But I can make pasta use
"open" on the network namespace entry by simply using it to make
connections.
So, for that, add "open" to the existing rule for user_tmp_t:dir.
Then, another one I couldn't reproduce instead:
denied { write } for pid=3589324 comm="pasta.avx2" name="rootless-netns" dev="tmpfs" ino=36 scontext=unconfined_u:unconfined_r:pasta_t:s0-s0:c0.c1023 tcontext=unconfined_u:object_r:container_var_run_t:s0 tclass=dir permissive=0
which, I think, comes from a specific combination of versions of
container-selinux, Podman, and passt-selinux packages, which
prevents the expected type transition on container_var_run_t unless
restorecon is invoked manually, or until a reboot.
Allowing the same permissions on container_var_run_t as we do on
ifconfig_var_run_t is harmless, so do that to prevent this further
denial.
Reported-by: Tuomo Soini <tis@foobar.fi>
Fixes: d2c5133990a7 ("selinux: Enable read and watch permissions on netns directory as well")
Fixes: 7aeda16a7818 ("selinux: Transition to pasta_t in containers")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
(cherry picked from commit a6d92ca82c9ea0b395aa56c568ee6b6e6d4ac81e)
---
contrib/selinux/pasta.te | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te
index 22daa77..abeafa4 100644
--- a/contrib/selinux/pasta.te
+++ b/contrib/selinux/pasta.te
@@ -148,7 +148,7 @@ allow pasta_t root_t:dir mounton;
manage_files_pattern(pasta_t, pasta_pid_t, pasta_pid_t)
files_pid_filetrans(pasta_t, pasta_pid_t, file)
-allow pasta_t user_tmp_t:dir { add_name read remove_name search watch write };
+allow pasta_t user_tmp_t:dir { add_name open read remove_name search watch write };
allow pasta_t user_tmp_t:fifo_file append;
allow pasta_t user_tmp_t:file { create open write };
allow pasta_t user_tmp_t:sock_file { create unlink };
@@ -248,7 +248,9 @@ type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "netns";
type_transition container_runtime_t container_var_run_t : dir ifconfig_var_run_t "netns";
type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "rootless-netns";
type_transition container_runtime_t container_var_run_t : dir ifconfig_var_run_t "rootless-netns";
+allow pasta_t container_var_run_t:dir { add_name open rmdir write };
allow pasta_t ifconfig_var_run_t:dir { add_name open rmdir write };
+allow pasta_t container_var_run_t:file { create open write };
allow pasta_t ifconfig_var_run_t:file { create open write };
allow systemd_logind_exec_t ifconfig_var_run_t:dir rmdir;
--
2.47.1

View File

@ -0,0 +1,74 @@
From dbfbc33776290260b87bb29bb5572750f9709b35 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 9 Jan 2026 13:52:00 +0100
Subject: [PATCH 7/7] tcp: Fix rounding issue in check for approximating window
to zero
In general, we approximate the advertised window to zero if we would
otherwise advertise less than a MSS worth, and the reasoning behind
that is explained in cf1925fb7b77 ("tcp: Don't limit window to
less-than-MSS values, use zero instead").
Then, in commit b40f5cd8c8e1 ("tcp: Use less-than-MSS window on no
queued data, or no data sent recently"), I introduced some conditions
under which we won't do that, including a check on whether any data
was sent recently.
As an arbitrary but probably reasonable threshold, we consider data to
have recently been sent if that occurred less than ten times the
round-trip time (RTT) ago.
The time elapsed since the last data transmission is reported by the
kernel in milliseconds, in the tcpi_last_data_sent field of struct
tcp_info, and the RTT is reported in microseconds instead, in
tcpi_rtt.
To avoid the risk of overflow in a simple way, for the purpose of this
comparison, I converted tcpi_rtt to milliseconds first, but this means
that the check will always be false (and we'll never approximate the
window to zero) if the RTT is below one millisecond.
This, in turn, reintroduces nasty delay issues in transfers in
non-local connections which have however almost-local (low) latency.
Given that we want to use ten times the RTT as an arbitrary "long
enough" upper bound, round the RTT up while converting it to
milliseconds.
As an alternative, we could perform the comparison in microseconds,
but we would need a slightly more complicated implementation to
exclude overflows, and it's definitely not worth it given the nature
of this threshold.
Fixes: b40f5cd8c8e1 ("tcp: Use less-than-MSS window on no queued data, or no data sent recently")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
(cherry picked from commit 2be0e790804f99580b1c8a1781c49913440607f2)
---
tcp.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tcp.c b/tcp.c
index 23fcbc3..8f4f087 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1180,6 +1180,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) {
new_wnd_to_tap = tinfo->tcpi_snd_wnd;
} else {
+ unsigned rtt_ms_ceiling = DIV_ROUND_UP(tinfo->tcpi_rtt, 1000);
uint32_t sendq;
int limit;
@@ -1223,7 +1224,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
* with pending data in the outbound queue
*/
if (limit < MSS_GET(conn) && sendq &&
- tinfo->tcpi_last_data_sent < tinfo->tcpi_rtt / 1000 * 10)
+ tinfo->tcpi_last_data_sent < rtt_ms_ceiling * 10)
limit = 0;
new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, limit);
--
2.47.1

View File

@ -0,0 +1,48 @@
From 768e38c4ab9f7bb328897577368084faf9ee41df Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Fri, 9 Jan 2026 17:54:35 +0100
Subject: [PATCH 08/18] udp_flow: remove unneeded epoll_ref indirection
The fref union was used to convert flow_sidx_t to uint32_t for
assignment to ref.data. This is unnecessary since epoll_ref already
contains a flowside member of type flow_sidx_t, so we can assign
directly.
This aligns with how icmp.c and other callers assign flow_sidx_t to
epoll_ref.
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
(cherry picked from commit ab27852d0eebcd96d33c3699b44596a827b83bc6)
---
udp_flow.c | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/udp_flow.c b/udp_flow.c
index 8907f2f..0ba7880 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -74,10 +74,6 @@ static int udp_flow_sock(const struct ctx *c,
{
const struct flowside *side = &uflow->f.side[sidei];
uint8_t pif = uflow->f.pif[sidei];
- union {
- flow_sidx_t sidx;
- uint32_t data;
- } fref = { .sidx = FLOW_SIDX(uflow, sidei) };
union epoll_ref ref;
int rc;
int s;
@@ -89,7 +85,7 @@ static int udp_flow_sock(const struct ctx *c,
}
ref.type = EPOLL_TYPE_UDP;
- ref.data = fref.data;
+ ref.flowside = FLOW_SIDX(uflow, sidei);
ref.fd = s;
flow_epollid_set(&uflow->f, EPOLLFD_ID_DEFAULT);
--
2.47.1

View File

@ -0,0 +1,47 @@
From 059a31c28aa6e5053846ee931b97eb1344a9ce17 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Fri, 9 Jan 2026 17:54:36 +0100
Subject: [PATCH 09/18] udp_flow: Assign socket to flow inside udp_flow_sock()
Move the assignment of uflow->s[sidei] from the caller (udp_flow_new())
into udp_flow_sock() itself, placing it after the successful connect().
This is a pure refactoring with no functional change. The socket fd is
now assigned within udp_flow_sock() where the socket is created, rather
than requiring the caller to capture the return value. On error paths,
uflow->s[sidei] remains at its initialized value of -1 rather than being
set to the negative error code, which is semantically cleaner (though
functionally equivalent given the >= 0 check in udp_flow_close()).
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
(cherry picked from commit e0fdfccc1c1a56c58a96d7fd6cc5d532cd780b6f)
---
udp_flow.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/udp_flow.c b/udp_flow.c
index 0ba7880..c4cf35c 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -105,6 +105,7 @@ static int udp_flow_sock(const struct ctx *c,
flow_dbg_perror(uflow, "Couldn't connect flow socket");
return rc;
}
+ uflow->s[sidei] = s;
/* It's possible, if unlikely, that we could receive some packets in
* between the bind() and connect() which may or may not be for this
@@ -159,7 +160,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
flow_foreach_sidei(sidei) {
if (pif_is_socket(uflow->f.pif[sidei]))
- if ((uflow->s[sidei] = udp_flow_sock(c, uflow, sidei)) < 0)
+ if (udp_flow_sock(c, uflow, sidei) < 0)
goto cancel;
}
--
2.47.1

View File

@ -0,0 +1,94 @@
From 766e42ea2c6f57547cfee4289ca27168149bb174 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Fri, 9 Jan 2026 17:54:37 +0100
Subject: [PATCH 10/18] tcp_splice: Refactor tcp_splice_conn_epoll_events() to
per-side computation
The function tcp_splice_conn_epoll_events() currently takes an array of
struct epoll_event and fills in the .events field for both sides using
flow_foreach_sidei() loops.
This works, but the function is doing two conceptually separate things
at once: computing events for side 0 and computing events for side 1.
The OUT_WAIT handling is particularly subtle, as it has cross-side
effects: when OUT_WAIT(sidei) is set, we add EPOLLOUT to ev[sidei] but
also remove EPOLLIN from ev[!sidei].
Refactor to make the function compute events for a single side at a
time, taking sidei as a parameter and returning uint32_t. This makes
the logic more focused and easier to follow. The cross-side effects of
OUT_WAIT are preserved by checking both OUT_WAIT(sidei) and
OUT_WAIT(!sidei) within each call.
The caller tcp_splice_epoll_ctl() now invokes the function twice, once
for each side, making the two-sided nature of the operation explicit.
No functional change.
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
(cherry picked from commit 23da651ab08e564b84c532f6f93b0817d2ae850f)
---
tcp_splice.c | 33 ++++++++++++++-------------------
1 file changed, 14 insertions(+), 19 deletions(-)
diff --git a/tcp_splice.c b/tcp_splice.c
index 4405224..bf4ff46 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -114,29 +114,23 @@ static struct tcp_splice_conn *conn_at_sidx(flow_sidx_t sidx)
* @events: Connection event flags
* @ev: Events to fill in, 0 is accepted socket, 1 is connecting socket
*/
-static void tcp_splice_conn_epoll_events(uint16_t events,
- struct epoll_event ev[])
+static uint32_t tcp_splice_conn_epoll_events(uint16_t events, unsigned sidei)
{
- unsigned sidei;
-
- flow_foreach_sidei(sidei)
- ev[sidei].events = 0;
+ uint32_t e = 0;
if (events & SPLICE_ESTABLISHED) {
- flow_foreach_sidei(sidei) {
- if (!(events & FIN_SENT(!sidei)))
- ev[sidei].events = EPOLLIN | EPOLLRDHUP;
- }
- } else if (events & SPLICE_CONNECT) {
- ev[1].events = EPOLLOUT;
+ if (!(events & FIN_SENT(!sidei)))
+ e = EPOLLIN | EPOLLRDHUP;
+ } else if (sidei == 1 && events & SPLICE_CONNECT) {
+ e = EPOLLOUT;
}
- flow_foreach_sidei(sidei) {
- if (events & OUT_WAIT(sidei)) {
- ev[sidei].events |= EPOLLOUT;
- ev[!sidei].events &= ~EPOLLIN;
- }
- }
+ if (events & OUT_WAIT(sidei))
+ e |= EPOLLOUT;
+ if (events & OUT_WAIT(!sidei))
+ e &= ~EPOLLIN;
+
+ return e;
}
/**
@@ -161,7 +155,8 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
struct epoll_event ev[SIDES] = { { .data.u64 = ref[0].u64 },
{ .data.u64 = ref[1].u64 } };
- tcp_splice_conn_epoll_events(conn->events, ev);
+ ev[0].events = tcp_splice_conn_epoll_events(conn->events, 0);
+ ev[1].events = tcp_splice_conn_epoll_events(conn->events, 1);
if (epoll_ctl(epollfd, m, conn->s[0], &ev[0]) ||
--
2.47.1

View File

@ -0,0 +1,489 @@
From 79dab11a029025e485faf4a3f5ea1ed4538fb64b Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Fri, 9 Jan 2026 17:54:38 +0100
Subject: [PATCH 11/18] flow: Introduce flow_epoll_set() to centralize epoll
operations
Currently, each flow type (TCP, TCP_SPLICE, PING, UDP) has its own
code to add or modify file descriptors in epoll. This leads to
duplicated boilerplate code across icmp.c, tcp.c, tcp_splice.c, and
udp_flow.c, each setting up epoll_ref unions and calling epoll_ctl()
with flow-type-specific details.
Introduce flow_epoll_set() in flow.c to handle epoll operations for
all flow types in a unified way.
This will be needed to migrate queue pair from an epollfd to another.
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
(cherry picked from commit c0be730f2aa2243a132b3ee40c2bf05ebc84fedf)
---
flow.c | 37 ++++++++++++++++++++++++
flow.h | 2 ++
icmp.c | 10 ++-----
tcp.c | 48 ++++++++++++++++++------------
tcp_splice.c | 82 ++++++++++++++++++++++++----------------------------
udp_flow.c | 11 ++-----
6 files changed, 111 insertions(+), 79 deletions(-)
diff --git a/flow.c b/flow.c
index 4f53486..cefe6c8 100644
--- a/flow.c
+++ b/flow.c
@@ -20,6 +20,7 @@
#include "flow.h"
#include "flow_table.h"
#include "repair.h"
+#include "epoll_ctl.h"
const char *flow_state_str[] = {
[FLOW_STATE_FREE] = "FREE",
@@ -53,6 +54,16 @@ const uint8_t flow_proto[] = {
static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
"flow_proto[] doesn't match enum flow_type");
+static const enum epoll_type flow_epoll[] = {
+ [FLOW_TCP] = EPOLL_TYPE_TCP,
+ [FLOW_TCP_SPLICE] = EPOLL_TYPE_TCP_SPLICE,
+ [FLOW_PING4] = EPOLL_TYPE_PING,
+ [FLOW_PING6] = EPOLL_TYPE_PING,
+ [FLOW_UDP] = EPOLL_TYPE_UDP,
+};
+static_assert(ARRAY_SIZE(flow_epoll) == FLOW_NUM_TYPES,
+ "flow_epoll[] doesn't match enum flow_type");
+
#define foreach_established_tcp_flow(flow) \
flow_foreach_of_type((flow), FLOW_TCP) \
if (!tcp_flow_is_established(&(flow)->tcp)) \
@@ -390,6 +401,32 @@ void flow_epollid_clear(struct flow_common *f)
f->epollid = EPOLLFD_ID_INVALID;
}
+/**
+ * flow_epoll_set() - Add or modify epoll registration for a flow socket
+ * @f: Flow to register socket for
+ * @command: epoll_ctl() command: EPOLL_CTL_ADD or EPOLL_CTL_MOD
+ * @events: epoll events to watch for
+ * @fd: File descriptor to register
+ * @sidei: Side index of the flow
+ *
+ * Return: 0 on success, -1 on error (from epoll_ctl())
+ */
+int flow_epoll_set(const struct flow_common *f, int command, uint32_t events,
+ int fd, unsigned int sidei)
+{
+ struct epoll_event ev;
+ union epoll_ref ref;
+
+ ref.fd = fd;
+ ref.type = flow_epoll[f->type];
+ ref.flowside = flow_sidx(f, sidei);
+
+ ev.events = events;
+ ev.data.u64 = ref.u64;
+
+ return epoll_ctl(flow_epollfd(f), command, fd, &ev);
+}
+
/**
* flow_epollid_register() - Initialize the epoll id -> fd mapping
* @epollid: epoll id to associate to
diff --git a/flow.h b/flow.h
index b43b0b1..1b78d59 100644
--- a/flow.h
+++ b/flow.h
@@ -265,6 +265,8 @@ bool flow_in_epoll(const struct flow_common *f);
int flow_epollfd(const struct flow_common *f);
void flow_epollid_set(struct flow_common *f, int epollid);
void flow_epollid_clear(struct flow_common *f);
+int flow_epoll_set(const struct flow_common *f, int command, uint32_t events,
+ int fd, unsigned int sidei);
void flow_epollid_register(int epollid, int epollfd);
void flow_defer_handler(const struct ctx *c, const struct timespec *now);
int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage,
diff --git a/icmp.c b/icmp.c
index 9564c49..eb7f11b 100644
--- a/icmp.c
+++ b/icmp.c
@@ -177,7 +177,6 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
union flow *flow = flow_alloc();
struct icmp_ping_flow *pingf;
const struct flowside *tgt;
- union epoll_ref ref;
if (!flow)
return NULL;
@@ -211,13 +210,10 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
goto cancel;
flow_epollid_set(&pingf->f, EPOLLFD_ID_DEFAULT);
-
- ref.type = EPOLL_TYPE_PING;
- ref.flowside = FLOW_SIDX(flow, TGTSIDE);
- ref.fd = pingf->sock;
-
- if (epoll_add(flow_epollfd(&pingf->f), EPOLLIN, ref) < 0) {
+ if (flow_epoll_set(&pingf->f, EPOLL_CTL_ADD, EPOLLIN, pingf->sock,
+ TGTSIDE) < 0) {
close(pingf->sock);
+ flow_epollid_clear(&pingf->f);
goto cancel;
}
diff --git a/tcp.c b/tcp.c
index 8f4f087..146d460 100644
--- a/tcp.c
+++ b/tcp.c
@@ -523,34 +523,44 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
/**
* tcp_epoll_ctl() - Add/modify/delete epoll state from connection events
- * @c: Execution context
* @conn: Connection pointer
*
* Return: 0 on success, negative error code on failure (not on deletion)
*/
-static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
+static int tcp_epoll_ctl(struct tcp_tap_conn *conn)
{
- int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
- union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock,
- .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), };
- struct epoll_event ev = { .data.u64 = ref.u64 };
- int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f)
- : c->epollfd;
+ uint32_t events;
+ int m;
if (conn->events == CLOSED) {
- if (flow_in_epoll(&conn->f))
+ if (flow_in_epoll(&conn->f)) {
+ int epollfd = flow_epollfd(&conn->f);
+
epoll_del(epollfd, conn->sock);
- if (conn->timer != -1)
- epoll_del(epollfd, conn->timer);
+ if (conn->timer != -1)
+ epoll_del(epollfd, conn->timer);
+ }
+
return 0;
}
- ev.events = tcp_conn_epoll_events(conn->events, conn->flags);
+ events = tcp_conn_epoll_events(conn->events, conn->flags);
- if (epoll_ctl(epollfd, m, conn->sock, &ev))
- return -errno;
+ if (flow_in_epoll(&conn->f)) {
+ m = EPOLL_CTL_MOD;
+ } else {
+ flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
+ m = EPOLL_CTL_ADD;
+ }
- flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
+ if (flow_epoll_set(&conn->f, m, events, conn->sock,
+ !TAPSIDE(conn)) < 0) {
+ int ret = -errno;
+
+ if (m == EPOLL_CTL_ADD)
+ flow_epollid_clear(&conn->f);
+ return ret;
+ }
if (conn->timer != -1) {
union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER,
@@ -681,7 +691,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
}
if (flag == STALLED || flag == ~STALLED)
- tcp_epoll_ctl(c, conn);
+ tcp_epoll_ctl(conn);
if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE ||
(flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) ||
@@ -738,7 +748,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
} else {
if (event == CLOSED)
flow_hash_remove(c, TAP_SIDX(conn));
- tcp_epoll_ctl(c, conn);
+ tcp_epoll_ctl(conn);
}
if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
@@ -1753,7 +1763,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
conn_event(c, conn, TAP_SYN_ACK_SENT);
}
- tcp_epoll_ctl(c, conn);
+ tcp_epoll_ctl(conn);
if (c->mode == MODE_VU) { /* To rebind to same oport after migration */
socklen_t sl = sizeof(sa);
@@ -4021,7 +4031,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
tcp_send_flag(c, conn, ACK);
tcp_data_from_sock(c, conn);
- if ((rc = tcp_epoll_ctl(c, conn))) {
+ if ((rc = tcp_epoll_ctl(conn))) {
flow_dbg(conn,
"Failed to subscribe to epoll for migrated socket: %s",
strerror_(-rc));
diff --git a/tcp_splice.c b/tcp_splice.c
index bf4ff46..a7c04ca 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -135,37 +135,31 @@ static uint32_t tcp_splice_conn_epoll_events(uint16_t events, unsigned sidei)
/**
* tcp_splice_epoll_ctl() - Add/modify/delete epoll state from connection events
- * @c: Execution context
* @conn: Connection pointer
*
* Return: 0 on success, negative error code on failure (not on deletion)
*/
-static int tcp_splice_epoll_ctl(const struct ctx *c,
- struct tcp_splice_conn *conn)
+static int tcp_splice_epoll_ctl(struct tcp_splice_conn *conn)
{
- int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f)
- : c->epollfd;
- int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
- const union epoll_ref ref[SIDES] = {
- { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[0],
- .flowside = FLOW_SIDX(conn, 0) },
- { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[1],
- .flowside = FLOW_SIDX(conn, 1) }
- };
- struct epoll_event ev[SIDES] = { { .data.u64 = ref[0].u64 },
- { .data.u64 = ref[1].u64 } };
-
- ev[0].events = tcp_splice_conn_epoll_events(conn->events, 0);
- ev[1].events = tcp_splice_conn_epoll_events(conn->events, 1);
-
-
- if (epoll_ctl(epollfd, m, conn->s[0], &ev[0]) ||
- epoll_ctl(epollfd, m, conn->s[1], &ev[1])) {
+ uint32_t events[2];
+ int m;
+
+ if (flow_in_epoll(&conn->f)) {
+ m = EPOLL_CTL_MOD;
+ } else {
+ flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
+ m = EPOLL_CTL_ADD;
+ }
+
+ events[0] = tcp_splice_conn_epoll_events(conn->events, 0);
+ events[1] = tcp_splice_conn_epoll_events(conn->events, 1);
+
+ if (flow_epoll_set(&conn->f, m, events[0], conn->s[0], 0) ||
+ flow_epoll_set(&conn->f, m, events[1], conn->s[1], 1)) {
int ret = -errno;
flow_perror(conn, "ERROR on epoll_ctl()");
return ret;
}
- flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
return 0;
}
@@ -205,7 +199,7 @@ static void conn_flag_do(struct tcp_splice_conn *conn,
}
}
-#define conn_flag(c, conn, flag) \
+#define conn_flag(conn, flag) \
do { \
flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \
conn_flag_do(conn, flag); \
@@ -213,12 +207,10 @@ static void conn_flag_do(struct tcp_splice_conn *conn,
/**
* conn_event_do() - Set and log connection events, update epoll state
- * @c: Execution context
* @conn: Connection pointer
* @event: Connection event
*/
-static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn,
- unsigned long event)
+static void conn_event_do(struct tcp_splice_conn *conn, unsigned long event)
{
if (event & (event - 1)) {
int flag_index = fls(~event);
@@ -240,14 +232,14 @@ static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn,
flow_dbg(conn, "%s", tcp_splice_event_str[flag_index]);
}
- if (tcp_splice_epoll_ctl(c, conn))
- conn_flag(c, conn, CLOSING);
+ if (tcp_splice_epoll_ctl(conn))
+ conn_flag(conn, CLOSING);
}
-#define conn_event(c, conn, event) \
+#define conn_event(conn, event) \
do { \
flow_trace(conn, "event at %s:%i",__func__, __LINE__); \
- conn_event_do(c, conn, event); \
+ conn_event_do(conn, event); \
} while (0)
@@ -315,7 +307,7 @@ static int tcp_splice_connect_finish(const struct ctx *c,
if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) {
flow_perror(conn, "cannot create %d->%d pipe",
sidei, !sidei);
- conn_flag(c, conn, CLOSING);
+ conn_flag(conn, CLOSING);
return -EIO;
}
@@ -329,7 +321,7 @@ static int tcp_splice_connect_finish(const struct ctx *c,
}
if (!(conn->events & SPLICE_ESTABLISHED))
- conn_event(c, conn, SPLICE_ESTABLISHED);
+ conn_event(conn, SPLICE_ESTABLISHED);
return 0;
}
@@ -376,7 +368,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
pif_sockaddr(c, &sa, tgtpif, &tgt->eaddr, tgt->eport);
- conn_event(c, conn, SPLICE_CONNECT);
+ conn_event(conn, SPLICE_CONNECT);
if (connect(conn->s[1], &sa.sa, socklen_inany(&sa))) {
if (errno != EINPROGRESS) {
@@ -385,7 +377,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
return -errno;
}
} else {
- conn_event(c, conn, SPLICE_ESTABLISHED);
+ conn_event(conn, SPLICE_ESTABLISHED);
return tcp_splice_connect_finish(c, conn);
}
@@ -445,7 +437,7 @@ void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0)
flow_trace(conn, "failed to set TCP_QUICKACK on %i", s0);
if (tcp_splice_connect(c, conn))
- conn_flag(c, conn, CLOSING);
+ conn_flag(conn, CLOSING);
FLOW_ACTIVATE(conn);
}
@@ -494,14 +486,14 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
if (events & EPOLLOUT) {
fromsidei = !evsidei;
- conn_event(c, conn, ~OUT_WAIT(evsidei));
+ conn_event(conn, ~OUT_WAIT(evsidei));
} else {
fromsidei = evsidei;
}
if (events & EPOLLRDHUP)
/* For side 0 this is fake, but implied */
- conn_event(c, conn, FIN_RCVD(evsidei));
+ conn_event(conn, FIN_RCVD(evsidei));
swap:
eof = 0;
@@ -536,7 +528,7 @@ retry:
more = SPLICE_F_MORE;
if (conn->flags & lowat_set_flag)
- conn_flag(c, conn, lowat_act_flag);
+ conn_flag(conn, lowat_act_flag);
}
do
@@ -568,8 +560,8 @@ retry:
"Setting SO_RCVLOWAT %i: %s",
lowat, strerror_(errno));
} else {
- conn_flag(c, conn, lowat_set_flag);
- conn_flag(c, conn, lowat_act_flag);
+ conn_flag(conn, lowat_set_flag);
+ conn_flag(conn, lowat_act_flag);
}
}
@@ -583,7 +575,7 @@ retry:
if (conn->read[fromsidei] == conn->written[fromsidei])
break;
- conn_event(c, conn, OUT_WAIT(!fromsidei));
+ conn_event(conn, OUT_WAIT(!fromsidei));
break;
}
@@ -605,7 +597,7 @@ retry:
if ((conn->events & FIN_RCVD(sidei)) &&
!(conn->events & FIN_SENT(!sidei))) {
shutdown(conn->s[!sidei], SHUT_WR);
- conn_event(c, conn, FIN_SENT(!sidei));
+ conn_event(conn, FIN_SENT(!sidei));
}
}
}
@@ -626,7 +618,7 @@ retry:
return;
close:
- conn_flag(c, conn, CLOSING);
+ conn_flag(conn, CLOSING);
}
/**
@@ -762,10 +754,10 @@ void tcp_splice_timer(struct tcp_splice_conn *conn)
flow_trace(conn, "can't set SO_RCVLOWAT on %d",
conn->s[sidei]);
}
- conn_flag(c, conn, ~RCVLOWAT_SET(sidei));
+ conn_flag(conn, ~RCVLOWAT_SET(sidei));
}
}
flow_foreach_sidei(sidei)
- conn_flag(c, conn, ~RCVLOWAT_ACT(sidei));
+ conn_flag(conn, ~RCVLOWAT_ACT(sidei));
}
diff --git a/udp_flow.c b/udp_flow.c
index c4cf35c..80b1543 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -74,7 +74,6 @@ static int udp_flow_sock(const struct ctx *c,
{
const struct flowside *side = &uflow->f.side[sidei];
uint8_t pif = uflow->f.pif[sidei];
- union epoll_ref ref;
int rc;
int s;
@@ -84,14 +83,10 @@ static int udp_flow_sock(const struct ctx *c,
return s;
}
- ref.type = EPOLL_TYPE_UDP;
- ref.flowside = FLOW_SIDX(uflow, sidei);
- ref.fd = s;
-
flow_epollid_set(&uflow->f, EPOLLFD_ID_DEFAULT);
-
- rc = epoll_add(flow_epollfd(&uflow->f), EPOLLIN, ref);
- if (rc < 0) {
+ if (flow_epoll_set(&uflow->f, EPOLL_CTL_ADD, EPOLLIN, s, sidei) < 0) {
+ rc = -errno;
+ flow_epollid_clear(&uflow->f);
close(s);
return rc;
}
--
2.47.1

View File

@ -0,0 +1,99 @@
From 73a9bee3e1ffe447cb041c4826465a71730c2ecf Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 27 Jan 2026 19:39:52 +1100
Subject: [PATCH 12/18] tcp: Properly propagate tap-side RST to socket side
When the guest sends a TCP RST, or on certain error conditions, we want to
signal the abnormal termination of a TCP connection to the peer with an
RST as well. We attempt to do that by close()ing the socket.
That doesn't work: a close() will usually send a FIN, rather than an RST.
The standard method of forcing an RST on a socket is to set the SO_LINGER
socket option with a 0 timeout, then close().
Update the tcp_rst() path to do this, so it forces a socket side RST.
Update the handling of a guest side RST to use the same path (minus
sending a tap side RST) so that we properly propagate guest RSTs to the
peer.
Link: https://bugs.passt.top/show_bug.cgi?id=191
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
(cherry picked from commit cce94e92fb3d2a90730c125f2bad32c9ed51da3f)
---
tcp.c | 37 +++++++++++++++++++++++++++++++++----
1 file changed, 33 insertions(+), 4 deletions(-)
diff --git a/tcp.c b/tcp.c
index 146d460..602e810 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1417,7 +1417,34 @@ static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn,
}
/**
- * tcp_rst_do() - Reset a tap connection: send RST segment to tap, close socket
+ * tcp_sock_rst() - Close TCP connection forcing RST on socket side
+ * @c: Execution context
+ * @conn: Connection pointer
+ */
+static void tcp_sock_rst(const struct ctx *c, struct tcp_tap_conn *conn)
+{
+ const struct linger linger0 = {
+ .l_onoff = 1,
+ .l_linger = 0,
+ };
+
+ /* Force RST on socket to inform the peer
+ *
+ * We do this by setting SO_LINGER with 0 timeout, which means that
+ * close() will send an RST (unless the connection is already closed in
+ * both directions).
+ */
+ if (setsockopt(conn->sock, SOL_SOCKET,
+ SO_LINGER, &linger0, sizeof(linger0)) < 0) {
+ flow_dbg_perror(conn,
+ "SO_LINGER failed, may not send RST to peer");
+ }
+
+ conn_event(c, conn, CLOSED);
+}
+
+/**
+ * tcp_rst_do() - Reset a tap connection: send RST segment on both sides, close
* @c: Execution context
* @conn: Connection pointer
*/
@@ -1426,8 +1453,10 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn)
if (conn->events == CLOSED)
return;
+ /* Send RST on tap */
tcp_send_flag(c, conn, RST);
- conn_event(c, conn, CLOSED);
+
+ tcp_sock_rst(c, conn);
}
/**
@@ -1898,7 +1927,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
return -1;
if (th->rst) {
- conn_event(c, conn, CLOSED);
+ tcp_sock_rst(c, conn);
return 1;
}
@@ -2262,7 +2291,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
flow_trace(conn, "packet length %zu from tap", l4len);
if (th->rst) {
- conn_event(c, conn, CLOSED);
+ tcp_sock_rst(c, conn);
return 1;
}
--
2.47.1

View File

@ -0,0 +1,239 @@
From 3d6804c07d1b9ed26fea88d680a1734ea1069d91 Mon Sep 17 00:00:00 2001
From: Yumei Huang <yuhuang@redhat.com>
Date: Sat, 14 Feb 2026 15:31:36 +0800
Subject: [PATCH 13/18] udp: Split activity timeouts for UDP flows
Frequent DNS queries over UDP from a container or guest can result
in many sockets shown in ss(8), typically one per flow. This is
expected and harmless, but it can make the output of ss(8) look
noisy and potentially concern users.
This patch splits UDP flow timeouts into two, mirroring the Linux
kernel, and sources the values from kernel parameters. The shorter
timeout is applied to unidirectional flows and minimal bidirectional
exchanges (single datagram and reply), while the longer timeout is
used for bidirectional flows with multiple datagrams on either side.
Link: https://bugs.passt.top/show_bug.cgi?id=197
Suggested-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Yumei Huang <yuhuang@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
(cherry picked from commit bebafa72a982784164a7d556bd860ec0ed1e02c7)
---
contrib/apparmor/abstractions/passt | 4 ++++
udp.c | 34 +++++++++++++++++++++++++++--
udp.h | 4 ++++
udp_flow.c | 30 ++++++++++++++++++++-----
udp_flow.h | 4 ++++
5 files changed, 69 insertions(+), 7 deletions(-)
diff --git a/contrib/apparmor/abstractions/passt b/contrib/apparmor/abstractions/passt
index 43fd63f..e8ed513 100644
--- a/contrib/apparmor/abstractions/passt
+++ b/contrib/apparmor/abstractions/passt
@@ -36,6 +36,10 @@
@{PROC}/sys/net/ipv4/ip_local_port_range r, # fwd_probe_ephemeral()
+ # udp_get_timeout_params(), udp.c
+ @{PROC}/sys/net/netfilter/nf_conntrack_udp_timeout r,
+ @{PROC}/sys/net/netfilter/nf_conntrack_udp_timeout_stream r,
+
network netlink raw, # nl_sock_init_do(), netlink.c
network inet stream, # tcp.c
diff --git a/udp.c b/udp.c
index 08bec50..32d70b6 100644
--- a/udp.c
+++ b/udp.c
@@ -26,7 +26,10 @@
*
* We track pseudo-connections of this type as flow table entries of type
* FLOW_UDP. We store the time of the last traffic on the flow in uflow->ts,
- * and let the flow expire if there is no traffic for UDP_CONN_TIMEOUT seconds.
+ * and let the flow expire if there is no traffic for UDP_TIMEOUT seconds for
+ * unidirectional flows and flows with only one datagram and one reply, or
+ * UDP_TIMEOUT_STREAM seconds for bidirectional flows with more than one
+ * datagram on either side.
*
* NOTE: This won't handle multicast protocols, or some protocols with different
* port usage. We'll need specific logic if we want to handle those.
@@ -118,6 +121,13 @@
#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */
+#define UDP_TIMEOUT "/proc/sys/net/netfilter/nf_conntrack_udp_timeout"
+#define UDP_TIMEOUT_STREAM \
+ "/proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream"
+
+#define UDP_TIMEOUT_DEFAULT 30 /* s */
+#define UDP_TIMEOUT_STREAM_DEFAULT 120 /* s */
+
/* Maximum UDP data to be returned in ICMP messages */
#define ICMP4_MAX_DLEN 8
#define ICMP6_MAX_DLEN (IPV6_MIN_MTU \
@@ -966,7 +976,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
int s = ref.fd;
flow_trace(uflow, "Received data on reply socket");
- uflow->ts = now->tv_sec;
+ udp_flow_activity(uflow, !tosidx.sidei, now);
if (pif_is_socket(topif)) {
udp_sock_to_sock(c, ref.fd, n, tosidx);
@@ -1301,6 +1311,24 @@ void udp_port_rebind_all(struct ctx *c)
udp_port_rebind(c, false);
}
+/**
+ * udp_get_timeout_params() - Get host kernel UDP timeout parameters
+ * @c: Execution context
+ */
+static void udp_get_timeout_params(struct ctx *c)
+{
+ intmax_t v;
+
+ v = read_file_integer(UDP_TIMEOUT, UDP_TIMEOUT_DEFAULT);
+ c->udp.timeout = v;
+
+ v = read_file_integer(UDP_TIMEOUT_STREAM, UDP_TIMEOUT_STREAM_DEFAULT);
+ c->udp.stream_timeout = v;
+
+ debug("Using UDP timeout parameters, timeout: %d, stream_timeout: %d",
+ c->udp.timeout, c->udp.stream_timeout);
+}
+
/**
* udp_init() - Initialise per-socket data, and sockets in namespace
* @c: Execution context
@@ -1311,6 +1339,8 @@ int udp_init(struct ctx *c)
{
ASSERT(!c->no_udp);
+ udp_get_timeout_params(c);
+
udp_iov_init(c);
if (c->mode == MODE_PASTA) {
diff --git a/udp.h b/udp.h
index 03e8dc5..618f258 100644
--- a/udp.h
+++ b/udp.h
@@ -42,11 +42,15 @@ union udp_listen_epoll_ref {
* @fwd_in: Port forwarding configuration for inbound packets
* @fwd_out: Port forwarding configuration for outbound packets
* @timer_run: Timestamp of most recent timer run
+ * @timeout: Timeout for unidirectional flows (in s)
+ * @stream_timeout: Timeout for stream-like flows (in s)
*/
struct udp_ctx {
struct fwd_ports fwd_in;
struct fwd_ports fwd_out;
struct timespec timer_run;
+ int timeout;
+ int stream_timeout;
};
#endif /* UDP_H */
diff --git a/udp_flow.c b/udp_flow.c
index 80b1543..4a8d4b6 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -17,8 +17,6 @@
#include "udp_internal.h"
#include "epoll_ctl.h"
-#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */
-
/**
* udp_at_sidx() - Get UDP specific flow at given sidx
* @sidx: Flow and side to retrieve
@@ -152,6 +150,8 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
uflow->ts = now->tv_sec;
uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = 0;
+ uflow->activity[INISIDE] = 1;
+ uflow->activity[TGTSIDE] = 0;
flow_foreach_sidei(sidei) {
if (pif_is_socket(uflow->f.pif[sidei]))
@@ -227,7 +227,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif,
sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, s_in, dst, port);
if ((uflow = udp_at_sidx(sidx))) {
- uflow->ts = now->tv_sec;
+ udp_flow_activity(uflow, sidx.sidei, now);
return flow_sidx_opposite(sidx);
}
@@ -284,7 +284,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
sidx = flow_lookup_af(c, IPPROTO_UDP, pif, af, saddr, daddr,
srcport, dstport);
if ((uflow = udp_at_sidx(sidx))) {
- uflow->ts = now->tv_sec;
+ udp_flow_activity(uflow, sidx.sidei, now);
return flow_sidx_opposite(sidx);
}
@@ -361,9 +361,29 @@ bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow,
bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow,
const struct timespec *now)
{
- if (now->tv_sec - uflow->ts <= UDP_CONN_TIMEOUT)
+ int timeout = c->udp.timeout;
+
+ if (uflow->activity[TGTSIDE] &&
+ (uflow->activity[INISIDE] > 1 || uflow->activity[TGTSIDE] > 1))
+ timeout = c->udp.stream_timeout;
+
+ if (now->tv_sec - uflow->ts <= timeout)
return false;
udp_flow_close(c, uflow);
return true;
}
+
+/**
+ * udp_flow_activity() - Track activity of a UDP flow
+ * @uflow: UDP flow
+ * @sidei: Side index of the flow (INISIDE or TGTSIDE)
+ * @now: Current timestamp
+ */
+void udp_flow_activity(struct udp_flow *uflow, unsigned int sidei,
+ const struct timespec *now)
+{
+ uflow->ts = now->tv_sec;
+ if (uflow->activity[sidei] < UINT8_MAX)
+ uflow->activity[sidei]++;
+}
diff --git a/udp_flow.h b/udp_flow.h
index 4c528e9..183a429 100644
--- a/udp_flow.h
+++ b/udp_flow.h
@@ -16,6 +16,7 @@
* @flush1: @s[1] may have datagrams queued for other flows
* @ts: Activity timestamp
* @s: Socket fd (or -1) for each side of the flow
+ * @activity: Packets seen from each side of the flow, up to UINT8_MAX
*/
struct udp_flow {
/* Must be first element */
@@ -29,6 +30,7 @@ struct udp_flow {
time_t ts;
int s[SIDES];
+ uint8_t activity[SIDES];
};
struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
@@ -46,5 +48,7 @@ bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow,
const struct timespec *now);
bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow,
const struct timespec *now);
+void udp_flow_activity(struct udp_flow *uflow, unsigned int sidei,
+ const struct timespec *now);
#endif /* UDP_FLOW_H */
--
2.47.1

View File

@ -0,0 +1,80 @@
From 79430cb183b70aee127dfc68846e1f8661820a43 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 4 Feb 2026 21:41:34 +1000
Subject: [PATCH 14/18] tcp: Remove non-working activity timeout mechanism
This mechanism was intended to remove connections which have had no
activity for two hours, even if they haven't closed or been reset
internally. It operated by setting the two hour timeout if there are
no sooner TCP timeouts to schedule.
However, when the timer fires, the way we detect the case of the activity
timeout doesn't work: it resets the timer for another two hours, then
checks if the old timeout was two hours. But the old timeout returned
by timerfd_settime() is not the original value of the timer, but the
remaining time. Since the timer has just fired it will essentially always
be 0.
For now, just remove the mechanism, disarming the timer entirely if there
isn't another upcoming event. We'll re-introduce some sort of activity
timeout by a different means later.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
(cherry picked from commit e48ce41a1ec2f05846fb66d3847c2c2b6448ca71)
---
tcp.c | 24 +++---------------------
1 file changed, 3 insertions(+), 21 deletions(-)
diff --git a/tcp.c b/tcp.c
index 602e810..de2ad38 100644
--- a/tcp.c
+++ b/tcp.c
@@ -199,9 +199,6 @@
* TAP_FIN_ACKED), but no socket activity is detected from the socket within
* this time, reset the connection
*
- * - ACT_TIMEOUT, in the presence of any event: if no activity is detected on
- * either side, the connection is reset
- *
* - RTT / 2 elapsed after data segment received from tap without having
* sent an ACK segment, or zero-sized window advertised to tap/guest (flag
* ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent.
@@ -632,7 +629,9 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
it.it_value.tv_sec = FIN_TIMEOUT;
} else {
- it.it_value.tv_sec = ACT_TIMEOUT;
+ /* Disarm */
+ it.it_value.tv_sec = 0;
+ it.it_value.tv_nsec = 0;
}
if (conn->flags & ACK_TO_TAP_DUE) {
@@ -2628,23 +2627,6 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
tcp_data_from_sock(c, conn);
tcp_timer_ctl(c, conn);
}
- } else {
- struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } };
- struct itimerspec old = { { 0 }, { 0 } };
-
- /* Activity timeout: if it was already set, reset the
- * connection, otherwise, it was a left-over from ACK_TO_TAP_DUE
- * or ACK_FROM_TAP_DUE, so just set the long timeout in that
- * case. This avoids having to preemptively reset the timer on
- * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE.
- */
- if (timerfd_settime(conn->timer, 0, &new, &old))
- flow_perror(conn, "failed to set timer");
-
- if (old.it_value.tv_sec == ACT_TIMEOUT) {
- flow_dbg(conn, "activity timeout");
- tcp_rst(c, conn);
- }
}
}
--
2.47.1

View File

@ -0,0 +1,191 @@
From a2b1ad31a4d56a59e4d407263a22dee270973ea4 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 4 Feb 2026 21:41:35 +1000
Subject: [PATCH 15/18] tcp: Re-introduce inactivity timeouts based on a clock
algorithm
We previously had a mechanism to remove TCP connections which were
inactive for 2 hours. That was broken for a long time, due to poor
interactions with the timerfd handling, so we removed it.
Adding this long scale timer onto the timerfd handling, which mostly
handles much shorter timeouts is tricky to reason about. However, for the
inactivity timeouts, we don't require precision. Instead, we can use
a 1-bit page replacement / "clock" algorithm. Every INACTIVITY_INTERVAL
(2 hours), a global timer marks every TCP connection as tentatively
inactive. That flag is cleared if we get any events, either tap side or
socket side.
If the inactive flag is still set when the next INACTIVITY_INTERVAL expires
then the connection has been inactive for an extended period and we reset
and close it. In practice this means that connections will be removed
after 2-4 hours of inactivity.
This is not a true fix for bug 179, but it does mitigate the damage, by
limiting the time that inactive connections will remain around,
Link: https://bugs.passt.top/show_bug.cgi?id=179
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
(cherry picked from commit 1820103fbbf13df98257a3f5c3ba625de624b0b3)
---
tcp.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++----
tcp.h | 4 +++-
tcp_conn.h | 3 +++
3 files changed, 54 insertions(+), 5 deletions(-)
diff --git a/tcp.c b/tcp.c
index de2ad38..dd58550 100644
--- a/tcp.c
+++ b/tcp.c
@@ -207,6 +207,13 @@
* TCP_INFO, with a representable range from RTT_STORE_MIN (100 us) to
* RTT_STORE_MAX (3276.8 ms). The timeout value is clamped accordingly.
*
+ * We also use a global interval timer for an activity timeout which doesn't
+ * require precision:
+ *
+ * - INACTIVITY_INTERVAL: if a connection has had no activity for an entire
+ * interval, close and reset it. This means that idle connections (without
+ * keepalives) will be removed between INACTIVITY_INTERVAL s and
+ * 2*INACTIVITY_INTERVAL s after the last activity.
*
* Summary of data flows (with ESTABLISHED event)
* ----------------------------------------------
@@ -345,7 +352,8 @@ enum {
#define RTO_INIT 1 /* s, RFC 6298 */
#define RTO_INIT_AFTER_SYN_RETRIES 3 /* s, RFC 6298 */
#define FIN_TIMEOUT 60
-#define ACT_TIMEOUT 7200
+
+#define INACTIVITY_INTERVAL 7200 /* s */
#define LOW_RTT_TABLE_SIZE 8
#define LOW_RTT_THRESHOLD 10 /* us */
@@ -2294,6 +2302,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
return 1;
}
+ conn->inactive = false;
+
if (th->ack && !(conn->events & ESTABLISHED))
tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
@@ -2652,6 +2662,8 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
return;
}
+ conn->inactive = false;
+
if ((conn->events & TAP_FIN_ACKED) && (events & EPOLLHUP)) {
conn_event(c, conn, CLOSED);
return;
@@ -3030,6 +3042,38 @@ static void tcp_port_rebind(struct ctx *c, bool outbound)
}
}
+/**
+ * tcp_inactivity() - Scan for and close long-inactive connections
+ * @: Execution context
+ */
+static void tcp_inactivity(struct ctx *c, const struct timespec *now)
+{
+ union flow *flow;
+
+ if (now->tv_sec - c->tcp.inactivity_run < INACTIVITY_INTERVAL)
+ return;
+
+ debug("TCP inactivity scan");
+ c->tcp.inactivity_run = now->tv_sec;
+
+ flow_foreach(flow) {
+ struct tcp_tap_conn *conn = &flow->tcp;
+
+ if (flow->f.type != FLOW_TCP)
+ continue;
+
+ if (conn->inactive) {
+ /* No activity in this interval, reset */
+ flow_dbg(conn, "Inactive for at least %us, resetting",
+ INACTIVITY_INTERVAL);
+ tcp_rst(c, conn);
+ }
+
+ /* Ready to check fot next interval */
+ conn->inactive = true;
+ }
+}
+
/**
* tcp_port_rebind_outbound() - Rebind ports in namespace
* @arg: Execution context
@@ -3068,13 +3112,13 @@ void tcp_port_rebind_all(struct ctx *c)
* @c: Execution context
* @now: Current timestamp
*/
-void tcp_timer(const struct ctx *c, const struct timespec *now)
+void tcp_timer(struct ctx *c, const struct timespec *now)
{
- (void)now;
-
tcp_sock_refill_init(c);
if (c->mode == MODE_PASTA)
tcp_splice_refill(c);
+
+ tcp_inactivity(c, now);
}
/**
diff --git a/tcp.h b/tcp.h
index 3f21e75..37cfc5b 100644
--- a/tcp.h
+++ b/tcp.h
@@ -23,7 +23,7 @@ int tcp_sock_init(const struct ctx *c, uint8_t pif,
in_port_t port);
int tcp_init(struct ctx *c);
void tcp_port_rebind_all(struct ctx *c);
-void tcp_timer(const struct ctx *c, const struct timespec *now);
+void tcp_timer(struct ctx *c, const struct timespec *now);
void tcp_defer_handler(struct ctx *c);
void tcp_update_l2_buf(const unsigned char *eth_d);
@@ -64,6 +64,7 @@ union tcp_listen_epoll_ref {
* @rto_max: Maximum retry timeout (in s)
* @syn_retries: SYN retries using exponential backoff timeout
* @syn_linear_timeouts: SYN retries before using exponential backoff timeout
+ * @inactivity_run: Time we last scanned for inactive connections
*/
struct tcp_ctx {
struct fwd_ports fwd_in;
@@ -73,6 +74,7 @@ struct tcp_ctx {
int rto_max;
uint8_t syn_retries;
uint8_t syn_linear_timeouts;
+ time_t inactivity_run;
};
#endif /* TCP_H */
diff --git a/tcp_conn.h b/tcp_conn.h
index 9c6ff9e..2e70d39 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -16,6 +16,7 @@
* @ws_from_tap: Window scaling factor advertised from tap/guest
* @ws_to_tap: Window scaling factor advertised to tap/guest
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
+ * @inactive: No activity within the current INACTIVITY_INTERVAL
* @sock: Socket descriptor number
* @events: Connection events, implying connection states
* @listening_sock: Listening socket this socket was accept()ed from, or -1
@@ -58,6 +59,8 @@ struct tcp_tap_conn {
(conn->rtt_exp = MIN(RTT_EXP_MAX, ilog2(MAX(1, rtt / RTT_STORE_MIN))))
#define RTT_GET(conn) (RTT_STORE_MIN << conn->rtt_exp)
+ bool inactive :1;
+
int sock :FD_REF_BITS;
uint8_t events;
--
2.47.1

View File

@ -0,0 +1,66 @@
From 4600f95f99f12eb0680277da971a3af0ba27d5c1 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 4 Feb 2026 21:41:36 +1000
Subject: [PATCH 16/18] tcp: Extend tcp_send_flag() to send TCP keepalive
segments
TCP keepalives aren't technically a flag, but they are a zero-data segment
so they can be generated with only a small modification to
tcp_{buf,vu}_send_flag(). Implement this, using a new "pseudo-flag"
value (similar to DUP_ACK), KEEPALIVE.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[sbrivio: Fix trivial merge conflict with 812cdb802c6e]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
(cherry picked from commit a681e44ec60179567fb10f34351d7dfdbd2e7c7e)
---
tcp_buf.c | 4 ++++
tcp_internal.h | 2 ++
tcp_vu.c | 3 +++
3 files changed, 9 insertions(+)
diff --git a/tcp_buf.c b/tcp_buf.c
index 5d419d3..75a020f 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -227,6 +227,10 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
tcp_frame_conns[tcp_payload_used++] = conn;
l4len = optlen + sizeof(struct tcphdr);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
+
+ if (flags & KEEPALIVE)
+ seq--;
+
tcp_l2_buf_fill_headers(c, conn, iov, NULL, seq, false);
tcp_l2_buf_pad(iov);
diff --git a/tcp_internal.h b/tcp_internal.h
index 5f8fb35..36f443b 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -38,6 +38,8 @@
/* Flags for internal usage */
#define DUP_ACK (1 << 5)
+#define KEEPALIVE (1 << 6)
+
#define OPT_EOL 0
#define OPT_NOP 1
#define OPT_MSS 2
diff --git a/tcp_vu.c b/tcp_vu.c
index db9db78..dd50241 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -135,6 +135,9 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
flags_elem[0].in_sg[0].iov_len = hdrlen + optlen;
payload = IOV_TAIL(flags_elem[0].in_sg, 1, hdrlen);
+ if (flags & KEEPALIVE)
+ seq--;
+
tcp_fill_headers(c, conn, NULL, eh, ip4h, ip6h, th, &payload,
NULL, seq, !*c->pcap);
--
2.47.1

View File

@ -0,0 +1,161 @@
From b911ba6899bac381e795e26d9bebfac69b1a5748 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 4 Feb 2026 21:41:37 +1000
Subject: [PATCH 17/18] tcp: Send TCP keepalive segments after a period of
tap-side inactivity
There are several circumstances in which a live, but idle TCP connection
can be forgotten by a guest, with no "on the wire" indication that this has
happened. The most obvious is if the guest abruptly reboots. A more
subtle case can happen with a half-closed connection, specifically one
in FIN_WAIT_2 state on the guest. A connection can, legitimately, remain
in this state indefinitely. If however, a socket in this state is closed
by userspace, Linux at least will remove the kernel socket after 60s
(or as configured in the net.ipv4.tcp_fin_timeout sysctl).
Because there's no on the wire indication in these cases, passt will
pointlessly retain the connection in its flow table, at least until it is
removed by the inactivity timeout after several hours.
To avoid keeping connections around for so long in this state, add
functionality to periodically send TCP keepalive segments to the guest if
we've seen no activity on the tap interface. If the guest is no longer
aware of the connection, it should respond with an RST which will let
passt remove the stale entry.
To do this we use a method similar to the inactivity timeout - a 1-bit
page replacement / clock algorithm, but with a shorter interval, and only
checking for tap side activity. Currently we use a 300s interval, meaning
we'll send a keepalive after 5-10 minutes of (tap side) inactivity.
Link: https://bugs.passt.top/show_bug.cgi?id=179
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
(cherry picked from commit d2f7c21cfb949f2b1587b9475917efdd6ac549fd)
---
tcp.c | 39 +++++++++++++++++++++++++++++++++++++++
tcp.h | 2 ++
tcp_conn.h | 2 ++
3 files changed, 43 insertions(+)
diff --git a/tcp.c b/tcp.c
index dd58550..1691987 100644
--- a/tcp.c
+++ b/tcp.c
@@ -215,6 +215,12 @@
* keepalives) will be removed between INACTIVITY_INTERVAL s and
* 2*INACTIVITY_INTERVAL s after the last activity.
*
+ * - KEEPALIVE_INTERVAL: if a connection has had no tap-side activity for an
+ * entire interval, send a tap-side keepalive. If the endpoint is no longer
+ * aware of the connection (due to a reboot, or a kernel timeout in FIN_WAIT_2
+ * state) that should trigger an RST, so we won't keep track of connections
+ * that the guest endpoint no longer cares about.
+ *
* Summary of data flows (with ESTABLISHED event)
* ----------------------------------------------
*
@@ -354,6 +360,7 @@ enum {
#define FIN_TIMEOUT 60
#define INACTIVITY_INTERVAL 7200 /* s */
+#define KEEPALIVE_INTERVAL 30 /* s */
#define LOW_RTT_TABLE_SIZE 8
#define LOW_RTT_THRESHOLD 10 /* us */
@@ -2303,6 +2310,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
}
conn->inactive = false;
+ conn->tap_inactive = false;
if (th->ack && !(conn->events & ESTABLISHED))
tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
@@ -3042,6 +3050,36 @@ static void tcp_port_rebind(struct ctx *c, bool outbound)
}
}
+/**
+ * tcp_keepalive() - Send keepalives for connections which need it
+ * @: Execution context
+ */
+static void tcp_keepalive(struct ctx *c, const struct timespec *now)
+{
+ union flow *flow;
+
+ if (now->tv_sec - c->tcp.keepalive_run < KEEPALIVE_INTERVAL)
+ return;
+
+ c->tcp.keepalive_run = now->tv_sec;
+
+ flow_foreach(flow) {
+ struct tcp_tap_conn *conn = &flow->tcp;
+
+ if (flow->f.type != FLOW_TCP)
+ continue;
+
+ if (conn->tap_inactive) {
+ flow_dbg(conn, "No tap activity for least %us, send keepalive",
+ KEEPALIVE_INTERVAL);
+ tcp_send_flag(c, conn, KEEPALIVE);
+ }
+
+ /* Ready to check fot next interval */
+ conn->tap_inactive = true;
+ }
+}
+
/**
* tcp_inactivity() - Scan for and close long-inactive connections
* @: Execution context
@@ -3118,6 +3156,7 @@ void tcp_timer(struct ctx *c, const struct timespec *now)
if (c->mode == MODE_PASTA)
tcp_splice_refill(c);
+ tcp_keepalive(c, now);
tcp_inactivity(c, now);
}
diff --git a/tcp.h b/tcp.h
index 37cfc5b..505f21a 100644
--- a/tcp.h
+++ b/tcp.h
@@ -64,6 +64,7 @@ union tcp_listen_epoll_ref {
* @rto_max: Maximum retry timeout (in s)
* @syn_retries: SYN retries using exponential backoff timeout
* @syn_linear_timeouts: SYN retries before using exponential backoff timeout
+ * @keepalive_run: Time we last issued tap-side keepalives
* @inactivity_run: Time we last scanned for inactive connections
*/
struct tcp_ctx {
@@ -74,6 +75,7 @@ struct tcp_ctx {
int rto_max;
uint8_t syn_retries;
uint8_t syn_linear_timeouts;
+ time_t keepalive_run;
time_t inactivity_run;
};
diff --git a/tcp_conn.h b/tcp_conn.h
index 2e70d39..2ff76ed 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -16,6 +16,7 @@
* @ws_from_tap: Window scaling factor advertised from tap/guest
* @ws_to_tap: Window scaling factor advertised to tap/guest
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
+ * @tapinactive: No tao activity within the current KEEPALIVE_INTERVAL
* @inactive: No activity within the current INACTIVITY_INTERVAL
* @sock: Socket descriptor number
* @events: Connection events, implying connection states
@@ -59,6 +60,7 @@ struct tcp_tap_conn {
(conn->rtt_exp = MIN(RTT_EXP_MAX, ilog2(MAX(1, rtt / RTT_STORE_MIN))))
#define RTT_GET(conn) (RTT_STORE_MIN << conn->rtt_exp)
+ bool tap_inactive :1;
bool inactive :1;
int sock :FD_REF_BITS;
--
2.47.1

View File

@ -0,0 +1,133 @@
From 4d1c8b11460cfe05372e572f33e046a8e98e242c Mon Sep 17 00:00:00 2001
From: Yumei Huang <yuhuang@redhat.com>
Date: Fri, 20 Mar 2026 18:32:14 +0800
Subject: [PATCH 18/18] tcp: Replace send buffer boost with EPOLLOUT monitoring
Currently we use the SNDBUF boost mechanism to force TCP auto-tuning.
However, it doesn't always work, and sometimes causes a lot of
retransmissions. As a result, the throughput suffers.
This patch replaces it with monitoring EPOLLOUT when sendmsg() failure
(with EAGAIN and EWOULDBLOCK) and partial sends occur.
Tested with iperf3 inside pasta: throughput is now comparable to running
iperf3 directly on the host without pasta. However, retransmissions can
still be elevated when RTT >= 50ms. For example, when RTT is between
200ms and 500ms, retransmission count varies from 30 to 120 in roughly
80% of test runs.
Link: https://bugs.passt.top/show_bug.cgi?id=138
Link: https://github.com/containers/podman/issues/28219
Suggested-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Yumei Huang <yuhuang@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
(cherry picked from commit 831857e9b547ac27f868b6c24049c4da435b63fe)
---
tcp.c | 57 +++++++++++++++++----------------------------------------
1 file changed, 17 insertions(+), 40 deletions(-)
diff --git a/tcp.c b/tcp.c
index 1691987..920af70 100644
--- a/tcp.c
+++ b/tcp.c
@@ -365,13 +365,6 @@ enum {
#define LOW_RTT_TABLE_SIZE 8
#define LOW_RTT_THRESHOLD 10 /* us */
-/* Parameters to temporarily exceed sending buffer to force TCP auto-tuning */
-#define SNDBUF_BOOST_BYTES_RTT_LO 2500 /* B * s: no boost until here */
-/* ...examples: 5 MB sent * 500 ns RTT, 250 kB * 10 ms, 8 kB * 300 ms */
-#define SNDBUF_BOOST_FACTOR 150 /* % */
-#define SNDBUF_BOOST_BYTES_RTT_HI 6000 /* apply full boost factor */
-/* 12 MB sent * 500 ns RTT, 600 kB * 10 ms, 20 kB * 300 ms */
-
/* Ratio of buffer to bandwidth * delay product implying interactive traffic */
#define SNDBUF_TO_BW_DELAY_INTERACTIVE /* > */ 20 /* (i.e. < 5% of buffer) */
@@ -1067,35 +1060,6 @@ void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
tap_hdr_update(taph, MAX(l3len + sizeof(struct ethhdr), ETH_ZLEN));
}
-/**
- * tcp_sndbuf_boost() - Calculate limit of sending buffer to force auto-tuning
- * @conn: Connection pointer
- * @tinfo: tcp_info from kernel, must be pre-fetched
- *
- * Return: increased sending buffer to use as a limit for advertised window
- */
-static unsigned long tcp_sndbuf_boost(const struct tcp_tap_conn *conn,
- const struct tcp_info_linux *tinfo)
-{
- unsigned long bytes_rtt_product;
-
- if (!bytes_acked_cap)
- return SNDBUF_GET(conn);
-
- /* This is *not* a bandwidth-delay product, but it's somewhat related:
- * as we send more data (usually at the beginning of a connection), we
- * try to make the sending buffer progressively grow, with the RTT as a
- * factor (longer delay, bigger buffer needed).
- */
- bytes_rtt_product = (long long)tinfo->tcpi_bytes_acked *
- tinfo->tcpi_rtt / 1000 / 1000;
-
- return clamped_scale(SNDBUF_GET(conn), bytes_rtt_product,
- SNDBUF_BOOST_BYTES_RTT_LO,
- SNDBUF_BOOST_BYTES_RTT_HI,
- SNDBUF_BOOST_FACTOR);
-}
-
/**
* tcp_update_seqack_wnd() - Update ACK sequence and window to guest/tap
* @c: Execution context
@@ -1216,8 +1180,6 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
if ((int)sendq > SNDBUF_GET(conn)) /* Due to memory pressure? */
limit = 0;
- else if ((int)tinfo->tcpi_snd_wnd > SNDBUF_GET(conn))
- limit = tcp_sndbuf_boost(conn, tinfo) - (int)sendq;
else
limit = SNDBUF_GET(conn) - (int)sendq;
@@ -2088,14 +2050,28 @@ eintr:
if (errno == EAGAIN || errno == EWOULDBLOCK) {
tcp_send_flag(c, conn, ACK | DUP_ACK);
+ uint32_t events = tcp_conn_epoll_events(conn->events,
+ conn->flags);
+ events |= EPOLLOUT;
+ if (flow_epoll_set(&conn->f, EPOLL_CTL_MOD, events,
+ conn->sock, !TAPSIDE(conn)) < 0)
+ debug("Failed to add EPOLLOUT");
return p->count - idx;
-
}
return -1;
}
- if (n < (int)(seq_from_tap - conn->seq_from_tap))
+ if (n < (int)(seq_from_tap - conn->seq_from_tap)) {
partial_send = 1;
+ uint32_t events = tcp_conn_epoll_events(conn->events,
+ conn->flags);
+ events |= EPOLLOUT;
+ if (flow_epoll_set(&conn->f, EPOLL_CTL_MOD, events, conn->sock,
+ !TAPSIDE(conn)) < 0)
+ debug("Failed to add EPOLLOUT");
+ } else {
+ tcp_epoll_ctl(conn);
+ }
conn->seq_from_tap += n;
@@ -2688,6 +2664,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
tcp_data_from_sock(c, conn);
if (events & EPOLLOUT) {
+ tcp_epoll_ctl(conn);
if (tcp_update_seqack_wnd(c, conn, false, NULL))
tcp_send_flag(c, conn, ACK);
}
--
2.47.1

View File

@ -7,11 +7,12 @@
# Copyright (c) 2022 Red Hat GmbH
# Author: Stefano Brivio <sbrivio@redhat.com>
%global git_hash 8ec134109eb136432a29bdf5a14f8b1fd4e46208
%global git_hash d04c48032bcf724550d0b8f652fd00efcd2dfad0
%global selinuxtype targeted
%global selinux_policy_version 41.41
Name: passt
Version: 0^20250512.g8ec1341
Version: 0^20251210.gd04c480
Release: 4%{?dist}
Summary: User-mode networking daemons for virtual machines and namespaces
License: GPL-2.0-or-later AND BSD-3-Clause
@ -20,9 +21,23 @@ URL: https://passt.top/
Source: https://passt.top/passt/snapshot/passt-%{git_hash}.tar.xz
Patch1: 0001-selinux-Drop-user_namespace-create-allow-rules.patch
Patch2: 0002-treewide-By-default-don-t-quit-source-after-migratio.patch
Patch3: 0003-tcp-Cast-operands-of-sequence-comparison-macros-to-u.patch
Patch4: 0004-tcp-Don-t-consider-FIN-flags-with-mismatching-sequen.patch
Patch2: 0002-selinux-Use-systemd_logind_exec_t-instead-of-systemd.patch
Patch3: 0003-tcp-Use-less-than-MSS-window-on-no-queued-data-or-no.patch
Patch4: 0004-pasta-Warn-disable-matching-IP-version-if-not-suppor.patch
Patch5: 0005-selinux-Enable-read-and-watch-permissions-on-netns-d.patch
Patch6: 0006-selinux-Enable-open-permissions-on-netns-directory-o.patch
Patch7: 0007-tcp-Fix-rounding-issue-in-check-for-approximating-wi.patch
Patch8: 0008-udp_flow-remove-unneeded-epoll_ref-indirection.patch
Patch9: 0009-udp_flow-Assign-socket-to-flow-inside-udp_flow_sock.patch
Patch10: 0010-tcp_splice-Refactor-tcp_splice_conn_epoll_events-to-.patch
Patch11: 0011-flow-Introduce-flow_epoll_set-to-centralize-epoll-op.patch
Patch12: 0012-tcp-Properly-propagate-tap-side-RST-to-socket-side.patch
Patch13: 0013-udp-Split-activity-timeouts-for-UDP-flows.patch
Patch14: 0014-tcp-Remove-non-working-activity-timeout-mechanism.patch
Patch15: 0015-tcp-Re-introduce-inactivity-timeouts-based-on-a-cloc.patch
Patch16: 0016-tcp-Extend-tcp_send_flag-to-send-TCP-keepalive-segme.patch
Patch17: 0017-tcp-Send-TCP-keepalive-segments-after-a-period-of-ta.patch
Patch18: 0018-tcp-Replace-send-buffer-boost-with-EPOLLOUT-monitori.patch
BuildRequires: gcc, make, git, checkpolicy, selinux-policy-devel
Requires: (%{name}-selinux = %{version}-%{release} if selinux-policy-%{selinuxtype})
@ -38,15 +53,21 @@ for network namespaces: traffic is forwarded using a tap interface inside the
namespace, without the need to create further interfaces on the host, hence not
requiring any capabilities or privileges.
%package selinux
BuildArch: noarch
Summary: SELinux support for passt and pasta
Requires: %{name} = %{version}-%{release}
Requires: selinux-policy
Requires(post): %{name}
Requires(post): policycoreutils
Requires(preun): %{name}
Requires(preun): policycoreutils
%package selinux
BuildArch: noarch
Summary: SELinux support for passt and pasta
%if 0%{?fedora} > 43
BuildRequires: selinux-policy-devel
%selinux_requires_min
%else
BuildRequires: pkgconfig(systemd)
Requires(post): libselinux-utils
Requires(post): policycoreutils
%endif
Requires: container-selinux
Requires: selinux-policy-%{selinuxtype}
Requires(post): container-selinux
Requires(post): selinux-policy-%{selinuxtype}
%description selinux
This package adds SELinux enforcement to passt(1), pasta(1), passt-repair(1).
@ -94,15 +115,11 @@ popd
%selinux_relabel_pre -s %{selinuxtype}
%post selinux
%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
%postun selinux
if [ $1 -eq 0 ]; then
%selinux_modules_uninstall -s %{selinuxtype} passt
%selinux_modules_uninstall -s %{selinuxtype} pasta
%selinux_modules_uninstall -s %{selinuxtype} passt-repair
%selinux_modules_uninstall -s %{selinuxtype} passt pasta passt-repair
fi
%posttrans selinux
@ -135,8 +152,23 @@ fi
%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
%changelog
* Thu Oct 23 2025 Stefano Brivio <sbrivio@redhat.com> - 0^20250512.g8ec1341-4
- Resolves: RHEL-123413 RHEL-123419
* Tue Apr 21 2026 Stefano Brivio <sbrivio@redhat.com> - 0^20251210.gd04c480-4
- Resolves: RHEL-169637 RHEL-169639 RHEL-169648
* Wed Feb 11 2026 Stefano Brivio <sbrivio@redhat.com> - 0^20251210.gd04c480-3
- Resolves: RHEL-137588 RHEL-136313
* Wed Dec 24 2025 Stefano Brivio <sbrivio@redhat.com> - 0^20251210.gd04c480-2
- Resolves: RHEL-136313 RHEL-136461 RHEL-137439 RHEL-137588
* Wed Dec 10 2025 Stefano Brivio <sbrivio@redhat.com> - 0^20251210.gd04c480-1
- Resolves: RHEL-134942 RHEL-134943
* Tue Dec 9 2025 Stefano Brivio <sbrivio@redhat.com> - 0^20251209.gc3f1ba7-1
- Resolves: RHEL-134119
* Thu Oct 23 2025 Stefano Brivio <sbrivio@redhat.com> - 0^20250512.g8ec1341-3
- Resolves: RHEL-123376 RHEL-123438
* Tue Jul 29 2025 Stefano Brivio <sbrivio@redhat.com> - 0^20250512.g8ec1341-2
- Resolves: RHEL-106326