From b396701ee3d212176cbdf09f911f8137ea012fa1 Mon Sep 17 00:00:00 2001 From: AlmaLinux RelEng Bot Date: Tue, 19 May 2026 20:35:01 -0400 Subject: [PATCH] import UBI passt-0^20251210.gd04c480-4.el9_8 --- .gitignore | 2 +- .passt.metadata | 2 +- ...op-user_namespace-create-allow-rules.patch | 21 +- ...emd_logind_exec_t-instead-of-systemd.patch | 41 ++ ...ult-don-t-quit-source-after-migratio.patch | 264 ---------- ...s-of-sequence-comparison-macros-to-u.patch | 48 -- ...n-MSS-window-on-no-queued-data-or-no.patch | 110 ++++ ...le-matching-IP-version-if-not-suppor.patch | 90 ++++ ...er-FIN-flags-with-mismatching-sequen.patch | 76 --- ...ead-and-watch-permissions-on-netns-d.patch | 58 +++ ...pen-permissions-on-netns-directory-o.patch | 68 +++ ...-issue-in-check-for-approximating-wi.patch | 74 +++ ...emove-unneeded-epoll_ref-indirection.patch | 48 ++ ...-socket-to-flow-inside-udp_flow_sock.patch | 47 ++ ...tor-tcp_splice_conn_epoll_events-to-.patch | 94 ++++ ...low_epoll_set-to-centralize-epoll-op.patch | 489 ++++++++++++++++++ ...ropagate-tap-side-RST-to-socket-side.patch | 99 ++++ ...plit-activity-timeouts-for-UDP-flows.patch | 239 +++++++++ ...n-working-activity-timeout-mechanism.patch | 80 +++ ...-inactivity-timeouts-based-on-a-cloc.patch | 191 +++++++ ...end_flag-to-send-TCP-keepalive-segme.patch | 66 +++ ...palive-segments-after-a-period-of-ta.patch | 161 ++++++ ...-buffer-boost-with-EPOLLOUT-monitori.patch | 133 +++++ SPECS/passt.spec | 76 ++- 24 files changed, 2155 insertions(+), 422 deletions(-) create mode 100644 SOURCES/0002-selinux-Use-systemd_logind_exec_t-instead-of-systemd.patch delete mode 100644 SOURCES/0002-treewide-By-default-don-t-quit-source-after-migratio.patch delete mode 100644 SOURCES/0003-tcp-Cast-operands-of-sequence-comparison-macros-to-u.patch create mode 100644 SOURCES/0003-tcp-Use-less-than-MSS-window-on-no-queued-data-or-no.patch create mode 100644 SOURCES/0004-pasta-Warn-disable-matching-IP-version-if-not-suppor.patch delete mode 100644 SOURCES/0004-tcp-Don-t-consider-FIN-flags-with-mismatching-sequen.patch create mode 100644 SOURCES/0005-selinux-Enable-read-and-watch-permissions-on-netns-d.patch create mode 100644 SOURCES/0006-selinux-Enable-open-permissions-on-netns-directory-o.patch create mode 100644 SOURCES/0007-tcp-Fix-rounding-issue-in-check-for-approximating-wi.patch create mode 100644 SOURCES/0008-udp_flow-remove-unneeded-epoll_ref-indirection.patch create mode 100644 SOURCES/0009-udp_flow-Assign-socket-to-flow-inside-udp_flow_sock.patch create mode 100644 SOURCES/0010-tcp_splice-Refactor-tcp_splice_conn_epoll_events-to-.patch create mode 100644 SOURCES/0011-flow-Introduce-flow_epoll_set-to-centralize-epoll-op.patch create mode 100644 SOURCES/0012-tcp-Properly-propagate-tap-side-RST-to-socket-side.patch create mode 100644 SOURCES/0013-udp-Split-activity-timeouts-for-UDP-flows.patch create mode 100644 SOURCES/0014-tcp-Remove-non-working-activity-timeout-mechanism.patch create mode 100644 SOURCES/0015-tcp-Re-introduce-inactivity-timeouts-based-on-a-cloc.patch create mode 100644 SOURCES/0016-tcp-Extend-tcp_send_flag-to-send-TCP-keepalive-segme.patch create mode 100644 SOURCES/0017-tcp-Send-TCP-keepalive-segments-after-a-period-of-ta.patch create mode 100644 SOURCES/0018-tcp-Replace-send-buffer-boost-with-EPOLLOUT-monitori.patch diff --git a/.gitignore b/.gitignore index 7d7ac68..0a79d09 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1 @@ -SOURCES/passt-8ec134109eb136432a29bdf5a14f8b1fd4e46208.tar.xz +SOURCES/passt-d04c48032bcf724550d0b8f652fd00efcd2dfad0.tar.xz diff --git a/.passt.metadata b/.passt.metadata index 674e16c..c00c0d5 100644 --- a/.passt.metadata +++ b/.passt.metadata @@ -1 +1 @@ -7b91876dcd65569ddf775b2da567345500ec8862 SOURCES/passt-8ec134109eb136432a29bdf5a14f8b1fd4e46208.tar.xz +ec2fcde158b88b1ed9786565025380d03aa32d56 SOURCES/passt-d04c48032bcf724550d0b8f652fd00efcd2dfad0.tar.xz diff --git a/SOURCES/0001-selinux-Drop-user_namespace-create-allow-rules.patch b/SOURCES/0001-selinux-Drop-user_namespace-create-allow-rules.patch index 4149192..5b5fea6 100644 --- a/SOURCES/0001-selinux-Drop-user_namespace-create-allow-rules.patch +++ b/SOURCES/0001-selinux-Drop-user_namespace-create-allow-rules.patch @@ -1,6 +1,6 @@ -From 6977619743bbc602a865f79562b59a80921d6063 Mon Sep 17 00:00:00 2001 +From 7087adfbab35354f9def7edee87385b82416c722 Mon Sep 17 00:00:00 2001 From: Stefano Brivio -Date: Mon, 21 Aug 2023 17:52:28 +0200 +Date: Mon, 8 Dec 2025 22:32:50 -0500 Subject: [PATCH] selinux: Drop user_namespace create allow rules Those are incompatible with current el9 kernels. I introduced them @@ -24,10 +24,10 @@ Signed-off-by: Stefano Brivio 2 files changed, 2 deletions(-) diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te -index c6cea34..131fadc 100644 +index 6995df8..76d23e8 100644 --- a/contrib/selinux/passt.te +++ b/contrib/selinux/passt.te -@@ -92,7 +92,6 @@ allow syslogd_t self:cap_userns sys_ptrace; +@@ -105,7 +105,6 @@ allow syslogd_t self:cap_userns sys_ptrace; allow passt_t self:process setcap; allow passt_t self:capability { sys_tty_config setpcap net_bind_service setuid setgid}; allow passt_t self:cap_userns { setpcap sys_admin sys_ptrace }; @@ -36,16 +36,17 @@ index c6cea34..131fadc 100644 auth_read_passwd(passt_t) diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te -index 69be081..892edae 100644 +index 95fe42a..7e1e821 100644 --- a/contrib/selinux/pasta.te +++ b/contrib/selinux/pasta.te -@@ -110,7 +110,6 @@ init_daemon_domain(pasta_t, pasta_exec_t) - - allow pasta_t self:capability { setpcap net_bind_service sys_tty_config dac_read_search net_admin sys_resource setuid setgid }; - allow pasta_t self:cap_userns { setpcap sys_admin sys_ptrace net_admin net_bind_service }; +@@ -126,7 +126,6 @@ allow pasta_t self:cap_userns { setpcap sys_admin sys_ptrace net_admin net_bind_ + # pasta only calls setuid and setgid with the current UID and GID, so this + # denial is harmless. See https://bugzilla.redhat.com/show_bug.cgi?id=2330512#c10 + dontaudit pasta_t self:cap_userns { setgid setuid }; -allow pasta_t self:user_namespace create; auth_read_passwd(pasta_t) -- -2.39.2 +2.47.1 + diff --git a/SOURCES/0002-selinux-Use-systemd_logind_exec_t-instead-of-systemd.patch b/SOURCES/0002-selinux-Use-systemd_logind_exec_t-instead-of-systemd.patch new file mode 100644 index 0000000..7f3736a --- /dev/null +++ b/SOURCES/0002-selinux-Use-systemd_logind_exec_t-instead-of-systemd.patch @@ -0,0 +1,41 @@ +From 2244df26b2cb63acb51a20485e1ca7ad0649b152 Mon Sep 17 00:00:00 2001 +From: Stefano Brivio +Date: Mon, 22 Dec 2025 21:48:32 -0500 +Subject: [PATCH] selinux: Use systemd_logind_exec_t instead of + systemd_user_runtimedir_exec_t + +On CentOS Stream 9, selinux-policy doesn't contain commit +700b3622d575 ("Confine /usr/lib/systemd/systemd-user-runtime-dir"), +so the file context of /usr/lib/systemd/systemd-user-runtime-dir is +still systemd_logind_exec_t there. + +Signed-off-by: Stefano Brivio +--- + contrib/selinux/pasta.te | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te +index 7e1e821..d29d6c4 100644 +--- a/contrib/selinux/pasta.te ++++ b/contrib/selinux/pasta.te +@@ -98,7 +98,7 @@ require { + type container_runtime_t; + type container_var_run_t; + type container_t; +- type systemd_user_runtimedir_t; ++ type systemd_logind_exec_t; + } + + type pasta_t; +@@ -250,7 +250,7 @@ type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "rootles + type_transition container_runtime_t container_var_run_t : dir ifconfig_var_run_t "rootless-netns"; + allow pasta_t ifconfig_var_run_t:dir { add_name open rmdir write }; + allow pasta_t ifconfig_var_run_t:file { create open write }; +-allow systemd_user_runtimedir_t ifconfig_var_run_t:dir rmdir; ++allow systemd_logind_exec_t ifconfig_var_run_t:dir rmdir; + + # Allow pasta to bind to any port + bool pasta_bind_all_ports true; +-- +2.47.1 + diff --git a/SOURCES/0002-treewide-By-default-don-t-quit-source-after-migratio.patch b/SOURCES/0002-treewide-By-default-don-t-quit-source-after-migratio.patch deleted file mode 100644 index 95e79e7..0000000 --- a/SOURCES/0002-treewide-By-default-don-t-quit-source-after-migratio.patch +++ /dev/null @@ -1,264 +0,0 @@ -From b0b5ce0a76cf7fec0b00405732fd94e0b34e8d84 Mon Sep 17 00:00:00 2001 -From: Stefano Brivio -Date: Thu, 17 Jul 2025 10:38:17 +0200 -Subject: [PATCH] treewide: By default, don't quit source after migration, keep - sockets open - -We are hitting an issue in the KubeVirt integration where some data is -still sent to the source instance even after migration is complete. As -we exit, the kernel closes our sockets and resets connections. The -resulting RST segments are sent to peers, effectively terminating -connections that were meanwhile migrated. - -At the moment, this is not done intentionally, but in the future -KubeVirt might enable OVN-Kubernetes features where source and -destination nodes are explicitly getting mirrored traffic for a while, -in order to decrease migration downtime. - -By default, don't quit after migration is completed on the source: the -previous behaviour can be enabled with the new, but deprecated, ---migrate-exit option. After migration (as source), the -1 / --one-off -option has no effect. - -Also, by default, keep migrated TCP sockets open (in repair mode) as -long as we're running, and ignore events on any epoll descriptor -representing data channels. The previous behaviour can be enabled with -the new, equally deprecated, --migrate-no-linger option. - -By keeping sockets open, and not exiting, we prevent the kernel -running on the source node to send out RST segments if further data -reaches us. - -Reported-by: Nir Dothan -Signed-off-by: Stefano Brivio -(cherry picked from commit a8782865c342eb2682cca292d5bf92b567344351) ---- - conf.c | 22 ++++++++++++++++++++++ - flow.c | 2 +- - passt.1 | 29 +++++++++++++++++++++++++++++ - passt.h | 4 ++++ - tcp.c | 9 +++++++-- - tcp_conn.h | 3 ++- - test/lib/setup | 4 ++-- - vhost_user.c | 9 +++++++-- - 8 files changed, 74 insertions(+), 8 deletions(-) - -diff --git a/conf.c b/conf.c -index a6d7e22..1295d89 100644 ---- a/conf.c -+++ b/conf.c -@@ -864,6 +864,14 @@ static void usage(const char *name, FILE *f, int status) - FPRINTF(f, - " --repair-path PATH path for passt-repair(1)\n" - " default: append '.repair' to UNIX domain path\n"); -+ FPRINTF(f, -+ " --migrate-exit DEPRECATED:\n" -+ " source quits after migration\n" -+ " default: source keeps running after migration\n"); -+ FPRINTF(f, -+ " --migrate-no-linger DEPRECATED:\n" -+ " close sockets on migration\n" -+ " default: keep sockets open, ignore events\n"); - } - - FPRINTF(f, -@@ -1468,6 +1476,8 @@ void conf(struct ctx *c, int argc, char **argv) - {"socket-path", required_argument, NULL, 's' }, - {"fqdn", required_argument, NULL, 27 }, - {"repair-path", required_argument, NULL, 28 }, -+ {"migrate-exit", no_argument, NULL, 29 }, -+ {"migrate-no-linger", no_argument, NULL, 30 }, - { 0 }, - }; - const char *optstring = "+dqfel:hs:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:"; -@@ -1683,6 +1693,18 @@ void conf(struct ctx *c, int argc, char **argv) - optarg)) - die("Invalid passt-repair path: %s", optarg); - -+ break; -+ case 29: -+ if (c->mode != MODE_VU) -+ die("--migrate-exit is for vhost-user mode only"); -+ c->migrate_exit = true; -+ -+ break; -+ case 30: -+ if (c->mode != MODE_VU) -+ die("--migrate-no-linger is for vhost-user mode only"); -+ c->migrate_no_linger = true; -+ - break; - case 'd': - c->debug = 1; -diff --git a/flow.c b/flow.c -index 6a5c8aa..a4b65ea 100644 ---- a/flow.c -+++ b/flow.c -@@ -1089,7 +1089,7 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, - * as EIO). - */ - foreach_established_tcp_flow(flow) { -- rc = tcp_flow_migrate_source_ext(fd, &flow->tcp); -+ rc = tcp_flow_migrate_source_ext(c, fd, &flow->tcp); - if (rc) { - flow_err(flow, "Can't send extended data: %s", - strerror_(-rc)); -diff --git a/passt.1 b/passt.1 -index 60066c2..cef98b2 100644 ---- a/passt.1 -+++ b/passt.1 -@@ -439,6 +439,30 @@ Default, for \-\-vhost-user mode only, is to append \fI.repair\fR to the path - chosen for the hypervisor UNIX domain socket. No socket is created if not in - \-\-vhost-user mode. - -+.TP -+.BR \-\-migrate-exit (DEPRECATED) -+Exit after a completed migration as source. By default, \fBpasst\fR keeps -+running and the migrated guest can continue using its connection, or a new guest -+can connect. -+ -+Note that this configuration option is \fBdeprecated\fR and will be removed in a -+future version. It is not expected to be of any use, and it simply reflects a -+legacy behaviour. If you have any use for this, refer to \fBREPORTING BUGS\fR -+below. -+ -+.TP -+.BR \-\-migrate-no-linger (DEPRECATED) -+Close TCP sockets on the source instance once migration completes. -+ -+By default, sockets are kept open, and events on data sockets are ignored, so -+that any further message reaching sockets after the source migrated is silently -+ignored, to avoid connection resets in case data is received after migration. -+ -+Note that this configuration option is \fBdeprecated\fR and will be removed in a -+future version. It is not expected to be of any use, and it simply reflects a -+legacy behaviour. If you have any use for this, refer to \fBREPORTING BUGS\fR -+below. -+ - .TP - .BR \-F ", " \-\-fd " " \fIFD - Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened -@@ -454,6 +478,11 @@ is closed. - Quit after handling a single client connection, that is, once the client closes - the socket, or once we get a socket error. - -+\fBNote\fR: this option has no effect after \fBpasst\fR completes a migration as -+source, because, in that case, exiting would close sockets for active -+connections, which would in turn cause connection resets if any further data is -+received. See also the description of \fI\-\-migrate-no-linger\fR. -+ - .TP - .BR \-t ", " \-\-tcp-ports " " \fIspec - Configure TCP port forwarding to guest. \fIspec\fR can be one of: -diff --git a/passt.h b/passt.h -index 8693794..4cfd6eb 100644 ---- a/passt.h -+++ b/passt.h -@@ -241,6 +241,8 @@ struct ip6_ctx { - * @device_state_fd: Device state migration channel - * @device_state_result: Device state migration result - * @migrate_target: Are we the target, on the next migration request? -+ * @migrate_no_linger: Close sockets as we migrate them -+ * @migrate_exit: Exit (on source) once migration is complete - */ - struct ctx { - enum passt_modes mode; -@@ -318,6 +320,8 @@ struct ctx { - int device_state_fd; - int device_state_result; - bool migrate_target; -+ bool migrate_no_linger; -+ bool migrate_exit; - }; - - void proto_update_l2_buf(const unsigned char *eth_d, -diff --git a/tcp.c b/tcp.c -index 0ac298a..1b22f70 100644 ---- a/tcp.c -+++ b/tcp.c -@@ -3284,12 +3284,14 @@ int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn) - - /** - * tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data -+ * @c: Execution context - * @fd: Descriptor for state migration - * @conn: Pointer to the TCP connection structure - * - * Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure - */ --int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn) -+int tcp_flow_migrate_source_ext(const struct ctx *c, -+ int fd, const struct tcp_tap_conn *conn) - { - uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap; - struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(conn)]; -@@ -3334,7 +3336,10 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn) - if ((rc = tcp_flow_dump_seq(conn, &t->seq_rcv))) - goto fail; - -- close(s); -+ if (c->migrate_no_linger) -+ close(s); -+ else -+ epoll_del(c, s); - - /* Adjustments unrelated to FIN segments: sequence numbers we dumped are - * based on the end of the queues. -diff --git a/tcp_conn.h b/tcp_conn.h -index 35d813d..38b5c54 100644 ---- a/tcp_conn.h -+++ b/tcp_conn.h -@@ -236,7 +236,8 @@ int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn); - int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn); - - int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn); --int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn); -+int tcp_flow_migrate_source_ext(const struct ctx *c, int fd, -+ const struct tcp_tap_conn *conn); - - int tcp_flow_migrate_target(struct ctx *c, int fd); - int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd); -diff --git a/test/lib/setup b/test/lib/setup -index 575bc21..5994598 100755 ---- a/test/lib/setup -+++ b/test/lib/setup -@@ -350,7 +350,7 @@ setup_migrate() { - - sleep 1 - -- __opts="--vhost-user" -+ __opts="--vhost-user --migrate-exit --migrate-no-linger" - [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap" - [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" - [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" -@@ -360,7 +360,7 @@ setup_migrate() { - - context_run_bg passt_repair_1 "./passt-repair ${STATESETUP}/passt_1.socket.repair" - -- __opts="--vhost-user" -+ __opts="--vhost-user --migrate-exit --migrate-no-linger" - [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap" - [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" - [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" -diff --git a/vhost_user.c b/vhost_user.c -index 105f77a..c4d3a52 100644 ---- a/vhost_user.c -+++ b/vhost_user.c -@@ -1208,7 +1208,12 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events) - if (msg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE && - vdev->context->device_state_result == 0 && - !vdev->context->migrate_target) { -- info("Migration complete, exiting"); -- _exit(EXIT_SUCCESS); -+ if (vdev->context->migrate_exit) { -+ info("Migration complete, exiting"); -+ _exit(EXIT_SUCCESS); -+ } -+ -+ info("Migration complete"); -+ vdev->context->one_off = false; - } - } --- -2.47.1 - diff --git a/SOURCES/0003-tcp-Cast-operands-of-sequence-comparison-macros-to-u.patch b/SOURCES/0003-tcp-Cast-operands-of-sequence-comparison-macros-to-u.patch deleted file mode 100644 index ec6b394..0000000 --- a/SOURCES/0003-tcp-Cast-operands-of-sequence-comparison-macros-to-u.patch +++ /dev/null @@ -1,48 +0,0 @@ -From bd90a820852ff8966aeb83231c29e48849db3493 Mon Sep 17 00:00:00 2001 -From: Stefano Brivio -Date: Fri, 29 Aug 2025 22:11:31 +0200 -Subject: [PATCH 3/4] tcp: Cast operands of sequence comparison macros to - uint32_t before using them - -Otherwise, passing signed types causes automatic promotion of the -result of the subtractions as well, which is not what we want, as -these macros rely on unsigned 32-bit arithmetic. - -The next patch introduces a ssize_t operand for SEQ_LE, illustrating -the issue. - -Signed-off-by: Stefano Brivio -Reviewed-by: David Gibson -Tested-by: Paul Holzinger -Reviewed-by: Jon Maloy -(cherry picked from commit 660cd6907e14a41ad9bc77d317140c70ab416fce) ---- - tcp_internal.h | 12 ++++++++---- - 1 file changed, 8 insertions(+), 4 deletions(-) - -diff --git a/tcp_internal.h b/tcp_internal.h -index 36c6533..c80ba40 100644 ---- a/tcp_internal.h -+++ b/tcp_internal.h -@@ -18,10 +18,14 @@ - sizeof(struct ipv6hdr), \ - sizeof(uint32_t)) - --#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW) --#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW) --#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW) --#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW) -+#define SEQ_LE(a, b) \ -+ ((uint32_t)(b) - (uint32_t)(a) < MAX_WINDOW) -+#define SEQ_LT(a, b) \ -+ ((uint32_t)(b) - (uint32_t)(a) - 1 < MAX_WINDOW) -+#define SEQ_GE(a, b) \ -+ ((uint32_t)(a) - (uint32_t)(b) < MAX_WINDOW) -+#define SEQ_GT(a, b) \ -+ ((uint32_t)(a) - (uint32_t)(b) - 1 < MAX_WINDOW) - - #define FIN (1 << 0) - #define SYN (1 << 1) --- -2.47.1 - diff --git a/SOURCES/0003-tcp-Use-less-than-MSS-window-on-no-queued-data-or-no.patch b/SOURCES/0003-tcp-Use-less-than-MSS-window-on-no-queued-data-or-no.patch new file mode 100644 index 0000000..50f80a3 --- /dev/null +++ b/SOURCES/0003-tcp-Use-less-than-MSS-window-on-no-queued-data-or-no.patch @@ -0,0 +1,110 @@ +From b40f5cd8c8e16c6eceb1f26eb895527fda84068b Mon Sep 17 00:00:00 2001 +From: Stefano Brivio +Date: Sat, 13 Dec 2025 14:19:13 +0100 +Subject: [PATCH] tcp: Use less-than-MSS window on no queued data, or no data + sent recently + +We limit the advertised window to guests and containers to the +available length of the sending buffer, and if it's less than the MSS, +since commit cf1925fb7b77 ("tcp: Don't limit window to less-than-MSS +values, use zero instead"), we approximate that limit to zero. + +This way, we'll trigger a window update as soon as we realise that we +can advertise a larger value, just like we do in all other cases where +we advertise a zero-sized window. + +By doing that, we don't wait for the peer to send us data before we +update the window. This matters because the guest or container might +be trying to aggregate more data and won't send us anything at all if +the advertised window is too small. + +However, this might be problematic in two situations: + +1. one, reported by Tyler, where the remote (receiving) peer + advertises a window that's smaller than what we usually get and + very close to the MSS, causing the kernel to give us a starting + size of the buffer that's less than the MSS we advertise to the + guest or container. + + If this happens, we'll never advertise a non-zero window after + the handshake, and the container or guest will never send us any + data at all. + + With a simple 'curl https://cloudflare.com/', we get, with default + TCP memory parameters, a 65535-byte window from the peer, and 46080 + bytes of initial sending buffer from the kernel. But we advertised + a 65480-byte MSS, and we'll never actually receive the client + request. + + This seems to be specific to Cloudflare for some reason, probably + deriving from a particular tuning of TCP parameters on their + servers. + +2. another one, hypothesised by David, where the peer might only be + willing to process (and acknowledge) data in batches. + + We might have queued outbound data which is, at the same time, not + enough to fill one of these batches and be acknowledged and removed + from the sending queue, but enough to make our available buffer + smaller than the MSS, and the connection will hang. + +Take care of both cases by: + +a. not approximating the sending buffer to zero if we have no outboud + queued data at all, because in that case we don't expect the + available buffer to increase if we don't send any data, so there's + no point in waiting for it to grow larger than the MSS. + + This fixes problem 1. above. + +b. also using the full sending buffer size if we haven't send data to + the socket for a while (reported by tcpi_last_data_sent). This part + was already suggested by David in: + + https://archives.passt.top/passt-dev/aTZzgtcKWLb28zrf@zatzit/ + + and I'm now picking ten times the RTT as a somewhat arbitrary + threshold. + + This is meant to take care of potential problem 2. above, but it + also happens to fix 1. + +Reported-by: Tyler Cloud +Link: https://bugs.passt.top/show_bug.cgi?id=183 +Suggested-by: David Gibson +Signed-off-by: Stefano Brivio +Reviewed-by: David Gibson +--- + tcp.c | 15 ++++++++++++++- + 1 file changed, 14 insertions(+), 1 deletion(-) + +diff --git a/tcp.c b/tcp.c +index 81bc114..b179e39 100644 +--- a/tcp.c ++++ b/tcp.c +@@ -1211,8 +1211,21 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, + * the MSS to zero, as we already have mechanisms in place to + * force updates after the window becomes zero. This matches the + * suggestion from RFC 813, Section 4. ++ * ++ * But don't do this if, either: ++ * ++ * - there's nothing in the outbound queue: the size of the ++ * sending buffer is limiting us, and it won't increase if we ++ * don't send data, so there's no point in waiting, or ++ * ++ * - we haven't sent data in a while (somewhat arbitrarily, ten ++ * times the RTT), as that might indicate that the receiver ++ * will only process data in batches that are large enough, ++ * but we won't send enough to fill one because we're stuck ++ * with pending data in the outbound queue + */ +- if (limit < MSS_GET(conn)) ++ if (limit < MSS_GET(conn) && sendq && ++ tinfo->tcpi_last_data_sent < tinfo->tcpi_rtt / 1000 * 10) + limit = 0; + + new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, limit); +-- +2.47.1 + diff --git a/SOURCES/0004-pasta-Warn-disable-matching-IP-version-if-not-suppor.patch b/SOURCES/0004-pasta-Warn-disable-matching-IP-version-if-not-suppor.patch new file mode 100644 index 0000000..d85c03d --- /dev/null +++ b/SOURCES/0004-pasta-Warn-disable-matching-IP-version-if-not-suppor.patch @@ -0,0 +1,90 @@ +From 75dcbc300bf09c3649823b12d30c4f24de7271d4 Mon Sep 17 00:00:00 2001 +From: Stefano Brivio +Date: Tue, 23 Dec 2025 13:39:17 +0100 +Subject: [PATCH] pasta: Warn, disable matching IP version if not supported, in + local mode + +...instead of exiting, but only if local mode is enabled, that is, if +we couldn't find a template interface or if the user didn't specify +one. + +With IPv4, we always try to set or copy an address, so check if that +fails. + +With IPv6, in local mode, we rely on the link-local address that's +automatically generated inside the target namespace, and only fail +later, as we try to set up routes. Check if that fails, instead. + +Otherwise, we'll fail to start if IPv6 support is not built in or +disabled by the kernel ("ipv6.disable=1" on the command line), +because, in that case, we'll try to enable local mode by default, and +then fail to set any address or route. + +It would probably be more elegant to check for IP version support in +conf_ip4_local() and conf_ip6_local(), and not even try to enable +connectivity for unsupported versions, but it looks less robust than +trying and failing, as there might be other ways to disable a given +IP version. + +Note that there's currently no way to disable IPv4 support on the +kernel command line, that is, there's no such thing as an +ipv4.disable boot parameter. But I guess that's due to be eventually +implemented, one day, so let's cover that case as well, also for +consistency. + +Reported-by: Iyan +Link: https://bugzilla.redhat.com/show_bug.cgi?id=2424192 +Fixes: 4ddd59bc6085 ("conf: Separate local mode for each IP version, don't enable disabled IP version") +Signed-off-by: Stefano Brivio +--- + pasta.c | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +diff --git a/pasta.c b/pasta.c +index c307b8a..0ddd6b0 100644 +--- a/pasta.c ++++ b/pasta.c +@@ -348,6 +348,12 @@ void pasta_ns_conf(struct ctx *c) + AF_INET); + } + ++ if (c->ifi4 == -1 && rc == -ENOTSUP) { ++ warn("IPv4 not supported, disabling"); ++ c->ifi4 = 0; ++ goto ipv4_done; ++ } ++ + if (rc < 0) { + die("Couldn't set IPv4 address(es) in namespace: %s", + strerror_(-rc)); +@@ -367,6 +373,7 @@ void pasta_ns_conf(struct ctx *c) + strerror_(-rc)); + } + } ++ipv4_done: + + if (c->ifi6) { + rc = nl_addr_get_ll(nl_sock_ns, c->pasta_ifi, +@@ -413,12 +420,19 @@ void pasta_ns_conf(struct ctx *c) + AF_INET6); + } + ++ if (c->ifi6 == -1 && rc == -ENOTSUP) { ++ warn("IPv6 not supported, disabling"); ++ c->ifi6 = 0; ++ goto ipv6_done; ++ } ++ + if (rc < 0) { + die("Couldn't set IPv6 route(s) in guest: %s", + strerror_(-rc)); + } + } + } ++ipv6_done: + + proto_update_l2_buf(c->guest_mac); + } +-- +2.47.1 + diff --git a/SOURCES/0004-tcp-Don-t-consider-FIN-flags-with-mismatching-sequen.patch b/SOURCES/0004-tcp-Don-t-consider-FIN-flags-with-mismatching-sequen.patch deleted file mode 100644 index 77a5cc4..0000000 --- a/SOURCES/0004-tcp-Don-t-consider-FIN-flags-with-mismatching-sequen.patch +++ /dev/null @@ -1,76 +0,0 @@ -From f9278aab878ef58cf8502ea8f904dbb40fbbb16a Mon Sep 17 00:00:00 2001 -From: Stefano Brivio -Date: Thu, 2 Oct 2025 00:41:54 +0200 -Subject: [PATCH 4/4] tcp: Don't consider FIN flags with mismatching sequence -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -If a guest or container sends us a FIN segment but its sequence number -doesn't match the highest sequence of data we *accepted* (not -necessarily the highest sequence we received), that is, -conn->seq_from_tap, plus any data we're accepting in the current -batch, we should discard the flag (not necessarily the segment), -because there's still data we need to receive (again) before the end -of the stream. - -If we consider those FIN flags as such, we'll end up in the -situation described below. - -Here, 192.168.10.102 is a HTTP server in a Podman container, and -192.168.10.44 is a client fetching approximately 121 KB of data from -it: - - 82 2.026811 192.168.10.102 → 192.168.10.44 54 TCP 55414 → 44992 [FIN, ACK] Seq=121441 Ack=143 Win=65536 Len=0 - -the server is done sending - - 83 2.026898 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [ACK] Seq=143 Ack=114394 Win=216192 Len=0 - -pasta (client) acknowledges a previous sequence, because of -a short sendmsg() - - 84 2.027324 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [FIN, ACK] Seq=143 Ack=114394 Win=216192 Len=0 - -pasta (client) sends FIN, ACK as the client has no more data to -send (a single GET request), while still acknowledging a previous -sequence, because the retransmission didn't happen yet - - 85 2.027349 192.168.10.102 → 192.168.10.44 54 TCP 55414 → 44992 [ACK] Seq=121442 Ack=144 Win=65536 Len=0 - -the server acknowledges the FIN, ACK - - 86 2.224125 192.168.10.102 → 192.168.10.44 4150 TCP [TCP Retransmission] 55414 → 44992 [ACK] Seq=114394 Ack=144 Win=65536 Len=4096 [TCP segment of a reassembled PDU] - -and finally a retransmission comes, but as we wrongly switched to -the CLOSE-WAIT state, - - 87 2.224202 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [RST] Seq=144 Win=0 Len=0 - -we consider frame #86 as an acknowledgement for the FIN segment we -sent, and close the connection, while we still had to re-receive -(and finally send) the missing data segment, instead. - -Link: https://github.com/containers/podman/issues/27179 -Signed-off-by: Stefano Brivio -(cherry picked from commit b145441913eef6f8885b6b84531e944ff593790c) ---- - tcp.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tcp.c b/tcp.c -index 0ac298a..4428305 100644 ---- a/tcp.c -+++ b/tcp.c -@@ -1696,7 +1696,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, - } - } - -- if (th->fin) -+ if (th->fin && seq == seq_from_tap) - fin = 1; - - if (!len) --- -2.47.1 - diff --git a/SOURCES/0005-selinux-Enable-read-and-watch-permissions-on-netns-d.patch b/SOURCES/0005-selinux-Enable-read-and-watch-permissions-on-netns-d.patch new file mode 100644 index 0000000..f3d94ad --- /dev/null +++ b/SOURCES/0005-selinux-Enable-read-and-watch-permissions-on-netns-d.patch @@ -0,0 +1,58 @@ +From d2c5133990a7758bfa567fc73216393498949e9b Mon Sep 17 00:00:00 2001 +From: Stefano Brivio +Date: Tue, 23 Dec 2025 01:59:34 +0100 +Subject: [PATCH] selinux: Enable read and watch permissions on netns directory + as well + +With commit 7aeda16a7818 ("selinux: Transition to pasta_t in +containers"), we need to make sure that pasta can access the target +namespace directory passed by Podman, and, in a general case, we have +all the permissions we need. + +But if we now start a container without the Podman changes referenced +by commit fd1bcc30af07 ("selinux: add container_var_run_t type +transition"), or with them, but with the container being created +before those and without a reboot in between, we'll additionally need +'read' and 'watch' permissions on user_tmp_t directory as well, as +user_tmp_t is still the (inconsistent) context of the namespace entry. + +Otherwise, on a container start/restart, we'll get SELinux denials: + + type=AVC msg=audit(1766451401.296:184): avc: denied { read } for pid=2159 comm="pasta.avx2" name="netns" dev="tmpfs" ino=60 scontext=unconfined_u:unconfined_r:pasta_t:s0-s0:c0.c1023 tcontext=unconfined_u:obje +ct_r:user_tmp_t:s0 tclass=dir permissive=1 + type=AVC msg=audit(1766451401.298:185): avc: denied { watch } for pid=2159 comm="pasta.avx2" path="/run/user/1001/netns" dev="tmpfs" ino=60 scontext=unconfined_u:unconfined_r:pasta_t:s0-s0:c0.c1023 tcontext=unconfined_u:object_r:user_tmp_t:s0 tclass=dir permissive=1 + +This can be reproduced quite simply: + + $ podman create -q --name hello hello + 6c4eaf15a03edf799673a97d84d0331f3a3f34a11015b58c69318101a3232770 + + [upgrade passt's SELinux policy to a version including 7aeda16a7818] + + $ podman start hello + Error: unable to start container "6c4eaf15a03edf799673a97d84d0331f3a3f34a11015b58c69318101a3232770": pasta failed with exit code 1: + netns dir open: Permission denied, exiting + +Reported-by: Tuomo Soini +Fixes: 7aeda16a7818 ("selinux: Transition to pasta_t in containers") +Signed-off-by: Stefano Brivio +--- + contrib/selinux/pasta.te | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te +index 95fe42a..3eb58f6 100644 +--- a/contrib/selinux/pasta.te ++++ b/contrib/selinux/pasta.te +@@ -149,7 +149,7 @@ allow pasta_t root_t:dir mounton; + manage_files_pattern(pasta_t, pasta_pid_t, pasta_pid_t) + files_pid_filetrans(pasta_t, pasta_pid_t, file) + +-allow pasta_t user_tmp_t:dir { add_name remove_name search write }; ++allow pasta_t user_tmp_t:dir { add_name read remove_name search watch write }; + allow pasta_t user_tmp_t:fifo_file append; + allow pasta_t user_tmp_t:file { create open write }; + allow pasta_t user_tmp_t:sock_file { create unlink }; +-- +2.47.1 + diff --git a/SOURCES/0006-selinux-Enable-open-permissions-on-netns-directory-o.patch b/SOURCES/0006-selinux-Enable-open-permissions-on-netns-directory-o.patch new file mode 100644 index 0000000..3072efd --- /dev/null +++ b/SOURCES/0006-selinux-Enable-open-permissions-on-netns-directory-o.patch @@ -0,0 +1,68 @@ +From 6babaa8a88eb337e4b81aeff673fcebb28015f36 Mon Sep 17 00:00:00 2001 +From: Stefano Brivio +Date: Fri, 16 Jan 2026 16:48:46 +0100 +Subject: [PATCH 6/7] selinux: Enable open permissions on netns directory, + operations on container_var_run_t + +Tuomo reports two further SELinux denials after upgrading to a +passt-selinux version that includes the transition to pasta_t for +containers, one I could reproduce: + + denied { open } for pid=3343050 comm="pasta.avx2" path="/run/user/1000/netns" dev="tmpfs" ino=51 scontext=unconfined_u:unconfined_r:pasta_t:s0-s0:c0.c1023 tcontext=unconfined_u:object_r:user_tmp_t:s0 tclass=dir permissive=1 + +which I didn't take care of in the previous commit, d2c5133990a7 +("selinux: Enable read and watch permissions on netns directory as +well"), as it didn't appear in my quick test. But I can make pasta use +"open" on the network namespace entry by simply using it to make +connections. + +So, for that, add "open" to the existing rule for user_tmp_t:dir. + +Then, another one I couldn't reproduce instead: + + denied { write } for pid=3589324 comm="pasta.avx2" name="rootless-netns" dev="tmpfs" ino=36 scontext=unconfined_u:unconfined_r:pasta_t:s0-s0:c0.c1023 tcontext=unconfined_u:object_r:container_var_run_t:s0 tclass=dir permissive=0 + +which, I think, comes from a specific combination of versions of +container-selinux, Podman, and passt-selinux packages, which +prevents the expected type transition on container_var_run_t unless +restorecon is invoked manually, or until a reboot. + +Allowing the same permissions on container_var_run_t as we do on +ifconfig_var_run_t is harmless, so do that to prevent this further +denial. + +Reported-by: Tuomo Soini +Fixes: d2c5133990a7 ("selinux: Enable read and watch permissions on netns directory as well") +Fixes: 7aeda16a7818 ("selinux: Transition to pasta_t in containers") +Signed-off-by: Stefano Brivio +(cherry picked from commit a6d92ca82c9ea0b395aa56c568ee6b6e6d4ac81e) +--- + contrib/selinux/pasta.te | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te +index 22daa77..abeafa4 100644 +--- a/contrib/selinux/pasta.te ++++ b/contrib/selinux/pasta.te +@@ -148,7 +148,7 @@ allow pasta_t root_t:dir mounton; + manage_files_pattern(pasta_t, pasta_pid_t, pasta_pid_t) + files_pid_filetrans(pasta_t, pasta_pid_t, file) + +-allow pasta_t user_tmp_t:dir { add_name read remove_name search watch write }; ++allow pasta_t user_tmp_t:dir { add_name open read remove_name search watch write }; + allow pasta_t user_tmp_t:fifo_file append; + allow pasta_t user_tmp_t:file { create open write }; + allow pasta_t user_tmp_t:sock_file { create unlink }; +@@ -248,7 +248,9 @@ type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "netns"; + type_transition container_runtime_t container_var_run_t : dir ifconfig_var_run_t "netns"; + type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "rootless-netns"; + type_transition container_runtime_t container_var_run_t : dir ifconfig_var_run_t "rootless-netns"; ++allow pasta_t container_var_run_t:dir { add_name open rmdir write }; + allow pasta_t ifconfig_var_run_t:dir { add_name open rmdir write }; ++allow pasta_t container_var_run_t:file { create open write }; + allow pasta_t ifconfig_var_run_t:file { create open write }; + allow systemd_logind_exec_t ifconfig_var_run_t:dir rmdir; + +-- +2.47.1 + diff --git a/SOURCES/0007-tcp-Fix-rounding-issue-in-check-for-approximating-wi.patch b/SOURCES/0007-tcp-Fix-rounding-issue-in-check-for-approximating-wi.patch new file mode 100644 index 0000000..b7cb942 --- /dev/null +++ b/SOURCES/0007-tcp-Fix-rounding-issue-in-check-for-approximating-wi.patch @@ -0,0 +1,74 @@ +From dbfbc33776290260b87bb29bb5572750f9709b35 Mon Sep 17 00:00:00 2001 +From: Stefano Brivio +Date: Fri, 9 Jan 2026 13:52:00 +0100 +Subject: [PATCH 7/7] tcp: Fix rounding issue in check for approximating window + to zero + +In general, we approximate the advertised window to zero if we would +otherwise advertise less than a MSS worth, and the reasoning behind +that is explained in cf1925fb7b77 ("tcp: Don't limit window to +less-than-MSS values, use zero instead"). + +Then, in commit b40f5cd8c8e1 ("tcp: Use less-than-MSS window on no +queued data, or no data sent recently"), I introduced some conditions +under which we won't do that, including a check on whether any data +was sent recently. + +As an arbitrary but probably reasonable threshold, we consider data to +have recently been sent if that occurred less than ten times the +round-trip time (RTT) ago. + +The time elapsed since the last data transmission is reported by the +kernel in milliseconds, in the tcpi_last_data_sent field of struct +tcp_info, and the RTT is reported in microseconds instead, in +tcpi_rtt. + +To avoid the risk of overflow in a simple way, for the purpose of this +comparison, I converted tcpi_rtt to milliseconds first, but this means +that the check will always be false (and we'll never approximate the +window to zero) if the RTT is below one millisecond. + +This, in turn, reintroduces nasty delay issues in transfers in +non-local connections which have however almost-local (low) latency. + +Given that we want to use ten times the RTT as an arbitrary "long +enough" upper bound, round the RTT up while converting it to +milliseconds. + +As an alternative, we could perform the comparison in microseconds, +but we would need a slightly more complicated implementation to +exclude overflows, and it's definitely not worth it given the nature +of this threshold. + +Fixes: b40f5cd8c8e1 ("tcp: Use less-than-MSS window on no queued data, or no data sent recently") +Signed-off-by: Stefano Brivio +Reviewed-by: David Gibson +(cherry picked from commit 2be0e790804f99580b1c8a1781c49913440607f2) +--- + tcp.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/tcp.c b/tcp.c +index 23fcbc3..8f4f087 100644 +--- a/tcp.c ++++ b/tcp.c +@@ -1180,6 +1180,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, + if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) { + new_wnd_to_tap = tinfo->tcpi_snd_wnd; + } else { ++ unsigned rtt_ms_ceiling = DIV_ROUND_UP(tinfo->tcpi_rtt, 1000); + uint32_t sendq; + int limit; + +@@ -1223,7 +1224,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, + * with pending data in the outbound queue + */ + if (limit < MSS_GET(conn) && sendq && +- tinfo->tcpi_last_data_sent < tinfo->tcpi_rtt / 1000 * 10) ++ tinfo->tcpi_last_data_sent < rtt_ms_ceiling * 10) + limit = 0; + + new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, limit); +-- +2.47.1 + diff --git a/SOURCES/0008-udp_flow-remove-unneeded-epoll_ref-indirection.patch b/SOURCES/0008-udp_flow-remove-unneeded-epoll_ref-indirection.patch new file mode 100644 index 0000000..c3c73e0 --- /dev/null +++ b/SOURCES/0008-udp_flow-remove-unneeded-epoll_ref-indirection.patch @@ -0,0 +1,48 @@ +From 768e38c4ab9f7bb328897577368084faf9ee41df Mon Sep 17 00:00:00 2001 +From: Laurent Vivier +Date: Fri, 9 Jan 2026 17:54:35 +0100 +Subject: [PATCH 08/18] udp_flow: remove unneeded epoll_ref indirection + +The fref union was used to convert flow_sidx_t to uint32_t for +assignment to ref.data. This is unnecessary since epoll_ref already +contains a flowside member of type flow_sidx_t, so we can assign +directly. + +This aligns with how icmp.c and other callers assign flow_sidx_t to +epoll_ref. + +Signed-off-by: Laurent Vivier +Reviewed-by: David Gibson +Signed-off-by: Stefano Brivio +(cherry picked from commit ab27852d0eebcd96d33c3699b44596a827b83bc6) +--- + udp_flow.c | 6 +----- + 1 file changed, 1 insertion(+), 5 deletions(-) + +diff --git a/udp_flow.c b/udp_flow.c +index 8907f2f..0ba7880 100644 +--- a/udp_flow.c ++++ b/udp_flow.c +@@ -74,10 +74,6 @@ static int udp_flow_sock(const struct ctx *c, + { + const struct flowside *side = &uflow->f.side[sidei]; + uint8_t pif = uflow->f.pif[sidei]; +- union { +- flow_sidx_t sidx; +- uint32_t data; +- } fref = { .sidx = FLOW_SIDX(uflow, sidei) }; + union epoll_ref ref; + int rc; + int s; +@@ -89,7 +85,7 @@ static int udp_flow_sock(const struct ctx *c, + } + + ref.type = EPOLL_TYPE_UDP; +- ref.data = fref.data; ++ ref.flowside = FLOW_SIDX(uflow, sidei); + ref.fd = s; + + flow_epollid_set(&uflow->f, EPOLLFD_ID_DEFAULT); +-- +2.47.1 + diff --git a/SOURCES/0009-udp_flow-Assign-socket-to-flow-inside-udp_flow_sock.patch b/SOURCES/0009-udp_flow-Assign-socket-to-flow-inside-udp_flow_sock.patch new file mode 100644 index 0000000..02f81e6 --- /dev/null +++ b/SOURCES/0009-udp_flow-Assign-socket-to-flow-inside-udp_flow_sock.patch @@ -0,0 +1,47 @@ +From 059a31c28aa6e5053846ee931b97eb1344a9ce17 Mon Sep 17 00:00:00 2001 +From: Laurent Vivier +Date: Fri, 9 Jan 2026 17:54:36 +0100 +Subject: [PATCH 09/18] udp_flow: Assign socket to flow inside udp_flow_sock() + +Move the assignment of uflow->s[sidei] from the caller (udp_flow_new()) +into udp_flow_sock() itself, placing it after the successful connect(). + +This is a pure refactoring with no functional change. The socket fd is +now assigned within udp_flow_sock() where the socket is created, rather +than requiring the caller to capture the return value. On error paths, +uflow->s[sidei] remains at its initialized value of -1 rather than being +set to the negative error code, which is semantically cleaner (though +functionally equivalent given the >= 0 check in udp_flow_close()). + +Signed-off-by: Laurent Vivier +Reviewed-by: David Gibson +Signed-off-by: Stefano Brivio +(cherry picked from commit e0fdfccc1c1a56c58a96d7fd6cc5d532cd780b6f) +--- + udp_flow.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/udp_flow.c b/udp_flow.c +index 0ba7880..c4cf35c 100644 +--- a/udp_flow.c ++++ b/udp_flow.c +@@ -105,6 +105,7 @@ static int udp_flow_sock(const struct ctx *c, + flow_dbg_perror(uflow, "Couldn't connect flow socket"); + return rc; + } ++ uflow->s[sidei] = s; + + /* It's possible, if unlikely, that we could receive some packets in + * between the bind() and connect() which may or may not be for this +@@ -159,7 +160,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, + + flow_foreach_sidei(sidei) { + if (pif_is_socket(uflow->f.pif[sidei])) +- if ((uflow->s[sidei] = udp_flow_sock(c, uflow, sidei)) < 0) ++ if (udp_flow_sock(c, uflow, sidei) < 0) + goto cancel; + } + +-- +2.47.1 + diff --git a/SOURCES/0010-tcp_splice-Refactor-tcp_splice_conn_epoll_events-to-.patch b/SOURCES/0010-tcp_splice-Refactor-tcp_splice_conn_epoll_events-to-.patch new file mode 100644 index 0000000..0a6a8f9 --- /dev/null +++ b/SOURCES/0010-tcp_splice-Refactor-tcp_splice_conn_epoll_events-to-.patch @@ -0,0 +1,94 @@ +From 766e42ea2c6f57547cfee4289ca27168149bb174 Mon Sep 17 00:00:00 2001 +From: Laurent Vivier +Date: Fri, 9 Jan 2026 17:54:37 +0100 +Subject: [PATCH 10/18] tcp_splice: Refactor tcp_splice_conn_epoll_events() to + per-side computation + +The function tcp_splice_conn_epoll_events() currently takes an array of +struct epoll_event and fills in the .events field for both sides using +flow_foreach_sidei() loops. + +This works, but the function is doing two conceptually separate things +at once: computing events for side 0 and computing events for side 1. +The OUT_WAIT handling is particularly subtle, as it has cross-side +effects: when OUT_WAIT(sidei) is set, we add EPOLLOUT to ev[sidei] but +also remove EPOLLIN from ev[!sidei]. + +Refactor to make the function compute events for a single side at a +time, taking sidei as a parameter and returning uint32_t. This makes +the logic more focused and easier to follow. The cross-side effects of +OUT_WAIT are preserved by checking both OUT_WAIT(sidei) and +OUT_WAIT(!sidei) within each call. + +The caller tcp_splice_epoll_ctl() now invokes the function twice, once +for each side, making the two-sided nature of the operation explicit. + +No functional change. + +Signed-off-by: Laurent Vivier +Reviewed-by: David Gibson +Signed-off-by: Stefano Brivio +(cherry picked from commit 23da651ab08e564b84c532f6f93b0817d2ae850f) +--- + tcp_splice.c | 33 ++++++++++++++------------------- + 1 file changed, 14 insertions(+), 19 deletions(-) + +diff --git a/tcp_splice.c b/tcp_splice.c +index 4405224..bf4ff46 100644 +--- a/tcp_splice.c ++++ b/tcp_splice.c +@@ -114,29 +114,23 @@ static struct tcp_splice_conn *conn_at_sidx(flow_sidx_t sidx) + * @events: Connection event flags + * @ev: Events to fill in, 0 is accepted socket, 1 is connecting socket + */ +-static void tcp_splice_conn_epoll_events(uint16_t events, +- struct epoll_event ev[]) ++static uint32_t tcp_splice_conn_epoll_events(uint16_t events, unsigned sidei) + { +- unsigned sidei; +- +- flow_foreach_sidei(sidei) +- ev[sidei].events = 0; ++ uint32_t e = 0; + + if (events & SPLICE_ESTABLISHED) { +- flow_foreach_sidei(sidei) { +- if (!(events & FIN_SENT(!sidei))) +- ev[sidei].events = EPOLLIN | EPOLLRDHUP; +- } +- } else if (events & SPLICE_CONNECT) { +- ev[1].events = EPOLLOUT; ++ if (!(events & FIN_SENT(!sidei))) ++ e = EPOLLIN | EPOLLRDHUP; ++ } else if (sidei == 1 && events & SPLICE_CONNECT) { ++ e = EPOLLOUT; + } + +- flow_foreach_sidei(sidei) { +- if (events & OUT_WAIT(sidei)) { +- ev[sidei].events |= EPOLLOUT; +- ev[!sidei].events &= ~EPOLLIN; +- } +- } ++ if (events & OUT_WAIT(sidei)) ++ e |= EPOLLOUT; ++ if (events & OUT_WAIT(!sidei)) ++ e &= ~EPOLLIN; ++ ++ return e; + } + + /** +@@ -161,7 +155,8 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, + struct epoll_event ev[SIDES] = { { .data.u64 = ref[0].u64 }, + { .data.u64 = ref[1].u64 } }; + +- tcp_splice_conn_epoll_events(conn->events, ev); ++ ev[0].events = tcp_splice_conn_epoll_events(conn->events, 0); ++ ev[1].events = tcp_splice_conn_epoll_events(conn->events, 1); + + + if (epoll_ctl(epollfd, m, conn->s[0], &ev[0]) || +-- +2.47.1 + diff --git a/SOURCES/0011-flow-Introduce-flow_epoll_set-to-centralize-epoll-op.patch b/SOURCES/0011-flow-Introduce-flow_epoll_set-to-centralize-epoll-op.patch new file mode 100644 index 0000000..82bf451 --- /dev/null +++ b/SOURCES/0011-flow-Introduce-flow_epoll_set-to-centralize-epoll-op.patch @@ -0,0 +1,489 @@ +From 79dab11a029025e485faf4a3f5ea1ed4538fb64b Mon Sep 17 00:00:00 2001 +From: Laurent Vivier +Date: Fri, 9 Jan 2026 17:54:38 +0100 +Subject: [PATCH 11/18] flow: Introduce flow_epoll_set() to centralize epoll + operations + +Currently, each flow type (TCP, TCP_SPLICE, PING, UDP) has its own +code to add or modify file descriptors in epoll. This leads to +duplicated boilerplate code across icmp.c, tcp.c, tcp_splice.c, and +udp_flow.c, each setting up epoll_ref unions and calling epoll_ctl() +with flow-type-specific details. + +Introduce flow_epoll_set() in flow.c to handle epoll operations for +all flow types in a unified way. + +This will be needed to migrate queue pair from an epollfd to another. + +Signed-off-by: Laurent Vivier +Reviewed-by: David Gibson +Signed-off-by: Stefano Brivio +(cherry picked from commit c0be730f2aa2243a132b3ee40c2bf05ebc84fedf) +--- + flow.c | 37 ++++++++++++++++++++++++ + flow.h | 2 ++ + icmp.c | 10 ++----- + tcp.c | 48 ++++++++++++++++++------------ + tcp_splice.c | 82 ++++++++++++++++++++++++---------------------------- + udp_flow.c | 11 ++----- + 6 files changed, 111 insertions(+), 79 deletions(-) + +diff --git a/flow.c b/flow.c +index 4f53486..cefe6c8 100644 +--- a/flow.c ++++ b/flow.c +@@ -20,6 +20,7 @@ + #include "flow.h" + #include "flow_table.h" + #include "repair.h" ++#include "epoll_ctl.h" + + const char *flow_state_str[] = { + [FLOW_STATE_FREE] = "FREE", +@@ -53,6 +54,16 @@ const uint8_t flow_proto[] = { + static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, + "flow_proto[] doesn't match enum flow_type"); + ++static const enum epoll_type flow_epoll[] = { ++ [FLOW_TCP] = EPOLL_TYPE_TCP, ++ [FLOW_TCP_SPLICE] = EPOLL_TYPE_TCP_SPLICE, ++ [FLOW_PING4] = EPOLL_TYPE_PING, ++ [FLOW_PING6] = EPOLL_TYPE_PING, ++ [FLOW_UDP] = EPOLL_TYPE_UDP, ++}; ++static_assert(ARRAY_SIZE(flow_epoll) == FLOW_NUM_TYPES, ++ "flow_epoll[] doesn't match enum flow_type"); ++ + #define foreach_established_tcp_flow(flow) \ + flow_foreach_of_type((flow), FLOW_TCP) \ + if (!tcp_flow_is_established(&(flow)->tcp)) \ +@@ -390,6 +401,32 @@ void flow_epollid_clear(struct flow_common *f) + f->epollid = EPOLLFD_ID_INVALID; + } + ++/** ++ * flow_epoll_set() - Add or modify epoll registration for a flow socket ++ * @f: Flow to register socket for ++ * @command: epoll_ctl() command: EPOLL_CTL_ADD or EPOLL_CTL_MOD ++ * @events: epoll events to watch for ++ * @fd: File descriptor to register ++ * @sidei: Side index of the flow ++ * ++ * Return: 0 on success, -1 on error (from epoll_ctl()) ++ */ ++int flow_epoll_set(const struct flow_common *f, int command, uint32_t events, ++ int fd, unsigned int sidei) ++{ ++ struct epoll_event ev; ++ union epoll_ref ref; ++ ++ ref.fd = fd; ++ ref.type = flow_epoll[f->type]; ++ ref.flowside = flow_sidx(f, sidei); ++ ++ ev.events = events; ++ ev.data.u64 = ref.u64; ++ ++ return epoll_ctl(flow_epollfd(f), command, fd, &ev); ++} ++ + /** + * flow_epollid_register() - Initialize the epoll id -> fd mapping + * @epollid: epoll id to associate to +diff --git a/flow.h b/flow.h +index b43b0b1..1b78d59 100644 +--- a/flow.h ++++ b/flow.h +@@ -265,6 +265,8 @@ bool flow_in_epoll(const struct flow_common *f); + int flow_epollfd(const struct flow_common *f); + void flow_epollid_set(struct flow_common *f, int epollid); + void flow_epollid_clear(struct flow_common *f); ++int flow_epoll_set(const struct flow_common *f, int command, uint32_t events, ++ int fd, unsigned int sidei); + void flow_epollid_register(int epollid, int epollfd); + void flow_defer_handler(const struct ctx *c, const struct timespec *now); + int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage, +diff --git a/icmp.c b/icmp.c +index 9564c49..eb7f11b 100644 +--- a/icmp.c ++++ b/icmp.c +@@ -177,7 +177,6 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, + union flow *flow = flow_alloc(); + struct icmp_ping_flow *pingf; + const struct flowside *tgt; +- union epoll_ref ref; + + if (!flow) + return NULL; +@@ -211,13 +210,10 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, + goto cancel; + + flow_epollid_set(&pingf->f, EPOLLFD_ID_DEFAULT); +- +- ref.type = EPOLL_TYPE_PING; +- ref.flowside = FLOW_SIDX(flow, TGTSIDE); +- ref.fd = pingf->sock; +- +- if (epoll_add(flow_epollfd(&pingf->f), EPOLLIN, ref) < 0) { ++ if (flow_epoll_set(&pingf->f, EPOLL_CTL_ADD, EPOLLIN, pingf->sock, ++ TGTSIDE) < 0) { + close(pingf->sock); ++ flow_epollid_clear(&pingf->f); + goto cancel; + } + +diff --git a/tcp.c b/tcp.c +index 8f4f087..146d460 100644 +--- a/tcp.c ++++ b/tcp.c +@@ -523,34 +523,44 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) + + /** + * tcp_epoll_ctl() - Add/modify/delete epoll state from connection events +- * @c: Execution context + * @conn: Connection pointer + * + * Return: 0 on success, negative error code on failure (not on deletion) + */ +-static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) ++static int tcp_epoll_ctl(struct tcp_tap_conn *conn) + { +- int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; +- union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock, +- .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), }; +- struct epoll_event ev = { .data.u64 = ref.u64 }; +- int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f) +- : c->epollfd; ++ uint32_t events; ++ int m; + + if (conn->events == CLOSED) { +- if (flow_in_epoll(&conn->f)) ++ if (flow_in_epoll(&conn->f)) { ++ int epollfd = flow_epollfd(&conn->f); ++ + epoll_del(epollfd, conn->sock); +- if (conn->timer != -1) +- epoll_del(epollfd, conn->timer); ++ if (conn->timer != -1) ++ epoll_del(epollfd, conn->timer); ++ } ++ + return 0; + } + +- ev.events = tcp_conn_epoll_events(conn->events, conn->flags); ++ events = tcp_conn_epoll_events(conn->events, conn->flags); + +- if (epoll_ctl(epollfd, m, conn->sock, &ev)) +- return -errno; ++ if (flow_in_epoll(&conn->f)) { ++ m = EPOLL_CTL_MOD; ++ } else { ++ flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT); ++ m = EPOLL_CTL_ADD; ++ } + +- flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT); ++ if (flow_epoll_set(&conn->f, m, events, conn->sock, ++ !TAPSIDE(conn)) < 0) { ++ int ret = -errno; ++ ++ if (m == EPOLL_CTL_ADD) ++ flow_epollid_clear(&conn->f); ++ return ret; ++ } + + if (conn->timer != -1) { + union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER, +@@ -681,7 +691,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, + } + + if (flag == STALLED || flag == ~STALLED) +- tcp_epoll_ctl(c, conn); ++ tcp_epoll_ctl(conn); + + if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE || + (flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) || +@@ -738,7 +748,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, + } else { + if (event == CLOSED) + flow_hash_remove(c, TAP_SIDX(conn)); +- tcp_epoll_ctl(c, conn); ++ tcp_epoll_ctl(conn); + } + + if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) +@@ -1753,7 +1763,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, + conn_event(c, conn, TAP_SYN_ACK_SENT); + } + +- tcp_epoll_ctl(c, conn); ++ tcp_epoll_ctl(conn); + + if (c->mode == MODE_VU) { /* To rebind to same oport after migration */ + socklen_t sl = sizeof(sa); +@@ -4021,7 +4031,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd + tcp_send_flag(c, conn, ACK); + tcp_data_from_sock(c, conn); + +- if ((rc = tcp_epoll_ctl(c, conn))) { ++ if ((rc = tcp_epoll_ctl(conn))) { + flow_dbg(conn, + "Failed to subscribe to epoll for migrated socket: %s", + strerror_(-rc)); +diff --git a/tcp_splice.c b/tcp_splice.c +index bf4ff46..a7c04ca 100644 +--- a/tcp_splice.c ++++ b/tcp_splice.c +@@ -135,37 +135,31 @@ static uint32_t tcp_splice_conn_epoll_events(uint16_t events, unsigned sidei) + + /** + * tcp_splice_epoll_ctl() - Add/modify/delete epoll state from connection events +- * @c: Execution context + * @conn: Connection pointer + * + * Return: 0 on success, negative error code on failure (not on deletion) + */ +-static int tcp_splice_epoll_ctl(const struct ctx *c, +- struct tcp_splice_conn *conn) ++static int tcp_splice_epoll_ctl(struct tcp_splice_conn *conn) + { +- int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f) +- : c->epollfd; +- int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; +- const union epoll_ref ref[SIDES] = { +- { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[0], +- .flowside = FLOW_SIDX(conn, 0) }, +- { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[1], +- .flowside = FLOW_SIDX(conn, 1) } +- }; +- struct epoll_event ev[SIDES] = { { .data.u64 = ref[0].u64 }, +- { .data.u64 = ref[1].u64 } }; +- +- ev[0].events = tcp_splice_conn_epoll_events(conn->events, 0); +- ev[1].events = tcp_splice_conn_epoll_events(conn->events, 1); +- +- +- if (epoll_ctl(epollfd, m, conn->s[0], &ev[0]) || +- epoll_ctl(epollfd, m, conn->s[1], &ev[1])) { ++ uint32_t events[2]; ++ int m; ++ ++ if (flow_in_epoll(&conn->f)) { ++ m = EPOLL_CTL_MOD; ++ } else { ++ flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT); ++ m = EPOLL_CTL_ADD; ++ } ++ ++ events[0] = tcp_splice_conn_epoll_events(conn->events, 0); ++ events[1] = tcp_splice_conn_epoll_events(conn->events, 1); ++ ++ if (flow_epoll_set(&conn->f, m, events[0], conn->s[0], 0) || ++ flow_epoll_set(&conn->f, m, events[1], conn->s[1], 1)) { + int ret = -errno; + flow_perror(conn, "ERROR on epoll_ctl()"); + return ret; + } +- flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT); + + return 0; + } +@@ -205,7 +199,7 @@ static void conn_flag_do(struct tcp_splice_conn *conn, + } + } + +-#define conn_flag(c, conn, flag) \ ++#define conn_flag(conn, flag) \ + do { \ + flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \ + conn_flag_do(conn, flag); \ +@@ -213,12 +207,10 @@ static void conn_flag_do(struct tcp_splice_conn *conn, + + /** + * conn_event_do() - Set and log connection events, update epoll state +- * @c: Execution context + * @conn: Connection pointer + * @event: Connection event + */ +-static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn, +- unsigned long event) ++static void conn_event_do(struct tcp_splice_conn *conn, unsigned long event) + { + if (event & (event - 1)) { + int flag_index = fls(~event); +@@ -240,14 +232,14 @@ static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn, + flow_dbg(conn, "%s", tcp_splice_event_str[flag_index]); + } + +- if (tcp_splice_epoll_ctl(c, conn)) +- conn_flag(c, conn, CLOSING); ++ if (tcp_splice_epoll_ctl(conn)) ++ conn_flag(conn, CLOSING); + } + +-#define conn_event(c, conn, event) \ ++#define conn_event(conn, event) \ + do { \ + flow_trace(conn, "event at %s:%i",__func__, __LINE__); \ +- conn_event_do(c, conn, event); \ ++ conn_event_do(conn, event); \ + } while (0) + + +@@ -315,7 +307,7 @@ static int tcp_splice_connect_finish(const struct ctx *c, + if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) { + flow_perror(conn, "cannot create %d->%d pipe", + sidei, !sidei); +- conn_flag(c, conn, CLOSING); ++ conn_flag(conn, CLOSING); + return -EIO; + } + +@@ -329,7 +321,7 @@ static int tcp_splice_connect_finish(const struct ctx *c, + } + + if (!(conn->events & SPLICE_ESTABLISHED)) +- conn_event(c, conn, SPLICE_ESTABLISHED); ++ conn_event(conn, SPLICE_ESTABLISHED); + + return 0; + } +@@ -376,7 +368,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn) + + pif_sockaddr(c, &sa, tgtpif, &tgt->eaddr, tgt->eport); + +- conn_event(c, conn, SPLICE_CONNECT); ++ conn_event(conn, SPLICE_CONNECT); + + if (connect(conn->s[1], &sa.sa, socklen_inany(&sa))) { + if (errno != EINPROGRESS) { +@@ -385,7 +377,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn) + return -errno; + } + } else { +- conn_event(c, conn, SPLICE_ESTABLISHED); ++ conn_event(conn, SPLICE_ESTABLISHED); + return tcp_splice_connect_finish(c, conn); + } + +@@ -445,7 +437,7 @@ void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0) + flow_trace(conn, "failed to set TCP_QUICKACK on %i", s0); + + if (tcp_splice_connect(c, conn)) +- conn_flag(c, conn, CLOSING); ++ conn_flag(conn, CLOSING); + + FLOW_ACTIVATE(conn); + } +@@ -494,14 +486,14 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, + + if (events & EPOLLOUT) { + fromsidei = !evsidei; +- conn_event(c, conn, ~OUT_WAIT(evsidei)); ++ conn_event(conn, ~OUT_WAIT(evsidei)); + } else { + fromsidei = evsidei; + } + + if (events & EPOLLRDHUP) + /* For side 0 this is fake, but implied */ +- conn_event(c, conn, FIN_RCVD(evsidei)); ++ conn_event(conn, FIN_RCVD(evsidei)); + + swap: + eof = 0; +@@ -536,7 +528,7 @@ retry: + more = SPLICE_F_MORE; + + if (conn->flags & lowat_set_flag) +- conn_flag(c, conn, lowat_act_flag); ++ conn_flag(conn, lowat_act_flag); + } + + do +@@ -568,8 +560,8 @@ retry: + "Setting SO_RCVLOWAT %i: %s", + lowat, strerror_(errno)); + } else { +- conn_flag(c, conn, lowat_set_flag); +- conn_flag(c, conn, lowat_act_flag); ++ conn_flag(conn, lowat_set_flag); ++ conn_flag(conn, lowat_act_flag); + } + } + +@@ -583,7 +575,7 @@ retry: + if (conn->read[fromsidei] == conn->written[fromsidei]) + break; + +- conn_event(c, conn, OUT_WAIT(!fromsidei)); ++ conn_event(conn, OUT_WAIT(!fromsidei)); + break; + } + +@@ -605,7 +597,7 @@ retry: + if ((conn->events & FIN_RCVD(sidei)) && + !(conn->events & FIN_SENT(!sidei))) { + shutdown(conn->s[!sidei], SHUT_WR); +- conn_event(c, conn, FIN_SENT(!sidei)); ++ conn_event(conn, FIN_SENT(!sidei)); + } + } + } +@@ -626,7 +618,7 @@ retry: + return; + + close: +- conn_flag(c, conn, CLOSING); ++ conn_flag(conn, CLOSING); + } + + /** +@@ -762,10 +754,10 @@ void tcp_splice_timer(struct tcp_splice_conn *conn) + flow_trace(conn, "can't set SO_RCVLOWAT on %d", + conn->s[sidei]); + } +- conn_flag(c, conn, ~RCVLOWAT_SET(sidei)); ++ conn_flag(conn, ~RCVLOWAT_SET(sidei)); + } + } + + flow_foreach_sidei(sidei) +- conn_flag(c, conn, ~RCVLOWAT_ACT(sidei)); ++ conn_flag(conn, ~RCVLOWAT_ACT(sidei)); + } +diff --git a/udp_flow.c b/udp_flow.c +index c4cf35c..80b1543 100644 +--- a/udp_flow.c ++++ b/udp_flow.c +@@ -74,7 +74,6 @@ static int udp_flow_sock(const struct ctx *c, + { + const struct flowside *side = &uflow->f.side[sidei]; + uint8_t pif = uflow->f.pif[sidei]; +- union epoll_ref ref; + int rc; + int s; + +@@ -84,14 +83,10 @@ static int udp_flow_sock(const struct ctx *c, + return s; + } + +- ref.type = EPOLL_TYPE_UDP; +- ref.flowside = FLOW_SIDX(uflow, sidei); +- ref.fd = s; +- + flow_epollid_set(&uflow->f, EPOLLFD_ID_DEFAULT); +- +- rc = epoll_add(flow_epollfd(&uflow->f), EPOLLIN, ref); +- if (rc < 0) { ++ if (flow_epoll_set(&uflow->f, EPOLL_CTL_ADD, EPOLLIN, s, sidei) < 0) { ++ rc = -errno; ++ flow_epollid_clear(&uflow->f); + close(s); + return rc; + } +-- +2.47.1 + diff --git a/SOURCES/0012-tcp-Properly-propagate-tap-side-RST-to-socket-side.patch b/SOURCES/0012-tcp-Properly-propagate-tap-side-RST-to-socket-side.patch new file mode 100644 index 0000000..7027242 --- /dev/null +++ b/SOURCES/0012-tcp-Properly-propagate-tap-side-RST-to-socket-side.patch @@ -0,0 +1,99 @@ +From 73a9bee3e1ffe447cb041c4826465a71730c2ecf Mon Sep 17 00:00:00 2001 +From: David Gibson +Date: Tue, 27 Jan 2026 19:39:52 +1100 +Subject: [PATCH 12/18] tcp: Properly propagate tap-side RST to socket side + +When the guest sends a TCP RST, or on certain error conditions, we want to +signal the abnormal termination of a TCP connection to the peer with an +RST as well. We attempt to do that by close()ing the socket. + +That doesn't work: a close() will usually send a FIN, rather than an RST. +The standard method of forcing an RST on a socket is to set the SO_LINGER +socket option with a 0 timeout, then close(). + +Update the tcp_rst() path to do this, so it forces a socket side RST. +Update the handling of a guest side RST to use the same path (minus +sending a tap side RST) so that we properly propagate guest RSTs to the +peer. + +Link: https://bugs.passt.top/show_bug.cgi?id=191 +Signed-off-by: David Gibson +Signed-off-by: Stefano Brivio +(cherry picked from commit cce94e92fb3d2a90730c125f2bad32c9ed51da3f) +--- + tcp.c | 37 +++++++++++++++++++++++++++++++++---- + 1 file changed, 33 insertions(+), 4 deletions(-) + +diff --git a/tcp.c b/tcp.c +index 146d460..602e810 100644 +--- a/tcp.c ++++ b/tcp.c +@@ -1417,7 +1417,34 @@ static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, + } + + /** +- * tcp_rst_do() - Reset a tap connection: send RST segment to tap, close socket ++ * tcp_sock_rst() - Close TCP connection forcing RST on socket side ++ * @c: Execution context ++ * @conn: Connection pointer ++ */ ++static void tcp_sock_rst(const struct ctx *c, struct tcp_tap_conn *conn) ++{ ++ const struct linger linger0 = { ++ .l_onoff = 1, ++ .l_linger = 0, ++ }; ++ ++ /* Force RST on socket to inform the peer ++ * ++ * We do this by setting SO_LINGER with 0 timeout, which means that ++ * close() will send an RST (unless the connection is already closed in ++ * both directions). ++ */ ++ if (setsockopt(conn->sock, SOL_SOCKET, ++ SO_LINGER, &linger0, sizeof(linger0)) < 0) { ++ flow_dbg_perror(conn, ++ "SO_LINGER failed, may not send RST to peer"); ++ } ++ ++ conn_event(c, conn, CLOSED); ++} ++ ++/** ++ * tcp_rst_do() - Reset a tap connection: send RST segment on both sides, close + * @c: Execution context + * @conn: Connection pointer + */ +@@ -1426,8 +1453,10 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn) + if (conn->events == CLOSED) + return; + ++ /* Send RST on tap */ + tcp_send_flag(c, conn, RST); +- conn_event(c, conn, CLOSED); ++ ++ tcp_sock_rst(c, conn); + } + + /** +@@ -1898,7 +1927,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, + return -1; + + if (th->rst) { +- conn_event(c, conn, CLOSED); ++ tcp_sock_rst(c, conn); + return 1; + } + +@@ -2262,7 +2291,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, + flow_trace(conn, "packet length %zu from tap", l4len); + + if (th->rst) { +- conn_event(c, conn, CLOSED); ++ tcp_sock_rst(c, conn); + return 1; + } + +-- +2.47.1 + diff --git a/SOURCES/0013-udp-Split-activity-timeouts-for-UDP-flows.patch b/SOURCES/0013-udp-Split-activity-timeouts-for-UDP-flows.patch new file mode 100644 index 0000000..e26e9ab --- /dev/null +++ b/SOURCES/0013-udp-Split-activity-timeouts-for-UDP-flows.patch @@ -0,0 +1,239 @@ +From 3d6804c07d1b9ed26fea88d680a1734ea1069d91 Mon Sep 17 00:00:00 2001 +From: Yumei Huang +Date: Sat, 14 Feb 2026 15:31:36 +0800 +Subject: [PATCH 13/18] udp: Split activity timeouts for UDP flows + +Frequent DNS queries over UDP from a container or guest can result +in many sockets shown in ss(8), typically one per flow. This is +expected and harmless, but it can make the output of ss(8) look +noisy and potentially concern users. + +This patch splits UDP flow timeouts into two, mirroring the Linux +kernel, and sources the values from kernel parameters. The shorter +timeout is applied to unidirectional flows and minimal bidirectional +exchanges (single datagram and reply), while the longer timeout is +used for bidirectional flows with multiple datagrams on either side. + +Link: https://bugs.passt.top/show_bug.cgi?id=197 +Suggested-by: Stefano Brivio +Signed-off-by: Yumei Huang +Signed-off-by: Stefano Brivio +(cherry picked from commit bebafa72a982784164a7d556bd860ec0ed1e02c7) +--- + contrib/apparmor/abstractions/passt | 4 ++++ + udp.c | 34 +++++++++++++++++++++++++++-- + udp.h | 4 ++++ + udp_flow.c | 30 ++++++++++++++++++++----- + udp_flow.h | 4 ++++ + 5 files changed, 69 insertions(+), 7 deletions(-) + +diff --git a/contrib/apparmor/abstractions/passt b/contrib/apparmor/abstractions/passt +index 43fd63f..e8ed513 100644 +--- a/contrib/apparmor/abstractions/passt ++++ b/contrib/apparmor/abstractions/passt +@@ -36,6 +36,10 @@ + + @{PROC}/sys/net/ipv4/ip_local_port_range r, # fwd_probe_ephemeral() + ++ # udp_get_timeout_params(), udp.c ++ @{PROC}/sys/net/netfilter/nf_conntrack_udp_timeout r, ++ @{PROC}/sys/net/netfilter/nf_conntrack_udp_timeout_stream r, ++ + network netlink raw, # nl_sock_init_do(), netlink.c + + network inet stream, # tcp.c +diff --git a/udp.c b/udp.c +index 08bec50..32d70b6 100644 +--- a/udp.c ++++ b/udp.c +@@ -26,7 +26,10 @@ + * + * We track pseudo-connections of this type as flow table entries of type + * FLOW_UDP. We store the time of the last traffic on the flow in uflow->ts, +- * and let the flow expire if there is no traffic for UDP_CONN_TIMEOUT seconds. ++ * and let the flow expire if there is no traffic for UDP_TIMEOUT seconds for ++ * unidirectional flows and flows with only one datagram and one reply, or ++ * UDP_TIMEOUT_STREAM seconds for bidirectional flows with more than one ++ * datagram on either side. + * + * NOTE: This won't handle multicast protocols, or some protocols with different + * port usage. We'll need specific logic if we want to handle those. +@@ -118,6 +121,13 @@ + + #define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ + ++#define UDP_TIMEOUT "/proc/sys/net/netfilter/nf_conntrack_udp_timeout" ++#define UDP_TIMEOUT_STREAM \ ++ "/proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream" ++ ++#define UDP_TIMEOUT_DEFAULT 30 /* s */ ++#define UDP_TIMEOUT_STREAM_DEFAULT 120 /* s */ ++ + /* Maximum UDP data to be returned in ICMP messages */ + #define ICMP4_MAX_DLEN 8 + #define ICMP6_MAX_DLEN (IPV6_MIN_MTU \ +@@ -966,7 +976,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, + int s = ref.fd; + + flow_trace(uflow, "Received data on reply socket"); +- uflow->ts = now->tv_sec; ++ udp_flow_activity(uflow, !tosidx.sidei, now); + + if (pif_is_socket(topif)) { + udp_sock_to_sock(c, ref.fd, n, tosidx); +@@ -1301,6 +1311,24 @@ void udp_port_rebind_all(struct ctx *c) + udp_port_rebind(c, false); + } + ++/** ++ * udp_get_timeout_params() - Get host kernel UDP timeout parameters ++ * @c: Execution context ++ */ ++static void udp_get_timeout_params(struct ctx *c) ++{ ++ intmax_t v; ++ ++ v = read_file_integer(UDP_TIMEOUT, UDP_TIMEOUT_DEFAULT); ++ c->udp.timeout = v; ++ ++ v = read_file_integer(UDP_TIMEOUT_STREAM, UDP_TIMEOUT_STREAM_DEFAULT); ++ c->udp.stream_timeout = v; ++ ++ debug("Using UDP timeout parameters, timeout: %d, stream_timeout: %d", ++ c->udp.timeout, c->udp.stream_timeout); ++} ++ + /** + * udp_init() - Initialise per-socket data, and sockets in namespace + * @c: Execution context +@@ -1311,6 +1339,8 @@ int udp_init(struct ctx *c) + { + ASSERT(!c->no_udp); + ++ udp_get_timeout_params(c); ++ + udp_iov_init(c); + + if (c->mode == MODE_PASTA) { +diff --git a/udp.h b/udp.h +index 03e8dc5..618f258 100644 +--- a/udp.h ++++ b/udp.h +@@ -42,11 +42,15 @@ union udp_listen_epoll_ref { + * @fwd_in: Port forwarding configuration for inbound packets + * @fwd_out: Port forwarding configuration for outbound packets + * @timer_run: Timestamp of most recent timer run ++ * @timeout: Timeout for unidirectional flows (in s) ++ * @stream_timeout: Timeout for stream-like flows (in s) + */ + struct udp_ctx { + struct fwd_ports fwd_in; + struct fwd_ports fwd_out; + struct timespec timer_run; ++ int timeout; ++ int stream_timeout; + }; + + #endif /* UDP_H */ +diff --git a/udp_flow.c b/udp_flow.c +index 80b1543..4a8d4b6 100644 +--- a/udp_flow.c ++++ b/udp_flow.c +@@ -17,8 +17,6 @@ + #include "udp_internal.h" + #include "epoll_ctl.h" + +-#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */ +- + /** + * udp_at_sidx() - Get UDP specific flow at given sidx + * @sidx: Flow and side to retrieve +@@ -152,6 +150,8 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, + uflow->ts = now->tv_sec; + uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1; + uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = 0; ++ uflow->activity[INISIDE] = 1; ++ uflow->activity[TGTSIDE] = 0; + + flow_foreach_sidei(sidei) { + if (pif_is_socket(uflow->f.pif[sidei])) +@@ -227,7 +227,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, + + sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, s_in, dst, port); + if ((uflow = udp_at_sidx(sidx))) { +- uflow->ts = now->tv_sec; ++ udp_flow_activity(uflow, sidx.sidei, now); + return flow_sidx_opposite(sidx); + } + +@@ -284,7 +284,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, + sidx = flow_lookup_af(c, IPPROTO_UDP, pif, af, saddr, daddr, + srcport, dstport); + if ((uflow = udp_at_sidx(sidx))) { +- uflow->ts = now->tv_sec; ++ udp_flow_activity(uflow, sidx.sidei, now); + return flow_sidx_opposite(sidx); + } + +@@ -361,9 +361,29 @@ bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow, + bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow, + const struct timespec *now) + { +- if (now->tv_sec - uflow->ts <= UDP_CONN_TIMEOUT) ++ int timeout = c->udp.timeout; ++ ++ if (uflow->activity[TGTSIDE] && ++ (uflow->activity[INISIDE] > 1 || uflow->activity[TGTSIDE] > 1)) ++ timeout = c->udp.stream_timeout; ++ ++ if (now->tv_sec - uflow->ts <= timeout) + return false; + + udp_flow_close(c, uflow); + return true; + } ++ ++/** ++ * udp_flow_activity() - Track activity of a UDP flow ++ * @uflow: UDP flow ++ * @sidei: Side index of the flow (INISIDE or TGTSIDE) ++ * @now: Current timestamp ++ */ ++void udp_flow_activity(struct udp_flow *uflow, unsigned int sidei, ++ const struct timespec *now) ++{ ++ uflow->ts = now->tv_sec; ++ if (uflow->activity[sidei] < UINT8_MAX) ++ uflow->activity[sidei]++; ++} +diff --git a/udp_flow.h b/udp_flow.h +index 4c528e9..183a429 100644 +--- a/udp_flow.h ++++ b/udp_flow.h +@@ -16,6 +16,7 @@ + * @flush1: @s[1] may have datagrams queued for other flows + * @ts: Activity timestamp + * @s: Socket fd (or -1) for each side of the flow ++ * @activity: Packets seen from each side of the flow, up to UINT8_MAX + */ + struct udp_flow { + /* Must be first element */ +@@ -29,6 +30,7 @@ struct udp_flow { + + time_t ts; + int s[SIDES]; ++ uint8_t activity[SIDES]; + }; + + struct udp_flow *udp_at_sidx(flow_sidx_t sidx); +@@ -46,5 +48,7 @@ bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow, + const struct timespec *now); + bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow, + const struct timespec *now); ++void udp_flow_activity(struct udp_flow *uflow, unsigned int sidei, ++ const struct timespec *now); + + #endif /* UDP_FLOW_H */ +-- +2.47.1 + diff --git a/SOURCES/0014-tcp-Remove-non-working-activity-timeout-mechanism.patch b/SOURCES/0014-tcp-Remove-non-working-activity-timeout-mechanism.patch new file mode 100644 index 0000000..d2846ad --- /dev/null +++ b/SOURCES/0014-tcp-Remove-non-working-activity-timeout-mechanism.patch @@ -0,0 +1,80 @@ +From 79430cb183b70aee127dfc68846e1f8661820a43 Mon Sep 17 00:00:00 2001 +From: David Gibson +Date: Wed, 4 Feb 2026 21:41:34 +1000 +Subject: [PATCH 14/18] tcp: Remove non-working activity timeout mechanism + +This mechanism was intended to remove connections which have had no +activity for two hours, even if they haven't closed or been reset +internally. It operated by setting the two hour timeout if there are +no sooner TCP timeouts to schedule. + +However, when the timer fires, the way we detect the case of the activity +timeout doesn't work: it resets the timer for another two hours, then +checks if the old timeout was two hours. But the old timeout returned +by timerfd_settime() is not the original value of the timer, but the +remaining time. Since the timer has just fired it will essentially always +be 0. + +For now, just remove the mechanism, disarming the timer entirely if there +isn't another upcoming event. We'll re-introduce some sort of activity +timeout by a different means later. + +Signed-off-by: David Gibson +Signed-off-by: Stefano Brivio +(cherry picked from commit e48ce41a1ec2f05846fb66d3847c2c2b6448ca71) +--- + tcp.c | 24 +++--------------------- + 1 file changed, 3 insertions(+), 21 deletions(-) + +diff --git a/tcp.c b/tcp.c +index 602e810..de2ad38 100644 +--- a/tcp.c ++++ b/tcp.c +@@ -199,9 +199,6 @@ + * TAP_FIN_ACKED), but no socket activity is detected from the socket within + * this time, reset the connection + * +- * - ACT_TIMEOUT, in the presence of any event: if no activity is detected on +- * either side, the connection is reset +- * + * - RTT / 2 elapsed after data segment received from tap without having + * sent an ACK segment, or zero-sized window advertised to tap/guest (flag + * ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent. +@@ -632,7 +629,9 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) + } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { + it.it_value.tv_sec = FIN_TIMEOUT; + } else { +- it.it_value.tv_sec = ACT_TIMEOUT; ++ /* Disarm */ ++ it.it_value.tv_sec = 0; ++ it.it_value.tv_nsec = 0; + } + + if (conn->flags & ACK_TO_TAP_DUE) { +@@ -2628,23 +2627,6 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) + tcp_data_from_sock(c, conn); + tcp_timer_ctl(c, conn); + } +- } else { +- struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } }; +- struct itimerspec old = { { 0 }, { 0 } }; +- +- /* Activity timeout: if it was already set, reset the +- * connection, otherwise, it was a left-over from ACK_TO_TAP_DUE +- * or ACK_FROM_TAP_DUE, so just set the long timeout in that +- * case. This avoids having to preemptively reset the timer on +- * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE. +- */ +- if (timerfd_settime(conn->timer, 0, &new, &old)) +- flow_perror(conn, "failed to set timer"); +- +- if (old.it_value.tv_sec == ACT_TIMEOUT) { +- flow_dbg(conn, "activity timeout"); +- tcp_rst(c, conn); +- } + } + } + +-- +2.47.1 + diff --git a/SOURCES/0015-tcp-Re-introduce-inactivity-timeouts-based-on-a-cloc.patch b/SOURCES/0015-tcp-Re-introduce-inactivity-timeouts-based-on-a-cloc.patch new file mode 100644 index 0000000..d12e6b4 --- /dev/null +++ b/SOURCES/0015-tcp-Re-introduce-inactivity-timeouts-based-on-a-cloc.patch @@ -0,0 +1,191 @@ +From a2b1ad31a4d56a59e4d407263a22dee270973ea4 Mon Sep 17 00:00:00 2001 +From: David Gibson +Date: Wed, 4 Feb 2026 21:41:35 +1000 +Subject: [PATCH 15/18] tcp: Re-introduce inactivity timeouts based on a clock + algorithm + +We previously had a mechanism to remove TCP connections which were +inactive for 2 hours. That was broken for a long time, due to poor +interactions with the timerfd handling, so we removed it. + +Adding this long scale timer onto the timerfd handling, which mostly +handles much shorter timeouts is tricky to reason about. However, for the +inactivity timeouts, we don't require precision. Instead, we can use +a 1-bit page replacement / "clock" algorithm. Every INACTIVITY_INTERVAL +(2 hours), a global timer marks every TCP connection as tentatively +inactive. That flag is cleared if we get any events, either tap side or +socket side. + +If the inactive flag is still set when the next INACTIVITY_INTERVAL expires +then the connection has been inactive for an extended period and we reset +and close it. In practice this means that connections will be removed +after 2-4 hours of inactivity. + +This is not a true fix for bug 179, but it does mitigate the damage, by +limiting the time that inactive connections will remain around, + +Link: https://bugs.passt.top/show_bug.cgi?id=179 +Signed-off-by: David Gibson +Signed-off-by: Stefano Brivio +(cherry picked from commit 1820103fbbf13df98257a3f5c3ba625de624b0b3) +--- + tcp.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++---- + tcp.h | 4 +++- + tcp_conn.h | 3 +++ + 3 files changed, 54 insertions(+), 5 deletions(-) + +diff --git a/tcp.c b/tcp.c +index de2ad38..dd58550 100644 +--- a/tcp.c ++++ b/tcp.c +@@ -207,6 +207,13 @@ + * TCP_INFO, with a representable range from RTT_STORE_MIN (100 us) to + * RTT_STORE_MAX (3276.8 ms). The timeout value is clamped accordingly. + * ++ * We also use a global interval timer for an activity timeout which doesn't ++ * require precision: ++ * ++ * - INACTIVITY_INTERVAL: if a connection has had no activity for an entire ++ * interval, close and reset it. This means that idle connections (without ++ * keepalives) will be removed between INACTIVITY_INTERVAL s and ++ * 2*INACTIVITY_INTERVAL s after the last activity. + * + * Summary of data flows (with ESTABLISHED event) + * ---------------------------------------------- +@@ -345,7 +352,8 @@ enum { + #define RTO_INIT 1 /* s, RFC 6298 */ + #define RTO_INIT_AFTER_SYN_RETRIES 3 /* s, RFC 6298 */ + #define FIN_TIMEOUT 60 +-#define ACT_TIMEOUT 7200 ++ ++#define INACTIVITY_INTERVAL 7200 /* s */ + + #define LOW_RTT_TABLE_SIZE 8 + #define LOW_RTT_THRESHOLD 10 /* us */ +@@ -2294,6 +2302,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, + return 1; + } + ++ conn->inactive = false; ++ + if (th->ack && !(conn->events & ESTABLISHED)) + tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); + +@@ -2652,6 +2662,8 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, + return; + } + ++ conn->inactive = false; ++ + if ((conn->events & TAP_FIN_ACKED) && (events & EPOLLHUP)) { + conn_event(c, conn, CLOSED); + return; +@@ -3030,6 +3042,38 @@ static void tcp_port_rebind(struct ctx *c, bool outbound) + } + } + ++/** ++ * tcp_inactivity() - Scan for and close long-inactive connections ++ * @: Execution context ++ */ ++static void tcp_inactivity(struct ctx *c, const struct timespec *now) ++{ ++ union flow *flow; ++ ++ if (now->tv_sec - c->tcp.inactivity_run < INACTIVITY_INTERVAL) ++ return; ++ ++ debug("TCP inactivity scan"); ++ c->tcp.inactivity_run = now->tv_sec; ++ ++ flow_foreach(flow) { ++ struct tcp_tap_conn *conn = &flow->tcp; ++ ++ if (flow->f.type != FLOW_TCP) ++ continue; ++ ++ if (conn->inactive) { ++ /* No activity in this interval, reset */ ++ flow_dbg(conn, "Inactive for at least %us, resetting", ++ INACTIVITY_INTERVAL); ++ tcp_rst(c, conn); ++ } ++ ++ /* Ready to check fot next interval */ ++ conn->inactive = true; ++ } ++} ++ + /** + * tcp_port_rebind_outbound() - Rebind ports in namespace + * @arg: Execution context +@@ -3068,13 +3112,13 @@ void tcp_port_rebind_all(struct ctx *c) + * @c: Execution context + * @now: Current timestamp + */ +-void tcp_timer(const struct ctx *c, const struct timespec *now) ++void tcp_timer(struct ctx *c, const struct timespec *now) + { +- (void)now; +- + tcp_sock_refill_init(c); + if (c->mode == MODE_PASTA) + tcp_splice_refill(c); ++ ++ tcp_inactivity(c, now); + } + + /** +diff --git a/tcp.h b/tcp.h +index 3f21e75..37cfc5b 100644 +--- a/tcp.h ++++ b/tcp.h +@@ -23,7 +23,7 @@ int tcp_sock_init(const struct ctx *c, uint8_t pif, + in_port_t port); + int tcp_init(struct ctx *c); + void tcp_port_rebind_all(struct ctx *c); +-void tcp_timer(const struct ctx *c, const struct timespec *now); ++void tcp_timer(struct ctx *c, const struct timespec *now); + void tcp_defer_handler(struct ctx *c); + + void tcp_update_l2_buf(const unsigned char *eth_d); +@@ -64,6 +64,7 @@ union tcp_listen_epoll_ref { + * @rto_max: Maximum retry timeout (in s) + * @syn_retries: SYN retries using exponential backoff timeout + * @syn_linear_timeouts: SYN retries before using exponential backoff timeout ++ * @inactivity_run: Time we last scanned for inactive connections + */ + struct tcp_ctx { + struct fwd_ports fwd_in; +@@ -73,6 +74,7 @@ struct tcp_ctx { + int rto_max; + uint8_t syn_retries; + uint8_t syn_linear_timeouts; ++ time_t inactivity_run; + }; + + #endif /* TCP_H */ +diff --git a/tcp_conn.h b/tcp_conn.h +index 9c6ff9e..2e70d39 100644 +--- a/tcp_conn.h ++++ b/tcp_conn.h +@@ -16,6 +16,7 @@ + * @ws_from_tap: Window scaling factor advertised from tap/guest + * @ws_to_tap: Window scaling factor advertised to tap/guest + * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS ++ * @inactive: No activity within the current INACTIVITY_INTERVAL + * @sock: Socket descriptor number + * @events: Connection events, implying connection states + * @listening_sock: Listening socket this socket was accept()ed from, or -1 +@@ -58,6 +59,8 @@ struct tcp_tap_conn { + (conn->rtt_exp = MIN(RTT_EXP_MAX, ilog2(MAX(1, rtt / RTT_STORE_MIN)))) + #define RTT_GET(conn) (RTT_STORE_MIN << conn->rtt_exp) + ++ bool inactive :1; ++ + int sock :FD_REF_BITS; + + uint8_t events; +-- +2.47.1 + diff --git a/SOURCES/0016-tcp-Extend-tcp_send_flag-to-send-TCP-keepalive-segme.patch b/SOURCES/0016-tcp-Extend-tcp_send_flag-to-send-TCP-keepalive-segme.patch new file mode 100644 index 0000000..037d3e8 --- /dev/null +++ b/SOURCES/0016-tcp-Extend-tcp_send_flag-to-send-TCP-keepalive-segme.patch @@ -0,0 +1,66 @@ +From 4600f95f99f12eb0680277da971a3af0ba27d5c1 Mon Sep 17 00:00:00 2001 +From: David Gibson +Date: Wed, 4 Feb 2026 21:41:36 +1000 +Subject: [PATCH 16/18] tcp: Extend tcp_send_flag() to send TCP keepalive + segments + +TCP keepalives aren't technically a flag, but they are a zero-data segment +so they can be generated with only a small modification to +tcp_{buf,vu}_send_flag(). Implement this, using a new "pseudo-flag" +value (similar to DUP_ACK), KEEPALIVE. + +Signed-off-by: David Gibson +[sbrivio: Fix trivial merge conflict with 812cdb802c6e] +Signed-off-by: Stefano Brivio +(cherry picked from commit a681e44ec60179567fb10f34351d7dfdbd2e7c7e) +--- + tcp_buf.c | 4 ++++ + tcp_internal.h | 2 ++ + tcp_vu.c | 3 +++ + 3 files changed, 9 insertions(+) + +diff --git a/tcp_buf.c b/tcp_buf.c +index 5d419d3..75a020f 100644 +--- a/tcp_buf.c ++++ b/tcp_buf.c +@@ -227,6 +227,10 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) + tcp_frame_conns[tcp_payload_used++] = conn; + l4len = optlen + sizeof(struct tcphdr); + iov[TCP_IOV_PAYLOAD].iov_len = l4len; ++ ++ if (flags & KEEPALIVE) ++ seq--; ++ + tcp_l2_buf_fill_headers(c, conn, iov, NULL, seq, false); + + tcp_l2_buf_pad(iov); +diff --git a/tcp_internal.h b/tcp_internal.h +index 5f8fb35..36f443b 100644 +--- a/tcp_internal.h ++++ b/tcp_internal.h +@@ -38,6 +38,8 @@ + + /* Flags for internal usage */ + #define DUP_ACK (1 << 5) ++#define KEEPALIVE (1 << 6) ++ + #define OPT_EOL 0 + #define OPT_NOP 1 + #define OPT_MSS 2 +diff --git a/tcp_vu.c b/tcp_vu.c +index db9db78..dd50241 100644 +--- a/tcp_vu.c ++++ b/tcp_vu.c +@@ -135,6 +135,9 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) + flags_elem[0].in_sg[0].iov_len = hdrlen + optlen; + payload = IOV_TAIL(flags_elem[0].in_sg, 1, hdrlen); + ++ if (flags & KEEPALIVE) ++ seq--; ++ + tcp_fill_headers(c, conn, NULL, eh, ip4h, ip6h, th, &payload, + NULL, seq, !*c->pcap); + +-- +2.47.1 + diff --git a/SOURCES/0017-tcp-Send-TCP-keepalive-segments-after-a-period-of-ta.patch b/SOURCES/0017-tcp-Send-TCP-keepalive-segments-after-a-period-of-ta.patch new file mode 100644 index 0000000..00ca4fd --- /dev/null +++ b/SOURCES/0017-tcp-Send-TCP-keepalive-segments-after-a-period-of-ta.patch @@ -0,0 +1,161 @@ +From b911ba6899bac381e795e26d9bebfac69b1a5748 Mon Sep 17 00:00:00 2001 +From: David Gibson +Date: Wed, 4 Feb 2026 21:41:37 +1000 +Subject: [PATCH 17/18] tcp: Send TCP keepalive segments after a period of + tap-side inactivity + +There are several circumstances in which a live, but idle TCP connection +can be forgotten by a guest, with no "on the wire" indication that this has +happened. The most obvious is if the guest abruptly reboots. A more +subtle case can happen with a half-closed connection, specifically one +in FIN_WAIT_2 state on the guest. A connection can, legitimately, remain +in this state indefinitely. If however, a socket in this state is closed +by userspace, Linux at least will remove the kernel socket after 60s +(or as configured in the net.ipv4.tcp_fin_timeout sysctl). + +Because there's no on the wire indication in these cases, passt will +pointlessly retain the connection in its flow table, at least until it is +removed by the inactivity timeout after several hours. + +To avoid keeping connections around for so long in this state, add +functionality to periodically send TCP keepalive segments to the guest if +we've seen no activity on the tap interface. If the guest is no longer +aware of the connection, it should respond with an RST which will let +passt remove the stale entry. + +To do this we use a method similar to the inactivity timeout - a 1-bit +page replacement / clock algorithm, but with a shorter interval, and only +checking for tap side activity. Currently we use a 300s interval, meaning +we'll send a keepalive after 5-10 minutes of (tap side) inactivity. + +Link: https://bugs.passt.top/show_bug.cgi?id=179 +Signed-off-by: David Gibson +Signed-off-by: Stefano Brivio +(cherry picked from commit d2f7c21cfb949f2b1587b9475917efdd6ac549fd) +--- + tcp.c | 39 +++++++++++++++++++++++++++++++++++++++ + tcp.h | 2 ++ + tcp_conn.h | 2 ++ + 3 files changed, 43 insertions(+) + +diff --git a/tcp.c b/tcp.c +index dd58550..1691987 100644 +--- a/tcp.c ++++ b/tcp.c +@@ -215,6 +215,12 @@ + * keepalives) will be removed between INACTIVITY_INTERVAL s and + * 2*INACTIVITY_INTERVAL s after the last activity. + * ++ * - KEEPALIVE_INTERVAL: if a connection has had no tap-side activity for an ++ * entire interval, send a tap-side keepalive. If the endpoint is no longer ++ * aware of the connection (due to a reboot, or a kernel timeout in FIN_WAIT_2 ++ * state) that should trigger an RST, so we won't keep track of connections ++ * that the guest endpoint no longer cares about. ++ * + * Summary of data flows (with ESTABLISHED event) + * ---------------------------------------------- + * +@@ -354,6 +360,7 @@ enum { + #define FIN_TIMEOUT 60 + + #define INACTIVITY_INTERVAL 7200 /* s */ ++#define KEEPALIVE_INTERVAL 30 /* s */ + + #define LOW_RTT_TABLE_SIZE 8 + #define LOW_RTT_THRESHOLD 10 /* us */ +@@ -2303,6 +2310,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, + } + + conn->inactive = false; ++ conn->tap_inactive = false; + + if (th->ack && !(conn->events & ESTABLISHED)) + tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); +@@ -3042,6 +3050,36 @@ static void tcp_port_rebind(struct ctx *c, bool outbound) + } + } + ++/** ++ * tcp_keepalive() - Send keepalives for connections which need it ++ * @: Execution context ++ */ ++static void tcp_keepalive(struct ctx *c, const struct timespec *now) ++{ ++ union flow *flow; ++ ++ if (now->tv_sec - c->tcp.keepalive_run < KEEPALIVE_INTERVAL) ++ return; ++ ++ c->tcp.keepalive_run = now->tv_sec; ++ ++ flow_foreach(flow) { ++ struct tcp_tap_conn *conn = &flow->tcp; ++ ++ if (flow->f.type != FLOW_TCP) ++ continue; ++ ++ if (conn->tap_inactive) { ++ flow_dbg(conn, "No tap activity for least %us, send keepalive", ++ KEEPALIVE_INTERVAL); ++ tcp_send_flag(c, conn, KEEPALIVE); ++ } ++ ++ /* Ready to check fot next interval */ ++ conn->tap_inactive = true; ++ } ++} ++ + /** + * tcp_inactivity() - Scan for and close long-inactive connections + * @: Execution context +@@ -3118,6 +3156,7 @@ void tcp_timer(struct ctx *c, const struct timespec *now) + if (c->mode == MODE_PASTA) + tcp_splice_refill(c); + ++ tcp_keepalive(c, now); + tcp_inactivity(c, now); + } + +diff --git a/tcp.h b/tcp.h +index 37cfc5b..505f21a 100644 +--- a/tcp.h ++++ b/tcp.h +@@ -64,6 +64,7 @@ union tcp_listen_epoll_ref { + * @rto_max: Maximum retry timeout (in s) + * @syn_retries: SYN retries using exponential backoff timeout + * @syn_linear_timeouts: SYN retries before using exponential backoff timeout ++ * @keepalive_run: Time we last issued tap-side keepalives + * @inactivity_run: Time we last scanned for inactive connections + */ + struct tcp_ctx { +@@ -74,6 +75,7 @@ struct tcp_ctx { + int rto_max; + uint8_t syn_retries; + uint8_t syn_linear_timeouts; ++ time_t keepalive_run; + time_t inactivity_run; + }; + +diff --git a/tcp_conn.h b/tcp_conn.h +index 2e70d39..2ff76ed 100644 +--- a/tcp_conn.h ++++ b/tcp_conn.h +@@ -16,6 +16,7 @@ + * @ws_from_tap: Window scaling factor advertised from tap/guest + * @ws_to_tap: Window scaling factor advertised to tap/guest + * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS ++ * @tapinactive: No tao activity within the current KEEPALIVE_INTERVAL + * @inactive: No activity within the current INACTIVITY_INTERVAL + * @sock: Socket descriptor number + * @events: Connection events, implying connection states +@@ -59,6 +60,7 @@ struct tcp_tap_conn { + (conn->rtt_exp = MIN(RTT_EXP_MAX, ilog2(MAX(1, rtt / RTT_STORE_MIN)))) + #define RTT_GET(conn) (RTT_STORE_MIN << conn->rtt_exp) + ++ bool tap_inactive :1; + bool inactive :1; + + int sock :FD_REF_BITS; +-- +2.47.1 + diff --git a/SOURCES/0018-tcp-Replace-send-buffer-boost-with-EPOLLOUT-monitori.patch b/SOURCES/0018-tcp-Replace-send-buffer-boost-with-EPOLLOUT-monitori.patch new file mode 100644 index 0000000..8fe641e --- /dev/null +++ b/SOURCES/0018-tcp-Replace-send-buffer-boost-with-EPOLLOUT-monitori.patch @@ -0,0 +1,133 @@ +From 4d1c8b11460cfe05372e572f33e046a8e98e242c Mon Sep 17 00:00:00 2001 +From: Yumei Huang +Date: Fri, 20 Mar 2026 18:32:14 +0800 +Subject: [PATCH 18/18] tcp: Replace send buffer boost with EPOLLOUT monitoring + +Currently we use the SNDBUF boost mechanism to force TCP auto-tuning. +However, it doesn't always work, and sometimes causes a lot of +retransmissions. As a result, the throughput suffers. + +This patch replaces it with monitoring EPOLLOUT when sendmsg() failure +(with EAGAIN and EWOULDBLOCK) and partial sends occur. + +Tested with iperf3 inside pasta: throughput is now comparable to running +iperf3 directly on the host without pasta. However, retransmissions can +still be elevated when RTT >= 50ms. For example, when RTT is between +200ms and 500ms, retransmission count varies from 30 to 120 in roughly +80% of test runs. + +Link: https://bugs.passt.top/show_bug.cgi?id=138 +Link: https://github.com/containers/podman/issues/28219 +Suggested-by: Stefano Brivio +Signed-off-by: Yumei Huang +Signed-off-by: Stefano Brivio +(cherry picked from commit 831857e9b547ac27f868b6c24049c4da435b63fe) +--- + tcp.c | 57 +++++++++++++++++---------------------------------------- + 1 file changed, 17 insertions(+), 40 deletions(-) + +diff --git a/tcp.c b/tcp.c +index 1691987..920af70 100644 +--- a/tcp.c ++++ b/tcp.c +@@ -365,13 +365,6 @@ enum { + #define LOW_RTT_TABLE_SIZE 8 + #define LOW_RTT_THRESHOLD 10 /* us */ + +-/* Parameters to temporarily exceed sending buffer to force TCP auto-tuning */ +-#define SNDBUF_BOOST_BYTES_RTT_LO 2500 /* B * s: no boost until here */ +-/* ...examples: 5 MB sent * 500 ns RTT, 250 kB * 10 ms, 8 kB * 300 ms */ +-#define SNDBUF_BOOST_FACTOR 150 /* % */ +-#define SNDBUF_BOOST_BYTES_RTT_HI 6000 /* apply full boost factor */ +-/* 12 MB sent * 500 ns RTT, 600 kB * 10 ms, 20 kB * 300 ms */ +- + /* Ratio of buffer to bandwidth * delay product implying interactive traffic */ + #define SNDBUF_TO_BW_DELAY_INTERACTIVE /* > */ 20 /* (i.e. < 5% of buffer) */ + +@@ -1067,35 +1060,6 @@ void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn, + tap_hdr_update(taph, MAX(l3len + sizeof(struct ethhdr), ETH_ZLEN)); + } + +-/** +- * tcp_sndbuf_boost() - Calculate limit of sending buffer to force auto-tuning +- * @conn: Connection pointer +- * @tinfo: tcp_info from kernel, must be pre-fetched +- * +- * Return: increased sending buffer to use as a limit for advertised window +- */ +-static unsigned long tcp_sndbuf_boost(const struct tcp_tap_conn *conn, +- const struct tcp_info_linux *tinfo) +-{ +- unsigned long bytes_rtt_product; +- +- if (!bytes_acked_cap) +- return SNDBUF_GET(conn); +- +- /* This is *not* a bandwidth-delay product, but it's somewhat related: +- * as we send more data (usually at the beginning of a connection), we +- * try to make the sending buffer progressively grow, with the RTT as a +- * factor (longer delay, bigger buffer needed). +- */ +- bytes_rtt_product = (long long)tinfo->tcpi_bytes_acked * +- tinfo->tcpi_rtt / 1000 / 1000; +- +- return clamped_scale(SNDBUF_GET(conn), bytes_rtt_product, +- SNDBUF_BOOST_BYTES_RTT_LO, +- SNDBUF_BOOST_BYTES_RTT_HI, +- SNDBUF_BOOST_FACTOR); +-} +- + /** + * tcp_update_seqack_wnd() - Update ACK sequence and window to guest/tap + * @c: Execution context +@@ -1216,8 +1180,6 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, + + if ((int)sendq > SNDBUF_GET(conn)) /* Due to memory pressure? */ + limit = 0; +- else if ((int)tinfo->tcpi_snd_wnd > SNDBUF_GET(conn)) +- limit = tcp_sndbuf_boost(conn, tinfo) - (int)sendq; + else + limit = SNDBUF_GET(conn) - (int)sendq; + +@@ -2088,14 +2050,28 @@ eintr: + + if (errno == EAGAIN || errno == EWOULDBLOCK) { + tcp_send_flag(c, conn, ACK | DUP_ACK); ++ uint32_t events = tcp_conn_epoll_events(conn->events, ++ conn->flags); ++ events |= EPOLLOUT; ++ if (flow_epoll_set(&conn->f, EPOLL_CTL_MOD, events, ++ conn->sock, !TAPSIDE(conn)) < 0) ++ debug("Failed to add EPOLLOUT"); + return p->count - idx; +- + } + return -1; + } + +- if (n < (int)(seq_from_tap - conn->seq_from_tap)) ++ if (n < (int)(seq_from_tap - conn->seq_from_tap)) { + partial_send = 1; ++ uint32_t events = tcp_conn_epoll_events(conn->events, ++ conn->flags); ++ events |= EPOLLOUT; ++ if (flow_epoll_set(&conn->f, EPOLL_CTL_MOD, events, conn->sock, ++ !TAPSIDE(conn)) < 0) ++ debug("Failed to add EPOLLOUT"); ++ } else { ++ tcp_epoll_ctl(conn); ++ } + + conn->seq_from_tap += n; + +@@ -2688,6 +2664,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, + tcp_data_from_sock(c, conn); + + if (events & EPOLLOUT) { ++ tcp_epoll_ctl(conn); + if (tcp_update_seqack_wnd(c, conn, false, NULL)) + tcp_send_flag(c, conn, ACK); + } +-- +2.47.1 + diff --git a/SPECS/passt.spec b/SPECS/passt.spec index 0845696..9157c79 100644 --- a/SPECS/passt.spec +++ b/SPECS/passt.spec @@ -7,11 +7,12 @@ # Copyright (c) 2022 Red Hat GmbH # Author: Stefano Brivio -%global git_hash 8ec134109eb136432a29bdf5a14f8b1fd4e46208 +%global git_hash d04c48032bcf724550d0b8f652fd00efcd2dfad0 %global selinuxtype targeted +%global selinux_policy_version 41.41 Name: passt -Version: 0^20250512.g8ec1341 +Version: 0^20251210.gd04c480 Release: 4%{?dist} Summary: User-mode networking daemons for virtual machines and namespaces License: GPL-2.0-or-later AND BSD-3-Clause @@ -20,9 +21,23 @@ URL: https://passt.top/ Source: https://passt.top/passt/snapshot/passt-%{git_hash}.tar.xz Patch1: 0001-selinux-Drop-user_namespace-create-allow-rules.patch -Patch2: 0002-treewide-By-default-don-t-quit-source-after-migratio.patch -Patch3: 0003-tcp-Cast-operands-of-sequence-comparison-macros-to-u.patch -Patch4: 0004-tcp-Don-t-consider-FIN-flags-with-mismatching-sequen.patch +Patch2: 0002-selinux-Use-systemd_logind_exec_t-instead-of-systemd.patch +Patch3: 0003-tcp-Use-less-than-MSS-window-on-no-queued-data-or-no.patch +Patch4: 0004-pasta-Warn-disable-matching-IP-version-if-not-suppor.patch +Patch5: 0005-selinux-Enable-read-and-watch-permissions-on-netns-d.patch +Patch6: 0006-selinux-Enable-open-permissions-on-netns-directory-o.patch +Patch7: 0007-tcp-Fix-rounding-issue-in-check-for-approximating-wi.patch +Patch8: 0008-udp_flow-remove-unneeded-epoll_ref-indirection.patch +Patch9: 0009-udp_flow-Assign-socket-to-flow-inside-udp_flow_sock.patch +Patch10: 0010-tcp_splice-Refactor-tcp_splice_conn_epoll_events-to-.patch +Patch11: 0011-flow-Introduce-flow_epoll_set-to-centralize-epoll-op.patch +Patch12: 0012-tcp-Properly-propagate-tap-side-RST-to-socket-side.patch +Patch13: 0013-udp-Split-activity-timeouts-for-UDP-flows.patch +Patch14: 0014-tcp-Remove-non-working-activity-timeout-mechanism.patch +Patch15: 0015-tcp-Re-introduce-inactivity-timeouts-based-on-a-cloc.patch +Patch16: 0016-tcp-Extend-tcp_send_flag-to-send-TCP-keepalive-segme.patch +Patch17: 0017-tcp-Send-TCP-keepalive-segments-after-a-period-of-ta.patch +Patch18: 0018-tcp-Replace-send-buffer-boost-with-EPOLLOUT-monitori.patch BuildRequires: gcc, make, git, checkpolicy, selinux-policy-devel Requires: (%{name}-selinux = %{version}-%{release} if selinux-policy-%{selinuxtype}) @@ -38,15 +53,21 @@ for network namespaces: traffic is forwarded using a tap interface inside the namespace, without the need to create further interfaces on the host, hence not requiring any capabilities or privileges. -%package selinux -BuildArch: noarch -Summary: SELinux support for passt and pasta -Requires: %{name} = %{version}-%{release} -Requires: selinux-policy -Requires(post): %{name} -Requires(post): policycoreutils -Requires(preun): %{name} -Requires(preun): policycoreutils +%package selinux +BuildArch: noarch +Summary: SELinux support for passt and pasta +%if 0%{?fedora} > 43 +BuildRequires: selinux-policy-devel +%selinux_requires_min +%else +BuildRequires: pkgconfig(systemd) +Requires(post): libselinux-utils +Requires(post): policycoreutils +%endif +Requires: container-selinux +Requires: selinux-policy-%{selinuxtype} +Requires(post): container-selinux +Requires(post): selinux-policy-%{selinuxtype} %description selinux This package adds SELinux enforcement to passt(1), pasta(1), passt-repair(1). @@ -94,15 +115,11 @@ popd %selinux_relabel_pre -s %{selinuxtype} %post selinux -%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp -%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp -%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp +%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp %postun selinux if [ $1 -eq 0 ]; then - %selinux_modules_uninstall -s %{selinuxtype} passt - %selinux_modules_uninstall -s %{selinuxtype} pasta - %selinux_modules_uninstall -s %{selinuxtype} passt-repair + %selinux_modules_uninstall -s %{selinuxtype} passt pasta passt-repair fi %posttrans selinux @@ -135,8 +152,23 @@ fi %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp %changelog -* Thu Oct 23 2025 Stefano Brivio - 0^20250512.g8ec1341-4 -- Resolves: RHEL-123413 RHEL-123419 +* Tue Apr 21 2026 Stefano Brivio - 0^20251210.gd04c480-4 +- Resolves: RHEL-169637 RHEL-169639 RHEL-169648 + +* Wed Feb 11 2026 Stefano Brivio - 0^20251210.gd04c480-3 +- Resolves: RHEL-137588 RHEL-136313 + +* Wed Dec 24 2025 Stefano Brivio - 0^20251210.gd04c480-2 +- Resolves: RHEL-136313 RHEL-136461 RHEL-137439 RHEL-137588 + +* Wed Dec 10 2025 Stefano Brivio - 0^20251210.gd04c480-1 +- Resolves: RHEL-134942 RHEL-134943 + +* Tue Dec 9 2025 Stefano Brivio - 0^20251209.gc3f1ba7-1 +- Resolves: RHEL-134119 + +* Thu Oct 23 2025 Stefano Brivio - 0^20250512.g8ec1341-3 +- Resolves: RHEL-123376 RHEL-123438 * Tue Jul 29 2025 Stefano Brivio - 0^20250512.g8ec1341-2 - Resolves: RHEL-106326