import UBI passt-0^20251210.gd04c480-4.el9_8
This commit is contained in:
parent
5362eb6d20
commit
b396701ee3
2
.gitignore
vendored
2
.gitignore
vendored
@ -1 +1 @@
|
||||
SOURCES/passt-8ec134109eb136432a29bdf5a14f8b1fd4e46208.tar.xz
|
||||
SOURCES/passt-d04c48032bcf724550d0b8f652fd00efcd2dfad0.tar.xz
|
||||
|
||||
@ -1 +1 @@
|
||||
7b91876dcd65569ddf775b2da567345500ec8862 SOURCES/passt-8ec134109eb136432a29bdf5a14f8b1fd4e46208.tar.xz
|
||||
ec2fcde158b88b1ed9786565025380d03aa32d56 SOURCES/passt-d04c48032bcf724550d0b8f652fd00efcd2dfad0.tar.xz
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
From 6977619743bbc602a865f79562b59a80921d6063 Mon Sep 17 00:00:00 2001
|
||||
From 7087adfbab35354f9def7edee87385b82416c722 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Mon, 21 Aug 2023 17:52:28 +0200
|
||||
Date: Mon, 8 Dec 2025 22:32:50 -0500
|
||||
Subject: [PATCH] selinux: Drop user_namespace create allow rules
|
||||
|
||||
Those are incompatible with current el9 kernels. I introduced them
|
||||
@ -24,10 +24,10 @@ Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
2 files changed, 2 deletions(-)
|
||||
|
||||
diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te
|
||||
index c6cea34..131fadc 100644
|
||||
index 6995df8..76d23e8 100644
|
||||
--- a/contrib/selinux/passt.te
|
||||
+++ b/contrib/selinux/passt.te
|
||||
@@ -92,7 +92,6 @@ allow syslogd_t self:cap_userns sys_ptrace;
|
||||
@@ -105,7 +105,6 @@ allow syslogd_t self:cap_userns sys_ptrace;
|
||||
allow passt_t self:process setcap;
|
||||
allow passt_t self:capability { sys_tty_config setpcap net_bind_service setuid setgid};
|
||||
allow passt_t self:cap_userns { setpcap sys_admin sys_ptrace };
|
||||
@ -36,16 +36,17 @@ index c6cea34..131fadc 100644
|
||||
auth_read_passwd(passt_t)
|
||||
|
||||
diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te
|
||||
index 69be081..892edae 100644
|
||||
index 95fe42a..7e1e821 100644
|
||||
--- a/contrib/selinux/pasta.te
|
||||
+++ b/contrib/selinux/pasta.te
|
||||
@@ -110,7 +110,6 @@ init_daemon_domain(pasta_t, pasta_exec_t)
|
||||
|
||||
allow pasta_t self:capability { setpcap net_bind_service sys_tty_config dac_read_search net_admin sys_resource setuid setgid };
|
||||
allow pasta_t self:cap_userns { setpcap sys_admin sys_ptrace net_admin net_bind_service };
|
||||
@@ -126,7 +126,6 @@ allow pasta_t self:cap_userns { setpcap sys_admin sys_ptrace net_admin net_bind_
|
||||
# pasta only calls setuid and setgid with the current UID and GID, so this
|
||||
# denial is harmless. See https://bugzilla.redhat.com/show_bug.cgi?id=2330512#c10
|
||||
dontaudit pasta_t self:cap_userns { setgid setuid };
|
||||
-allow pasta_t self:user_namespace create;
|
||||
|
||||
auth_read_passwd(pasta_t)
|
||||
|
||||
--
|
||||
2.39.2
|
||||
2.47.1
|
||||
|
||||
|
||||
@ -0,0 +1,41 @@
|
||||
From 2244df26b2cb63acb51a20485e1ca7ad0649b152 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Mon, 22 Dec 2025 21:48:32 -0500
|
||||
Subject: [PATCH] selinux: Use systemd_logind_exec_t instead of
|
||||
systemd_user_runtimedir_exec_t
|
||||
|
||||
On CentOS Stream 9, selinux-policy doesn't contain commit
|
||||
700b3622d575 ("Confine /usr/lib/systemd/systemd-user-runtime-dir"),
|
||||
so the file context of /usr/lib/systemd/systemd-user-runtime-dir is
|
||||
still systemd_logind_exec_t there.
|
||||
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
---
|
||||
contrib/selinux/pasta.te | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te
|
||||
index 7e1e821..d29d6c4 100644
|
||||
--- a/contrib/selinux/pasta.te
|
||||
+++ b/contrib/selinux/pasta.te
|
||||
@@ -98,7 +98,7 @@ require {
|
||||
type container_runtime_t;
|
||||
type container_var_run_t;
|
||||
type container_t;
|
||||
- type systemd_user_runtimedir_t;
|
||||
+ type systemd_logind_exec_t;
|
||||
}
|
||||
|
||||
type pasta_t;
|
||||
@@ -250,7 +250,7 @@ type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "rootles
|
||||
type_transition container_runtime_t container_var_run_t : dir ifconfig_var_run_t "rootless-netns";
|
||||
allow pasta_t ifconfig_var_run_t:dir { add_name open rmdir write };
|
||||
allow pasta_t ifconfig_var_run_t:file { create open write };
|
||||
-allow systemd_user_runtimedir_t ifconfig_var_run_t:dir rmdir;
|
||||
+allow systemd_logind_exec_t ifconfig_var_run_t:dir rmdir;
|
||||
|
||||
# Allow pasta to bind to any port
|
||||
bool pasta_bind_all_ports true;
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -1,264 +0,0 @@
|
||||
From b0b5ce0a76cf7fec0b00405732fd94e0b34e8d84 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Thu, 17 Jul 2025 10:38:17 +0200
|
||||
Subject: [PATCH] treewide: By default, don't quit source after migration, keep
|
||||
sockets open
|
||||
|
||||
We are hitting an issue in the KubeVirt integration where some data is
|
||||
still sent to the source instance even after migration is complete. As
|
||||
we exit, the kernel closes our sockets and resets connections. The
|
||||
resulting RST segments are sent to peers, effectively terminating
|
||||
connections that were meanwhile migrated.
|
||||
|
||||
At the moment, this is not done intentionally, but in the future
|
||||
KubeVirt might enable OVN-Kubernetes features where source and
|
||||
destination nodes are explicitly getting mirrored traffic for a while,
|
||||
in order to decrease migration downtime.
|
||||
|
||||
By default, don't quit after migration is completed on the source: the
|
||||
previous behaviour can be enabled with the new, but deprecated,
|
||||
--migrate-exit option. After migration (as source), the -1 / --one-off
|
||||
option has no effect.
|
||||
|
||||
Also, by default, keep migrated TCP sockets open (in repair mode) as
|
||||
long as we're running, and ignore events on any epoll descriptor
|
||||
representing data channels. The previous behaviour can be enabled with
|
||||
the new, equally deprecated, --migrate-no-linger option.
|
||||
|
||||
By keeping sockets open, and not exiting, we prevent the kernel
|
||||
running on the source node to send out RST segments if further data
|
||||
reaches us.
|
||||
|
||||
Reported-by: Nir Dothan <ndothan@redhat.com>
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
(cherry picked from commit a8782865c342eb2682cca292d5bf92b567344351)
|
||||
---
|
||||
conf.c | 22 ++++++++++++++++++++++
|
||||
flow.c | 2 +-
|
||||
passt.1 | 29 +++++++++++++++++++++++++++++
|
||||
passt.h | 4 ++++
|
||||
tcp.c | 9 +++++++--
|
||||
tcp_conn.h | 3 ++-
|
||||
test/lib/setup | 4 ++--
|
||||
vhost_user.c | 9 +++++++--
|
||||
8 files changed, 74 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/conf.c b/conf.c
|
||||
index a6d7e22..1295d89 100644
|
||||
--- a/conf.c
|
||||
+++ b/conf.c
|
||||
@@ -864,6 +864,14 @@ static void usage(const char *name, FILE *f, int status)
|
||||
FPRINTF(f,
|
||||
" --repair-path PATH path for passt-repair(1)\n"
|
||||
" default: append '.repair' to UNIX domain path\n");
|
||||
+ FPRINTF(f,
|
||||
+ " --migrate-exit DEPRECATED:\n"
|
||||
+ " source quits after migration\n"
|
||||
+ " default: source keeps running after migration\n");
|
||||
+ FPRINTF(f,
|
||||
+ " --migrate-no-linger DEPRECATED:\n"
|
||||
+ " close sockets on migration\n"
|
||||
+ " default: keep sockets open, ignore events\n");
|
||||
}
|
||||
|
||||
FPRINTF(f,
|
||||
@@ -1468,6 +1476,8 @@ void conf(struct ctx *c, int argc, char **argv)
|
||||
{"socket-path", required_argument, NULL, 's' },
|
||||
{"fqdn", required_argument, NULL, 27 },
|
||||
{"repair-path", required_argument, NULL, 28 },
|
||||
+ {"migrate-exit", no_argument, NULL, 29 },
|
||||
+ {"migrate-no-linger", no_argument, NULL, 30 },
|
||||
{ 0 },
|
||||
};
|
||||
const char *optstring = "+dqfel:hs:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:";
|
||||
@@ -1683,6 +1693,18 @@ void conf(struct ctx *c, int argc, char **argv)
|
||||
optarg))
|
||||
die("Invalid passt-repair path: %s", optarg);
|
||||
|
||||
+ break;
|
||||
+ case 29:
|
||||
+ if (c->mode != MODE_VU)
|
||||
+ die("--migrate-exit is for vhost-user mode only");
|
||||
+ c->migrate_exit = true;
|
||||
+
|
||||
+ break;
|
||||
+ case 30:
|
||||
+ if (c->mode != MODE_VU)
|
||||
+ die("--migrate-no-linger is for vhost-user mode only");
|
||||
+ c->migrate_no_linger = true;
|
||||
+
|
||||
break;
|
||||
case 'd':
|
||||
c->debug = 1;
|
||||
diff --git a/flow.c b/flow.c
|
||||
index 6a5c8aa..a4b65ea 100644
|
||||
--- a/flow.c
|
||||
+++ b/flow.c
|
||||
@@ -1089,7 +1089,7 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
|
||||
* as EIO).
|
||||
*/
|
||||
foreach_established_tcp_flow(flow) {
|
||||
- rc = tcp_flow_migrate_source_ext(fd, &flow->tcp);
|
||||
+ rc = tcp_flow_migrate_source_ext(c, fd, &flow->tcp);
|
||||
if (rc) {
|
||||
flow_err(flow, "Can't send extended data: %s",
|
||||
strerror_(-rc));
|
||||
diff --git a/passt.1 b/passt.1
|
||||
index 60066c2..cef98b2 100644
|
||||
--- a/passt.1
|
||||
+++ b/passt.1
|
||||
@@ -439,6 +439,30 @@ Default, for \-\-vhost-user mode only, is to append \fI.repair\fR to the path
|
||||
chosen for the hypervisor UNIX domain socket. No socket is created if not in
|
||||
\-\-vhost-user mode.
|
||||
|
||||
+.TP
|
||||
+.BR \-\-migrate-exit (DEPRECATED)
|
||||
+Exit after a completed migration as source. By default, \fBpasst\fR keeps
|
||||
+running and the migrated guest can continue using its connection, or a new guest
|
||||
+can connect.
|
||||
+
|
||||
+Note that this configuration option is \fBdeprecated\fR and will be removed in a
|
||||
+future version. It is not expected to be of any use, and it simply reflects a
|
||||
+legacy behaviour. If you have any use for this, refer to \fBREPORTING BUGS\fR
|
||||
+below.
|
||||
+
|
||||
+.TP
|
||||
+.BR \-\-migrate-no-linger (DEPRECATED)
|
||||
+Close TCP sockets on the source instance once migration completes.
|
||||
+
|
||||
+By default, sockets are kept open, and events on data sockets are ignored, so
|
||||
+that any further message reaching sockets after the source migrated is silently
|
||||
+ignored, to avoid connection resets in case data is received after migration.
|
||||
+
|
||||
+Note that this configuration option is \fBdeprecated\fR and will be removed in a
|
||||
+future version. It is not expected to be of any use, and it simply reflects a
|
||||
+legacy behaviour. If you have any use for this, refer to \fBREPORTING BUGS\fR
|
||||
+below.
|
||||
+
|
||||
.TP
|
||||
.BR \-F ", " \-\-fd " " \fIFD
|
||||
Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened
|
||||
@@ -454,6 +478,11 @@ is closed.
|
||||
Quit after handling a single client connection, that is, once the client closes
|
||||
the socket, or once we get a socket error.
|
||||
|
||||
+\fBNote\fR: this option has no effect after \fBpasst\fR completes a migration as
|
||||
+source, because, in that case, exiting would close sockets for active
|
||||
+connections, which would in turn cause connection resets if any further data is
|
||||
+received. See also the description of \fI\-\-migrate-no-linger\fR.
|
||||
+
|
||||
.TP
|
||||
.BR \-t ", " \-\-tcp-ports " " \fIspec
|
||||
Configure TCP port forwarding to guest. \fIspec\fR can be one of:
|
||||
diff --git a/passt.h b/passt.h
|
||||
index 8693794..4cfd6eb 100644
|
||||
--- a/passt.h
|
||||
+++ b/passt.h
|
||||
@@ -241,6 +241,8 @@ struct ip6_ctx {
|
||||
* @device_state_fd: Device state migration channel
|
||||
* @device_state_result: Device state migration result
|
||||
* @migrate_target: Are we the target, on the next migration request?
|
||||
+ * @migrate_no_linger: Close sockets as we migrate them
|
||||
+ * @migrate_exit: Exit (on source) once migration is complete
|
||||
*/
|
||||
struct ctx {
|
||||
enum passt_modes mode;
|
||||
@@ -318,6 +320,8 @@ struct ctx {
|
||||
int device_state_fd;
|
||||
int device_state_result;
|
||||
bool migrate_target;
|
||||
+ bool migrate_no_linger;
|
||||
+ bool migrate_exit;
|
||||
};
|
||||
|
||||
void proto_update_l2_buf(const unsigned char *eth_d,
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index 0ac298a..1b22f70 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -3284,12 +3284,14 @@ int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn)
|
||||
|
||||
/**
|
||||
* tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data
|
||||
+ * @c: Execution context
|
||||
* @fd: Descriptor for state migration
|
||||
* @conn: Pointer to the TCP connection structure
|
||||
*
|
||||
* Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure
|
||||
*/
|
||||
-int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
|
||||
+int tcp_flow_migrate_source_ext(const struct ctx *c,
|
||||
+ int fd, const struct tcp_tap_conn *conn)
|
||||
{
|
||||
uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
|
||||
struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(conn)];
|
||||
@@ -3334,7 +3336,10 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
|
||||
if ((rc = tcp_flow_dump_seq(conn, &t->seq_rcv)))
|
||||
goto fail;
|
||||
|
||||
- close(s);
|
||||
+ if (c->migrate_no_linger)
|
||||
+ close(s);
|
||||
+ else
|
||||
+ epoll_del(c, s);
|
||||
|
||||
/* Adjustments unrelated to FIN segments: sequence numbers we dumped are
|
||||
* based on the end of the queues.
|
||||
diff --git a/tcp_conn.h b/tcp_conn.h
|
||||
index 35d813d..38b5c54 100644
|
||||
--- a/tcp_conn.h
|
||||
+++ b/tcp_conn.h
|
||||
@@ -236,7 +236,8 @@ int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
|
||||
int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn);
|
||||
|
||||
int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn);
|
||||
-int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn);
|
||||
+int tcp_flow_migrate_source_ext(const struct ctx *c, int fd,
|
||||
+ const struct tcp_tap_conn *conn);
|
||||
|
||||
int tcp_flow_migrate_target(struct ctx *c, int fd);
|
||||
int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd);
|
||||
diff --git a/test/lib/setup b/test/lib/setup
|
||||
index 575bc21..5994598 100755
|
||||
--- a/test/lib/setup
|
||||
+++ b/test/lib/setup
|
||||
@@ -350,7 +350,7 @@ setup_migrate() {
|
||||
|
||||
sleep 1
|
||||
|
||||
- __opts="--vhost-user"
|
||||
+ __opts="--vhost-user --migrate-exit --migrate-no-linger"
|
||||
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap"
|
||||
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
|
||||
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
|
||||
@@ -360,7 +360,7 @@ setup_migrate() {
|
||||
|
||||
context_run_bg passt_repair_1 "./passt-repair ${STATESETUP}/passt_1.socket.repair"
|
||||
|
||||
- __opts="--vhost-user"
|
||||
+ __opts="--vhost-user --migrate-exit --migrate-no-linger"
|
||||
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap"
|
||||
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
|
||||
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
|
||||
diff --git a/vhost_user.c b/vhost_user.c
|
||||
index 105f77a..c4d3a52 100644
|
||||
--- a/vhost_user.c
|
||||
+++ b/vhost_user.c
|
||||
@@ -1208,7 +1208,12 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
|
||||
if (msg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE &&
|
||||
vdev->context->device_state_result == 0 &&
|
||||
!vdev->context->migrate_target) {
|
||||
- info("Migration complete, exiting");
|
||||
- _exit(EXIT_SUCCESS);
|
||||
+ if (vdev->context->migrate_exit) {
|
||||
+ info("Migration complete, exiting");
|
||||
+ _exit(EXIT_SUCCESS);
|
||||
+ }
|
||||
+
|
||||
+ info("Migration complete");
|
||||
+ vdev->context->one_off = false;
|
||||
}
|
||||
}
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -1,48 +0,0 @@
|
||||
From bd90a820852ff8966aeb83231c29e48849db3493 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Fri, 29 Aug 2025 22:11:31 +0200
|
||||
Subject: [PATCH 3/4] tcp: Cast operands of sequence comparison macros to
|
||||
uint32_t before using them
|
||||
|
||||
Otherwise, passing signed types causes automatic promotion of the
|
||||
result of the subtractions as well, which is not what we want, as
|
||||
these macros rely on unsigned 32-bit arithmetic.
|
||||
|
||||
The next patch introduces a ssize_t operand for SEQ_LE, illustrating
|
||||
the issue.
|
||||
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Tested-by: Paul Holzinger <pholzing@redhat.com>
|
||||
Reviewed-by: Jon Maloy <jmaloy@redhat.com>
|
||||
(cherry picked from commit 660cd6907e14a41ad9bc77d317140c70ab416fce)
|
||||
---
|
||||
tcp_internal.h | 12 ++++++++----
|
||||
1 file changed, 8 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/tcp_internal.h b/tcp_internal.h
|
||||
index 36c6533..c80ba40 100644
|
||||
--- a/tcp_internal.h
|
||||
+++ b/tcp_internal.h
|
||||
@@ -18,10 +18,14 @@
|
||||
sizeof(struct ipv6hdr), \
|
||||
sizeof(uint32_t))
|
||||
|
||||
-#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW)
|
||||
-#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW)
|
||||
-#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW)
|
||||
-#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW)
|
||||
+#define SEQ_LE(a, b) \
|
||||
+ ((uint32_t)(b) - (uint32_t)(a) < MAX_WINDOW)
|
||||
+#define SEQ_LT(a, b) \
|
||||
+ ((uint32_t)(b) - (uint32_t)(a) - 1 < MAX_WINDOW)
|
||||
+#define SEQ_GE(a, b) \
|
||||
+ ((uint32_t)(a) - (uint32_t)(b) < MAX_WINDOW)
|
||||
+#define SEQ_GT(a, b) \
|
||||
+ ((uint32_t)(a) - (uint32_t)(b) - 1 < MAX_WINDOW)
|
||||
|
||||
#define FIN (1 << 0)
|
||||
#define SYN (1 << 1)
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -0,0 +1,110 @@
|
||||
From b40f5cd8c8e16c6eceb1f26eb895527fda84068b Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Sat, 13 Dec 2025 14:19:13 +0100
|
||||
Subject: [PATCH] tcp: Use less-than-MSS window on no queued data, or no data
|
||||
sent recently
|
||||
|
||||
We limit the advertised window to guests and containers to the
|
||||
available length of the sending buffer, and if it's less than the MSS,
|
||||
since commit cf1925fb7b77 ("tcp: Don't limit window to less-than-MSS
|
||||
values, use zero instead"), we approximate that limit to zero.
|
||||
|
||||
This way, we'll trigger a window update as soon as we realise that we
|
||||
can advertise a larger value, just like we do in all other cases where
|
||||
we advertise a zero-sized window.
|
||||
|
||||
By doing that, we don't wait for the peer to send us data before we
|
||||
update the window. This matters because the guest or container might
|
||||
be trying to aggregate more data and won't send us anything at all if
|
||||
the advertised window is too small.
|
||||
|
||||
However, this might be problematic in two situations:
|
||||
|
||||
1. one, reported by Tyler, where the remote (receiving) peer
|
||||
advertises a window that's smaller than what we usually get and
|
||||
very close to the MSS, causing the kernel to give us a starting
|
||||
size of the buffer that's less than the MSS we advertise to the
|
||||
guest or container.
|
||||
|
||||
If this happens, we'll never advertise a non-zero window after
|
||||
the handshake, and the container or guest will never send us any
|
||||
data at all.
|
||||
|
||||
With a simple 'curl https://cloudflare.com/', we get, with default
|
||||
TCP memory parameters, a 65535-byte window from the peer, and 46080
|
||||
bytes of initial sending buffer from the kernel. But we advertised
|
||||
a 65480-byte MSS, and we'll never actually receive the client
|
||||
request.
|
||||
|
||||
This seems to be specific to Cloudflare for some reason, probably
|
||||
deriving from a particular tuning of TCP parameters on their
|
||||
servers.
|
||||
|
||||
2. another one, hypothesised by David, where the peer might only be
|
||||
willing to process (and acknowledge) data in batches.
|
||||
|
||||
We might have queued outbound data which is, at the same time, not
|
||||
enough to fill one of these batches and be acknowledged and removed
|
||||
from the sending queue, but enough to make our available buffer
|
||||
smaller than the MSS, and the connection will hang.
|
||||
|
||||
Take care of both cases by:
|
||||
|
||||
a. not approximating the sending buffer to zero if we have no outboud
|
||||
queued data at all, because in that case we don't expect the
|
||||
available buffer to increase if we don't send any data, so there's
|
||||
no point in waiting for it to grow larger than the MSS.
|
||||
|
||||
This fixes problem 1. above.
|
||||
|
||||
b. also using the full sending buffer size if we haven't send data to
|
||||
the socket for a while (reported by tcpi_last_data_sent). This part
|
||||
was already suggested by David in:
|
||||
|
||||
https://archives.passt.top/passt-dev/aTZzgtcKWLb28zrf@zatzit/
|
||||
|
||||
and I'm now picking ten times the RTT as a somewhat arbitrary
|
||||
threshold.
|
||||
|
||||
This is meant to take care of potential problem 2. above, but it
|
||||
also happens to fix 1.
|
||||
|
||||
Reported-by: Tyler Cloud <tcloud@redhat.com>
|
||||
Link: https://bugs.passt.top/show_bug.cgi?id=183
|
||||
Suggested-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
---
|
||||
tcp.c | 15 ++++++++++++++-
|
||||
1 file changed, 14 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index 81bc114..b179e39 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -1211,8 +1211,21 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
* the MSS to zero, as we already have mechanisms in place to
|
||||
* force updates after the window becomes zero. This matches the
|
||||
* suggestion from RFC 813, Section 4.
|
||||
+ *
|
||||
+ * But don't do this if, either:
|
||||
+ *
|
||||
+ * - there's nothing in the outbound queue: the size of the
|
||||
+ * sending buffer is limiting us, and it won't increase if we
|
||||
+ * don't send data, so there's no point in waiting, or
|
||||
+ *
|
||||
+ * - we haven't sent data in a while (somewhat arbitrarily, ten
|
||||
+ * times the RTT), as that might indicate that the receiver
|
||||
+ * will only process data in batches that are large enough,
|
||||
+ * but we won't send enough to fill one because we're stuck
|
||||
+ * with pending data in the outbound queue
|
||||
*/
|
||||
- if (limit < MSS_GET(conn))
|
||||
+ if (limit < MSS_GET(conn) && sendq &&
|
||||
+ tinfo->tcpi_last_data_sent < tinfo->tcpi_rtt / 1000 * 10)
|
||||
limit = 0;
|
||||
|
||||
new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, limit);
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -0,0 +1,90 @@
|
||||
From 75dcbc300bf09c3649823b12d30c4f24de7271d4 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Tue, 23 Dec 2025 13:39:17 +0100
|
||||
Subject: [PATCH] pasta: Warn, disable matching IP version if not supported, in
|
||||
local mode
|
||||
|
||||
...instead of exiting, but only if local mode is enabled, that is, if
|
||||
we couldn't find a template interface or if the user didn't specify
|
||||
one.
|
||||
|
||||
With IPv4, we always try to set or copy an address, so check if that
|
||||
fails.
|
||||
|
||||
With IPv6, in local mode, we rely on the link-local address that's
|
||||
automatically generated inside the target namespace, and only fail
|
||||
later, as we try to set up routes. Check if that fails, instead.
|
||||
|
||||
Otherwise, we'll fail to start if IPv6 support is not built in or
|
||||
disabled by the kernel ("ipv6.disable=1" on the command line),
|
||||
because, in that case, we'll try to enable local mode by default, and
|
||||
then fail to set any address or route.
|
||||
|
||||
It would probably be more elegant to check for IP version support in
|
||||
conf_ip4_local() and conf_ip6_local(), and not even try to enable
|
||||
connectivity for unsupported versions, but it looks less robust than
|
||||
trying and failing, as there might be other ways to disable a given
|
||||
IP version.
|
||||
|
||||
Note that there's currently no way to disable IPv4 support on the
|
||||
kernel command line, that is, there's no such thing as an
|
||||
ipv4.disable boot parameter. But I guess that's due to be eventually
|
||||
implemented, one day, so let's cover that case as well, also for
|
||||
consistency.
|
||||
|
||||
Reported-by: Iyan <iyanmv@gmail.com>
|
||||
Link: https://bugzilla.redhat.com/show_bug.cgi?id=2424192
|
||||
Fixes: 4ddd59bc6085 ("conf: Separate local mode for each IP version, don't enable disabled IP version")
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
---
|
||||
pasta.c | 14 ++++++++++++++
|
||||
1 file changed, 14 insertions(+)
|
||||
|
||||
diff --git a/pasta.c b/pasta.c
|
||||
index c307b8a..0ddd6b0 100644
|
||||
--- a/pasta.c
|
||||
+++ b/pasta.c
|
||||
@@ -348,6 +348,12 @@ void pasta_ns_conf(struct ctx *c)
|
||||
AF_INET);
|
||||
}
|
||||
|
||||
+ if (c->ifi4 == -1 && rc == -ENOTSUP) {
|
||||
+ warn("IPv4 not supported, disabling");
|
||||
+ c->ifi4 = 0;
|
||||
+ goto ipv4_done;
|
||||
+ }
|
||||
+
|
||||
if (rc < 0) {
|
||||
die("Couldn't set IPv4 address(es) in namespace: %s",
|
||||
strerror_(-rc));
|
||||
@@ -367,6 +373,7 @@ void pasta_ns_conf(struct ctx *c)
|
||||
strerror_(-rc));
|
||||
}
|
||||
}
|
||||
+ipv4_done:
|
||||
|
||||
if (c->ifi6) {
|
||||
rc = nl_addr_get_ll(nl_sock_ns, c->pasta_ifi,
|
||||
@@ -413,12 +420,19 @@ void pasta_ns_conf(struct ctx *c)
|
||||
AF_INET6);
|
||||
}
|
||||
|
||||
+ if (c->ifi6 == -1 && rc == -ENOTSUP) {
|
||||
+ warn("IPv6 not supported, disabling");
|
||||
+ c->ifi6 = 0;
|
||||
+ goto ipv6_done;
|
||||
+ }
|
||||
+
|
||||
if (rc < 0) {
|
||||
die("Couldn't set IPv6 route(s) in guest: %s",
|
||||
strerror_(-rc));
|
||||
}
|
||||
}
|
||||
}
|
||||
+ipv6_done:
|
||||
|
||||
proto_update_l2_buf(c->guest_mac);
|
||||
}
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -1,76 +0,0 @@
|
||||
From f9278aab878ef58cf8502ea8f904dbb40fbbb16a Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Thu, 2 Oct 2025 00:41:54 +0200
|
||||
Subject: [PATCH 4/4] tcp: Don't consider FIN flags with mismatching sequence
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
If a guest or container sends us a FIN segment but its sequence number
|
||||
doesn't match the highest sequence of data we *accepted* (not
|
||||
necessarily the highest sequence we received), that is,
|
||||
conn->seq_from_tap, plus any data we're accepting in the current
|
||||
batch, we should discard the flag (not necessarily the segment),
|
||||
because there's still data we need to receive (again) before the end
|
||||
of the stream.
|
||||
|
||||
If we consider those FIN flags as such, we'll end up in the
|
||||
situation described below.
|
||||
|
||||
Here, 192.168.10.102 is a HTTP server in a Podman container, and
|
||||
192.168.10.44 is a client fetching approximately 121 KB of data from
|
||||
it:
|
||||
|
||||
82 2.026811 192.168.10.102 → 192.168.10.44 54 TCP 55414 → 44992 [FIN, ACK] Seq=121441 Ack=143 Win=65536 Len=0
|
||||
|
||||
the server is done sending
|
||||
|
||||
83 2.026898 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [ACK] Seq=143 Ack=114394 Win=216192 Len=0
|
||||
|
||||
pasta (client) acknowledges a previous sequence, because of
|
||||
a short sendmsg()
|
||||
|
||||
84 2.027324 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [FIN, ACK] Seq=143 Ack=114394 Win=216192 Len=0
|
||||
|
||||
pasta (client) sends FIN, ACK as the client has no more data to
|
||||
send (a single GET request), while still acknowledging a previous
|
||||
sequence, because the retransmission didn't happen yet
|
||||
|
||||
85 2.027349 192.168.10.102 → 192.168.10.44 54 TCP 55414 → 44992 [ACK] Seq=121442 Ack=144 Win=65536 Len=0
|
||||
|
||||
the server acknowledges the FIN, ACK
|
||||
|
||||
86 2.224125 192.168.10.102 → 192.168.10.44 4150 TCP [TCP Retransmission] 55414 → 44992 [ACK] Seq=114394 Ack=144 Win=65536 Len=4096 [TCP segment of a reassembled PDU]
|
||||
|
||||
and finally a retransmission comes, but as we wrongly switched to
|
||||
the CLOSE-WAIT state,
|
||||
|
||||
87 2.224202 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [RST] Seq=144 Win=0 Len=0
|
||||
|
||||
we consider frame #86 as an acknowledgement for the FIN segment we
|
||||
sent, and close the connection, while we still had to re-receive
|
||||
(and finally send) the missing data segment, instead.
|
||||
|
||||
Link: https://github.com/containers/podman/issues/27179
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
(cherry picked from commit b145441913eef6f8885b6b84531e944ff593790c)
|
||||
---
|
||||
tcp.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index 0ac298a..4428305 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -1696,7 +1696,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
}
|
||||
}
|
||||
|
||||
- if (th->fin)
|
||||
+ if (th->fin && seq == seq_from_tap)
|
||||
fin = 1;
|
||||
|
||||
if (!len)
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -0,0 +1,58 @@
|
||||
From d2c5133990a7758bfa567fc73216393498949e9b Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Tue, 23 Dec 2025 01:59:34 +0100
|
||||
Subject: [PATCH] selinux: Enable read and watch permissions on netns directory
|
||||
as well
|
||||
|
||||
With commit 7aeda16a7818 ("selinux: Transition to pasta_t in
|
||||
containers"), we need to make sure that pasta can access the target
|
||||
namespace directory passed by Podman, and, in a general case, we have
|
||||
all the permissions we need.
|
||||
|
||||
But if we now start a container without the Podman changes referenced
|
||||
by commit fd1bcc30af07 ("selinux: add container_var_run_t type
|
||||
transition"), or with them, but with the container being created
|
||||
before those and without a reboot in between, we'll additionally need
|
||||
'read' and 'watch' permissions on user_tmp_t directory as well, as
|
||||
user_tmp_t is still the (inconsistent) context of the namespace entry.
|
||||
|
||||
Otherwise, on a container start/restart, we'll get SELinux denials:
|
||||
|
||||
type=AVC msg=audit(1766451401.296:184): avc: denied { read } for pid=2159 comm="pasta.avx2" name="netns" dev="tmpfs" ino=60 scontext=unconfined_u:unconfined_r:pasta_t:s0-s0:c0.c1023 tcontext=unconfined_u:obje
|
||||
ct_r:user_tmp_t:s0 tclass=dir permissive=1
|
||||
type=AVC msg=audit(1766451401.298:185): avc: denied { watch } for pid=2159 comm="pasta.avx2" path="/run/user/1001/netns" dev="tmpfs" ino=60 scontext=unconfined_u:unconfined_r:pasta_t:s0-s0:c0.c1023 tcontext=unconfined_u:object_r:user_tmp_t:s0 tclass=dir permissive=1
|
||||
|
||||
This can be reproduced quite simply:
|
||||
|
||||
$ podman create -q --name hello hello
|
||||
6c4eaf15a03edf799673a97d84d0331f3a3f34a11015b58c69318101a3232770
|
||||
|
||||
[upgrade passt's SELinux policy to a version including 7aeda16a7818]
|
||||
|
||||
$ podman start hello
|
||||
Error: unable to start container "6c4eaf15a03edf799673a97d84d0331f3a3f34a11015b58c69318101a3232770": pasta failed with exit code 1:
|
||||
netns dir open: Permission denied, exiting
|
||||
|
||||
Reported-by: Tuomo Soini <tis@foobar.fi>
|
||||
Fixes: 7aeda16a7818 ("selinux: Transition to pasta_t in containers")
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
---
|
||||
contrib/selinux/pasta.te | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te
|
||||
index 95fe42a..3eb58f6 100644
|
||||
--- a/contrib/selinux/pasta.te
|
||||
+++ b/contrib/selinux/pasta.te
|
||||
@@ -149,7 +149,7 @@ allow pasta_t root_t:dir mounton;
|
||||
manage_files_pattern(pasta_t, pasta_pid_t, pasta_pid_t)
|
||||
files_pid_filetrans(pasta_t, pasta_pid_t, file)
|
||||
|
||||
-allow pasta_t user_tmp_t:dir { add_name remove_name search write };
|
||||
+allow pasta_t user_tmp_t:dir { add_name read remove_name search watch write };
|
||||
allow pasta_t user_tmp_t:fifo_file append;
|
||||
allow pasta_t user_tmp_t:file { create open write };
|
||||
allow pasta_t user_tmp_t:sock_file { create unlink };
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -0,0 +1,68 @@
|
||||
From 6babaa8a88eb337e4b81aeff673fcebb28015f36 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Fri, 16 Jan 2026 16:48:46 +0100
|
||||
Subject: [PATCH 6/7] selinux: Enable open permissions on netns directory,
|
||||
operations on container_var_run_t
|
||||
|
||||
Tuomo reports two further SELinux denials after upgrading to a
|
||||
passt-selinux version that includes the transition to pasta_t for
|
||||
containers, one I could reproduce:
|
||||
|
||||
denied { open } for pid=3343050 comm="pasta.avx2" path="/run/user/1000/netns" dev="tmpfs" ino=51 scontext=unconfined_u:unconfined_r:pasta_t:s0-s0:c0.c1023 tcontext=unconfined_u:object_r:user_tmp_t:s0 tclass=dir permissive=1
|
||||
|
||||
which I didn't take care of in the previous commit, d2c5133990a7
|
||||
("selinux: Enable read and watch permissions on netns directory as
|
||||
well"), as it didn't appear in my quick test. But I can make pasta use
|
||||
"open" on the network namespace entry by simply using it to make
|
||||
connections.
|
||||
|
||||
So, for that, add "open" to the existing rule for user_tmp_t:dir.
|
||||
|
||||
Then, another one I couldn't reproduce instead:
|
||||
|
||||
denied { write } for pid=3589324 comm="pasta.avx2" name="rootless-netns" dev="tmpfs" ino=36 scontext=unconfined_u:unconfined_r:pasta_t:s0-s0:c0.c1023 tcontext=unconfined_u:object_r:container_var_run_t:s0 tclass=dir permissive=0
|
||||
|
||||
which, I think, comes from a specific combination of versions of
|
||||
container-selinux, Podman, and passt-selinux packages, which
|
||||
prevents the expected type transition on container_var_run_t unless
|
||||
restorecon is invoked manually, or until a reboot.
|
||||
|
||||
Allowing the same permissions on container_var_run_t as we do on
|
||||
ifconfig_var_run_t is harmless, so do that to prevent this further
|
||||
denial.
|
||||
|
||||
Reported-by: Tuomo Soini <tis@foobar.fi>
|
||||
Fixes: d2c5133990a7 ("selinux: Enable read and watch permissions on netns directory as well")
|
||||
Fixes: 7aeda16a7818 ("selinux: Transition to pasta_t in containers")
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
(cherry picked from commit a6d92ca82c9ea0b395aa56c568ee6b6e6d4ac81e)
|
||||
---
|
||||
contrib/selinux/pasta.te | 4 +++-
|
||||
1 file changed, 3 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/contrib/selinux/pasta.te b/contrib/selinux/pasta.te
|
||||
index 22daa77..abeafa4 100644
|
||||
--- a/contrib/selinux/pasta.te
|
||||
+++ b/contrib/selinux/pasta.te
|
||||
@@ -148,7 +148,7 @@ allow pasta_t root_t:dir mounton;
|
||||
manage_files_pattern(pasta_t, pasta_pid_t, pasta_pid_t)
|
||||
files_pid_filetrans(pasta_t, pasta_pid_t, file)
|
||||
|
||||
-allow pasta_t user_tmp_t:dir { add_name read remove_name search watch write };
|
||||
+allow pasta_t user_tmp_t:dir { add_name open read remove_name search watch write };
|
||||
allow pasta_t user_tmp_t:fifo_file append;
|
||||
allow pasta_t user_tmp_t:file { create open write };
|
||||
allow pasta_t user_tmp_t:sock_file { create unlink };
|
||||
@@ -248,7 +248,9 @@ type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "netns";
|
||||
type_transition container_runtime_t container_var_run_t : dir ifconfig_var_run_t "netns";
|
||||
type_transition container_runtime_t user_tmp_t : dir ifconfig_var_run_t "rootless-netns";
|
||||
type_transition container_runtime_t container_var_run_t : dir ifconfig_var_run_t "rootless-netns";
|
||||
+allow pasta_t container_var_run_t:dir { add_name open rmdir write };
|
||||
allow pasta_t ifconfig_var_run_t:dir { add_name open rmdir write };
|
||||
+allow pasta_t container_var_run_t:file { create open write };
|
||||
allow pasta_t ifconfig_var_run_t:file { create open write };
|
||||
allow systemd_logind_exec_t ifconfig_var_run_t:dir rmdir;
|
||||
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -0,0 +1,74 @@
|
||||
From dbfbc33776290260b87bb29bb5572750f9709b35 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Fri, 9 Jan 2026 13:52:00 +0100
|
||||
Subject: [PATCH 7/7] tcp: Fix rounding issue in check for approximating window
|
||||
to zero
|
||||
|
||||
In general, we approximate the advertised window to zero if we would
|
||||
otherwise advertise less than a MSS worth, and the reasoning behind
|
||||
that is explained in cf1925fb7b77 ("tcp: Don't limit window to
|
||||
less-than-MSS values, use zero instead").
|
||||
|
||||
Then, in commit b40f5cd8c8e1 ("tcp: Use less-than-MSS window on no
|
||||
queued data, or no data sent recently"), I introduced some conditions
|
||||
under which we won't do that, including a check on whether any data
|
||||
was sent recently.
|
||||
|
||||
As an arbitrary but probably reasonable threshold, we consider data to
|
||||
have recently been sent if that occurred less than ten times the
|
||||
round-trip time (RTT) ago.
|
||||
|
||||
The time elapsed since the last data transmission is reported by the
|
||||
kernel in milliseconds, in the tcpi_last_data_sent field of struct
|
||||
tcp_info, and the RTT is reported in microseconds instead, in
|
||||
tcpi_rtt.
|
||||
|
||||
To avoid the risk of overflow in a simple way, for the purpose of this
|
||||
comparison, I converted tcpi_rtt to milliseconds first, but this means
|
||||
that the check will always be false (and we'll never approximate the
|
||||
window to zero) if the RTT is below one millisecond.
|
||||
|
||||
This, in turn, reintroduces nasty delay issues in transfers in
|
||||
non-local connections which have however almost-local (low) latency.
|
||||
|
||||
Given that we want to use ten times the RTT as an arbitrary "long
|
||||
enough" upper bound, round the RTT up while converting it to
|
||||
milliseconds.
|
||||
|
||||
As an alternative, we could perform the comparison in microseconds,
|
||||
but we would need a slightly more complicated implementation to
|
||||
exclude overflows, and it's definitely not worth it given the nature
|
||||
of this threshold.
|
||||
|
||||
Fixes: b40f5cd8c8e1 ("tcp: Use less-than-MSS window on no queued data, or no data sent recently")
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
(cherry picked from commit 2be0e790804f99580b1c8a1781c49913440607f2)
|
||||
---
|
||||
tcp.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index 23fcbc3..8f4f087 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -1180,6 +1180,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) {
|
||||
new_wnd_to_tap = tinfo->tcpi_snd_wnd;
|
||||
} else {
|
||||
+ unsigned rtt_ms_ceiling = DIV_ROUND_UP(tinfo->tcpi_rtt, 1000);
|
||||
uint32_t sendq;
|
||||
int limit;
|
||||
|
||||
@@ -1223,7 +1224,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
* with pending data in the outbound queue
|
||||
*/
|
||||
if (limit < MSS_GET(conn) && sendq &&
|
||||
- tinfo->tcpi_last_data_sent < tinfo->tcpi_rtt / 1000 * 10)
|
||||
+ tinfo->tcpi_last_data_sent < rtt_ms_ceiling * 10)
|
||||
limit = 0;
|
||||
|
||||
new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, limit);
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -0,0 +1,48 @@
|
||||
From 768e38c4ab9f7bb328897577368084faf9ee41df Mon Sep 17 00:00:00 2001
|
||||
From: Laurent Vivier <lvivier@redhat.com>
|
||||
Date: Fri, 9 Jan 2026 17:54:35 +0100
|
||||
Subject: [PATCH 08/18] udp_flow: remove unneeded epoll_ref indirection
|
||||
|
||||
The fref union was used to convert flow_sidx_t to uint32_t for
|
||||
assignment to ref.data. This is unnecessary since epoll_ref already
|
||||
contains a flowside member of type flow_sidx_t, so we can assign
|
||||
directly.
|
||||
|
||||
This aligns with how icmp.c and other callers assign flow_sidx_t to
|
||||
epoll_ref.
|
||||
|
||||
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
|
||||
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
(cherry picked from commit ab27852d0eebcd96d33c3699b44596a827b83bc6)
|
||||
---
|
||||
udp_flow.c | 6 +-----
|
||||
1 file changed, 1 insertion(+), 5 deletions(-)
|
||||
|
||||
diff --git a/udp_flow.c b/udp_flow.c
|
||||
index 8907f2f..0ba7880 100644
|
||||
--- a/udp_flow.c
|
||||
+++ b/udp_flow.c
|
||||
@@ -74,10 +74,6 @@ static int udp_flow_sock(const struct ctx *c,
|
||||
{
|
||||
const struct flowside *side = &uflow->f.side[sidei];
|
||||
uint8_t pif = uflow->f.pif[sidei];
|
||||
- union {
|
||||
- flow_sidx_t sidx;
|
||||
- uint32_t data;
|
||||
- } fref = { .sidx = FLOW_SIDX(uflow, sidei) };
|
||||
union epoll_ref ref;
|
||||
int rc;
|
||||
int s;
|
||||
@@ -89,7 +85,7 @@ static int udp_flow_sock(const struct ctx *c,
|
||||
}
|
||||
|
||||
ref.type = EPOLL_TYPE_UDP;
|
||||
- ref.data = fref.data;
|
||||
+ ref.flowside = FLOW_SIDX(uflow, sidei);
|
||||
ref.fd = s;
|
||||
|
||||
flow_epollid_set(&uflow->f, EPOLLFD_ID_DEFAULT);
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -0,0 +1,47 @@
|
||||
From 059a31c28aa6e5053846ee931b97eb1344a9ce17 Mon Sep 17 00:00:00 2001
|
||||
From: Laurent Vivier <lvivier@redhat.com>
|
||||
Date: Fri, 9 Jan 2026 17:54:36 +0100
|
||||
Subject: [PATCH 09/18] udp_flow: Assign socket to flow inside udp_flow_sock()
|
||||
|
||||
Move the assignment of uflow->s[sidei] from the caller (udp_flow_new())
|
||||
into udp_flow_sock() itself, placing it after the successful connect().
|
||||
|
||||
This is a pure refactoring with no functional change. The socket fd is
|
||||
now assigned within udp_flow_sock() where the socket is created, rather
|
||||
than requiring the caller to capture the return value. On error paths,
|
||||
uflow->s[sidei] remains at its initialized value of -1 rather than being
|
||||
set to the negative error code, which is semantically cleaner (though
|
||||
functionally equivalent given the >= 0 check in udp_flow_close()).
|
||||
|
||||
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
|
||||
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
(cherry picked from commit e0fdfccc1c1a56c58a96d7fd6cc5d532cd780b6f)
|
||||
---
|
||||
udp_flow.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/udp_flow.c b/udp_flow.c
|
||||
index 0ba7880..c4cf35c 100644
|
||||
--- a/udp_flow.c
|
||||
+++ b/udp_flow.c
|
||||
@@ -105,6 +105,7 @@ static int udp_flow_sock(const struct ctx *c,
|
||||
flow_dbg_perror(uflow, "Couldn't connect flow socket");
|
||||
return rc;
|
||||
}
|
||||
+ uflow->s[sidei] = s;
|
||||
|
||||
/* It's possible, if unlikely, that we could receive some packets in
|
||||
* between the bind() and connect() which may or may not be for this
|
||||
@@ -159,7 +160,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
|
||||
|
||||
flow_foreach_sidei(sidei) {
|
||||
if (pif_is_socket(uflow->f.pif[sidei]))
|
||||
- if ((uflow->s[sidei] = udp_flow_sock(c, uflow, sidei)) < 0)
|
||||
+ if (udp_flow_sock(c, uflow, sidei) < 0)
|
||||
goto cancel;
|
||||
}
|
||||
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -0,0 +1,94 @@
|
||||
From 766e42ea2c6f57547cfee4289ca27168149bb174 Mon Sep 17 00:00:00 2001
|
||||
From: Laurent Vivier <lvivier@redhat.com>
|
||||
Date: Fri, 9 Jan 2026 17:54:37 +0100
|
||||
Subject: [PATCH 10/18] tcp_splice: Refactor tcp_splice_conn_epoll_events() to
|
||||
per-side computation
|
||||
|
||||
The function tcp_splice_conn_epoll_events() currently takes an array of
|
||||
struct epoll_event and fills in the .events field for both sides using
|
||||
flow_foreach_sidei() loops.
|
||||
|
||||
This works, but the function is doing two conceptually separate things
|
||||
at once: computing events for side 0 and computing events for side 1.
|
||||
The OUT_WAIT handling is particularly subtle, as it has cross-side
|
||||
effects: when OUT_WAIT(sidei) is set, we add EPOLLOUT to ev[sidei] but
|
||||
also remove EPOLLIN from ev[!sidei].
|
||||
|
||||
Refactor to make the function compute events for a single side at a
|
||||
time, taking sidei as a parameter and returning uint32_t. This makes
|
||||
the logic more focused and easier to follow. The cross-side effects of
|
||||
OUT_WAIT are preserved by checking both OUT_WAIT(sidei) and
|
||||
OUT_WAIT(!sidei) within each call.
|
||||
|
||||
The caller tcp_splice_epoll_ctl() now invokes the function twice, once
|
||||
for each side, making the two-sided nature of the operation explicit.
|
||||
|
||||
No functional change.
|
||||
|
||||
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
|
||||
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
(cherry picked from commit 23da651ab08e564b84c532f6f93b0817d2ae850f)
|
||||
---
|
||||
tcp_splice.c | 33 ++++++++++++++-------------------
|
||||
1 file changed, 14 insertions(+), 19 deletions(-)
|
||||
|
||||
diff --git a/tcp_splice.c b/tcp_splice.c
|
||||
index 4405224..bf4ff46 100644
|
||||
--- a/tcp_splice.c
|
||||
+++ b/tcp_splice.c
|
||||
@@ -114,29 +114,23 @@ static struct tcp_splice_conn *conn_at_sidx(flow_sidx_t sidx)
|
||||
* @events: Connection event flags
|
||||
* @ev: Events to fill in, 0 is accepted socket, 1 is connecting socket
|
||||
*/
|
||||
-static void tcp_splice_conn_epoll_events(uint16_t events,
|
||||
- struct epoll_event ev[])
|
||||
+static uint32_t tcp_splice_conn_epoll_events(uint16_t events, unsigned sidei)
|
||||
{
|
||||
- unsigned sidei;
|
||||
-
|
||||
- flow_foreach_sidei(sidei)
|
||||
- ev[sidei].events = 0;
|
||||
+ uint32_t e = 0;
|
||||
|
||||
if (events & SPLICE_ESTABLISHED) {
|
||||
- flow_foreach_sidei(sidei) {
|
||||
- if (!(events & FIN_SENT(!sidei)))
|
||||
- ev[sidei].events = EPOLLIN | EPOLLRDHUP;
|
||||
- }
|
||||
- } else if (events & SPLICE_CONNECT) {
|
||||
- ev[1].events = EPOLLOUT;
|
||||
+ if (!(events & FIN_SENT(!sidei)))
|
||||
+ e = EPOLLIN | EPOLLRDHUP;
|
||||
+ } else if (sidei == 1 && events & SPLICE_CONNECT) {
|
||||
+ e = EPOLLOUT;
|
||||
}
|
||||
|
||||
- flow_foreach_sidei(sidei) {
|
||||
- if (events & OUT_WAIT(sidei)) {
|
||||
- ev[sidei].events |= EPOLLOUT;
|
||||
- ev[!sidei].events &= ~EPOLLIN;
|
||||
- }
|
||||
- }
|
||||
+ if (events & OUT_WAIT(sidei))
|
||||
+ e |= EPOLLOUT;
|
||||
+ if (events & OUT_WAIT(!sidei))
|
||||
+ e &= ~EPOLLIN;
|
||||
+
|
||||
+ return e;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -161,7 +155,8 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
|
||||
struct epoll_event ev[SIDES] = { { .data.u64 = ref[0].u64 },
|
||||
{ .data.u64 = ref[1].u64 } };
|
||||
|
||||
- tcp_splice_conn_epoll_events(conn->events, ev);
|
||||
+ ev[0].events = tcp_splice_conn_epoll_events(conn->events, 0);
|
||||
+ ev[1].events = tcp_splice_conn_epoll_events(conn->events, 1);
|
||||
|
||||
|
||||
if (epoll_ctl(epollfd, m, conn->s[0], &ev[0]) ||
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -0,0 +1,489 @@
|
||||
From 79dab11a029025e485faf4a3f5ea1ed4538fb64b Mon Sep 17 00:00:00 2001
|
||||
From: Laurent Vivier <lvivier@redhat.com>
|
||||
Date: Fri, 9 Jan 2026 17:54:38 +0100
|
||||
Subject: [PATCH 11/18] flow: Introduce flow_epoll_set() to centralize epoll
|
||||
operations
|
||||
|
||||
Currently, each flow type (TCP, TCP_SPLICE, PING, UDP) has its own
|
||||
code to add or modify file descriptors in epoll. This leads to
|
||||
duplicated boilerplate code across icmp.c, tcp.c, tcp_splice.c, and
|
||||
udp_flow.c, each setting up epoll_ref unions and calling epoll_ctl()
|
||||
with flow-type-specific details.
|
||||
|
||||
Introduce flow_epoll_set() in flow.c to handle epoll operations for
|
||||
all flow types in a unified way.
|
||||
|
||||
This will be needed to migrate queue pair from an epollfd to another.
|
||||
|
||||
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
|
||||
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
(cherry picked from commit c0be730f2aa2243a132b3ee40c2bf05ebc84fedf)
|
||||
---
|
||||
flow.c | 37 ++++++++++++++++++++++++
|
||||
flow.h | 2 ++
|
||||
icmp.c | 10 ++-----
|
||||
tcp.c | 48 ++++++++++++++++++------------
|
||||
tcp_splice.c | 82 ++++++++++++++++++++++++----------------------------
|
||||
udp_flow.c | 11 ++-----
|
||||
6 files changed, 111 insertions(+), 79 deletions(-)
|
||||
|
||||
diff --git a/flow.c b/flow.c
|
||||
index 4f53486..cefe6c8 100644
|
||||
--- a/flow.c
|
||||
+++ b/flow.c
|
||||
@@ -20,6 +20,7 @@
|
||||
#include "flow.h"
|
||||
#include "flow_table.h"
|
||||
#include "repair.h"
|
||||
+#include "epoll_ctl.h"
|
||||
|
||||
const char *flow_state_str[] = {
|
||||
[FLOW_STATE_FREE] = "FREE",
|
||||
@@ -53,6 +54,16 @@ const uint8_t flow_proto[] = {
|
||||
static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
|
||||
"flow_proto[] doesn't match enum flow_type");
|
||||
|
||||
+static const enum epoll_type flow_epoll[] = {
|
||||
+ [FLOW_TCP] = EPOLL_TYPE_TCP,
|
||||
+ [FLOW_TCP_SPLICE] = EPOLL_TYPE_TCP_SPLICE,
|
||||
+ [FLOW_PING4] = EPOLL_TYPE_PING,
|
||||
+ [FLOW_PING6] = EPOLL_TYPE_PING,
|
||||
+ [FLOW_UDP] = EPOLL_TYPE_UDP,
|
||||
+};
|
||||
+static_assert(ARRAY_SIZE(flow_epoll) == FLOW_NUM_TYPES,
|
||||
+ "flow_epoll[] doesn't match enum flow_type");
|
||||
+
|
||||
#define foreach_established_tcp_flow(flow) \
|
||||
flow_foreach_of_type((flow), FLOW_TCP) \
|
||||
if (!tcp_flow_is_established(&(flow)->tcp)) \
|
||||
@@ -390,6 +401,32 @@ void flow_epollid_clear(struct flow_common *f)
|
||||
f->epollid = EPOLLFD_ID_INVALID;
|
||||
}
|
||||
|
||||
+/**
|
||||
+ * flow_epoll_set() - Add or modify epoll registration for a flow socket
|
||||
+ * @f: Flow to register socket for
|
||||
+ * @command: epoll_ctl() command: EPOLL_CTL_ADD or EPOLL_CTL_MOD
|
||||
+ * @events: epoll events to watch for
|
||||
+ * @fd: File descriptor to register
|
||||
+ * @sidei: Side index of the flow
|
||||
+ *
|
||||
+ * Return: 0 on success, -1 on error (from epoll_ctl())
|
||||
+ */
|
||||
+int flow_epoll_set(const struct flow_common *f, int command, uint32_t events,
|
||||
+ int fd, unsigned int sidei)
|
||||
+{
|
||||
+ struct epoll_event ev;
|
||||
+ union epoll_ref ref;
|
||||
+
|
||||
+ ref.fd = fd;
|
||||
+ ref.type = flow_epoll[f->type];
|
||||
+ ref.flowside = flow_sidx(f, sidei);
|
||||
+
|
||||
+ ev.events = events;
|
||||
+ ev.data.u64 = ref.u64;
|
||||
+
|
||||
+ return epoll_ctl(flow_epollfd(f), command, fd, &ev);
|
||||
+}
|
||||
+
|
||||
/**
|
||||
* flow_epollid_register() - Initialize the epoll id -> fd mapping
|
||||
* @epollid: epoll id to associate to
|
||||
diff --git a/flow.h b/flow.h
|
||||
index b43b0b1..1b78d59 100644
|
||||
--- a/flow.h
|
||||
+++ b/flow.h
|
||||
@@ -265,6 +265,8 @@ bool flow_in_epoll(const struct flow_common *f);
|
||||
int flow_epollfd(const struct flow_common *f);
|
||||
void flow_epollid_set(struct flow_common *f, int epollid);
|
||||
void flow_epollid_clear(struct flow_common *f);
|
||||
+int flow_epoll_set(const struct flow_common *f, int command, uint32_t events,
|
||||
+ int fd, unsigned int sidei);
|
||||
void flow_epollid_register(int epollid, int epollfd);
|
||||
void flow_defer_handler(const struct ctx *c, const struct timespec *now);
|
||||
int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage,
|
||||
diff --git a/icmp.c b/icmp.c
|
||||
index 9564c49..eb7f11b 100644
|
||||
--- a/icmp.c
|
||||
+++ b/icmp.c
|
||||
@@ -177,7 +177,6 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
|
||||
union flow *flow = flow_alloc();
|
||||
struct icmp_ping_flow *pingf;
|
||||
const struct flowside *tgt;
|
||||
- union epoll_ref ref;
|
||||
|
||||
if (!flow)
|
||||
return NULL;
|
||||
@@ -211,13 +210,10 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
|
||||
goto cancel;
|
||||
|
||||
flow_epollid_set(&pingf->f, EPOLLFD_ID_DEFAULT);
|
||||
-
|
||||
- ref.type = EPOLL_TYPE_PING;
|
||||
- ref.flowside = FLOW_SIDX(flow, TGTSIDE);
|
||||
- ref.fd = pingf->sock;
|
||||
-
|
||||
- if (epoll_add(flow_epollfd(&pingf->f), EPOLLIN, ref) < 0) {
|
||||
+ if (flow_epoll_set(&pingf->f, EPOLL_CTL_ADD, EPOLLIN, pingf->sock,
|
||||
+ TGTSIDE) < 0) {
|
||||
close(pingf->sock);
|
||||
+ flow_epollid_clear(&pingf->f);
|
||||
goto cancel;
|
||||
}
|
||||
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index 8f4f087..146d460 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -523,34 +523,44 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
|
||||
|
||||
/**
|
||||
* tcp_epoll_ctl() - Add/modify/delete epoll state from connection events
|
||||
- * @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
*
|
||||
* Return: 0 on success, negative error code on failure (not on deletion)
|
||||
*/
|
||||
-static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
|
||||
+static int tcp_epoll_ctl(struct tcp_tap_conn *conn)
|
||||
{
|
||||
- int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
|
||||
- union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock,
|
||||
- .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), };
|
||||
- struct epoll_event ev = { .data.u64 = ref.u64 };
|
||||
- int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f)
|
||||
- : c->epollfd;
|
||||
+ uint32_t events;
|
||||
+ int m;
|
||||
|
||||
if (conn->events == CLOSED) {
|
||||
- if (flow_in_epoll(&conn->f))
|
||||
+ if (flow_in_epoll(&conn->f)) {
|
||||
+ int epollfd = flow_epollfd(&conn->f);
|
||||
+
|
||||
epoll_del(epollfd, conn->sock);
|
||||
- if (conn->timer != -1)
|
||||
- epoll_del(epollfd, conn->timer);
|
||||
+ if (conn->timer != -1)
|
||||
+ epoll_del(epollfd, conn->timer);
|
||||
+ }
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
|
||||
- ev.events = tcp_conn_epoll_events(conn->events, conn->flags);
|
||||
+ events = tcp_conn_epoll_events(conn->events, conn->flags);
|
||||
|
||||
- if (epoll_ctl(epollfd, m, conn->sock, &ev))
|
||||
- return -errno;
|
||||
+ if (flow_in_epoll(&conn->f)) {
|
||||
+ m = EPOLL_CTL_MOD;
|
||||
+ } else {
|
||||
+ flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
|
||||
+ m = EPOLL_CTL_ADD;
|
||||
+ }
|
||||
|
||||
- flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
|
||||
+ if (flow_epoll_set(&conn->f, m, events, conn->sock,
|
||||
+ !TAPSIDE(conn)) < 0) {
|
||||
+ int ret = -errno;
|
||||
+
|
||||
+ if (m == EPOLL_CTL_ADD)
|
||||
+ flow_epollid_clear(&conn->f);
|
||||
+ return ret;
|
||||
+ }
|
||||
|
||||
if (conn->timer != -1) {
|
||||
union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER,
|
||||
@@ -681,7 +691,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
}
|
||||
|
||||
if (flag == STALLED || flag == ~STALLED)
|
||||
- tcp_epoll_ctl(c, conn);
|
||||
+ tcp_epoll_ctl(conn);
|
||||
|
||||
if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE ||
|
||||
(flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) ||
|
||||
@@ -738,7 +748,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
} else {
|
||||
if (event == CLOSED)
|
||||
flow_hash_remove(c, TAP_SIDX(conn));
|
||||
- tcp_epoll_ctl(c, conn);
|
||||
+ tcp_epoll_ctl(conn);
|
||||
}
|
||||
|
||||
if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
|
||||
@@ -1753,7 +1763,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
|
||||
conn_event(c, conn, TAP_SYN_ACK_SENT);
|
||||
}
|
||||
|
||||
- tcp_epoll_ctl(c, conn);
|
||||
+ tcp_epoll_ctl(conn);
|
||||
|
||||
if (c->mode == MODE_VU) { /* To rebind to same oport after migration */
|
||||
socklen_t sl = sizeof(sa);
|
||||
@@ -4021,7 +4031,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
|
||||
tcp_send_flag(c, conn, ACK);
|
||||
tcp_data_from_sock(c, conn);
|
||||
|
||||
- if ((rc = tcp_epoll_ctl(c, conn))) {
|
||||
+ if ((rc = tcp_epoll_ctl(conn))) {
|
||||
flow_dbg(conn,
|
||||
"Failed to subscribe to epoll for migrated socket: %s",
|
||||
strerror_(-rc));
|
||||
diff --git a/tcp_splice.c b/tcp_splice.c
|
||||
index bf4ff46..a7c04ca 100644
|
||||
--- a/tcp_splice.c
|
||||
+++ b/tcp_splice.c
|
||||
@@ -135,37 +135,31 @@ static uint32_t tcp_splice_conn_epoll_events(uint16_t events, unsigned sidei)
|
||||
|
||||
/**
|
||||
* tcp_splice_epoll_ctl() - Add/modify/delete epoll state from connection events
|
||||
- * @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
*
|
||||
* Return: 0 on success, negative error code on failure (not on deletion)
|
||||
*/
|
||||
-static int tcp_splice_epoll_ctl(const struct ctx *c,
|
||||
- struct tcp_splice_conn *conn)
|
||||
+static int tcp_splice_epoll_ctl(struct tcp_splice_conn *conn)
|
||||
{
|
||||
- int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f)
|
||||
- : c->epollfd;
|
||||
- int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
|
||||
- const union epoll_ref ref[SIDES] = {
|
||||
- { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[0],
|
||||
- .flowside = FLOW_SIDX(conn, 0) },
|
||||
- { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[1],
|
||||
- .flowside = FLOW_SIDX(conn, 1) }
|
||||
- };
|
||||
- struct epoll_event ev[SIDES] = { { .data.u64 = ref[0].u64 },
|
||||
- { .data.u64 = ref[1].u64 } };
|
||||
-
|
||||
- ev[0].events = tcp_splice_conn_epoll_events(conn->events, 0);
|
||||
- ev[1].events = tcp_splice_conn_epoll_events(conn->events, 1);
|
||||
-
|
||||
-
|
||||
- if (epoll_ctl(epollfd, m, conn->s[0], &ev[0]) ||
|
||||
- epoll_ctl(epollfd, m, conn->s[1], &ev[1])) {
|
||||
+ uint32_t events[2];
|
||||
+ int m;
|
||||
+
|
||||
+ if (flow_in_epoll(&conn->f)) {
|
||||
+ m = EPOLL_CTL_MOD;
|
||||
+ } else {
|
||||
+ flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
|
||||
+ m = EPOLL_CTL_ADD;
|
||||
+ }
|
||||
+
|
||||
+ events[0] = tcp_splice_conn_epoll_events(conn->events, 0);
|
||||
+ events[1] = tcp_splice_conn_epoll_events(conn->events, 1);
|
||||
+
|
||||
+ if (flow_epoll_set(&conn->f, m, events[0], conn->s[0], 0) ||
|
||||
+ flow_epoll_set(&conn->f, m, events[1], conn->s[1], 1)) {
|
||||
int ret = -errno;
|
||||
flow_perror(conn, "ERROR on epoll_ctl()");
|
||||
return ret;
|
||||
}
|
||||
- flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -205,7 +199,7 @@ static void conn_flag_do(struct tcp_splice_conn *conn,
|
||||
}
|
||||
}
|
||||
|
||||
-#define conn_flag(c, conn, flag) \
|
||||
+#define conn_flag(conn, flag) \
|
||||
do { \
|
||||
flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \
|
||||
conn_flag_do(conn, flag); \
|
||||
@@ -213,12 +207,10 @@ static void conn_flag_do(struct tcp_splice_conn *conn,
|
||||
|
||||
/**
|
||||
* conn_event_do() - Set and log connection events, update epoll state
|
||||
- * @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
* @event: Connection event
|
||||
*/
|
||||
-static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn,
|
||||
- unsigned long event)
|
||||
+static void conn_event_do(struct tcp_splice_conn *conn, unsigned long event)
|
||||
{
|
||||
if (event & (event - 1)) {
|
||||
int flag_index = fls(~event);
|
||||
@@ -240,14 +232,14 @@ static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn,
|
||||
flow_dbg(conn, "%s", tcp_splice_event_str[flag_index]);
|
||||
}
|
||||
|
||||
- if (tcp_splice_epoll_ctl(c, conn))
|
||||
- conn_flag(c, conn, CLOSING);
|
||||
+ if (tcp_splice_epoll_ctl(conn))
|
||||
+ conn_flag(conn, CLOSING);
|
||||
}
|
||||
|
||||
-#define conn_event(c, conn, event) \
|
||||
+#define conn_event(conn, event) \
|
||||
do { \
|
||||
flow_trace(conn, "event at %s:%i",__func__, __LINE__); \
|
||||
- conn_event_do(c, conn, event); \
|
||||
+ conn_event_do(conn, event); \
|
||||
} while (0)
|
||||
|
||||
|
||||
@@ -315,7 +307,7 @@ static int tcp_splice_connect_finish(const struct ctx *c,
|
||||
if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) {
|
||||
flow_perror(conn, "cannot create %d->%d pipe",
|
||||
sidei, !sidei);
|
||||
- conn_flag(c, conn, CLOSING);
|
||||
+ conn_flag(conn, CLOSING);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
@@ -329,7 +321,7 @@ static int tcp_splice_connect_finish(const struct ctx *c,
|
||||
}
|
||||
|
||||
if (!(conn->events & SPLICE_ESTABLISHED))
|
||||
- conn_event(c, conn, SPLICE_ESTABLISHED);
|
||||
+ conn_event(conn, SPLICE_ESTABLISHED);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -376,7 +368,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
|
||||
|
||||
pif_sockaddr(c, &sa, tgtpif, &tgt->eaddr, tgt->eport);
|
||||
|
||||
- conn_event(c, conn, SPLICE_CONNECT);
|
||||
+ conn_event(conn, SPLICE_CONNECT);
|
||||
|
||||
if (connect(conn->s[1], &sa.sa, socklen_inany(&sa))) {
|
||||
if (errno != EINPROGRESS) {
|
||||
@@ -385,7 +377,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
|
||||
return -errno;
|
||||
}
|
||||
} else {
|
||||
- conn_event(c, conn, SPLICE_ESTABLISHED);
|
||||
+ conn_event(conn, SPLICE_ESTABLISHED);
|
||||
return tcp_splice_connect_finish(c, conn);
|
||||
}
|
||||
|
||||
@@ -445,7 +437,7 @@ void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0)
|
||||
flow_trace(conn, "failed to set TCP_QUICKACK on %i", s0);
|
||||
|
||||
if (tcp_splice_connect(c, conn))
|
||||
- conn_flag(c, conn, CLOSING);
|
||||
+ conn_flag(conn, CLOSING);
|
||||
|
||||
FLOW_ACTIVATE(conn);
|
||||
}
|
||||
@@ -494,14 +486,14 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
|
||||
|
||||
if (events & EPOLLOUT) {
|
||||
fromsidei = !evsidei;
|
||||
- conn_event(c, conn, ~OUT_WAIT(evsidei));
|
||||
+ conn_event(conn, ~OUT_WAIT(evsidei));
|
||||
} else {
|
||||
fromsidei = evsidei;
|
||||
}
|
||||
|
||||
if (events & EPOLLRDHUP)
|
||||
/* For side 0 this is fake, but implied */
|
||||
- conn_event(c, conn, FIN_RCVD(evsidei));
|
||||
+ conn_event(conn, FIN_RCVD(evsidei));
|
||||
|
||||
swap:
|
||||
eof = 0;
|
||||
@@ -536,7 +528,7 @@ retry:
|
||||
more = SPLICE_F_MORE;
|
||||
|
||||
if (conn->flags & lowat_set_flag)
|
||||
- conn_flag(c, conn, lowat_act_flag);
|
||||
+ conn_flag(conn, lowat_act_flag);
|
||||
}
|
||||
|
||||
do
|
||||
@@ -568,8 +560,8 @@ retry:
|
||||
"Setting SO_RCVLOWAT %i: %s",
|
||||
lowat, strerror_(errno));
|
||||
} else {
|
||||
- conn_flag(c, conn, lowat_set_flag);
|
||||
- conn_flag(c, conn, lowat_act_flag);
|
||||
+ conn_flag(conn, lowat_set_flag);
|
||||
+ conn_flag(conn, lowat_act_flag);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -583,7 +575,7 @@ retry:
|
||||
if (conn->read[fromsidei] == conn->written[fromsidei])
|
||||
break;
|
||||
|
||||
- conn_event(c, conn, OUT_WAIT(!fromsidei));
|
||||
+ conn_event(conn, OUT_WAIT(!fromsidei));
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -605,7 +597,7 @@ retry:
|
||||
if ((conn->events & FIN_RCVD(sidei)) &&
|
||||
!(conn->events & FIN_SENT(!sidei))) {
|
||||
shutdown(conn->s[!sidei], SHUT_WR);
|
||||
- conn_event(c, conn, FIN_SENT(!sidei));
|
||||
+ conn_event(conn, FIN_SENT(!sidei));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -626,7 +618,7 @@ retry:
|
||||
return;
|
||||
|
||||
close:
|
||||
- conn_flag(c, conn, CLOSING);
|
||||
+ conn_flag(conn, CLOSING);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -762,10 +754,10 @@ void tcp_splice_timer(struct tcp_splice_conn *conn)
|
||||
flow_trace(conn, "can't set SO_RCVLOWAT on %d",
|
||||
conn->s[sidei]);
|
||||
}
|
||||
- conn_flag(c, conn, ~RCVLOWAT_SET(sidei));
|
||||
+ conn_flag(conn, ~RCVLOWAT_SET(sidei));
|
||||
}
|
||||
}
|
||||
|
||||
flow_foreach_sidei(sidei)
|
||||
- conn_flag(c, conn, ~RCVLOWAT_ACT(sidei));
|
||||
+ conn_flag(conn, ~RCVLOWAT_ACT(sidei));
|
||||
}
|
||||
diff --git a/udp_flow.c b/udp_flow.c
|
||||
index c4cf35c..80b1543 100644
|
||||
--- a/udp_flow.c
|
||||
+++ b/udp_flow.c
|
||||
@@ -74,7 +74,6 @@ static int udp_flow_sock(const struct ctx *c,
|
||||
{
|
||||
const struct flowside *side = &uflow->f.side[sidei];
|
||||
uint8_t pif = uflow->f.pif[sidei];
|
||||
- union epoll_ref ref;
|
||||
int rc;
|
||||
int s;
|
||||
|
||||
@@ -84,14 +83,10 @@ static int udp_flow_sock(const struct ctx *c,
|
||||
return s;
|
||||
}
|
||||
|
||||
- ref.type = EPOLL_TYPE_UDP;
|
||||
- ref.flowside = FLOW_SIDX(uflow, sidei);
|
||||
- ref.fd = s;
|
||||
-
|
||||
flow_epollid_set(&uflow->f, EPOLLFD_ID_DEFAULT);
|
||||
-
|
||||
- rc = epoll_add(flow_epollfd(&uflow->f), EPOLLIN, ref);
|
||||
- if (rc < 0) {
|
||||
+ if (flow_epoll_set(&uflow->f, EPOLL_CTL_ADD, EPOLLIN, s, sidei) < 0) {
|
||||
+ rc = -errno;
|
||||
+ flow_epollid_clear(&uflow->f);
|
||||
close(s);
|
||||
return rc;
|
||||
}
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -0,0 +1,99 @@
|
||||
From 73a9bee3e1ffe447cb041c4826465a71730c2ecf Mon Sep 17 00:00:00 2001
|
||||
From: David Gibson <david@gibson.dropbear.id.au>
|
||||
Date: Tue, 27 Jan 2026 19:39:52 +1100
|
||||
Subject: [PATCH 12/18] tcp: Properly propagate tap-side RST to socket side
|
||||
|
||||
When the guest sends a TCP RST, or on certain error conditions, we want to
|
||||
signal the abnormal termination of a TCP connection to the peer with an
|
||||
RST as well. We attempt to do that by close()ing the socket.
|
||||
|
||||
That doesn't work: a close() will usually send a FIN, rather than an RST.
|
||||
The standard method of forcing an RST on a socket is to set the SO_LINGER
|
||||
socket option with a 0 timeout, then close().
|
||||
|
||||
Update the tcp_rst() path to do this, so it forces a socket side RST.
|
||||
Update the handling of a guest side RST to use the same path (minus
|
||||
sending a tap side RST) so that we properly propagate guest RSTs to the
|
||||
peer.
|
||||
|
||||
Link: https://bugs.passt.top/show_bug.cgi?id=191
|
||||
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
(cherry picked from commit cce94e92fb3d2a90730c125f2bad32c9ed51da3f)
|
||||
---
|
||||
tcp.c | 37 +++++++++++++++++++++++++++++++++----
|
||||
1 file changed, 33 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index 146d460..602e810 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -1417,7 +1417,34 @@ static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
}
|
||||
|
||||
/**
|
||||
- * tcp_rst_do() - Reset a tap connection: send RST segment to tap, close socket
|
||||
+ * tcp_sock_rst() - Close TCP connection forcing RST on socket side
|
||||
+ * @c: Execution context
|
||||
+ * @conn: Connection pointer
|
||||
+ */
|
||||
+static void tcp_sock_rst(const struct ctx *c, struct tcp_tap_conn *conn)
|
||||
+{
|
||||
+ const struct linger linger0 = {
|
||||
+ .l_onoff = 1,
|
||||
+ .l_linger = 0,
|
||||
+ };
|
||||
+
|
||||
+ /* Force RST on socket to inform the peer
|
||||
+ *
|
||||
+ * We do this by setting SO_LINGER with 0 timeout, which means that
|
||||
+ * close() will send an RST (unless the connection is already closed in
|
||||
+ * both directions).
|
||||
+ */
|
||||
+ if (setsockopt(conn->sock, SOL_SOCKET,
|
||||
+ SO_LINGER, &linger0, sizeof(linger0)) < 0) {
|
||||
+ flow_dbg_perror(conn,
|
||||
+ "SO_LINGER failed, may not send RST to peer");
|
||||
+ }
|
||||
+
|
||||
+ conn_event(c, conn, CLOSED);
|
||||
+}
|
||||
+
|
||||
+/**
|
||||
+ * tcp_rst_do() - Reset a tap connection: send RST segment on both sides, close
|
||||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
*/
|
||||
@@ -1426,8 +1453,10 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn)
|
||||
if (conn->events == CLOSED)
|
||||
return;
|
||||
|
||||
+ /* Send RST on tap */
|
||||
tcp_send_flag(c, conn, RST);
|
||||
- conn_event(c, conn, CLOSED);
|
||||
+
|
||||
+ tcp_sock_rst(c, conn);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1898,7 +1927,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
return -1;
|
||||
|
||||
if (th->rst) {
|
||||
- conn_event(c, conn, CLOSED);
|
||||
+ tcp_sock_rst(c, conn);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -2262,7 +2291,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
flow_trace(conn, "packet length %zu from tap", l4len);
|
||||
|
||||
if (th->rst) {
|
||||
- conn_event(c, conn, CLOSED);
|
||||
+ tcp_sock_rst(c, conn);
|
||||
return 1;
|
||||
}
|
||||
|
||||
--
|
||||
2.47.1
|
||||
|
||||
239
SOURCES/0013-udp-Split-activity-timeouts-for-UDP-flows.patch
Normal file
239
SOURCES/0013-udp-Split-activity-timeouts-for-UDP-flows.patch
Normal file
@ -0,0 +1,239 @@
|
||||
From 3d6804c07d1b9ed26fea88d680a1734ea1069d91 Mon Sep 17 00:00:00 2001
|
||||
From: Yumei Huang <yuhuang@redhat.com>
|
||||
Date: Sat, 14 Feb 2026 15:31:36 +0800
|
||||
Subject: [PATCH 13/18] udp: Split activity timeouts for UDP flows
|
||||
|
||||
Frequent DNS queries over UDP from a container or guest can result
|
||||
in many sockets shown in ss(8), typically one per flow. This is
|
||||
expected and harmless, but it can make the output of ss(8) look
|
||||
noisy and potentially concern users.
|
||||
|
||||
This patch splits UDP flow timeouts into two, mirroring the Linux
|
||||
kernel, and sources the values from kernel parameters. The shorter
|
||||
timeout is applied to unidirectional flows and minimal bidirectional
|
||||
exchanges (single datagram and reply), while the longer timeout is
|
||||
used for bidirectional flows with multiple datagrams on either side.
|
||||
|
||||
Link: https://bugs.passt.top/show_bug.cgi?id=197
|
||||
Suggested-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
Signed-off-by: Yumei Huang <yuhuang@redhat.com>
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
(cherry picked from commit bebafa72a982784164a7d556bd860ec0ed1e02c7)
|
||||
---
|
||||
contrib/apparmor/abstractions/passt | 4 ++++
|
||||
udp.c | 34 +++++++++++++++++++++++++++--
|
||||
udp.h | 4 ++++
|
||||
udp_flow.c | 30 ++++++++++++++++++++-----
|
||||
udp_flow.h | 4 ++++
|
||||
5 files changed, 69 insertions(+), 7 deletions(-)
|
||||
|
||||
diff --git a/contrib/apparmor/abstractions/passt b/contrib/apparmor/abstractions/passt
|
||||
index 43fd63f..e8ed513 100644
|
||||
--- a/contrib/apparmor/abstractions/passt
|
||||
+++ b/contrib/apparmor/abstractions/passt
|
||||
@@ -36,6 +36,10 @@
|
||||
|
||||
@{PROC}/sys/net/ipv4/ip_local_port_range r, # fwd_probe_ephemeral()
|
||||
|
||||
+ # udp_get_timeout_params(), udp.c
|
||||
+ @{PROC}/sys/net/netfilter/nf_conntrack_udp_timeout r,
|
||||
+ @{PROC}/sys/net/netfilter/nf_conntrack_udp_timeout_stream r,
|
||||
+
|
||||
network netlink raw, # nl_sock_init_do(), netlink.c
|
||||
|
||||
network inet stream, # tcp.c
|
||||
diff --git a/udp.c b/udp.c
|
||||
index 08bec50..32d70b6 100644
|
||||
--- a/udp.c
|
||||
+++ b/udp.c
|
||||
@@ -26,7 +26,10 @@
|
||||
*
|
||||
* We track pseudo-connections of this type as flow table entries of type
|
||||
* FLOW_UDP. We store the time of the last traffic on the flow in uflow->ts,
|
||||
- * and let the flow expire if there is no traffic for UDP_CONN_TIMEOUT seconds.
|
||||
+ * and let the flow expire if there is no traffic for UDP_TIMEOUT seconds for
|
||||
+ * unidirectional flows and flows with only one datagram and one reply, or
|
||||
+ * UDP_TIMEOUT_STREAM seconds for bidirectional flows with more than one
|
||||
+ * datagram on either side.
|
||||
*
|
||||
* NOTE: This won't handle multicast protocols, or some protocols with different
|
||||
* port usage. We'll need specific logic if we want to handle those.
|
||||
@@ -118,6 +121,13 @@
|
||||
|
||||
#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */
|
||||
|
||||
+#define UDP_TIMEOUT "/proc/sys/net/netfilter/nf_conntrack_udp_timeout"
|
||||
+#define UDP_TIMEOUT_STREAM \
|
||||
+ "/proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream"
|
||||
+
|
||||
+#define UDP_TIMEOUT_DEFAULT 30 /* s */
|
||||
+#define UDP_TIMEOUT_STREAM_DEFAULT 120 /* s */
|
||||
+
|
||||
/* Maximum UDP data to be returned in ICMP messages */
|
||||
#define ICMP4_MAX_DLEN 8
|
||||
#define ICMP6_MAX_DLEN (IPV6_MIN_MTU \
|
||||
@@ -966,7 +976,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
int s = ref.fd;
|
||||
|
||||
flow_trace(uflow, "Received data on reply socket");
|
||||
- uflow->ts = now->tv_sec;
|
||||
+ udp_flow_activity(uflow, !tosidx.sidei, now);
|
||||
|
||||
if (pif_is_socket(topif)) {
|
||||
udp_sock_to_sock(c, ref.fd, n, tosidx);
|
||||
@@ -1301,6 +1311,24 @@ void udp_port_rebind_all(struct ctx *c)
|
||||
udp_port_rebind(c, false);
|
||||
}
|
||||
|
||||
+/**
|
||||
+ * udp_get_timeout_params() - Get host kernel UDP timeout parameters
|
||||
+ * @c: Execution context
|
||||
+ */
|
||||
+static void udp_get_timeout_params(struct ctx *c)
|
||||
+{
|
||||
+ intmax_t v;
|
||||
+
|
||||
+ v = read_file_integer(UDP_TIMEOUT, UDP_TIMEOUT_DEFAULT);
|
||||
+ c->udp.timeout = v;
|
||||
+
|
||||
+ v = read_file_integer(UDP_TIMEOUT_STREAM, UDP_TIMEOUT_STREAM_DEFAULT);
|
||||
+ c->udp.stream_timeout = v;
|
||||
+
|
||||
+ debug("Using UDP timeout parameters, timeout: %d, stream_timeout: %d",
|
||||
+ c->udp.timeout, c->udp.stream_timeout);
|
||||
+}
|
||||
+
|
||||
/**
|
||||
* udp_init() - Initialise per-socket data, and sockets in namespace
|
||||
* @c: Execution context
|
||||
@@ -1311,6 +1339,8 @@ int udp_init(struct ctx *c)
|
||||
{
|
||||
ASSERT(!c->no_udp);
|
||||
|
||||
+ udp_get_timeout_params(c);
|
||||
+
|
||||
udp_iov_init(c);
|
||||
|
||||
if (c->mode == MODE_PASTA) {
|
||||
diff --git a/udp.h b/udp.h
|
||||
index 03e8dc5..618f258 100644
|
||||
--- a/udp.h
|
||||
+++ b/udp.h
|
||||
@@ -42,11 +42,15 @@ union udp_listen_epoll_ref {
|
||||
* @fwd_in: Port forwarding configuration for inbound packets
|
||||
* @fwd_out: Port forwarding configuration for outbound packets
|
||||
* @timer_run: Timestamp of most recent timer run
|
||||
+ * @timeout: Timeout for unidirectional flows (in s)
|
||||
+ * @stream_timeout: Timeout for stream-like flows (in s)
|
||||
*/
|
||||
struct udp_ctx {
|
||||
struct fwd_ports fwd_in;
|
||||
struct fwd_ports fwd_out;
|
||||
struct timespec timer_run;
|
||||
+ int timeout;
|
||||
+ int stream_timeout;
|
||||
};
|
||||
|
||||
#endif /* UDP_H */
|
||||
diff --git a/udp_flow.c b/udp_flow.c
|
||||
index 80b1543..4a8d4b6 100644
|
||||
--- a/udp_flow.c
|
||||
+++ b/udp_flow.c
|
||||
@@ -17,8 +17,6 @@
|
||||
#include "udp_internal.h"
|
||||
#include "epoll_ctl.h"
|
||||
|
||||
-#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */
|
||||
-
|
||||
/**
|
||||
* udp_at_sidx() - Get UDP specific flow at given sidx
|
||||
* @sidx: Flow and side to retrieve
|
||||
@@ -152,6 +150,8 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
|
||||
uflow->ts = now->tv_sec;
|
||||
uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
|
||||
uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = 0;
|
||||
+ uflow->activity[INISIDE] = 1;
|
||||
+ uflow->activity[TGTSIDE] = 0;
|
||||
|
||||
flow_foreach_sidei(sidei) {
|
||||
if (pif_is_socket(uflow->f.pif[sidei]))
|
||||
@@ -227,7 +227,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif,
|
||||
|
||||
sidx = flow_lookup_sa(c, IPPROTO_UDP, pif, s_in, dst, port);
|
||||
if ((uflow = udp_at_sidx(sidx))) {
|
||||
- uflow->ts = now->tv_sec;
|
||||
+ udp_flow_activity(uflow, sidx.sidei, now);
|
||||
return flow_sidx_opposite(sidx);
|
||||
}
|
||||
|
||||
@@ -284,7 +284,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
|
||||
sidx = flow_lookup_af(c, IPPROTO_UDP, pif, af, saddr, daddr,
|
||||
srcport, dstport);
|
||||
if ((uflow = udp_at_sidx(sidx))) {
|
||||
- uflow->ts = now->tv_sec;
|
||||
+ udp_flow_activity(uflow, sidx.sidei, now);
|
||||
return flow_sidx_opposite(sidx);
|
||||
}
|
||||
|
||||
@@ -361,9 +361,29 @@ bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow,
|
||||
bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow,
|
||||
const struct timespec *now)
|
||||
{
|
||||
- if (now->tv_sec - uflow->ts <= UDP_CONN_TIMEOUT)
|
||||
+ int timeout = c->udp.timeout;
|
||||
+
|
||||
+ if (uflow->activity[TGTSIDE] &&
|
||||
+ (uflow->activity[INISIDE] > 1 || uflow->activity[TGTSIDE] > 1))
|
||||
+ timeout = c->udp.stream_timeout;
|
||||
+
|
||||
+ if (now->tv_sec - uflow->ts <= timeout)
|
||||
return false;
|
||||
|
||||
udp_flow_close(c, uflow);
|
||||
return true;
|
||||
}
|
||||
+
|
||||
+/**
|
||||
+ * udp_flow_activity() - Track activity of a UDP flow
|
||||
+ * @uflow: UDP flow
|
||||
+ * @sidei: Side index of the flow (INISIDE or TGTSIDE)
|
||||
+ * @now: Current timestamp
|
||||
+ */
|
||||
+void udp_flow_activity(struct udp_flow *uflow, unsigned int sidei,
|
||||
+ const struct timespec *now)
|
||||
+{
|
||||
+ uflow->ts = now->tv_sec;
|
||||
+ if (uflow->activity[sidei] < UINT8_MAX)
|
||||
+ uflow->activity[sidei]++;
|
||||
+}
|
||||
diff --git a/udp_flow.h b/udp_flow.h
|
||||
index 4c528e9..183a429 100644
|
||||
--- a/udp_flow.h
|
||||
+++ b/udp_flow.h
|
||||
@@ -16,6 +16,7 @@
|
||||
* @flush1: @s[1] may have datagrams queued for other flows
|
||||
* @ts: Activity timestamp
|
||||
* @s: Socket fd (or -1) for each side of the flow
|
||||
+ * @activity: Packets seen from each side of the flow, up to UINT8_MAX
|
||||
*/
|
||||
struct udp_flow {
|
||||
/* Must be first element */
|
||||
@@ -29,6 +30,7 @@ struct udp_flow {
|
||||
|
||||
time_t ts;
|
||||
int s[SIDES];
|
||||
+ uint8_t activity[SIDES];
|
||||
};
|
||||
|
||||
struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
|
||||
@@ -46,5 +48,7 @@ bool udp_flow_defer(const struct ctx *c, struct udp_flow *uflow,
|
||||
const struct timespec *now);
|
||||
bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow,
|
||||
const struct timespec *now);
|
||||
+void udp_flow_activity(struct udp_flow *uflow, unsigned int sidei,
|
||||
+ const struct timespec *now);
|
||||
|
||||
#endif /* UDP_FLOW_H */
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -0,0 +1,80 @@
|
||||
From 79430cb183b70aee127dfc68846e1f8661820a43 Mon Sep 17 00:00:00 2001
|
||||
From: David Gibson <david@gibson.dropbear.id.au>
|
||||
Date: Wed, 4 Feb 2026 21:41:34 +1000
|
||||
Subject: [PATCH 14/18] tcp: Remove non-working activity timeout mechanism
|
||||
|
||||
This mechanism was intended to remove connections which have had no
|
||||
activity for two hours, even if they haven't closed or been reset
|
||||
internally. It operated by setting the two hour timeout if there are
|
||||
no sooner TCP timeouts to schedule.
|
||||
|
||||
However, when the timer fires, the way we detect the case of the activity
|
||||
timeout doesn't work: it resets the timer for another two hours, then
|
||||
checks if the old timeout was two hours. But the old timeout returned
|
||||
by timerfd_settime() is not the original value of the timer, but the
|
||||
remaining time. Since the timer has just fired it will essentially always
|
||||
be 0.
|
||||
|
||||
For now, just remove the mechanism, disarming the timer entirely if there
|
||||
isn't another upcoming event. We'll re-introduce some sort of activity
|
||||
timeout by a different means later.
|
||||
|
||||
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
(cherry picked from commit e48ce41a1ec2f05846fb66d3847c2c2b6448ca71)
|
||||
---
|
||||
tcp.c | 24 +++---------------------
|
||||
1 file changed, 3 insertions(+), 21 deletions(-)
|
||||
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index 602e810..de2ad38 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -199,9 +199,6 @@
|
||||
* TAP_FIN_ACKED), but no socket activity is detected from the socket within
|
||||
* this time, reset the connection
|
||||
*
|
||||
- * - ACT_TIMEOUT, in the presence of any event: if no activity is detected on
|
||||
- * either side, the connection is reset
|
||||
- *
|
||||
* - RTT / 2 elapsed after data segment received from tap without having
|
||||
* sent an ACK segment, or zero-sized window advertised to tap/guest (flag
|
||||
* ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent.
|
||||
@@ -632,7 +629,9 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
|
||||
} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
|
||||
it.it_value.tv_sec = FIN_TIMEOUT;
|
||||
} else {
|
||||
- it.it_value.tv_sec = ACT_TIMEOUT;
|
||||
+ /* Disarm */
|
||||
+ it.it_value.tv_sec = 0;
|
||||
+ it.it_value.tv_nsec = 0;
|
||||
}
|
||||
|
||||
if (conn->flags & ACK_TO_TAP_DUE) {
|
||||
@@ -2628,23 +2627,6 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
|
||||
tcp_data_from_sock(c, conn);
|
||||
tcp_timer_ctl(c, conn);
|
||||
}
|
||||
- } else {
|
||||
- struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } };
|
||||
- struct itimerspec old = { { 0 }, { 0 } };
|
||||
-
|
||||
- /* Activity timeout: if it was already set, reset the
|
||||
- * connection, otherwise, it was a left-over from ACK_TO_TAP_DUE
|
||||
- * or ACK_FROM_TAP_DUE, so just set the long timeout in that
|
||||
- * case. This avoids having to preemptively reset the timer on
|
||||
- * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE.
|
||||
- */
|
||||
- if (timerfd_settime(conn->timer, 0, &new, &old))
|
||||
- flow_perror(conn, "failed to set timer");
|
||||
-
|
||||
- if (old.it_value.tv_sec == ACT_TIMEOUT) {
|
||||
- flow_dbg(conn, "activity timeout");
|
||||
- tcp_rst(c, conn);
|
||||
- }
|
||||
}
|
||||
}
|
||||
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -0,0 +1,191 @@
|
||||
From a2b1ad31a4d56a59e4d407263a22dee270973ea4 Mon Sep 17 00:00:00 2001
|
||||
From: David Gibson <david@gibson.dropbear.id.au>
|
||||
Date: Wed, 4 Feb 2026 21:41:35 +1000
|
||||
Subject: [PATCH 15/18] tcp: Re-introduce inactivity timeouts based on a clock
|
||||
algorithm
|
||||
|
||||
We previously had a mechanism to remove TCP connections which were
|
||||
inactive for 2 hours. That was broken for a long time, due to poor
|
||||
interactions with the timerfd handling, so we removed it.
|
||||
|
||||
Adding this long scale timer onto the timerfd handling, which mostly
|
||||
handles much shorter timeouts is tricky to reason about. However, for the
|
||||
inactivity timeouts, we don't require precision. Instead, we can use
|
||||
a 1-bit page replacement / "clock" algorithm. Every INACTIVITY_INTERVAL
|
||||
(2 hours), a global timer marks every TCP connection as tentatively
|
||||
inactive. That flag is cleared if we get any events, either tap side or
|
||||
socket side.
|
||||
|
||||
If the inactive flag is still set when the next INACTIVITY_INTERVAL expires
|
||||
then the connection has been inactive for an extended period and we reset
|
||||
and close it. In practice this means that connections will be removed
|
||||
after 2-4 hours of inactivity.
|
||||
|
||||
This is not a true fix for bug 179, but it does mitigate the damage, by
|
||||
limiting the time that inactive connections will remain around,
|
||||
|
||||
Link: https://bugs.passt.top/show_bug.cgi?id=179
|
||||
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
(cherry picked from commit 1820103fbbf13df98257a3f5c3ba625de624b0b3)
|
||||
---
|
||||
tcp.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++----
|
||||
tcp.h | 4 +++-
|
||||
tcp_conn.h | 3 +++
|
||||
3 files changed, 54 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index de2ad38..dd58550 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -207,6 +207,13 @@
|
||||
* TCP_INFO, with a representable range from RTT_STORE_MIN (100 us) to
|
||||
* RTT_STORE_MAX (3276.8 ms). The timeout value is clamped accordingly.
|
||||
*
|
||||
+ * We also use a global interval timer for an activity timeout which doesn't
|
||||
+ * require precision:
|
||||
+ *
|
||||
+ * - INACTIVITY_INTERVAL: if a connection has had no activity for an entire
|
||||
+ * interval, close and reset it. This means that idle connections (without
|
||||
+ * keepalives) will be removed between INACTIVITY_INTERVAL s and
|
||||
+ * 2*INACTIVITY_INTERVAL s after the last activity.
|
||||
*
|
||||
* Summary of data flows (with ESTABLISHED event)
|
||||
* ----------------------------------------------
|
||||
@@ -345,7 +352,8 @@ enum {
|
||||
#define RTO_INIT 1 /* s, RFC 6298 */
|
||||
#define RTO_INIT_AFTER_SYN_RETRIES 3 /* s, RFC 6298 */
|
||||
#define FIN_TIMEOUT 60
|
||||
-#define ACT_TIMEOUT 7200
|
||||
+
|
||||
+#define INACTIVITY_INTERVAL 7200 /* s */
|
||||
|
||||
#define LOW_RTT_TABLE_SIZE 8
|
||||
#define LOW_RTT_THRESHOLD 10 /* us */
|
||||
@@ -2294,6 +2302,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
return 1;
|
||||
}
|
||||
|
||||
+ conn->inactive = false;
|
||||
+
|
||||
if (th->ack && !(conn->events & ESTABLISHED))
|
||||
tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
|
||||
|
||||
@@ -2652,6 +2662,8 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
return;
|
||||
}
|
||||
|
||||
+ conn->inactive = false;
|
||||
+
|
||||
if ((conn->events & TAP_FIN_ACKED) && (events & EPOLLHUP)) {
|
||||
conn_event(c, conn, CLOSED);
|
||||
return;
|
||||
@@ -3030,6 +3042,38 @@ static void tcp_port_rebind(struct ctx *c, bool outbound)
|
||||
}
|
||||
}
|
||||
|
||||
+/**
|
||||
+ * tcp_inactivity() - Scan for and close long-inactive connections
|
||||
+ * @: Execution context
|
||||
+ */
|
||||
+static void tcp_inactivity(struct ctx *c, const struct timespec *now)
|
||||
+{
|
||||
+ union flow *flow;
|
||||
+
|
||||
+ if (now->tv_sec - c->tcp.inactivity_run < INACTIVITY_INTERVAL)
|
||||
+ return;
|
||||
+
|
||||
+ debug("TCP inactivity scan");
|
||||
+ c->tcp.inactivity_run = now->tv_sec;
|
||||
+
|
||||
+ flow_foreach(flow) {
|
||||
+ struct tcp_tap_conn *conn = &flow->tcp;
|
||||
+
|
||||
+ if (flow->f.type != FLOW_TCP)
|
||||
+ continue;
|
||||
+
|
||||
+ if (conn->inactive) {
|
||||
+ /* No activity in this interval, reset */
|
||||
+ flow_dbg(conn, "Inactive for at least %us, resetting",
|
||||
+ INACTIVITY_INTERVAL);
|
||||
+ tcp_rst(c, conn);
|
||||
+ }
|
||||
+
|
||||
+ /* Ready to check fot next interval */
|
||||
+ conn->inactive = true;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
/**
|
||||
* tcp_port_rebind_outbound() - Rebind ports in namespace
|
||||
* @arg: Execution context
|
||||
@@ -3068,13 +3112,13 @@ void tcp_port_rebind_all(struct ctx *c)
|
||||
* @c: Execution context
|
||||
* @now: Current timestamp
|
||||
*/
|
||||
-void tcp_timer(const struct ctx *c, const struct timespec *now)
|
||||
+void tcp_timer(struct ctx *c, const struct timespec *now)
|
||||
{
|
||||
- (void)now;
|
||||
-
|
||||
tcp_sock_refill_init(c);
|
||||
if (c->mode == MODE_PASTA)
|
||||
tcp_splice_refill(c);
|
||||
+
|
||||
+ tcp_inactivity(c, now);
|
||||
}
|
||||
|
||||
/**
|
||||
diff --git a/tcp.h b/tcp.h
|
||||
index 3f21e75..37cfc5b 100644
|
||||
--- a/tcp.h
|
||||
+++ b/tcp.h
|
||||
@@ -23,7 +23,7 @@ int tcp_sock_init(const struct ctx *c, uint8_t pif,
|
||||
in_port_t port);
|
||||
int tcp_init(struct ctx *c);
|
||||
void tcp_port_rebind_all(struct ctx *c);
|
||||
-void tcp_timer(const struct ctx *c, const struct timespec *now);
|
||||
+void tcp_timer(struct ctx *c, const struct timespec *now);
|
||||
void tcp_defer_handler(struct ctx *c);
|
||||
|
||||
void tcp_update_l2_buf(const unsigned char *eth_d);
|
||||
@@ -64,6 +64,7 @@ union tcp_listen_epoll_ref {
|
||||
* @rto_max: Maximum retry timeout (in s)
|
||||
* @syn_retries: SYN retries using exponential backoff timeout
|
||||
* @syn_linear_timeouts: SYN retries before using exponential backoff timeout
|
||||
+ * @inactivity_run: Time we last scanned for inactive connections
|
||||
*/
|
||||
struct tcp_ctx {
|
||||
struct fwd_ports fwd_in;
|
||||
@@ -73,6 +74,7 @@ struct tcp_ctx {
|
||||
int rto_max;
|
||||
uint8_t syn_retries;
|
||||
uint8_t syn_linear_timeouts;
|
||||
+ time_t inactivity_run;
|
||||
};
|
||||
|
||||
#endif /* TCP_H */
|
||||
diff --git a/tcp_conn.h b/tcp_conn.h
|
||||
index 9c6ff9e..2e70d39 100644
|
||||
--- a/tcp_conn.h
|
||||
+++ b/tcp_conn.h
|
||||
@@ -16,6 +16,7 @@
|
||||
* @ws_from_tap: Window scaling factor advertised from tap/guest
|
||||
* @ws_to_tap: Window scaling factor advertised to tap/guest
|
||||
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
|
||||
+ * @inactive: No activity within the current INACTIVITY_INTERVAL
|
||||
* @sock: Socket descriptor number
|
||||
* @events: Connection events, implying connection states
|
||||
* @listening_sock: Listening socket this socket was accept()ed from, or -1
|
||||
@@ -58,6 +59,8 @@ struct tcp_tap_conn {
|
||||
(conn->rtt_exp = MIN(RTT_EXP_MAX, ilog2(MAX(1, rtt / RTT_STORE_MIN))))
|
||||
#define RTT_GET(conn) (RTT_STORE_MIN << conn->rtt_exp)
|
||||
|
||||
+ bool inactive :1;
|
||||
+
|
||||
int sock :FD_REF_BITS;
|
||||
|
||||
uint8_t events;
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -0,0 +1,66 @@
|
||||
From 4600f95f99f12eb0680277da971a3af0ba27d5c1 Mon Sep 17 00:00:00 2001
|
||||
From: David Gibson <david@gibson.dropbear.id.au>
|
||||
Date: Wed, 4 Feb 2026 21:41:36 +1000
|
||||
Subject: [PATCH 16/18] tcp: Extend tcp_send_flag() to send TCP keepalive
|
||||
segments
|
||||
|
||||
TCP keepalives aren't technically a flag, but they are a zero-data segment
|
||||
so they can be generated with only a small modification to
|
||||
tcp_{buf,vu}_send_flag(). Implement this, using a new "pseudo-flag"
|
||||
value (similar to DUP_ACK), KEEPALIVE.
|
||||
|
||||
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
[sbrivio: Fix trivial merge conflict with 812cdb802c6e]
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
(cherry picked from commit a681e44ec60179567fb10f34351d7dfdbd2e7c7e)
|
||||
---
|
||||
tcp_buf.c | 4 ++++
|
||||
tcp_internal.h | 2 ++
|
||||
tcp_vu.c | 3 +++
|
||||
3 files changed, 9 insertions(+)
|
||||
|
||||
diff --git a/tcp_buf.c b/tcp_buf.c
|
||||
index 5d419d3..75a020f 100644
|
||||
--- a/tcp_buf.c
|
||||
+++ b/tcp_buf.c
|
||||
@@ -227,6 +227,10 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
||||
tcp_frame_conns[tcp_payload_used++] = conn;
|
||||
l4len = optlen + sizeof(struct tcphdr);
|
||||
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
|
||||
+
|
||||
+ if (flags & KEEPALIVE)
|
||||
+ seq--;
|
||||
+
|
||||
tcp_l2_buf_fill_headers(c, conn, iov, NULL, seq, false);
|
||||
|
||||
tcp_l2_buf_pad(iov);
|
||||
diff --git a/tcp_internal.h b/tcp_internal.h
|
||||
index 5f8fb35..36f443b 100644
|
||||
--- a/tcp_internal.h
|
||||
+++ b/tcp_internal.h
|
||||
@@ -38,6 +38,8 @@
|
||||
|
||||
/* Flags for internal usage */
|
||||
#define DUP_ACK (1 << 5)
|
||||
+#define KEEPALIVE (1 << 6)
|
||||
+
|
||||
#define OPT_EOL 0
|
||||
#define OPT_NOP 1
|
||||
#define OPT_MSS 2
|
||||
diff --git a/tcp_vu.c b/tcp_vu.c
|
||||
index db9db78..dd50241 100644
|
||||
--- a/tcp_vu.c
|
||||
+++ b/tcp_vu.c
|
||||
@@ -135,6 +135,9 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
|
||||
flags_elem[0].in_sg[0].iov_len = hdrlen + optlen;
|
||||
payload = IOV_TAIL(flags_elem[0].in_sg, 1, hdrlen);
|
||||
|
||||
+ if (flags & KEEPALIVE)
|
||||
+ seq--;
|
||||
+
|
||||
tcp_fill_headers(c, conn, NULL, eh, ip4h, ip6h, th, &payload,
|
||||
NULL, seq, !*c->pcap);
|
||||
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -0,0 +1,161 @@
|
||||
From b911ba6899bac381e795e26d9bebfac69b1a5748 Mon Sep 17 00:00:00 2001
|
||||
From: David Gibson <david@gibson.dropbear.id.au>
|
||||
Date: Wed, 4 Feb 2026 21:41:37 +1000
|
||||
Subject: [PATCH 17/18] tcp: Send TCP keepalive segments after a period of
|
||||
tap-side inactivity
|
||||
|
||||
There are several circumstances in which a live, but idle TCP connection
|
||||
can be forgotten by a guest, with no "on the wire" indication that this has
|
||||
happened. The most obvious is if the guest abruptly reboots. A more
|
||||
subtle case can happen with a half-closed connection, specifically one
|
||||
in FIN_WAIT_2 state on the guest. A connection can, legitimately, remain
|
||||
in this state indefinitely. If however, a socket in this state is closed
|
||||
by userspace, Linux at least will remove the kernel socket after 60s
|
||||
(or as configured in the net.ipv4.tcp_fin_timeout sysctl).
|
||||
|
||||
Because there's no on the wire indication in these cases, passt will
|
||||
pointlessly retain the connection in its flow table, at least until it is
|
||||
removed by the inactivity timeout after several hours.
|
||||
|
||||
To avoid keeping connections around for so long in this state, add
|
||||
functionality to periodically send TCP keepalive segments to the guest if
|
||||
we've seen no activity on the tap interface. If the guest is no longer
|
||||
aware of the connection, it should respond with an RST which will let
|
||||
passt remove the stale entry.
|
||||
|
||||
To do this we use a method similar to the inactivity timeout - a 1-bit
|
||||
page replacement / clock algorithm, but with a shorter interval, and only
|
||||
checking for tap side activity. Currently we use a 300s interval, meaning
|
||||
we'll send a keepalive after 5-10 minutes of (tap side) inactivity.
|
||||
|
||||
Link: https://bugs.passt.top/show_bug.cgi?id=179
|
||||
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
(cherry picked from commit d2f7c21cfb949f2b1587b9475917efdd6ac549fd)
|
||||
---
|
||||
tcp.c | 39 +++++++++++++++++++++++++++++++++++++++
|
||||
tcp.h | 2 ++
|
||||
tcp_conn.h | 2 ++
|
||||
3 files changed, 43 insertions(+)
|
||||
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index dd58550..1691987 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -215,6 +215,12 @@
|
||||
* keepalives) will be removed between INACTIVITY_INTERVAL s and
|
||||
* 2*INACTIVITY_INTERVAL s after the last activity.
|
||||
*
|
||||
+ * - KEEPALIVE_INTERVAL: if a connection has had no tap-side activity for an
|
||||
+ * entire interval, send a tap-side keepalive. If the endpoint is no longer
|
||||
+ * aware of the connection (due to a reboot, or a kernel timeout in FIN_WAIT_2
|
||||
+ * state) that should trigger an RST, so we won't keep track of connections
|
||||
+ * that the guest endpoint no longer cares about.
|
||||
+ *
|
||||
* Summary of data flows (with ESTABLISHED event)
|
||||
* ----------------------------------------------
|
||||
*
|
||||
@@ -354,6 +360,7 @@ enum {
|
||||
#define FIN_TIMEOUT 60
|
||||
|
||||
#define INACTIVITY_INTERVAL 7200 /* s */
|
||||
+#define KEEPALIVE_INTERVAL 30 /* s */
|
||||
|
||||
#define LOW_RTT_TABLE_SIZE 8
|
||||
#define LOW_RTT_THRESHOLD 10 /* us */
|
||||
@@ -2303,6 +2310,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
}
|
||||
|
||||
conn->inactive = false;
|
||||
+ conn->tap_inactive = false;
|
||||
|
||||
if (th->ack && !(conn->events & ESTABLISHED))
|
||||
tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
|
||||
@@ -3042,6 +3050,36 @@ static void tcp_port_rebind(struct ctx *c, bool outbound)
|
||||
}
|
||||
}
|
||||
|
||||
+/**
|
||||
+ * tcp_keepalive() - Send keepalives for connections which need it
|
||||
+ * @: Execution context
|
||||
+ */
|
||||
+static void tcp_keepalive(struct ctx *c, const struct timespec *now)
|
||||
+{
|
||||
+ union flow *flow;
|
||||
+
|
||||
+ if (now->tv_sec - c->tcp.keepalive_run < KEEPALIVE_INTERVAL)
|
||||
+ return;
|
||||
+
|
||||
+ c->tcp.keepalive_run = now->tv_sec;
|
||||
+
|
||||
+ flow_foreach(flow) {
|
||||
+ struct tcp_tap_conn *conn = &flow->tcp;
|
||||
+
|
||||
+ if (flow->f.type != FLOW_TCP)
|
||||
+ continue;
|
||||
+
|
||||
+ if (conn->tap_inactive) {
|
||||
+ flow_dbg(conn, "No tap activity for least %us, send keepalive",
|
||||
+ KEEPALIVE_INTERVAL);
|
||||
+ tcp_send_flag(c, conn, KEEPALIVE);
|
||||
+ }
|
||||
+
|
||||
+ /* Ready to check fot next interval */
|
||||
+ conn->tap_inactive = true;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
/**
|
||||
* tcp_inactivity() - Scan for and close long-inactive connections
|
||||
* @: Execution context
|
||||
@@ -3118,6 +3156,7 @@ void tcp_timer(struct ctx *c, const struct timespec *now)
|
||||
if (c->mode == MODE_PASTA)
|
||||
tcp_splice_refill(c);
|
||||
|
||||
+ tcp_keepalive(c, now);
|
||||
tcp_inactivity(c, now);
|
||||
}
|
||||
|
||||
diff --git a/tcp.h b/tcp.h
|
||||
index 37cfc5b..505f21a 100644
|
||||
--- a/tcp.h
|
||||
+++ b/tcp.h
|
||||
@@ -64,6 +64,7 @@ union tcp_listen_epoll_ref {
|
||||
* @rto_max: Maximum retry timeout (in s)
|
||||
* @syn_retries: SYN retries using exponential backoff timeout
|
||||
* @syn_linear_timeouts: SYN retries before using exponential backoff timeout
|
||||
+ * @keepalive_run: Time we last issued tap-side keepalives
|
||||
* @inactivity_run: Time we last scanned for inactive connections
|
||||
*/
|
||||
struct tcp_ctx {
|
||||
@@ -74,6 +75,7 @@ struct tcp_ctx {
|
||||
int rto_max;
|
||||
uint8_t syn_retries;
|
||||
uint8_t syn_linear_timeouts;
|
||||
+ time_t keepalive_run;
|
||||
time_t inactivity_run;
|
||||
};
|
||||
|
||||
diff --git a/tcp_conn.h b/tcp_conn.h
|
||||
index 2e70d39..2ff76ed 100644
|
||||
--- a/tcp_conn.h
|
||||
+++ b/tcp_conn.h
|
||||
@@ -16,6 +16,7 @@
|
||||
* @ws_from_tap: Window scaling factor advertised from tap/guest
|
||||
* @ws_to_tap: Window scaling factor advertised to tap/guest
|
||||
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
|
||||
+ * @tapinactive: No tao activity within the current KEEPALIVE_INTERVAL
|
||||
* @inactive: No activity within the current INACTIVITY_INTERVAL
|
||||
* @sock: Socket descriptor number
|
||||
* @events: Connection events, implying connection states
|
||||
@@ -59,6 +60,7 @@ struct tcp_tap_conn {
|
||||
(conn->rtt_exp = MIN(RTT_EXP_MAX, ilog2(MAX(1, rtt / RTT_STORE_MIN))))
|
||||
#define RTT_GET(conn) (RTT_STORE_MIN << conn->rtt_exp)
|
||||
|
||||
+ bool tap_inactive :1;
|
||||
bool inactive :1;
|
||||
|
||||
int sock :FD_REF_BITS;
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -0,0 +1,133 @@
|
||||
From 4d1c8b11460cfe05372e572f33e046a8e98e242c Mon Sep 17 00:00:00 2001
|
||||
From: Yumei Huang <yuhuang@redhat.com>
|
||||
Date: Fri, 20 Mar 2026 18:32:14 +0800
|
||||
Subject: [PATCH 18/18] tcp: Replace send buffer boost with EPOLLOUT monitoring
|
||||
|
||||
Currently we use the SNDBUF boost mechanism to force TCP auto-tuning.
|
||||
However, it doesn't always work, and sometimes causes a lot of
|
||||
retransmissions. As a result, the throughput suffers.
|
||||
|
||||
This patch replaces it with monitoring EPOLLOUT when sendmsg() failure
|
||||
(with EAGAIN and EWOULDBLOCK) and partial sends occur.
|
||||
|
||||
Tested with iperf3 inside pasta: throughput is now comparable to running
|
||||
iperf3 directly on the host without pasta. However, retransmissions can
|
||||
still be elevated when RTT >= 50ms. For example, when RTT is between
|
||||
200ms and 500ms, retransmission count varies from 30 to 120 in roughly
|
||||
80% of test runs.
|
||||
|
||||
Link: https://bugs.passt.top/show_bug.cgi?id=138
|
||||
Link: https://github.com/containers/podman/issues/28219
|
||||
Suggested-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
Signed-off-by: Yumei Huang <yuhuang@redhat.com>
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
(cherry picked from commit 831857e9b547ac27f868b6c24049c4da435b63fe)
|
||||
---
|
||||
tcp.c | 57 +++++++++++++++++----------------------------------------
|
||||
1 file changed, 17 insertions(+), 40 deletions(-)
|
||||
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index 1691987..920af70 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -365,13 +365,6 @@ enum {
|
||||
#define LOW_RTT_TABLE_SIZE 8
|
||||
#define LOW_RTT_THRESHOLD 10 /* us */
|
||||
|
||||
-/* Parameters to temporarily exceed sending buffer to force TCP auto-tuning */
|
||||
-#define SNDBUF_BOOST_BYTES_RTT_LO 2500 /* B * s: no boost until here */
|
||||
-/* ...examples: 5 MB sent * 500 ns RTT, 250 kB * 10 ms, 8 kB * 300 ms */
|
||||
-#define SNDBUF_BOOST_FACTOR 150 /* % */
|
||||
-#define SNDBUF_BOOST_BYTES_RTT_HI 6000 /* apply full boost factor */
|
||||
-/* 12 MB sent * 500 ns RTT, 600 kB * 10 ms, 20 kB * 300 ms */
|
||||
-
|
||||
/* Ratio of buffer to bandwidth * delay product implying interactive traffic */
|
||||
#define SNDBUF_TO_BW_DELAY_INTERACTIVE /* > */ 20 /* (i.e. < 5% of buffer) */
|
||||
|
||||
@@ -1067,35 +1060,6 @@ void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
tap_hdr_update(taph, MAX(l3len + sizeof(struct ethhdr), ETH_ZLEN));
|
||||
}
|
||||
|
||||
-/**
|
||||
- * tcp_sndbuf_boost() - Calculate limit of sending buffer to force auto-tuning
|
||||
- * @conn: Connection pointer
|
||||
- * @tinfo: tcp_info from kernel, must be pre-fetched
|
||||
- *
|
||||
- * Return: increased sending buffer to use as a limit for advertised window
|
||||
- */
|
||||
-static unsigned long tcp_sndbuf_boost(const struct tcp_tap_conn *conn,
|
||||
- const struct tcp_info_linux *tinfo)
|
||||
-{
|
||||
- unsigned long bytes_rtt_product;
|
||||
-
|
||||
- if (!bytes_acked_cap)
|
||||
- return SNDBUF_GET(conn);
|
||||
-
|
||||
- /* This is *not* a bandwidth-delay product, but it's somewhat related:
|
||||
- * as we send more data (usually at the beginning of a connection), we
|
||||
- * try to make the sending buffer progressively grow, with the RTT as a
|
||||
- * factor (longer delay, bigger buffer needed).
|
||||
- */
|
||||
- bytes_rtt_product = (long long)tinfo->tcpi_bytes_acked *
|
||||
- tinfo->tcpi_rtt / 1000 / 1000;
|
||||
-
|
||||
- return clamped_scale(SNDBUF_GET(conn), bytes_rtt_product,
|
||||
- SNDBUF_BOOST_BYTES_RTT_LO,
|
||||
- SNDBUF_BOOST_BYTES_RTT_HI,
|
||||
- SNDBUF_BOOST_FACTOR);
|
||||
-}
|
||||
-
|
||||
/**
|
||||
* tcp_update_seqack_wnd() - Update ACK sequence and window to guest/tap
|
||||
* @c: Execution context
|
||||
@@ -1216,8 +1180,6 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
|
||||
if ((int)sendq > SNDBUF_GET(conn)) /* Due to memory pressure? */
|
||||
limit = 0;
|
||||
- else if ((int)tinfo->tcpi_snd_wnd > SNDBUF_GET(conn))
|
||||
- limit = tcp_sndbuf_boost(conn, tinfo) - (int)sendq;
|
||||
else
|
||||
limit = SNDBUF_GET(conn) - (int)sendq;
|
||||
|
||||
@@ -2088,14 +2050,28 @@ eintr:
|
||||
|
||||
if (errno == EAGAIN || errno == EWOULDBLOCK) {
|
||||
tcp_send_flag(c, conn, ACK | DUP_ACK);
|
||||
+ uint32_t events = tcp_conn_epoll_events(conn->events,
|
||||
+ conn->flags);
|
||||
+ events |= EPOLLOUT;
|
||||
+ if (flow_epoll_set(&conn->f, EPOLL_CTL_MOD, events,
|
||||
+ conn->sock, !TAPSIDE(conn)) < 0)
|
||||
+ debug("Failed to add EPOLLOUT");
|
||||
return p->count - idx;
|
||||
-
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
- if (n < (int)(seq_from_tap - conn->seq_from_tap))
|
||||
+ if (n < (int)(seq_from_tap - conn->seq_from_tap)) {
|
||||
partial_send = 1;
|
||||
+ uint32_t events = tcp_conn_epoll_events(conn->events,
|
||||
+ conn->flags);
|
||||
+ events |= EPOLLOUT;
|
||||
+ if (flow_epoll_set(&conn->f, EPOLL_CTL_MOD, events, conn->sock,
|
||||
+ !TAPSIDE(conn)) < 0)
|
||||
+ debug("Failed to add EPOLLOUT");
|
||||
+ } else {
|
||||
+ tcp_epoll_ctl(conn);
|
||||
+ }
|
||||
|
||||
conn->seq_from_tap += n;
|
||||
|
||||
@@ -2688,6 +2664,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
tcp_data_from_sock(c, conn);
|
||||
|
||||
if (events & EPOLLOUT) {
|
||||
+ tcp_epoll_ctl(conn);
|
||||
if (tcp_update_seqack_wnd(c, conn, false, NULL))
|
||||
tcp_send_flag(c, conn, ACK);
|
||||
}
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -7,11 +7,12 @@
|
||||
# Copyright (c) 2022 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
%global git_hash 8ec134109eb136432a29bdf5a14f8b1fd4e46208
|
||||
%global git_hash d04c48032bcf724550d0b8f652fd00efcd2dfad0
|
||||
%global selinuxtype targeted
|
||||
%global selinux_policy_version 41.41
|
||||
|
||||
Name: passt
|
||||
Version: 0^20250512.g8ec1341
|
||||
Version: 0^20251210.gd04c480
|
||||
Release: 4%{?dist}
|
||||
Summary: User-mode networking daemons for virtual machines and namespaces
|
||||
License: GPL-2.0-or-later AND BSD-3-Clause
|
||||
@ -20,9 +21,23 @@ URL: https://passt.top/
|
||||
Source: https://passt.top/passt/snapshot/passt-%{git_hash}.tar.xz
|
||||
|
||||
Patch1: 0001-selinux-Drop-user_namespace-create-allow-rules.patch
|
||||
Patch2: 0002-treewide-By-default-don-t-quit-source-after-migratio.patch
|
||||
Patch3: 0003-tcp-Cast-operands-of-sequence-comparison-macros-to-u.patch
|
||||
Patch4: 0004-tcp-Don-t-consider-FIN-flags-with-mismatching-sequen.patch
|
||||
Patch2: 0002-selinux-Use-systemd_logind_exec_t-instead-of-systemd.patch
|
||||
Patch3: 0003-tcp-Use-less-than-MSS-window-on-no-queued-data-or-no.patch
|
||||
Patch4: 0004-pasta-Warn-disable-matching-IP-version-if-not-suppor.patch
|
||||
Patch5: 0005-selinux-Enable-read-and-watch-permissions-on-netns-d.patch
|
||||
Patch6: 0006-selinux-Enable-open-permissions-on-netns-directory-o.patch
|
||||
Patch7: 0007-tcp-Fix-rounding-issue-in-check-for-approximating-wi.patch
|
||||
Patch8: 0008-udp_flow-remove-unneeded-epoll_ref-indirection.patch
|
||||
Patch9: 0009-udp_flow-Assign-socket-to-flow-inside-udp_flow_sock.patch
|
||||
Patch10: 0010-tcp_splice-Refactor-tcp_splice_conn_epoll_events-to-.patch
|
||||
Patch11: 0011-flow-Introduce-flow_epoll_set-to-centralize-epoll-op.patch
|
||||
Patch12: 0012-tcp-Properly-propagate-tap-side-RST-to-socket-side.patch
|
||||
Patch13: 0013-udp-Split-activity-timeouts-for-UDP-flows.patch
|
||||
Patch14: 0014-tcp-Remove-non-working-activity-timeout-mechanism.patch
|
||||
Patch15: 0015-tcp-Re-introduce-inactivity-timeouts-based-on-a-cloc.patch
|
||||
Patch16: 0016-tcp-Extend-tcp_send_flag-to-send-TCP-keepalive-segme.patch
|
||||
Patch17: 0017-tcp-Send-TCP-keepalive-segments-after-a-period-of-ta.patch
|
||||
Patch18: 0018-tcp-Replace-send-buffer-boost-with-EPOLLOUT-monitori.patch
|
||||
|
||||
BuildRequires: gcc, make, git, checkpolicy, selinux-policy-devel
|
||||
Requires: (%{name}-selinux = %{version}-%{release} if selinux-policy-%{selinuxtype})
|
||||
@ -38,15 +53,21 @@ for network namespaces: traffic is forwarded using a tap interface inside the
|
||||
namespace, without the need to create further interfaces on the host, hence not
|
||||
requiring any capabilities or privileges.
|
||||
|
||||
%package selinux
|
||||
BuildArch: noarch
|
||||
Summary: SELinux support for passt and pasta
|
||||
Requires: %{name} = %{version}-%{release}
|
||||
Requires: selinux-policy
|
||||
Requires(post): %{name}
|
||||
Requires(post): policycoreutils
|
||||
Requires(preun): %{name}
|
||||
Requires(preun): policycoreutils
|
||||
%package selinux
|
||||
BuildArch: noarch
|
||||
Summary: SELinux support for passt and pasta
|
||||
%if 0%{?fedora} > 43
|
||||
BuildRequires: selinux-policy-devel
|
||||
%selinux_requires_min
|
||||
%else
|
||||
BuildRequires: pkgconfig(systemd)
|
||||
Requires(post): libselinux-utils
|
||||
Requires(post): policycoreutils
|
||||
%endif
|
||||
Requires: container-selinux
|
||||
Requires: selinux-policy-%{selinuxtype}
|
||||
Requires(post): container-selinux
|
||||
Requires(post): selinux-policy-%{selinuxtype}
|
||||
|
||||
%description selinux
|
||||
This package adds SELinux enforcement to passt(1), pasta(1), passt-repair(1).
|
||||
@ -94,15 +115,11 @@ popd
|
||||
%selinux_relabel_pre -s %{selinuxtype}
|
||||
|
||||
%post selinux
|
||||
%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp
|
||||
%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp
|
||||
%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
|
||||
%selinux_modules_install -s %{selinuxtype} %{_datadir}/selinux/packages/%{selinuxtype}/passt.pp %{_datadir}/selinux/packages/%{selinuxtype}/pasta.pp %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
|
||||
|
||||
%postun selinux
|
||||
if [ $1 -eq 0 ]; then
|
||||
%selinux_modules_uninstall -s %{selinuxtype} passt
|
||||
%selinux_modules_uninstall -s %{selinuxtype} pasta
|
||||
%selinux_modules_uninstall -s %{selinuxtype} passt-repair
|
||||
%selinux_modules_uninstall -s %{selinuxtype} passt pasta passt-repair
|
||||
fi
|
||||
|
||||
%posttrans selinux
|
||||
@ -135,8 +152,23 @@ fi
|
||||
%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
|
||||
|
||||
%changelog
|
||||
* Thu Oct 23 2025 Stefano Brivio <sbrivio@redhat.com> - 0^20250512.g8ec1341-4
|
||||
- Resolves: RHEL-123413 RHEL-123419
|
||||
* Tue Apr 21 2026 Stefano Brivio <sbrivio@redhat.com> - 0^20251210.gd04c480-4
|
||||
- Resolves: RHEL-169637 RHEL-169639 RHEL-169648
|
||||
|
||||
* Wed Feb 11 2026 Stefano Brivio <sbrivio@redhat.com> - 0^20251210.gd04c480-3
|
||||
- Resolves: RHEL-137588 RHEL-136313
|
||||
|
||||
* Wed Dec 24 2025 Stefano Brivio <sbrivio@redhat.com> - 0^20251210.gd04c480-2
|
||||
- Resolves: RHEL-136313 RHEL-136461 RHEL-137439 RHEL-137588
|
||||
|
||||
* Wed Dec 10 2025 Stefano Brivio <sbrivio@redhat.com> - 0^20251210.gd04c480-1
|
||||
- Resolves: RHEL-134942 RHEL-134943
|
||||
|
||||
* Tue Dec 9 2025 Stefano Brivio <sbrivio@redhat.com> - 0^20251209.gc3f1ba7-1
|
||||
- Resolves: RHEL-134119
|
||||
|
||||
* Thu Oct 23 2025 Stefano Brivio <sbrivio@redhat.com> - 0^20250512.g8ec1341-3
|
||||
- Resolves: RHEL-123376 RHEL-123438
|
||||
|
||||
* Tue Jul 29 2025 Stefano Brivio <sbrivio@redhat.com> - 0^20250512.g8ec1341-2
|
||||
- Resolves: RHEL-106326
|
||||
|
||||
Loading…
Reference in New Issue
Block a user