passt-0^20251209.gc3f1ba7-1.el10
Resolves: RHEL-134120
This commit is contained in:
parent
75d1f679f8
commit
ee8b8bee3f
1
.gitignore
vendored
1
.gitignore
vendored
@ -42,3 +42,4 @@
|
||||
/passt-a1e48a02ff3550eb7875a7df6726086e9b3a1213.tar.xz
|
||||
/passt-32f6212551c5db3b7b3548e8483e5d73f07a35ac.tar.xz
|
||||
/passt-8ec134109eb136432a29bdf5a14f8b1fd4e46208.tar.xz
|
||||
/passt-c3f1ba70237a9e66822aff3aa5765d0adf6f6307.tar.xz
|
||||
|
||||
@ -1,264 +0,0 @@
|
||||
From b0b5ce0a76cf7fec0b00405732fd94e0b34e8d84 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Thu, 17 Jul 2025 10:38:17 +0200
|
||||
Subject: [PATCH] treewide: By default, don't quit source after migration, keep
|
||||
sockets open
|
||||
|
||||
We are hitting an issue in the KubeVirt integration where some data is
|
||||
still sent to the source instance even after migration is complete. As
|
||||
we exit, the kernel closes our sockets and resets connections. The
|
||||
resulting RST segments are sent to peers, effectively terminating
|
||||
connections that were meanwhile migrated.
|
||||
|
||||
At the moment, this is not done intentionally, but in the future
|
||||
KubeVirt might enable OVN-Kubernetes features where source and
|
||||
destination nodes are explicitly getting mirrored traffic for a while,
|
||||
in order to decrease migration downtime.
|
||||
|
||||
By default, don't quit after migration is completed on the source: the
|
||||
previous behaviour can be enabled with the new, but deprecated,
|
||||
--migrate-exit option. After migration (as source), the -1 / --one-off
|
||||
option has no effect.
|
||||
|
||||
Also, by default, keep migrated TCP sockets open (in repair mode) as
|
||||
long as we're running, and ignore events on any epoll descriptor
|
||||
representing data channels. The previous behaviour can be enabled with
|
||||
the new, equally deprecated, --migrate-no-linger option.
|
||||
|
||||
By keeping sockets open, and not exiting, we prevent the kernel
|
||||
running on the source node to send out RST segments if further data
|
||||
reaches us.
|
||||
|
||||
Reported-by: Nir Dothan <ndothan@redhat.com>
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
(cherry picked from commit a8782865c342eb2682cca292d5bf92b567344351)
|
||||
---
|
||||
conf.c | 22 ++++++++++++++++++++++
|
||||
flow.c | 2 +-
|
||||
passt.1 | 29 +++++++++++++++++++++++++++++
|
||||
passt.h | 4 ++++
|
||||
tcp.c | 9 +++++++--
|
||||
tcp_conn.h | 3 ++-
|
||||
test/lib/setup | 4 ++--
|
||||
vhost_user.c | 9 +++++++--
|
||||
8 files changed, 74 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/conf.c b/conf.c
|
||||
index a6d7e22..1295d89 100644
|
||||
--- a/conf.c
|
||||
+++ b/conf.c
|
||||
@@ -864,6 +864,14 @@ static void usage(const char *name, FILE *f, int status)
|
||||
FPRINTF(f,
|
||||
" --repair-path PATH path for passt-repair(1)\n"
|
||||
" default: append '.repair' to UNIX domain path\n");
|
||||
+ FPRINTF(f,
|
||||
+ " --migrate-exit DEPRECATED:\n"
|
||||
+ " source quits after migration\n"
|
||||
+ " default: source keeps running after migration\n");
|
||||
+ FPRINTF(f,
|
||||
+ " --migrate-no-linger DEPRECATED:\n"
|
||||
+ " close sockets on migration\n"
|
||||
+ " default: keep sockets open, ignore events\n");
|
||||
}
|
||||
|
||||
FPRINTF(f,
|
||||
@@ -1468,6 +1476,8 @@ void conf(struct ctx *c, int argc, char **argv)
|
||||
{"socket-path", required_argument, NULL, 's' },
|
||||
{"fqdn", required_argument, NULL, 27 },
|
||||
{"repair-path", required_argument, NULL, 28 },
|
||||
+ {"migrate-exit", no_argument, NULL, 29 },
|
||||
+ {"migrate-no-linger", no_argument, NULL, 30 },
|
||||
{ 0 },
|
||||
};
|
||||
const char *optstring = "+dqfel:hs:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:";
|
||||
@@ -1683,6 +1693,18 @@ void conf(struct ctx *c, int argc, char **argv)
|
||||
optarg))
|
||||
die("Invalid passt-repair path: %s", optarg);
|
||||
|
||||
+ break;
|
||||
+ case 29:
|
||||
+ if (c->mode != MODE_VU)
|
||||
+ die("--migrate-exit is for vhost-user mode only");
|
||||
+ c->migrate_exit = true;
|
||||
+
|
||||
+ break;
|
||||
+ case 30:
|
||||
+ if (c->mode != MODE_VU)
|
||||
+ die("--migrate-no-linger is for vhost-user mode only");
|
||||
+ c->migrate_no_linger = true;
|
||||
+
|
||||
break;
|
||||
case 'd':
|
||||
c->debug = 1;
|
||||
diff --git a/flow.c b/flow.c
|
||||
index 6a5c8aa..a4b65ea 100644
|
||||
--- a/flow.c
|
||||
+++ b/flow.c
|
||||
@@ -1089,7 +1089,7 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
|
||||
* as EIO).
|
||||
*/
|
||||
foreach_established_tcp_flow(flow) {
|
||||
- rc = tcp_flow_migrate_source_ext(fd, &flow->tcp);
|
||||
+ rc = tcp_flow_migrate_source_ext(c, fd, &flow->tcp);
|
||||
if (rc) {
|
||||
flow_err(flow, "Can't send extended data: %s",
|
||||
strerror_(-rc));
|
||||
diff --git a/passt.1 b/passt.1
|
||||
index 60066c2..cef98b2 100644
|
||||
--- a/passt.1
|
||||
+++ b/passt.1
|
||||
@@ -439,6 +439,30 @@ Default, for \-\-vhost-user mode only, is to append \fI.repair\fR to the path
|
||||
chosen for the hypervisor UNIX domain socket. No socket is created if not in
|
||||
\-\-vhost-user mode.
|
||||
|
||||
+.TP
|
||||
+.BR \-\-migrate-exit (DEPRECATED)
|
||||
+Exit after a completed migration as source. By default, \fBpasst\fR keeps
|
||||
+running and the migrated guest can continue using its connection, or a new guest
|
||||
+can connect.
|
||||
+
|
||||
+Note that this configuration option is \fBdeprecated\fR and will be removed in a
|
||||
+future version. It is not expected to be of any use, and it simply reflects a
|
||||
+legacy behaviour. If you have any use for this, refer to \fBREPORTING BUGS\fR
|
||||
+below.
|
||||
+
|
||||
+.TP
|
||||
+.BR \-\-migrate-no-linger (DEPRECATED)
|
||||
+Close TCP sockets on the source instance once migration completes.
|
||||
+
|
||||
+By default, sockets are kept open, and events on data sockets are ignored, so
|
||||
+that any further message reaching sockets after the source migrated is silently
|
||||
+ignored, to avoid connection resets in case data is received after migration.
|
||||
+
|
||||
+Note that this configuration option is \fBdeprecated\fR and will be removed in a
|
||||
+future version. It is not expected to be of any use, and it simply reflects a
|
||||
+legacy behaviour. If you have any use for this, refer to \fBREPORTING BUGS\fR
|
||||
+below.
|
||||
+
|
||||
.TP
|
||||
.BR \-F ", " \-\-fd " " \fIFD
|
||||
Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened
|
||||
@@ -454,6 +478,11 @@ is closed.
|
||||
Quit after handling a single client connection, that is, once the client closes
|
||||
the socket, or once we get a socket error.
|
||||
|
||||
+\fBNote\fR: this option has no effect after \fBpasst\fR completes a migration as
|
||||
+source, because, in that case, exiting would close sockets for active
|
||||
+connections, which would in turn cause connection resets if any further data is
|
||||
+received. See also the description of \fI\-\-migrate-no-linger\fR.
|
||||
+
|
||||
.TP
|
||||
.BR \-t ", " \-\-tcp-ports " " \fIspec
|
||||
Configure TCP port forwarding to guest. \fIspec\fR can be one of:
|
||||
diff --git a/passt.h b/passt.h
|
||||
index 8693794..4cfd6eb 100644
|
||||
--- a/passt.h
|
||||
+++ b/passt.h
|
||||
@@ -241,6 +241,8 @@ struct ip6_ctx {
|
||||
* @device_state_fd: Device state migration channel
|
||||
* @device_state_result: Device state migration result
|
||||
* @migrate_target: Are we the target, on the next migration request?
|
||||
+ * @migrate_no_linger: Close sockets as we migrate them
|
||||
+ * @migrate_exit: Exit (on source) once migration is complete
|
||||
*/
|
||||
struct ctx {
|
||||
enum passt_modes mode;
|
||||
@@ -318,6 +320,8 @@ struct ctx {
|
||||
int device_state_fd;
|
||||
int device_state_result;
|
||||
bool migrate_target;
|
||||
+ bool migrate_no_linger;
|
||||
+ bool migrate_exit;
|
||||
};
|
||||
|
||||
void proto_update_l2_buf(const unsigned char *eth_d,
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index 0ac298a..1b22f70 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -3284,12 +3284,14 @@ int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn)
|
||||
|
||||
/**
|
||||
* tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data
|
||||
+ * @c: Execution context
|
||||
* @fd: Descriptor for state migration
|
||||
* @conn: Pointer to the TCP connection structure
|
||||
*
|
||||
* Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure
|
||||
*/
|
||||
-int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
|
||||
+int tcp_flow_migrate_source_ext(const struct ctx *c,
|
||||
+ int fd, const struct tcp_tap_conn *conn)
|
||||
{
|
||||
uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
|
||||
struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(conn)];
|
||||
@@ -3334,7 +3336,10 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
|
||||
if ((rc = tcp_flow_dump_seq(conn, &t->seq_rcv)))
|
||||
goto fail;
|
||||
|
||||
- close(s);
|
||||
+ if (c->migrate_no_linger)
|
||||
+ close(s);
|
||||
+ else
|
||||
+ epoll_del(c, s);
|
||||
|
||||
/* Adjustments unrelated to FIN segments: sequence numbers we dumped are
|
||||
* based on the end of the queues.
|
||||
diff --git a/tcp_conn.h b/tcp_conn.h
|
||||
index 35d813d..38b5c54 100644
|
||||
--- a/tcp_conn.h
|
||||
+++ b/tcp_conn.h
|
||||
@@ -236,7 +236,8 @@ int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
|
||||
int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn);
|
||||
|
||||
int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn);
|
||||
-int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn);
|
||||
+int tcp_flow_migrate_source_ext(const struct ctx *c, int fd,
|
||||
+ const struct tcp_tap_conn *conn);
|
||||
|
||||
int tcp_flow_migrate_target(struct ctx *c, int fd);
|
||||
int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd);
|
||||
diff --git a/test/lib/setup b/test/lib/setup
|
||||
index 575bc21..5994598 100755
|
||||
--- a/test/lib/setup
|
||||
+++ b/test/lib/setup
|
||||
@@ -350,7 +350,7 @@ setup_migrate() {
|
||||
|
||||
sleep 1
|
||||
|
||||
- __opts="--vhost-user"
|
||||
+ __opts="--vhost-user --migrate-exit --migrate-no-linger"
|
||||
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap"
|
||||
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
|
||||
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
|
||||
@@ -360,7 +360,7 @@ setup_migrate() {
|
||||
|
||||
context_run_bg passt_repair_1 "./passt-repair ${STATESETUP}/passt_1.socket.repair"
|
||||
|
||||
- __opts="--vhost-user"
|
||||
+ __opts="--vhost-user --migrate-exit --migrate-no-linger"
|
||||
[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap"
|
||||
[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
|
||||
[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
|
||||
diff --git a/vhost_user.c b/vhost_user.c
|
||||
index 105f77a..c4d3a52 100644
|
||||
--- a/vhost_user.c
|
||||
+++ b/vhost_user.c
|
||||
@@ -1208,7 +1208,12 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
|
||||
if (msg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE &&
|
||||
vdev->context->device_state_result == 0 &&
|
||||
!vdev->context->migrate_target) {
|
||||
- info("Migration complete, exiting");
|
||||
- _exit(EXIT_SUCCESS);
|
||||
+ if (vdev->context->migrate_exit) {
|
||||
+ info("Migration complete, exiting");
|
||||
+ _exit(EXIT_SUCCESS);
|
||||
+ }
|
||||
+
|
||||
+ info("Migration complete");
|
||||
+ vdev->context->one_off = false;
|
||||
}
|
||||
}
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -1,45 +0,0 @@
|
||||
From 585326cf732c8396701b5d3fdf6a7b5f2ab79300 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Fri, 29 Aug 2025 22:11:26 +0200
|
||||
Subject: [PATCH 02/13] tcp: FIN flags have to be retransmitted as well
|
||||
|
||||
If we're retransmitting any data, and we sent a FIN segment to our
|
||||
peer, regardless of whether it was received, we obviously have to
|
||||
retransmit it as well, given that it can only come with the last data
|
||||
segment, or after it.
|
||||
|
||||
Unconditionally clear the internal TAP_FIN_SENT flag whenever we
|
||||
re-transmit, so that we know we have to send it again, in case.
|
||||
|
||||
Reported-by: Paul Holzinger <pholzing@redhat.com>
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Tested-by: Paul Holzinger <pholzing@redhat.com>
|
||||
(cherry picked from commit 2e3d93ba7033754301af0d50cd0bb20a38a005ac)
|
||||
---
|
||||
tcp.c | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index 0ac298a..c5c430e 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -1757,6 +1757,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
"fast re-transmit, ACK: %u, previous sequence: %u",
|
||||
max_ack_seq, conn->seq_to_tap);
|
||||
conn->seq_to_tap = max_ack_seq;
|
||||
+ conn->events &= ~TAP_FIN_SENT;
|
||||
if (tcp_set_peek_offset(conn, 0)) {
|
||||
tcp_rst(c, conn);
|
||||
return -1;
|
||||
@@ -2284,6 +2285,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
|
||||
flow_dbg(conn, "ACK timeout, retry");
|
||||
conn->retrans++;
|
||||
conn->seq_to_tap = conn->seq_ack_from_tap;
|
||||
+ conn->events &= ~TAP_FIN_SENT;
|
||||
if (!conn->wnd_from_tap)
|
||||
conn->wnd_from_tap = 1; /* Zero-window probe */
|
||||
if (tcp_set_peek_offset(conn, 0)) {
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -1,104 +0,0 @@
|
||||
From 02f5a975d22449a2a940dec91e8e78a79e316693 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Fri, 29 Aug 2025 22:11:27 +0200
|
||||
Subject: [PATCH 03/13] tcp: Factor sequence rewind for retransmissions into a
|
||||
new function
|
||||
|
||||
...as I'm going to need a third occurrence of this in the next change.
|
||||
|
||||
This introduces a small functional change in tcp_data_from_tap(): the
|
||||
sequence was previously rewound to the highest ACK number we found in
|
||||
the current packet batch, and not to the current value of
|
||||
seq_ack_from_tap.
|
||||
|
||||
The two might differ in case tcp_sock_consume() failed, because in
|
||||
that case we're ignoring that ACK altogether. But if we're ignoring
|
||||
it, it looks more correct to me to start retransmitting from an
|
||||
earlier sequence anyway.
|
||||
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Tested-by: Paul Holzinger <pholzing@redhat.com>
|
||||
(cherry picked from commit 1d502be1a809dcdc4614815f4ec0da77a1ff27ec)
|
||||
---
|
||||
tcp.c | 47 ++++++++++++++++++++++++++++++++---------------
|
||||
1 file changed, 32 insertions(+), 15 deletions(-)
|
||||
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index c5c430e..8b0e707 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -1097,6 +1097,26 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
|
||||
}
|
||||
}
|
||||
|
||||
+/**
|
||||
+ * tcp_rewind_seq() - Rewind sequence to tap and socket offset to current ACK
|
||||
+ * @c: Execution context
|
||||
+ * @conn: Connection pointer
|
||||
+ *
|
||||
+ * Return: 0 on success, -1 on failure, with connection reset
|
||||
+ */
|
||||
+static int tcp_rewind_seq(const struct ctx *c, struct tcp_tap_conn *conn)
|
||||
+{
|
||||
+ conn->seq_to_tap = conn->seq_ack_from_tap;
|
||||
+ conn->events &= ~TAP_FIN_SENT;
|
||||
+
|
||||
+ if (tcp_set_peek_offset(conn, 0)) {
|
||||
+ tcp_rst(c, conn);
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
/**
|
||||
* tcp_prepare_flags() - Prepare header for flags-only segment (no payload)
|
||||
* @c: Execution context
|
||||
@@ -1755,13 +1775,11 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
if (retr) {
|
||||
flow_trace(conn,
|
||||
"fast re-transmit, ACK: %u, previous sequence: %u",
|
||||
- max_ack_seq, conn->seq_to_tap);
|
||||
- conn->seq_to_tap = max_ack_seq;
|
||||
- conn->events &= ~TAP_FIN_SENT;
|
||||
- if (tcp_set_peek_offset(conn, 0)) {
|
||||
- tcp_rst(c, conn);
|
||||
+ conn->seq_ack_from_tap, conn->seq_to_tap);
|
||||
+
|
||||
+ if (tcp_rewind_seq(c, conn))
|
||||
return -1;
|
||||
- }
|
||||
+
|
||||
tcp_data_from_sock(c, conn);
|
||||
}
|
||||
|
||||
@@ -2283,17 +2301,16 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
|
||||
tcp_rst(c, conn);
|
||||
} else {
|
||||
flow_dbg(conn, "ACK timeout, retry");
|
||||
- conn->retrans++;
|
||||
- conn->seq_to_tap = conn->seq_ack_from_tap;
|
||||
- conn->events &= ~TAP_FIN_SENT;
|
||||
+
|
||||
if (!conn->wnd_from_tap)
|
||||
conn->wnd_from_tap = 1; /* Zero-window probe */
|
||||
- if (tcp_set_peek_offset(conn, 0)) {
|
||||
- tcp_rst(c, conn);
|
||||
- } else {
|
||||
- tcp_data_from_sock(c, conn);
|
||||
- tcp_timer_ctl(c, conn);
|
||||
- }
|
||||
+
|
||||
+ conn->retrans++;
|
||||
+ if (tcp_rewind_seq(c, conn))
|
||||
+ return;
|
||||
+
|
||||
+ tcp_data_from_sock(c, conn);
|
||||
+ tcp_timer_ctl(c, conn);
|
||||
}
|
||||
} else {
|
||||
struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } };
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -1,123 +0,0 @@
|
||||
From f6d5e6a614c385bccfd02453de5e04428d9fed9f Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Fri, 29 Aug 2025 22:11:28 +0200
|
||||
Subject: [PATCH 04/13] tcp: Rewind sequence when guest shrinks window to zero
|
||||
|
||||
A window shrunk to zero means by definition that anything else that
|
||||
might be in flight is now out of window. Restart from the currently
|
||||
acknowledged sequence.
|
||||
|
||||
We need to do that both in tcp_tap_window_update(), where we already
|
||||
check for zero-window updates, as well as in tcp_data_from_tap(),
|
||||
because we might get one of those updates in a batch of packets that
|
||||
also contains a non-zero window update.
|
||||
|
||||
Suggested-by: Jon Maloy <jmaloy@redhat.com>
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Tested-by: Paul Holzinger <pholzing@redhat.com>
|
||||
Reviewed-by: Jon Maloy <jmaloy@redhat.com>
|
||||
(cherry picked from commit e86d480be4174fdfec592ba689c2ef03019393f6)
|
||||
---
|
||||
tcp.c | 34 +++++++++++++++++++++++++---------
|
||||
1 file changed, 25 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index 8b0e707..74ed7c6 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -1257,19 +1257,25 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn,
|
||||
|
||||
/**
|
||||
* tcp_tap_window_update() - Process an updated window from tap side
|
||||
+ * @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
* @window: Window value, host order, unscaled
|
||||
*/
|
||||
-static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd)
|
||||
+static void tcp_tap_window_update(const struct ctx *c,
|
||||
+ struct tcp_tap_conn *conn, unsigned wnd)
|
||||
{
|
||||
wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap);
|
||||
|
||||
/* Work-around for bug introduced in peer kernel code, commit
|
||||
- * e2142825c120 ("net: tcp: send zero-window ACK when no memory").
|
||||
- * We don't update if window shrank to zero.
|
||||
+ * e2142825c120 ("net: tcp: send zero-window ACK when no memory"): don't
|
||||
+ * update the window if it shrank to zero, so that we'll eventually
|
||||
+ * retry to send data, but rewind the sequence as that obviously implies
|
||||
+ * that no data beyond the updated window will be acknowledged.
|
||||
*/
|
||||
- if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap))
|
||||
+ if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) {
|
||||
+ tcp_rewind_seq(c, conn);
|
||||
return;
|
||||
+ }
|
||||
|
||||
conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
|
||||
|
||||
@@ -1692,7 +1698,8 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
tcp_timer_ctl(c, conn);
|
||||
|
||||
if (p->count == 1) {
|
||||
- tcp_tap_window_update(conn, ntohs(th->window));
|
||||
+ tcp_tap_window_update(c, conn,
|
||||
+ ntohs(th->window));
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -1711,6 +1718,15 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
ack_seq == max_ack_seq &&
|
||||
ntohs(th->window) == max_ack_seq_wnd;
|
||||
|
||||
+ /* See tcp_tap_window_update() for details. On
|
||||
+ * top of that, we also need to check here if a
|
||||
+ * zero-window update is contained in a batch of
|
||||
+ * packets that includes a non-zero window as
|
||||
+ * well.
|
||||
+ */
|
||||
+ if (!ntohs(th->window))
|
||||
+ tcp_rewind_seq(c, conn);
|
||||
+
|
||||
max_ack_seq_wnd = ntohs(th->window);
|
||||
max_ack_seq = ack_seq;
|
||||
}
|
||||
@@ -1770,7 +1786,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
if (ack && !tcp_sock_consume(conn, max_ack_seq))
|
||||
tcp_update_seqack_from_tap(c, conn, max_ack_seq);
|
||||
|
||||
- tcp_tap_window_update(conn, max_ack_seq_wnd);
|
||||
+ tcp_tap_window_update(c, conn, max_ack_seq_wnd);
|
||||
|
||||
if (retr) {
|
||||
flow_trace(conn,
|
||||
@@ -1859,7 +1875,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
|
||||
const struct tcphdr *th,
|
||||
const char *opts, size_t optlen)
|
||||
{
|
||||
- tcp_tap_window_update(conn, ntohs(th->window));
|
||||
+ tcp_tap_window_update(c, conn, ntohs(th->window));
|
||||
tcp_get_tap_ws(conn, opts, optlen);
|
||||
|
||||
/* First value is not scaled */
|
||||
@@ -2057,7 +2073,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
if (!th->ack)
|
||||
goto reset;
|
||||
|
||||
- tcp_tap_window_update(conn, ntohs(th->window));
|
||||
+ tcp_tap_window_update(c, conn, ntohs(th->window));
|
||||
|
||||
tcp_data_from_sock(c, conn);
|
||||
|
||||
@@ -2069,7 +2085,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
if (conn->events & TAP_FIN_RCVD) {
|
||||
tcp_sock_consume(conn, ntohl(th->ack_seq));
|
||||
tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
|
||||
- tcp_tap_window_update(conn, ntohs(th->window));
|
||||
+ tcp_tap_window_update(c, conn, ntohs(th->window));
|
||||
tcp_data_from_sock(c, conn);
|
||||
|
||||
if (conn->events & SOCK_FIN_RCVD &&
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -1,63 +0,0 @@
|
||||
From 7a4c907502ffe8df07ee44d2d17818671dc4ae47 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Fri, 29 Aug 2025 22:11:29 +0200
|
||||
Subject: [PATCH 05/13] tcp: Fix closing logic for half-closed connections
|
||||
|
||||
First off, don't close connections half-closed by the guest before
|
||||
our own FIN is acknowledged by the guest itself.
|
||||
|
||||
That is, after we receive a FIN from the guest (TAP_FIN_RCVD), if we
|
||||
don't have any data left to send from the socket (SOCK_FIN_RCVD, or
|
||||
EPOLLHUP), we send a FIN segment to the guest (TAP_FIN_SENT), but we
|
||||
need to actually have it acknowledged (and have no pending
|
||||
retransmissions) before we can close the connection: check for
|
||||
TAP_FIN_ACKED, first.
|
||||
|
||||
Then, if we set TAP_FIN_SENT, and we receive an ACK segment from the
|
||||
guest, set TAP_FIN_ACKED. This was entirely missing for the
|
||||
TAP_FIN_RCVD case, and as we fix the problem described above, this
|
||||
becomes relevant as well.
|
||||
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Tested-by: Paul Holzinger <pholzing@redhat.com>
|
||||
Reviewed-by: Jon Maloy <jmaloy@redhat.com>
|
||||
(cherry picked from commit c62fb08002ad88079b84d1aa694492746f0d7f22)
|
||||
---
|
||||
tcp.c | 13 +++++++++----
|
||||
1 file changed, 9 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index 74ed7c6..ff1b6bf 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -2088,9 +2088,14 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
tcp_tap_window_update(c, conn, ntohs(th->window));
|
||||
tcp_data_from_sock(c, conn);
|
||||
|
||||
- if (conn->events & SOCK_FIN_RCVD &&
|
||||
- conn->seq_ack_from_tap == conn->seq_to_tap)
|
||||
- conn_event(c, conn, CLOSED);
|
||||
+ if (conn->seq_ack_from_tap == conn->seq_to_tap) {
|
||||
+ if (th->ack && conn->events & TAP_FIN_SENT)
|
||||
+ conn_event(c, conn, TAP_FIN_ACKED);
|
||||
+
|
||||
+ if (conn->events & SOCK_FIN_RCVD &&
|
||||
+ conn->events & TAP_FIN_ACKED)
|
||||
+ conn_event(c, conn, CLOSED);
|
||||
+ }
|
||||
|
||||
return 1;
|
||||
}
|
||||
@@ -2370,7 +2375,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
|
||||
return;
|
||||
}
|
||||
|
||||
- if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) {
|
||||
+ if ((conn->events & TAP_FIN_ACKED) && (events & EPOLLHUP)) {
|
||||
conn_event(c, conn, CLOSED);
|
||||
return;
|
||||
}
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -1,85 +0,0 @@
|
||||
From b1a28f0dd1a2a31e73c03aefbfc8dbd016f09ac4 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Fri, 29 Aug 2025 22:11:30 +0200
|
||||
Subject: [PATCH 06/13] tcp: Don't try to transmit right after the peer shrank
|
||||
the window to zero
|
||||
|
||||
If the peer shrinks the window to zero, we'll skip storing the new
|
||||
window, as a convenient way to cause window probes (which exceed any
|
||||
zero-sized window, strictly speaking) if we don't get window updates
|
||||
in a while.
|
||||
|
||||
As we do so, though, we need to ensure we don't try to queue more data
|
||||
from the socket right after we process this window update, as the
|
||||
entire point of a zero-window advertisement is to keep us from sending
|
||||
more data.
|
||||
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Tested-by: Paul Holzinger <pholzing@redhat.com>
|
||||
Reviewed-by: Jon Maloy <jmaloy@redhat.com>
|
||||
(cherry picked from commit 25f93545e7232a7dab9a022862514778c18bc85e)
|
||||
---
|
||||
tcp.c | 18 ++++++++++--------
|
||||
1 file changed, 10 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index ff1b6bf..4686846 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -1259,9 +1259,11 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn,
|
||||
* tcp_tap_window_update() - Process an updated window from tap side
|
||||
* @c: Execution context
|
||||
* @conn: Connection pointer
|
||||
- * @window: Window value, host order, unscaled
|
||||
+ * @wnd: Window value, host order, unscaled
|
||||
+ *
|
||||
+ * Return: false on zero window (not stored to wnd_from_tap), true otherwise
|
||||
*/
|
||||
-static void tcp_tap_window_update(const struct ctx *c,
|
||||
+static bool tcp_tap_window_update(const struct ctx *c,
|
||||
struct tcp_tap_conn *conn, unsigned wnd)
|
||||
{
|
||||
wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap);
|
||||
@@ -1274,13 +1276,14 @@ static void tcp_tap_window_update(const struct ctx *c,
|
||||
*/
|
||||
if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) {
|
||||
tcp_rewind_seq(c, conn);
|
||||
- return;
|
||||
+ return false;
|
||||
}
|
||||
|
||||
conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX);
|
||||
|
||||
/* FIXME: reflect the tap-side receiver's window back to the sock-side
|
||||
* sender by adjusting SO_RCVBUF? */
|
||||
+ return true;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -2073,9 +2076,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
if (!th->ack)
|
||||
goto reset;
|
||||
|
||||
- tcp_tap_window_update(c, conn, ntohs(th->window));
|
||||
-
|
||||
- tcp_data_from_sock(c, conn);
|
||||
+ if (tcp_tap_window_update(c, conn, ntohs(th->window)))
|
||||
+ tcp_data_from_sock(c, conn);
|
||||
|
||||
if (p->count - idx == 1)
|
||||
return 1;
|
||||
@@ -2085,8 +2087,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
if (conn->events & TAP_FIN_RCVD) {
|
||||
tcp_sock_consume(conn, ntohl(th->ack_seq));
|
||||
tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
|
||||
- tcp_tap_window_update(c, conn, ntohs(th->window));
|
||||
- tcp_data_from_sock(c, conn);
|
||||
+ if (tcp_tap_window_update(c, conn, ntohs(th->window)))
|
||||
+ tcp_data_from_sock(c, conn);
|
||||
|
||||
if (conn->seq_ack_from_tap == conn->seq_to_tap) {
|
||||
if (th->ack && conn->events & TAP_FIN_SENT)
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -1,48 +0,0 @@
|
||||
From 23331346494e8bd301cd635533812e329f693fdf Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Fri, 29 Aug 2025 22:11:31 +0200
|
||||
Subject: [PATCH 07/13] tcp: Cast operands of sequence comparison macros to
|
||||
uint32_t before using them
|
||||
|
||||
Otherwise, passing signed types causes automatic promotion of the
|
||||
result of the subtractions as well, which is not what we want, as
|
||||
these macros rely on unsigned 32-bit arithmetic.
|
||||
|
||||
The next patch introduces a ssize_t operand for SEQ_LE, illustrating
|
||||
the issue.
|
||||
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Tested-by: Paul Holzinger <pholzing@redhat.com>
|
||||
Reviewed-by: Jon Maloy <jmaloy@redhat.com>
|
||||
(cherry picked from commit 660cd6907e14a41ad9bc77d317140c70ab416fce)
|
||||
---
|
||||
tcp_internal.h | 12 ++++++++----
|
||||
1 file changed, 8 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/tcp_internal.h b/tcp_internal.h
|
||||
index 36c6533..c80ba40 100644
|
||||
--- a/tcp_internal.h
|
||||
+++ b/tcp_internal.h
|
||||
@@ -18,10 +18,14 @@
|
||||
sizeof(struct ipv6hdr), \
|
||||
sizeof(uint32_t))
|
||||
|
||||
-#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW)
|
||||
-#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW)
|
||||
-#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW)
|
||||
-#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW)
|
||||
+#define SEQ_LE(a, b) \
|
||||
+ ((uint32_t)(b) - (uint32_t)(a) < MAX_WINDOW)
|
||||
+#define SEQ_LT(a, b) \
|
||||
+ ((uint32_t)(b) - (uint32_t)(a) - 1 < MAX_WINDOW)
|
||||
+#define SEQ_GE(a, b) \
|
||||
+ ((uint32_t)(a) - (uint32_t)(b) < MAX_WINDOW)
|
||||
+#define SEQ_GT(a, b) \
|
||||
+ ((uint32_t)(a) - (uint32_t)(b) - 1 < MAX_WINDOW)
|
||||
|
||||
#define FIN (1 << 0)
|
||||
#define SYN (1 << 1)
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -1,95 +0,0 @@
|
||||
From c1420a41bb6e95f4dc578846568dacd9e0e0bbd2 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Fri, 29 Aug 2025 22:11:32 +0200
|
||||
Subject: [PATCH 08/13] tcp: Fast re-transmit if half-closed, make TAP_FIN_RCVD
|
||||
path consistent
|
||||
|
||||
We currently have a number of discrepancies in the tcp_tap_handler()
|
||||
path between the half-closed connection path and the regular one, and
|
||||
they are mostly a result of code duplication, which comes in turn from
|
||||
the fact that tcp_data_from_tap() deals with data transfers as well as
|
||||
general connection bookkeeping, so we can't use it for half-closed
|
||||
connections.
|
||||
|
||||
This suggests that we should probably rework it into two or more
|
||||
functions, in the long term, but for the moment being I'm just fixing
|
||||
one obvious issue, which is the lack of fast retransmissions in the
|
||||
TAP_FIN_RCVD path, and a potential one, which is the fact we don't
|
||||
handle socket flush failures.
|
||||
|
||||
Add fast re-transmit for half-closed connections, and handle the case
|
||||
of socket flush (tcp_sock_consume()) flush failure in the same way as
|
||||
tcp_data_from_tap() handles it.
|
||||
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Tested-by: Paul Holzinger <pholzing@redhat.com>
|
||||
Reviewed-by: Jon Maloy <jmaloy@redhat.com>
|
||||
(cherry picked from commit bde1847960cfecc46d8bbf203b931705f2922d31)
|
||||
---
|
||||
tcp.c | 42 +++++++++++++++++++++++++++++++++++++++---
|
||||
1 file changed, 39 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index 4686846..70798de 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -1639,6 +1639,23 @@ static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
||||
return tcp_buf_data_from_sock(c, conn);
|
||||
}
|
||||
|
||||
+/**
|
||||
+ * tcp_packet_data_len() - Get data (TCP payload) length for a TCP packet
|
||||
+ * @th: Pointer to TCP header
|
||||
+ * @l4len: TCP packet length, including TCP header
|
||||
+ *
|
||||
+ * Return: data length of TCP packet, -1 on invalid value of Data Offset field
|
||||
+ */
|
||||
+static ssize_t tcp_packet_data_len(const struct tcphdr *th, size_t l4len)
|
||||
+{
|
||||
+ size_t off = th->doff * 4UL;
|
||||
+
|
||||
+ if (off < sizeof(*th) || off > l4len)
|
||||
+ return -1;
|
||||
+
|
||||
+ return l4len - off;
|
||||
+}
|
||||
+
|
||||
/**
|
||||
* tcp_data_from_tap() - tap/guest data for established connection
|
||||
* @c: Execution context
|
||||
@@ -2085,9 +2102,28 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
|
||||
/* Established connections not accepting data from tap */
|
||||
if (conn->events & TAP_FIN_RCVD) {
|
||||
- tcp_sock_consume(conn, ntohl(th->ack_seq));
|
||||
- tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
|
||||
- if (tcp_tap_window_update(c, conn, ntohs(th->window)))
|
||||
+ bool retr;
|
||||
+
|
||||
+ retr = th->ack && !tcp_packet_data_len(th, len) && !th->fin &&
|
||||
+ ntohl(th->ack_seq) == conn->seq_ack_from_tap &&
|
||||
+ ntohs(th->window) == conn->wnd_from_tap;
|
||||
+
|
||||
+ /* On socket flush failure, pretend there was no ACK, try again
|
||||
+ * later
|
||||
+ */
|
||||
+ if (th->ack && !tcp_sock_consume(conn, ntohl(th->ack_seq)))
|
||||
+ tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq));
|
||||
+
|
||||
+ if (retr) {
|
||||
+ flow_trace(conn,
|
||||
+ "fast re-transmit, ACK: %u, previous sequence: %u",
|
||||
+ ntohl(th->ack_seq), conn->seq_to_tap);
|
||||
+
|
||||
+ if (tcp_rewind_seq(c, conn))
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ if (tcp_tap_window_update(c, conn, ntohs(th->window)) || retr)
|
||||
tcp_data_from_sock(c, conn);
|
||||
|
||||
if (conn->seq_ack_from_tap == conn->seq_to_tap) {
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -1,95 +0,0 @@
|
||||
From 8ec28b4d68a1f070b0b1dffd7f84863adac5c5a1 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Tue, 9 Sep 2025 19:18:41 +0200
|
||||
Subject: [PATCH 09/13] tcp: Don't send FIN segment to guest yet if we have
|
||||
pending unacknowledged data
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
For some reason, tcp_vu_data_from_sock() already takes care of this,
|
||||
but the non-vhost-user version ignores this possibility and just sends
|
||||
out a FIN segment whenever we infer we received one host-side,
|
||||
regardless of the fact that we might have unacknowledged data still to
|
||||
send.
|
||||
|
||||
Somewhat surprisingly, this didn't cause any issue to be reported yet,
|
||||
until 6.17-rc1 and 1d2fbaad7cd8 ("tcp: stronger sk_rcvbuf checks")
|
||||
came around, leading to the following report from Paul, who hit this
|
||||
running Podman tests:
|
||||
|
||||
439 0.033032 169.254.1.2 → 192.168.122.100 65540 TCP 56602 → 5789 [PSH, ACK] Seq=10336361 Ack=1 Win=65536 Len=65480
|
||||
440 0.033055 169.254.1.2 → 192.168.122.100 30324 TCP [TCP Window Full] 56602 → 5789 [PSH, ACK] Seq=10401841 Ack=1 Win=65536 Len=30264
|
||||
|
||||
we're sending data to the container, up to the edge of the window
|
||||
|
||||
441 0.033059 192.168.122.100 → 169.254.1.2 60 TCP 5789 → 56602 [ACK] Seq=1 Ack=10401841 Win=83968 Len=0
|
||||
|
||||
and the container acknowledges it
|
||||
|
||||
442 0.033091 169.254.1.2 → 192.168.122.100 53716 TCP 56602 → 5789 [PSH, ACK] Seq=10432105 Ack=1 Win=65536 Len=53656
|
||||
|
||||
we send more data, all we possibly can, in window
|
||||
|
||||
443 0.033126 192.168.122.100 → 169.254.1.2 60 TCP [TCP ZeroWindow] 5789 → 56602 [ACK] Seq=1 Ack=10432105 Win=0 Len=0
|
||||
|
||||
and the container shrinks the window due to the issue introduced
|
||||
by kernel commit e2142825c120 ("net: tcp: send zero-window ACK when no
|
||||
memory"). With a previous patch from this series, we rewind the
|
||||
sequence, meaning that we assign conn->seq_to_tap from
|
||||
conn->seq_ack_from_tap, so that we'll retransmit this segment, by
|
||||
reading again from the socket, and increasing conn->seq_to_tap
|
||||
once more.
|
||||
|
||||
However:
|
||||
|
||||
444 0.033144 169.254.1.2 → 192.168.122.100 60 TCP 56602 → 5789 [FIN, PSH, ACK] Seq=10485761 Ack=1 Win=65536 Len=0
|
||||
|
||||
we eventually get a zero-length read from the socket and we miss the
|
||||
fact that conn->seq_to_tap != conn->seq_ack_from_tap, so we send a
|
||||
FIN flag with the most recent sequence. The kernel insists:
|
||||
|
||||
445 0.033147 192.168.122.100 → 169.254.1.2 60 TCP [TCP ZeroWindow] 5789 → 56602 [ACK] Seq=1 Ack=10432105 Win=0 Len=0
|
||||
|
||||
with its buggy zero-window update, but:
|
||||
|
||||
446 0.033152 192.168.122.100 → 169.254.1.2 60 TCP [TCP Window Update] 5789 → 56602 [ACK] Seq=1 Ack=10432105 Win=69120 Len=0
|
||||
447 0.033202 192.168.122.100 → 169.254.1.2 60 TCP [TCP Window Update] 5789 → 56602 [ACK] Seq=1 Ack=10432105 Win=142848 Len=0
|
||||
|
||||
we don't reset the TAP_FIN_SENT flag anymore, and don't resend
|
||||
the FIN segment (nor data), as we already rewound the sequence
|
||||
earlier.
|
||||
|
||||
To solve this, hold off the FIN segment until we get a zero-length
|
||||
read from the socket *and* we know that there's no unacknowledged
|
||||
pending data, also without vhost-user, in tcp_buf_data_from_sock().
|
||||
|
||||
Reported-by: Paul Holzinger <pholzing@redhat.com>
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Tested-by: Paul Holzinger <pholzing@redhat.com>
|
||||
Reviewed-by: Jon Maloy <jmaloy@redhat.com>
|
||||
(cherry picked from commit 8d2f8c4d0fb58d6b2011e614bc7d7ff9dab406b3)
|
||||
---
|
||||
tcp_buf.c | 5 ++++-
|
||||
1 file changed, 4 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/tcp_buf.c b/tcp_buf.c
|
||||
index 0530563..028f8ba 100644
|
||||
--- a/tcp_buf.c
|
||||
+++ b/tcp_buf.c
|
||||
@@ -369,7 +369,10 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
||||
}
|
||||
|
||||
if (!len) {
|
||||
- if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
|
||||
+ if (already_sent) {
|
||||
+ conn_flag(c, conn, STALLED);
|
||||
+ } else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) ==
|
||||
+ SOCK_FIN_RCVD) {
|
||||
int ret = tcp_buf_send_flag(c, conn, FIN | ACK);
|
||||
if (ret) {
|
||||
tcp_rst(c, conn);
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -1,78 +0,0 @@
|
||||
From 836afc99e59cfd1f6934464a81baaa95d0d78b31 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Tue, 30 Sep 2025 22:26:02 +0200
|
||||
Subject: [PATCH 10/13] tcp: Fix ACK sequence on FIN to tap
|
||||
|
||||
If we reach end-of-file on a socket (or get EPOLLRDHUP / EPOLLHUP) and
|
||||
send a FIN segment to the guest / container acknowledging a sequence
|
||||
number that's behind what we received so far, we won't have any
|
||||
further trigger to send an updated ACK segment, as we are now
|
||||
switching the epoll socket monitoring to edge-triggered mode.
|
||||
|
||||
To avoid this situation, in tcp_update_seqack_wnd(), we set the next
|
||||
acknowledgement sequence to the current observed sequence, regardless
|
||||
of what was acknowledged socket-side.
|
||||
|
||||
However, we don't necessarily call tcp_update_seqack_wnd() before
|
||||
sending the FIN segment, which might potentially lead to a situation,
|
||||
not observed in practice, where we unnecessarily cause a
|
||||
retransmission at some point after our FIN segment.
|
||||
|
||||
Avoid that by setting the ACK sequence to whatever we received from
|
||||
the container / guest, before sending a FIN segment and switching to
|
||||
EPOLLET.
|
||||
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
(cherry picked from commit b3217aa5aec10704774c7142ef07daec5e698415)
|
||||
---
|
||||
tcp_buf.c | 14 +++++++++++++-
|
||||
tcp_vu.c | 7 ++++++-
|
||||
2 files changed, 19 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/tcp_buf.c b/tcp_buf.c
|
||||
index 028f8ba..9527eed 100644
|
||||
--- a/tcp_buf.c
|
||||
+++ b/tcp_buf.c
|
||||
@@ -373,7 +373,19 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
||||
conn_flag(c, conn, STALLED);
|
||||
} else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) ==
|
||||
SOCK_FIN_RCVD) {
|
||||
- int ret = tcp_buf_send_flag(c, conn, FIN | ACK);
|
||||
+ int ret;
|
||||
+
|
||||
+ /* On TAP_FIN_SENT, we won't get further data events
|
||||
+ * from the socket, and this might be the last ACK
|
||||
+ * segment we send to the tap, so update its sequence to
|
||||
+ * include everything we received until now.
|
||||
+ *
|
||||
+ * See also the special handling on CONN_IS_CLOSING() in
|
||||
+ * tcp_update_seqack_wnd().
|
||||
+ */
|
||||
+ conn->seq_ack_to_tap = conn->seq_from_tap;
|
||||
+
|
||||
+ ret = tcp_buf_send_flag(c, conn, FIN | ACK);
|
||||
if (ret) {
|
||||
tcp_rst(c, conn);
|
||||
return ret;
|
||||
diff --git a/tcp_vu.c b/tcp_vu.c
|
||||
index 57587cc..7a721a2 100644
|
||||
--- a/tcp_vu.c
|
||||
+++ b/tcp_vu.c
|
||||
@@ -415,7 +415,12 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
|
||||
conn_flag(c, conn, STALLED);
|
||||
} else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) ==
|
||||
SOCK_FIN_RCVD) {
|
||||
- int ret = tcp_vu_send_flag(c, conn, FIN | ACK);
|
||||
+ int ret;
|
||||
+
|
||||
+ /* See tcp_buf_data_from_sock() */
|
||||
+ conn->seq_ack_to_tap = conn->seq_from_tap;
|
||||
+
|
||||
+ ret = tcp_vu_send_flag(c, conn, FIN | ACK);
|
||||
if (ret) {
|
||||
tcp_rst(c, conn);
|
||||
return ret;
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -1,79 +0,0 @@
|
||||
From 188bcb00a0c0bdc1f468e71fd9a42345098c09d5 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Thu, 2 Oct 2025 01:13:27 +0200
|
||||
Subject: [PATCH 11/13] tcp: Completely ignore data segment in CLOSE-WAIT
|
||||
state, log a message
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
According to RFC 9293 we should ignore data (note: not data segments)
|
||||
in CLOSE-WAIT state (indicated by TAP_FIN_RCVD), see 3.10.7.4
|
||||
"Other states":
|
||||
|
||||
[...]
|
||||
|
||||
Seventh, process the segment text:
|
||||
|
||||
[...]
|
||||
|
||||
CLOSE-WAIT STATE
|
||||
|
||||
This should not occur since a FIN has been received from the remote
|
||||
side. Ignore the segment text.
|
||||
|
||||
and we almost do that, except that we would look at the data length
|
||||
to decide whether it's a request for fast re-transmission, so fix
|
||||
that, and while at it, log a message, so that cases such as the
|
||||
following one are more apparent in debug logs:
|
||||
|
||||
28692 0.009758 88.198.0.164 → 93.235.151.95 54 TCP 55414 → 47080 [FIN, ACK] Seq=121441 Ack=141 Win=65536 Len=0
|
||||
|
||||
we should ignore this FIN flag, because we didn't accept data up
|
||||
to this sequence (see next segment), but we don't do it, so, here:
|
||||
|
||||
28693 0.000036 93.235.151.95 → 88.198.0.164 54 TCP 47080 → 55414 [ACK] Seq=141 Ack=90722 Win=32128 Len=0
|
||||
28694 0.034597 93.235.151.95 → 88.198.0.164 54 TCP 47080 → 55414 [FIN, ACK] Seq=141 Ack=90722 Win=121216 Len=0
|
||||
28695 0.000019 88.198.0.164 → 93.235.151.95 54 TCP 55414 → 47080 [ACK] Seq=121442 Ack=142 Win=65536 Len=0
|
||||
28696 0.162968 88.198.0.164 → 93.235.151.95 30773 TCP [TCP Retransmission] 55414 → 47080 [FIN, PSH, ACK] Seq=90722 Ack=142 Win=65536 Len=30719 [TCP segment of a reassembled PDU]
|
||||
|
||||
we are erroneously in CLOSE-WAIT (TAP_FIN_RCVD) state, and this
|
||||
segment would look pretty strange there.
|
||||
|
||||
This specific case is fixed by the next patch, so it should never
|
||||
happen again.
|
||||
|
||||
Link: https://archives.passt.top/passt-dev/20250910115726.432bbb8d@elisabeth/
|
||||
Link: https://bugs.passt.top/show_bug.cgi?id=126
|
||||
Suggested-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
(cherry picked from commit 8efa80b51f9f52082954aa26719b36a9ec367567)
|
||||
---
|
||||
tcp.c | 8 +++++++-
|
||||
1 file changed, 7 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index 70798de..ff071ee 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -2102,9 +2102,15 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
|
||||
|
||||
/* Established connections not accepting data from tap */
|
||||
if (conn->events & TAP_FIN_RCVD) {
|
||||
+ size_t dlen;
|
||||
bool retr;
|
||||
|
||||
- retr = th->ack && !tcp_packet_data_len(th, len) && !th->fin &&
|
||||
+ if ((dlen = tcp_packet_data_len(th, len))) {
|
||||
+ flow_dbg(conn, "data segment in CLOSE-WAIT (%zu B)",
|
||||
+ dlen);
|
||||
+ }
|
||||
+
|
||||
+ retr = th->ack && !th->fin &&
|
||||
ntohl(th->ack_seq) == conn->seq_ack_from_tap &&
|
||||
ntohs(th->window) == conn->wnd_from_tap;
|
||||
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -1,76 +0,0 @@
|
||||
From 64ddc157615eadc7e566486746d7e8bf9c86c844 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Thu, 2 Oct 2025 00:41:54 +0200
|
||||
Subject: [PATCH 12/13] tcp: Don't consider FIN flags with mismatching sequence
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
If a guest or container sends us a FIN segment but its sequence number
|
||||
doesn't match the highest sequence of data we *accepted* (not
|
||||
necessarily the highest sequence we received), that is,
|
||||
conn->seq_from_tap, plus any data we're accepting in the current
|
||||
batch, we should discard the flag (not necessarily the segment),
|
||||
because there's still data we need to receive (again) before the end
|
||||
of the stream.
|
||||
|
||||
If we consider those FIN flags as such, we'll end up in the
|
||||
situation described below.
|
||||
|
||||
Here, 192.168.10.102 is a HTTP server in a Podman container, and
|
||||
192.168.10.44 is a client fetching approximately 121 KB of data from
|
||||
it:
|
||||
|
||||
82 2.026811 192.168.10.102 → 192.168.10.44 54 TCP 55414 → 44992 [FIN, ACK] Seq=121441 Ack=143 Win=65536 Len=0
|
||||
|
||||
the server is done sending
|
||||
|
||||
83 2.026898 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [ACK] Seq=143 Ack=114394 Win=216192 Len=0
|
||||
|
||||
pasta (client) acknowledges a previous sequence, because of
|
||||
a short sendmsg()
|
||||
|
||||
84 2.027324 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [FIN, ACK] Seq=143 Ack=114394 Win=216192 Len=0
|
||||
|
||||
pasta (client) sends FIN, ACK as the client has no more data to
|
||||
send (a single GET request), while still acknowledging a previous
|
||||
sequence, because the retransmission didn't happen yet
|
||||
|
||||
85 2.027349 192.168.10.102 → 192.168.10.44 54 TCP 55414 → 44992 [ACK] Seq=121442 Ack=144 Win=65536 Len=0
|
||||
|
||||
the server acknowledges the FIN, ACK
|
||||
|
||||
86 2.224125 192.168.10.102 → 192.168.10.44 4150 TCP [TCP Retransmission] 55414 → 44992 [ACK] Seq=114394 Ack=144 Win=65536 Len=4096 [TCP segment of a reassembled PDU]
|
||||
|
||||
and finally a retransmission comes, but as we wrongly switched to
|
||||
the CLOSE-WAIT state,
|
||||
|
||||
87 2.224202 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [RST] Seq=144 Win=0 Len=0
|
||||
|
||||
we consider frame #86 as an acknowledgement for the FIN segment we
|
||||
sent, and close the connection, while we still had to re-receive
|
||||
(and finally send) the missing data segment, instead.
|
||||
|
||||
Link: https://github.com/containers/podman/issues/27179
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
(cherry picked from commit b145441913eef6f8885b6b84531e944ff593790c)
|
||||
---
|
||||
tcp.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index ff071ee..e6af26a 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -1752,7 +1752,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
|
||||
}
|
||||
}
|
||||
|
||||
- if (th->fin)
|
||||
+ if (th->fin && seq == seq_from_tap)
|
||||
fin = 1;
|
||||
|
||||
if (!len)
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -1,59 +0,0 @@
|
||||
From abd48f62fe0f1034a27a43ae5eb4e0e5f419b411 Mon Sep 17 00:00:00 2001
|
||||
From: Stefano Brivio <sbrivio@redhat.com>
|
||||
Date: Thu, 2 Oct 2025 00:52:53 +0200
|
||||
Subject: [PATCH 13/13] tcp: On partial send (incomplete sendmsg()), request a
|
||||
retransmission right away
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
...possibly with an updated window value. As we're discarding the
|
||||
remaining data, we'll need to receive it again. If we don't request
|
||||
a retransmission immediately, we'll see an apparent gap in the
|
||||
sequence, and request a retransmission on the next data batch or
|
||||
segment, but we're just wasting time like that. In packets:
|
||||
|
||||
28686 0.000007 88.198.0.164 → 93.235.151.95 16118 TCP 55414 → 47080 [ACK] Seq=80501 Ack=141 Win=65536 Len=16064 [TCP segment of a reassembled PDU]
|
||||
28687 0.000012 88.198.0.164 → 93.235.151.95 16118 TCP [TCP Window Full] 55414 → 47080 [ACK] Seq=96565 Ack=141 Win=65536 Len=16064 [TCP segment of a reassembled PDU]
|
||||
|
||||
on this second data segment, we have a short sendmsg(), and
|
||||
|
||||
28688 0.000026 93.235.151.95 → 88.198.0.164 54 TCP 47080 → 55414 [ACK] Seq=141 Ack=90721 Win=32128 Len=0
|
||||
|
||||
consequently acknowledge it, without requesting a retransmission,
|
||||
|
||||
28689 0.000006 88.198.0.164 → 93.235.151.95 8866 HTTP HTTP/1.1 200 ok (text/css)
|
||||
|
||||
so the server proceeds sending a bit more, but
|
||||
|
||||
28690 0.000016 93.235.151.95 → 88.198.0.164 54 TCP [TCP Dup ACK 28688#1] 47080 → 55414 [ACK] Seq=141 Ack=90721 Win=32128 Len=0
|
||||
28691 0.000000 93.235.151.95 → 88.198.0.164 54 TCP [TCP Dup ACK 28688#2] 47080 → 55414 [ACK] Seq=141 Ack=90721 Win=32128 Len=0
|
||||
|
||||
we'll throw that data (from frame #28689) away, and finally request
|
||||
a retransmission as we spotted the gap now.
|
||||
|
||||
Request a retransmission as soon as we know we'll need it, instead.
|
||||
|
||||
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
(cherry picked from commit a94783951d0d759912dcae0c6289c61fe6a86fc4)
|
||||
---
|
||||
tcp.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/tcp.c b/tcp.c
|
||||
index e6af26a..9afc7f7 100644
|
||||
--- a/tcp.c
|
||||
+++ b/tcp.c
|
||||
@@ -1855,7 +1855,7 @@ eintr:
|
||||
}
|
||||
|
||||
out:
|
||||
- if (keep != -1) {
|
||||
+ if (keep != -1 || partial_send) {
|
||||
/* We use an 8-bit approximation here: the associated risk is
|
||||
* that we skip a duplicate ACK on 8-bit sequence number
|
||||
* collision. Fast retransmit is a SHOULD in RFC 5681, 3.2.
|
||||
--
|
||||
2.47.1
|
||||
|
||||
23
passt.spec
23
passt.spec
@ -7,32 +7,18 @@
|
||||
# Copyright (c) 2022 Red Hat GmbH
|
||||
# Author: Stefano Brivio <sbrivio@redhat.com>
|
||||
|
||||
%global git_hash 8ec134109eb136432a29bdf5a14f8b1fd4e46208
|
||||
%global git_hash c3f1ba70237a9e66822aff3aa5765d0adf6f6307
|
||||
%global selinuxtype targeted
|
||||
|
||||
Name: passt
|
||||
Version: 0^20250512.g8ec1341
|
||||
Release: 3%{?dist}
|
||||
Version: 0^20251209.gc3f1ba7
|
||||
Release: 1%{?dist}
|
||||
Summary: User-mode networking daemons for virtual machines and namespaces
|
||||
License: GPL-2.0-or-later AND BSD-3-Clause
|
||||
Group: System Environment/Daemons
|
||||
URL: https://passt.top/
|
||||
Source: https://passt.top/passt/snapshot/passt-%{git_hash}.tar.xz
|
||||
|
||||
Patch1: 0001-treewide-By-default-don-t-quit-source-after-migratio.patch
|
||||
Patch2: 0002-tcp-FIN-flags-have-to-be-retransmitted-as-well.patch
|
||||
Patch3: 0003-tcp-Factor-sequence-rewind-for-retransmissions-into-.patch
|
||||
Patch4: 0004-tcp-Rewind-sequence-when-guest-shrinks-window-to-zer.patch
|
||||
Patch5: 0005-tcp-Fix-closing-logic-for-half-closed-connections.patch
|
||||
Patch6: 0006-tcp-Don-t-try-to-transmit-right-after-the-peer-shran.patch
|
||||
Patch7: 0007-tcp-Cast-operands-of-sequence-comparison-macros-to-u.patch
|
||||
Patch8: 0008-tcp-Fast-re-transmit-if-half-closed-make-TAP_FIN_RCV.patch
|
||||
Patch9: 0009-tcp-Don-t-send-FIN-segment-to-guest-yet-if-we-have-p.patch
|
||||
Patch10: 0010-tcp-Fix-ACK-sequence-on-FIN-to-tap.patch
|
||||
Patch11: 0011-tcp-Completely-ignore-data-segment-in-CLOSE-WAIT-sta.patch
|
||||
Patch12: 0012-tcp-Don-t-consider-FIN-flags-with-mismatching-sequen.patch
|
||||
Patch13: 0013-tcp-On-partial-send-incomplete-sendmsg-request-a-ret.patch
|
||||
|
||||
BuildRequires: gcc, make, git, checkpolicy, selinux-policy-devel
|
||||
Requires: (%{name}-selinux = %{version}-%{release} if selinux-policy-%{selinuxtype})
|
||||
|
||||
@ -144,6 +130,9 @@ fi
|
||||
%{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp
|
||||
|
||||
%changelog
|
||||
* Tue Dec 9 2025 Stefano Brivio <sbrivio@redhat.com> - 0^20251209.gc3f1ba7-1
|
||||
- Resolves: RHEL-134120
|
||||
|
||||
* Thu Oct 23 2025 Stefano Brivio <sbrivio@redhat.com> - 0^20250512.g8ec1341-3
|
||||
- Resolves: RHEL-123425 RHEL-123683
|
||||
|
||||
|
||||
2
sources
2
sources
@ -1 +1 @@
|
||||
SHA512 (passt-8ec134109eb136432a29bdf5a14f8b1fd4e46208.tar.xz) = 3a63f3f62aae06ae0da2293808223f539bca1a030131c50499d5de2daa96faf887fd486b6aa71d627b5ede6de6f5310876150983a3e77fbaf9926e69af56bdab
|
||||
SHA512 (passt-c3f1ba70237a9e66822aff3aa5765d0adf6f6307.tar.xz) = 3f522803b1df364250989813d8e7c36e851572044bee39df7abf14f67a56bdfe03db59832c172bda28bbe3d985b3376bf5777770df79f2819b07492c718653ab
|
||||
|
||||
Loading…
Reference in New Issue
Block a user