diff --git a/.gitignore b/.gitignore index cff9ef8..45c0be9 100644 --- a/.gitignore +++ b/.gitignore @@ -42,3 +42,4 @@ /passt-a1e48a02ff3550eb7875a7df6726086e9b3a1213.tar.xz /passt-32f6212551c5db3b7b3548e8483e5d73f07a35ac.tar.xz /passt-8ec134109eb136432a29bdf5a14f8b1fd4e46208.tar.xz +/passt-c3f1ba70237a9e66822aff3aa5765d0adf6f6307.tar.xz diff --git a/0001-treewide-By-default-don-t-quit-source-after-migratio.patch b/0001-treewide-By-default-don-t-quit-source-after-migratio.patch deleted file mode 100644 index 95e79e7..0000000 --- a/0001-treewide-By-default-don-t-quit-source-after-migratio.patch +++ /dev/null @@ -1,264 +0,0 @@ -From b0b5ce0a76cf7fec0b00405732fd94e0b34e8d84 Mon Sep 17 00:00:00 2001 -From: Stefano Brivio -Date: Thu, 17 Jul 2025 10:38:17 +0200 -Subject: [PATCH] treewide: By default, don't quit source after migration, keep - sockets open - -We are hitting an issue in the KubeVirt integration where some data is -still sent to the source instance even after migration is complete. As -we exit, the kernel closes our sockets and resets connections. The -resulting RST segments are sent to peers, effectively terminating -connections that were meanwhile migrated. - -At the moment, this is not done intentionally, but in the future -KubeVirt might enable OVN-Kubernetes features where source and -destination nodes are explicitly getting mirrored traffic for a while, -in order to decrease migration downtime. - -By default, don't quit after migration is completed on the source: the -previous behaviour can be enabled with the new, but deprecated, ---migrate-exit option. After migration (as source), the -1 / --one-off -option has no effect. - -Also, by default, keep migrated TCP sockets open (in repair mode) as -long as we're running, and ignore events on any epoll descriptor -representing data channels. The previous behaviour can be enabled with -the new, equally deprecated, --migrate-no-linger option. - -By keeping sockets open, and not exiting, we prevent the kernel -running on the source node to send out RST segments if further data -reaches us. - -Reported-by: Nir Dothan -Signed-off-by: Stefano Brivio -(cherry picked from commit a8782865c342eb2682cca292d5bf92b567344351) ---- - conf.c | 22 ++++++++++++++++++++++ - flow.c | 2 +- - passt.1 | 29 +++++++++++++++++++++++++++++ - passt.h | 4 ++++ - tcp.c | 9 +++++++-- - tcp_conn.h | 3 ++- - test/lib/setup | 4 ++-- - vhost_user.c | 9 +++++++-- - 8 files changed, 74 insertions(+), 8 deletions(-) - -diff --git a/conf.c b/conf.c -index a6d7e22..1295d89 100644 ---- a/conf.c -+++ b/conf.c -@@ -864,6 +864,14 @@ static void usage(const char *name, FILE *f, int status) - FPRINTF(f, - " --repair-path PATH path for passt-repair(1)\n" - " default: append '.repair' to UNIX domain path\n"); -+ FPRINTF(f, -+ " --migrate-exit DEPRECATED:\n" -+ " source quits after migration\n" -+ " default: source keeps running after migration\n"); -+ FPRINTF(f, -+ " --migrate-no-linger DEPRECATED:\n" -+ " close sockets on migration\n" -+ " default: keep sockets open, ignore events\n"); - } - - FPRINTF(f, -@@ -1468,6 +1476,8 @@ void conf(struct ctx *c, int argc, char **argv) - {"socket-path", required_argument, NULL, 's' }, - {"fqdn", required_argument, NULL, 27 }, - {"repair-path", required_argument, NULL, 28 }, -+ {"migrate-exit", no_argument, NULL, 29 }, -+ {"migrate-no-linger", no_argument, NULL, 30 }, - { 0 }, - }; - const char *optstring = "+dqfel:hs:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:"; -@@ -1683,6 +1693,18 @@ void conf(struct ctx *c, int argc, char **argv) - optarg)) - die("Invalid passt-repair path: %s", optarg); - -+ break; -+ case 29: -+ if (c->mode != MODE_VU) -+ die("--migrate-exit is for vhost-user mode only"); -+ c->migrate_exit = true; -+ -+ break; -+ case 30: -+ if (c->mode != MODE_VU) -+ die("--migrate-no-linger is for vhost-user mode only"); -+ c->migrate_no_linger = true; -+ - break; - case 'd': - c->debug = 1; -diff --git a/flow.c b/flow.c -index 6a5c8aa..a4b65ea 100644 ---- a/flow.c -+++ b/flow.c -@@ -1089,7 +1089,7 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, - * as EIO). - */ - foreach_established_tcp_flow(flow) { -- rc = tcp_flow_migrate_source_ext(fd, &flow->tcp); -+ rc = tcp_flow_migrate_source_ext(c, fd, &flow->tcp); - if (rc) { - flow_err(flow, "Can't send extended data: %s", - strerror_(-rc)); -diff --git a/passt.1 b/passt.1 -index 60066c2..cef98b2 100644 ---- a/passt.1 -+++ b/passt.1 -@@ -439,6 +439,30 @@ Default, for \-\-vhost-user mode only, is to append \fI.repair\fR to the path - chosen for the hypervisor UNIX domain socket. No socket is created if not in - \-\-vhost-user mode. - -+.TP -+.BR \-\-migrate-exit (DEPRECATED) -+Exit after a completed migration as source. By default, \fBpasst\fR keeps -+running and the migrated guest can continue using its connection, or a new guest -+can connect. -+ -+Note that this configuration option is \fBdeprecated\fR and will be removed in a -+future version. It is not expected to be of any use, and it simply reflects a -+legacy behaviour. If you have any use for this, refer to \fBREPORTING BUGS\fR -+below. -+ -+.TP -+.BR \-\-migrate-no-linger (DEPRECATED) -+Close TCP sockets on the source instance once migration completes. -+ -+By default, sockets are kept open, and events on data sockets are ignored, so -+that any further message reaching sockets after the source migrated is silently -+ignored, to avoid connection resets in case data is received after migration. -+ -+Note that this configuration option is \fBdeprecated\fR and will be removed in a -+future version. It is not expected to be of any use, and it simply reflects a -+legacy behaviour. If you have any use for this, refer to \fBREPORTING BUGS\fR -+below. -+ - .TP - .BR \-F ", " \-\-fd " " \fIFD - Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened -@@ -454,6 +478,11 @@ is closed. - Quit after handling a single client connection, that is, once the client closes - the socket, or once we get a socket error. - -+\fBNote\fR: this option has no effect after \fBpasst\fR completes a migration as -+source, because, in that case, exiting would close sockets for active -+connections, which would in turn cause connection resets if any further data is -+received. See also the description of \fI\-\-migrate-no-linger\fR. -+ - .TP - .BR \-t ", " \-\-tcp-ports " " \fIspec - Configure TCP port forwarding to guest. \fIspec\fR can be one of: -diff --git a/passt.h b/passt.h -index 8693794..4cfd6eb 100644 ---- a/passt.h -+++ b/passt.h -@@ -241,6 +241,8 @@ struct ip6_ctx { - * @device_state_fd: Device state migration channel - * @device_state_result: Device state migration result - * @migrate_target: Are we the target, on the next migration request? -+ * @migrate_no_linger: Close sockets as we migrate them -+ * @migrate_exit: Exit (on source) once migration is complete - */ - struct ctx { - enum passt_modes mode; -@@ -318,6 +320,8 @@ struct ctx { - int device_state_fd; - int device_state_result; - bool migrate_target; -+ bool migrate_no_linger; -+ bool migrate_exit; - }; - - void proto_update_l2_buf(const unsigned char *eth_d, -diff --git a/tcp.c b/tcp.c -index 0ac298a..1b22f70 100644 ---- a/tcp.c -+++ b/tcp.c -@@ -3284,12 +3284,14 @@ int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn) - - /** - * tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data -+ * @c: Execution context - * @fd: Descriptor for state migration - * @conn: Pointer to the TCP connection structure - * - * Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure - */ --int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn) -+int tcp_flow_migrate_source_ext(const struct ctx *c, -+ int fd, const struct tcp_tap_conn *conn) - { - uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap; - struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(conn)]; -@@ -3334,7 +3336,10 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn) - if ((rc = tcp_flow_dump_seq(conn, &t->seq_rcv))) - goto fail; - -- close(s); -+ if (c->migrate_no_linger) -+ close(s); -+ else -+ epoll_del(c, s); - - /* Adjustments unrelated to FIN segments: sequence numbers we dumped are - * based on the end of the queues. -diff --git a/tcp_conn.h b/tcp_conn.h -index 35d813d..38b5c54 100644 ---- a/tcp_conn.h -+++ b/tcp_conn.h -@@ -236,7 +236,8 @@ int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn); - int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn); - - int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn); --int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn); -+int tcp_flow_migrate_source_ext(const struct ctx *c, int fd, -+ const struct tcp_tap_conn *conn); - - int tcp_flow_migrate_target(struct ctx *c, int fd); - int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd); -diff --git a/test/lib/setup b/test/lib/setup -index 575bc21..5994598 100755 ---- a/test/lib/setup -+++ b/test/lib/setup -@@ -350,7 +350,7 @@ setup_migrate() { - - sleep 1 - -- __opts="--vhost-user" -+ __opts="--vhost-user --migrate-exit --migrate-no-linger" - [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap" - [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" - [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" -@@ -360,7 +360,7 @@ setup_migrate() { - - context_run_bg passt_repair_1 "./passt-repair ${STATESETUP}/passt_1.socket.repair" - -- __opts="--vhost-user" -+ __opts="--vhost-user --migrate-exit --migrate-no-linger" - [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap" - [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" - [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" -diff --git a/vhost_user.c b/vhost_user.c -index 105f77a..c4d3a52 100644 ---- a/vhost_user.c -+++ b/vhost_user.c -@@ -1208,7 +1208,12 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events) - if (msg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE && - vdev->context->device_state_result == 0 && - !vdev->context->migrate_target) { -- info("Migration complete, exiting"); -- _exit(EXIT_SUCCESS); -+ if (vdev->context->migrate_exit) { -+ info("Migration complete, exiting"); -+ _exit(EXIT_SUCCESS); -+ } -+ -+ info("Migration complete"); -+ vdev->context->one_off = false; - } - } --- -2.47.1 - diff --git a/0002-tcp-FIN-flags-have-to-be-retransmitted-as-well.patch b/0002-tcp-FIN-flags-have-to-be-retransmitted-as-well.patch deleted file mode 100644 index 624bec6..0000000 --- a/0002-tcp-FIN-flags-have-to-be-retransmitted-as-well.patch +++ /dev/null @@ -1,45 +0,0 @@ -From 585326cf732c8396701b5d3fdf6a7b5f2ab79300 Mon Sep 17 00:00:00 2001 -From: Stefano Brivio -Date: Fri, 29 Aug 2025 22:11:26 +0200 -Subject: [PATCH 02/13] tcp: FIN flags have to be retransmitted as well - -If we're retransmitting any data, and we sent a FIN segment to our -peer, regardless of whether it was received, we obviously have to -retransmit it as well, given that it can only come with the last data -segment, or after it. - -Unconditionally clear the internal TAP_FIN_SENT flag whenever we -re-transmit, so that we know we have to send it again, in case. - -Reported-by: Paul Holzinger -Signed-off-by: Stefano Brivio -Reviewed-by: David Gibson -Tested-by: Paul Holzinger -(cherry picked from commit 2e3d93ba7033754301af0d50cd0bb20a38a005ac) ---- - tcp.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/tcp.c b/tcp.c -index 0ac298a..c5c430e 100644 ---- a/tcp.c -+++ b/tcp.c -@@ -1757,6 +1757,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, - "fast re-transmit, ACK: %u, previous sequence: %u", - max_ack_seq, conn->seq_to_tap); - conn->seq_to_tap = max_ack_seq; -+ conn->events &= ~TAP_FIN_SENT; - if (tcp_set_peek_offset(conn, 0)) { - tcp_rst(c, conn); - return -1; -@@ -2284,6 +2285,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) - flow_dbg(conn, "ACK timeout, retry"); - conn->retrans++; - conn->seq_to_tap = conn->seq_ack_from_tap; -+ conn->events &= ~TAP_FIN_SENT; - if (!conn->wnd_from_tap) - conn->wnd_from_tap = 1; /* Zero-window probe */ - if (tcp_set_peek_offset(conn, 0)) { --- -2.47.1 - diff --git a/0003-tcp-Factor-sequence-rewind-for-retransmissions-into-.patch b/0003-tcp-Factor-sequence-rewind-for-retransmissions-into-.patch deleted file mode 100644 index 62bed28..0000000 --- a/0003-tcp-Factor-sequence-rewind-for-retransmissions-into-.patch +++ /dev/null @@ -1,104 +0,0 @@ -From 02f5a975d22449a2a940dec91e8e78a79e316693 Mon Sep 17 00:00:00 2001 -From: Stefano Brivio -Date: Fri, 29 Aug 2025 22:11:27 +0200 -Subject: [PATCH 03/13] tcp: Factor sequence rewind for retransmissions into a - new function - -...as I'm going to need a third occurrence of this in the next change. - -This introduces a small functional change in tcp_data_from_tap(): the -sequence was previously rewound to the highest ACK number we found in -the current packet batch, and not to the current value of -seq_ack_from_tap. - -The two might differ in case tcp_sock_consume() failed, because in -that case we're ignoring that ACK altogether. But if we're ignoring -it, it looks more correct to me to start retransmitting from an -earlier sequence anyway. - -Signed-off-by: Stefano Brivio -Reviewed-by: David Gibson -Tested-by: Paul Holzinger -(cherry picked from commit 1d502be1a809dcdc4614815f4ec0da77a1ff27ec) ---- - tcp.c | 47 ++++++++++++++++++++++++++++++++--------------- - 1 file changed, 32 insertions(+), 15 deletions(-) - -diff --git a/tcp.c b/tcp.c -index c5c430e..8b0e707 100644 ---- a/tcp.c -+++ b/tcp.c -@@ -1097,6 +1097,26 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, - } - } - -+/** -+ * tcp_rewind_seq() - Rewind sequence to tap and socket offset to current ACK -+ * @c: Execution context -+ * @conn: Connection pointer -+ * -+ * Return: 0 on success, -1 on failure, with connection reset -+ */ -+static int tcp_rewind_seq(const struct ctx *c, struct tcp_tap_conn *conn) -+{ -+ conn->seq_to_tap = conn->seq_ack_from_tap; -+ conn->events &= ~TAP_FIN_SENT; -+ -+ if (tcp_set_peek_offset(conn, 0)) { -+ tcp_rst(c, conn); -+ return -1; -+ } -+ -+ return 0; -+} -+ - /** - * tcp_prepare_flags() - Prepare header for flags-only segment (no payload) - * @c: Execution context -@@ -1755,13 +1775,11 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, - if (retr) { - flow_trace(conn, - "fast re-transmit, ACK: %u, previous sequence: %u", -- max_ack_seq, conn->seq_to_tap); -- conn->seq_to_tap = max_ack_seq; -- conn->events &= ~TAP_FIN_SENT; -- if (tcp_set_peek_offset(conn, 0)) { -- tcp_rst(c, conn); -+ conn->seq_ack_from_tap, conn->seq_to_tap); -+ -+ if (tcp_rewind_seq(c, conn)) - return -1; -- } -+ - tcp_data_from_sock(c, conn); - } - -@@ -2283,17 +2301,16 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) - tcp_rst(c, conn); - } else { - flow_dbg(conn, "ACK timeout, retry"); -- conn->retrans++; -- conn->seq_to_tap = conn->seq_ack_from_tap; -- conn->events &= ~TAP_FIN_SENT; -+ - if (!conn->wnd_from_tap) - conn->wnd_from_tap = 1; /* Zero-window probe */ -- if (tcp_set_peek_offset(conn, 0)) { -- tcp_rst(c, conn); -- } else { -- tcp_data_from_sock(c, conn); -- tcp_timer_ctl(c, conn); -- } -+ -+ conn->retrans++; -+ if (tcp_rewind_seq(c, conn)) -+ return; -+ -+ tcp_data_from_sock(c, conn); -+ tcp_timer_ctl(c, conn); - } - } else { - struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } }; --- -2.47.1 - diff --git a/0004-tcp-Rewind-sequence-when-guest-shrinks-window-to-zer.patch b/0004-tcp-Rewind-sequence-when-guest-shrinks-window-to-zer.patch deleted file mode 100644 index 29bd90c..0000000 --- a/0004-tcp-Rewind-sequence-when-guest-shrinks-window-to-zer.patch +++ /dev/null @@ -1,123 +0,0 @@ -From f6d5e6a614c385bccfd02453de5e04428d9fed9f Mon Sep 17 00:00:00 2001 -From: Stefano Brivio -Date: Fri, 29 Aug 2025 22:11:28 +0200 -Subject: [PATCH 04/13] tcp: Rewind sequence when guest shrinks window to zero - -A window shrunk to zero means by definition that anything else that -might be in flight is now out of window. Restart from the currently -acknowledged sequence. - -We need to do that both in tcp_tap_window_update(), where we already -check for zero-window updates, as well as in tcp_data_from_tap(), -because we might get one of those updates in a batch of packets that -also contains a non-zero window update. - -Suggested-by: Jon Maloy -Signed-off-by: Stefano Brivio -Reviewed-by: David Gibson -Tested-by: Paul Holzinger -Reviewed-by: Jon Maloy -(cherry picked from commit e86d480be4174fdfec592ba689c2ef03019393f6) ---- - tcp.c | 34 +++++++++++++++++++++++++--------- - 1 file changed, 25 insertions(+), 9 deletions(-) - -diff --git a/tcp.c b/tcp.c -index 8b0e707..74ed7c6 100644 ---- a/tcp.c -+++ b/tcp.c -@@ -1257,19 +1257,25 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn, - - /** - * tcp_tap_window_update() - Process an updated window from tap side -+ * @c: Execution context - * @conn: Connection pointer - * @window: Window value, host order, unscaled - */ --static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd) -+static void tcp_tap_window_update(const struct ctx *c, -+ struct tcp_tap_conn *conn, unsigned wnd) - { - wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap); - - /* Work-around for bug introduced in peer kernel code, commit -- * e2142825c120 ("net: tcp: send zero-window ACK when no memory"). -- * We don't update if window shrank to zero. -+ * e2142825c120 ("net: tcp: send zero-window ACK when no memory"): don't -+ * update the window if it shrank to zero, so that we'll eventually -+ * retry to send data, but rewind the sequence as that obviously implies -+ * that no data beyond the updated window will be acknowledged. - */ -- if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) -+ if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) { -+ tcp_rewind_seq(c, conn); - return; -+ } - - conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX); - -@@ -1692,7 +1698,8 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, - tcp_timer_ctl(c, conn); - - if (p->count == 1) { -- tcp_tap_window_update(conn, ntohs(th->window)); -+ tcp_tap_window_update(c, conn, -+ ntohs(th->window)); - return 1; - } - -@@ -1711,6 +1718,15 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, - ack_seq == max_ack_seq && - ntohs(th->window) == max_ack_seq_wnd; - -+ /* See tcp_tap_window_update() for details. On -+ * top of that, we also need to check here if a -+ * zero-window update is contained in a batch of -+ * packets that includes a non-zero window as -+ * well. -+ */ -+ if (!ntohs(th->window)) -+ tcp_rewind_seq(c, conn); -+ - max_ack_seq_wnd = ntohs(th->window); - max_ack_seq = ack_seq; - } -@@ -1770,7 +1786,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, - if (ack && !tcp_sock_consume(conn, max_ack_seq)) - tcp_update_seqack_from_tap(c, conn, max_ack_seq); - -- tcp_tap_window_update(conn, max_ack_seq_wnd); -+ tcp_tap_window_update(c, conn, max_ack_seq_wnd); - - if (retr) { - flow_trace(conn, -@@ -1859,7 +1875,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c, - const struct tcphdr *th, - const char *opts, size_t optlen) - { -- tcp_tap_window_update(conn, ntohs(th->window)); -+ tcp_tap_window_update(c, conn, ntohs(th->window)); - tcp_get_tap_ws(conn, opts, optlen); - - /* First value is not scaled */ -@@ -2057,7 +2073,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, - if (!th->ack) - goto reset; - -- tcp_tap_window_update(conn, ntohs(th->window)); -+ tcp_tap_window_update(c, conn, ntohs(th->window)); - - tcp_data_from_sock(c, conn); - -@@ -2069,7 +2085,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, - if (conn->events & TAP_FIN_RCVD) { - tcp_sock_consume(conn, ntohl(th->ack_seq)); - tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); -- tcp_tap_window_update(conn, ntohs(th->window)); -+ tcp_tap_window_update(c, conn, ntohs(th->window)); - tcp_data_from_sock(c, conn); - - if (conn->events & SOCK_FIN_RCVD && --- -2.47.1 - diff --git a/0005-tcp-Fix-closing-logic-for-half-closed-connections.patch b/0005-tcp-Fix-closing-logic-for-half-closed-connections.patch deleted file mode 100644 index 3d34f7f..0000000 --- a/0005-tcp-Fix-closing-logic-for-half-closed-connections.patch +++ /dev/null @@ -1,63 +0,0 @@ -From 7a4c907502ffe8df07ee44d2d17818671dc4ae47 Mon Sep 17 00:00:00 2001 -From: Stefano Brivio -Date: Fri, 29 Aug 2025 22:11:29 +0200 -Subject: [PATCH 05/13] tcp: Fix closing logic for half-closed connections - -First off, don't close connections half-closed by the guest before -our own FIN is acknowledged by the guest itself. - -That is, after we receive a FIN from the guest (TAP_FIN_RCVD), if we -don't have any data left to send from the socket (SOCK_FIN_RCVD, or -EPOLLHUP), we send a FIN segment to the guest (TAP_FIN_SENT), but we -need to actually have it acknowledged (and have no pending -retransmissions) before we can close the connection: check for -TAP_FIN_ACKED, first. - -Then, if we set TAP_FIN_SENT, and we receive an ACK segment from the -guest, set TAP_FIN_ACKED. This was entirely missing for the -TAP_FIN_RCVD case, and as we fix the problem described above, this -becomes relevant as well. - -Signed-off-by: Stefano Brivio -Reviewed-by: David Gibson -Tested-by: Paul Holzinger -Reviewed-by: Jon Maloy -(cherry picked from commit c62fb08002ad88079b84d1aa694492746f0d7f22) ---- - tcp.c | 13 +++++++++---- - 1 file changed, 9 insertions(+), 4 deletions(-) - -diff --git a/tcp.c b/tcp.c -index 74ed7c6..ff1b6bf 100644 ---- a/tcp.c -+++ b/tcp.c -@@ -2088,9 +2088,14 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, - tcp_tap_window_update(c, conn, ntohs(th->window)); - tcp_data_from_sock(c, conn); - -- if (conn->events & SOCK_FIN_RCVD && -- conn->seq_ack_from_tap == conn->seq_to_tap) -- conn_event(c, conn, CLOSED); -+ if (conn->seq_ack_from_tap == conn->seq_to_tap) { -+ if (th->ack && conn->events & TAP_FIN_SENT) -+ conn_event(c, conn, TAP_FIN_ACKED); -+ -+ if (conn->events & SOCK_FIN_RCVD && -+ conn->events & TAP_FIN_ACKED) -+ conn_event(c, conn, CLOSED); -+ } - - return 1; - } -@@ -2370,7 +2375,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, - return; - } - -- if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) { -+ if ((conn->events & TAP_FIN_ACKED) && (events & EPOLLHUP)) { - conn_event(c, conn, CLOSED); - return; - } --- -2.47.1 - diff --git a/0006-tcp-Don-t-try-to-transmit-right-after-the-peer-shran.patch b/0006-tcp-Don-t-try-to-transmit-right-after-the-peer-shran.patch deleted file mode 100644 index 46b452f..0000000 --- a/0006-tcp-Don-t-try-to-transmit-right-after-the-peer-shran.patch +++ /dev/null @@ -1,85 +0,0 @@ -From b1a28f0dd1a2a31e73c03aefbfc8dbd016f09ac4 Mon Sep 17 00:00:00 2001 -From: Stefano Brivio -Date: Fri, 29 Aug 2025 22:11:30 +0200 -Subject: [PATCH 06/13] tcp: Don't try to transmit right after the peer shrank - the window to zero - -If the peer shrinks the window to zero, we'll skip storing the new -window, as a convenient way to cause window probes (which exceed any -zero-sized window, strictly speaking) if we don't get window updates -in a while. - -As we do so, though, we need to ensure we don't try to queue more data -from the socket right after we process this window update, as the -entire point of a zero-window advertisement is to keep us from sending -more data. - -Signed-off-by: Stefano Brivio -Reviewed-by: David Gibson -Tested-by: Paul Holzinger -Reviewed-by: Jon Maloy -(cherry picked from commit 25f93545e7232a7dab9a022862514778c18bc85e) ---- - tcp.c | 18 ++++++++++-------- - 1 file changed, 10 insertions(+), 8 deletions(-) - -diff --git a/tcp.c b/tcp.c -index ff1b6bf..4686846 100644 ---- a/tcp.c -+++ b/tcp.c -@@ -1259,9 +1259,11 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn, - * tcp_tap_window_update() - Process an updated window from tap side - * @c: Execution context - * @conn: Connection pointer -- * @window: Window value, host order, unscaled -+ * @wnd: Window value, host order, unscaled -+ * -+ * Return: false on zero window (not stored to wnd_from_tap), true otherwise - */ --static void tcp_tap_window_update(const struct ctx *c, -+static bool tcp_tap_window_update(const struct ctx *c, - struct tcp_tap_conn *conn, unsigned wnd) - { - wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap); -@@ -1274,13 +1276,14 @@ static void tcp_tap_window_update(const struct ctx *c, - */ - if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) { - tcp_rewind_seq(c, conn); -- return; -+ return false; - } - - conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX); - - /* FIXME: reflect the tap-side receiver's window back to the sock-side - * sender by adjusting SO_RCVBUF? */ -+ return true; - } - - /** -@@ -2073,9 +2076,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, - if (!th->ack) - goto reset; - -- tcp_tap_window_update(c, conn, ntohs(th->window)); -- -- tcp_data_from_sock(c, conn); -+ if (tcp_tap_window_update(c, conn, ntohs(th->window))) -+ tcp_data_from_sock(c, conn); - - if (p->count - idx == 1) - return 1; -@@ -2085,8 +2087,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, - if (conn->events & TAP_FIN_RCVD) { - tcp_sock_consume(conn, ntohl(th->ack_seq)); - tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); -- tcp_tap_window_update(c, conn, ntohs(th->window)); -- tcp_data_from_sock(c, conn); -+ if (tcp_tap_window_update(c, conn, ntohs(th->window))) -+ tcp_data_from_sock(c, conn); - - if (conn->seq_ack_from_tap == conn->seq_to_tap) { - if (th->ack && conn->events & TAP_FIN_SENT) --- -2.47.1 - diff --git a/0007-tcp-Cast-operands-of-sequence-comparison-macros-to-u.patch b/0007-tcp-Cast-operands-of-sequence-comparison-macros-to-u.patch deleted file mode 100644 index 884a0d7..0000000 --- a/0007-tcp-Cast-operands-of-sequence-comparison-macros-to-u.patch +++ /dev/null @@ -1,48 +0,0 @@ -From 23331346494e8bd301cd635533812e329f693fdf Mon Sep 17 00:00:00 2001 -From: Stefano Brivio -Date: Fri, 29 Aug 2025 22:11:31 +0200 -Subject: [PATCH 07/13] tcp: Cast operands of sequence comparison macros to - uint32_t before using them - -Otherwise, passing signed types causes automatic promotion of the -result of the subtractions as well, which is not what we want, as -these macros rely on unsigned 32-bit arithmetic. - -The next patch introduces a ssize_t operand for SEQ_LE, illustrating -the issue. - -Signed-off-by: Stefano Brivio -Reviewed-by: David Gibson -Tested-by: Paul Holzinger -Reviewed-by: Jon Maloy -(cherry picked from commit 660cd6907e14a41ad9bc77d317140c70ab416fce) ---- - tcp_internal.h | 12 ++++++++---- - 1 file changed, 8 insertions(+), 4 deletions(-) - -diff --git a/tcp_internal.h b/tcp_internal.h -index 36c6533..c80ba40 100644 ---- a/tcp_internal.h -+++ b/tcp_internal.h -@@ -18,10 +18,14 @@ - sizeof(struct ipv6hdr), \ - sizeof(uint32_t)) - --#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW) --#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW) --#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW) --#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW) -+#define SEQ_LE(a, b) \ -+ ((uint32_t)(b) - (uint32_t)(a) < MAX_WINDOW) -+#define SEQ_LT(a, b) \ -+ ((uint32_t)(b) - (uint32_t)(a) - 1 < MAX_WINDOW) -+#define SEQ_GE(a, b) \ -+ ((uint32_t)(a) - (uint32_t)(b) < MAX_WINDOW) -+#define SEQ_GT(a, b) \ -+ ((uint32_t)(a) - (uint32_t)(b) - 1 < MAX_WINDOW) - - #define FIN (1 << 0) - #define SYN (1 << 1) --- -2.47.1 - diff --git a/0008-tcp-Fast-re-transmit-if-half-closed-make-TAP_FIN_RCV.patch b/0008-tcp-Fast-re-transmit-if-half-closed-make-TAP_FIN_RCV.patch deleted file mode 100644 index 7fda0fe..0000000 --- a/0008-tcp-Fast-re-transmit-if-half-closed-make-TAP_FIN_RCV.patch +++ /dev/null @@ -1,95 +0,0 @@ -From c1420a41bb6e95f4dc578846568dacd9e0e0bbd2 Mon Sep 17 00:00:00 2001 -From: Stefano Brivio -Date: Fri, 29 Aug 2025 22:11:32 +0200 -Subject: [PATCH 08/13] tcp: Fast re-transmit if half-closed, make TAP_FIN_RCVD - path consistent - -We currently have a number of discrepancies in the tcp_tap_handler() -path between the half-closed connection path and the regular one, and -they are mostly a result of code duplication, which comes in turn from -the fact that tcp_data_from_tap() deals with data transfers as well as -general connection bookkeeping, so we can't use it for half-closed -connections. - -This suggests that we should probably rework it into two or more -functions, in the long term, but for the moment being I'm just fixing -one obvious issue, which is the lack of fast retransmissions in the -TAP_FIN_RCVD path, and a potential one, which is the fact we don't -handle socket flush failures. - -Add fast re-transmit for half-closed connections, and handle the case -of socket flush (tcp_sock_consume()) flush failure in the same way as -tcp_data_from_tap() handles it. - -Signed-off-by: Stefano Brivio -Reviewed-by: David Gibson -Tested-by: Paul Holzinger -Reviewed-by: Jon Maloy -(cherry picked from commit bde1847960cfecc46d8bbf203b931705f2922d31) ---- - tcp.c | 42 +++++++++++++++++++++++++++++++++++++++--- - 1 file changed, 39 insertions(+), 3 deletions(-) - -diff --git a/tcp.c b/tcp.c -index 4686846..70798de 100644 ---- a/tcp.c -+++ b/tcp.c -@@ -1639,6 +1639,23 @@ static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) - return tcp_buf_data_from_sock(c, conn); - } - -+/** -+ * tcp_packet_data_len() - Get data (TCP payload) length for a TCP packet -+ * @th: Pointer to TCP header -+ * @l4len: TCP packet length, including TCP header -+ * -+ * Return: data length of TCP packet, -1 on invalid value of Data Offset field -+ */ -+static ssize_t tcp_packet_data_len(const struct tcphdr *th, size_t l4len) -+{ -+ size_t off = th->doff * 4UL; -+ -+ if (off < sizeof(*th) || off > l4len) -+ return -1; -+ -+ return l4len - off; -+} -+ - /** - * tcp_data_from_tap() - tap/guest data for established connection - * @c: Execution context -@@ -2085,9 +2102,28 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, - - /* Established connections not accepting data from tap */ - if (conn->events & TAP_FIN_RCVD) { -- tcp_sock_consume(conn, ntohl(th->ack_seq)); -- tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); -- if (tcp_tap_window_update(c, conn, ntohs(th->window))) -+ bool retr; -+ -+ retr = th->ack && !tcp_packet_data_len(th, len) && !th->fin && -+ ntohl(th->ack_seq) == conn->seq_ack_from_tap && -+ ntohs(th->window) == conn->wnd_from_tap; -+ -+ /* On socket flush failure, pretend there was no ACK, try again -+ * later -+ */ -+ if (th->ack && !tcp_sock_consume(conn, ntohl(th->ack_seq))) -+ tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); -+ -+ if (retr) { -+ flow_trace(conn, -+ "fast re-transmit, ACK: %u, previous sequence: %u", -+ ntohl(th->ack_seq), conn->seq_to_tap); -+ -+ if (tcp_rewind_seq(c, conn)) -+ return -1; -+ } -+ -+ if (tcp_tap_window_update(c, conn, ntohs(th->window)) || retr) - tcp_data_from_sock(c, conn); - - if (conn->seq_ack_from_tap == conn->seq_to_tap) { --- -2.47.1 - diff --git a/0009-tcp-Don-t-send-FIN-segment-to-guest-yet-if-we-have-p.patch b/0009-tcp-Don-t-send-FIN-segment-to-guest-yet-if-we-have-p.patch deleted file mode 100644 index a1e4ae1..0000000 --- a/0009-tcp-Don-t-send-FIN-segment-to-guest-yet-if-we-have-p.patch +++ /dev/null @@ -1,95 +0,0 @@ -From 8ec28b4d68a1f070b0b1dffd7f84863adac5c5a1 Mon Sep 17 00:00:00 2001 -From: Stefano Brivio -Date: Tue, 9 Sep 2025 19:18:41 +0200 -Subject: [PATCH 09/13] tcp: Don't send FIN segment to guest yet if we have - pending unacknowledged data -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -For some reason, tcp_vu_data_from_sock() already takes care of this, -but the non-vhost-user version ignores this possibility and just sends -out a FIN segment whenever we infer we received one host-side, -regardless of the fact that we might have unacknowledged data still to -send. - -Somewhat surprisingly, this didn't cause any issue to be reported yet, -until 6.17-rc1 and 1d2fbaad7cd8 ("tcp: stronger sk_rcvbuf checks") -came around, leading to the following report from Paul, who hit this -running Podman tests: - - 439 0.033032 169.254.1.2 → 192.168.122.100 65540 TCP 56602 → 5789 [PSH, ACK] Seq=10336361 Ack=1 Win=65536 Len=65480 - 440 0.033055 169.254.1.2 → 192.168.122.100 30324 TCP [TCP Window Full] 56602 → 5789 [PSH, ACK] Seq=10401841 Ack=1 Win=65536 Len=30264 - -we're sending data to the container, up to the edge of the window - - 441 0.033059 192.168.122.100 → 169.254.1.2 60 TCP 5789 → 56602 [ACK] Seq=1 Ack=10401841 Win=83968 Len=0 - -and the container acknowledges it - - 442 0.033091 169.254.1.2 → 192.168.122.100 53716 TCP 56602 → 5789 [PSH, ACK] Seq=10432105 Ack=1 Win=65536 Len=53656 - -we send more data, all we possibly can, in window - - 443 0.033126 192.168.122.100 → 169.254.1.2 60 TCP [TCP ZeroWindow] 5789 → 56602 [ACK] Seq=1 Ack=10432105 Win=0 Len=0 - -and the container shrinks the window due to the issue introduced -by kernel commit e2142825c120 ("net: tcp: send zero-window ACK when no -memory"). With a previous patch from this series, we rewind the -sequence, meaning that we assign conn->seq_to_tap from -conn->seq_ack_from_tap, so that we'll retransmit this segment, by -reading again from the socket, and increasing conn->seq_to_tap -once more. - -However: - - 444 0.033144 169.254.1.2 → 192.168.122.100 60 TCP 56602 → 5789 [FIN, PSH, ACK] Seq=10485761 Ack=1 Win=65536 Len=0 - -we eventually get a zero-length read from the socket and we miss the -fact that conn->seq_to_tap != conn->seq_ack_from_tap, so we send a -FIN flag with the most recent sequence. The kernel insists: - - 445 0.033147 192.168.122.100 → 169.254.1.2 60 TCP [TCP ZeroWindow] 5789 → 56602 [ACK] Seq=1 Ack=10432105 Win=0 Len=0 - -with its buggy zero-window update, but: - - 446 0.033152 192.168.122.100 → 169.254.1.2 60 TCP [TCP Window Update] 5789 → 56602 [ACK] Seq=1 Ack=10432105 Win=69120 Len=0 - 447 0.033202 192.168.122.100 → 169.254.1.2 60 TCP [TCP Window Update] 5789 → 56602 [ACK] Seq=1 Ack=10432105 Win=142848 Len=0 - -we don't reset the TAP_FIN_SENT flag anymore, and don't resend -the FIN segment (nor data), as we already rewound the sequence -earlier. - -To solve this, hold off the FIN segment until we get a zero-length -read from the socket *and* we know that there's no unacknowledged -pending data, also without vhost-user, in tcp_buf_data_from_sock(). - -Reported-by: Paul Holzinger -Signed-off-by: Stefano Brivio -Reviewed-by: David Gibson -Tested-by: Paul Holzinger -Reviewed-by: Jon Maloy -(cherry picked from commit 8d2f8c4d0fb58d6b2011e614bc7d7ff9dab406b3) ---- - tcp_buf.c | 5 ++++- - 1 file changed, 4 insertions(+), 1 deletion(-) - -diff --git a/tcp_buf.c b/tcp_buf.c -index 0530563..028f8ba 100644 ---- a/tcp_buf.c -+++ b/tcp_buf.c -@@ -369,7 +369,10 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) - } - - if (!len) { -- if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { -+ if (already_sent) { -+ conn_flag(c, conn, STALLED); -+ } else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == -+ SOCK_FIN_RCVD) { - int ret = tcp_buf_send_flag(c, conn, FIN | ACK); - if (ret) { - tcp_rst(c, conn); --- -2.47.1 - diff --git a/0010-tcp-Fix-ACK-sequence-on-FIN-to-tap.patch b/0010-tcp-Fix-ACK-sequence-on-FIN-to-tap.patch deleted file mode 100644 index 4df6342..0000000 --- a/0010-tcp-Fix-ACK-sequence-on-FIN-to-tap.patch +++ /dev/null @@ -1,78 +0,0 @@ -From 836afc99e59cfd1f6934464a81baaa95d0d78b31 Mon Sep 17 00:00:00 2001 -From: Stefano Brivio -Date: Tue, 30 Sep 2025 22:26:02 +0200 -Subject: [PATCH 10/13] tcp: Fix ACK sequence on FIN to tap - -If we reach end-of-file on a socket (or get EPOLLRDHUP / EPOLLHUP) and -send a FIN segment to the guest / container acknowledging a sequence -number that's behind what we received so far, we won't have any -further trigger to send an updated ACK segment, as we are now -switching the epoll socket monitoring to edge-triggered mode. - -To avoid this situation, in tcp_update_seqack_wnd(), we set the next -acknowledgement sequence to the current observed sequence, regardless -of what was acknowledged socket-side. - -However, we don't necessarily call tcp_update_seqack_wnd() before -sending the FIN segment, which might potentially lead to a situation, -not observed in practice, where we unnecessarily cause a -retransmission at some point after our FIN segment. - -Avoid that by setting the ACK sequence to whatever we received from -the container / guest, before sending a FIN segment and switching to -EPOLLET. - -Signed-off-by: Stefano Brivio -Reviewed-by: David Gibson -(cherry picked from commit b3217aa5aec10704774c7142ef07daec5e698415) ---- - tcp_buf.c | 14 +++++++++++++- - tcp_vu.c | 7 ++++++- - 2 files changed, 19 insertions(+), 2 deletions(-) - -diff --git a/tcp_buf.c b/tcp_buf.c -index 028f8ba..9527eed 100644 ---- a/tcp_buf.c -+++ b/tcp_buf.c -@@ -373,7 +373,19 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) - conn_flag(c, conn, STALLED); - } else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == - SOCK_FIN_RCVD) { -- int ret = tcp_buf_send_flag(c, conn, FIN | ACK); -+ int ret; -+ -+ /* On TAP_FIN_SENT, we won't get further data events -+ * from the socket, and this might be the last ACK -+ * segment we send to the tap, so update its sequence to -+ * include everything we received until now. -+ * -+ * See also the special handling on CONN_IS_CLOSING() in -+ * tcp_update_seqack_wnd(). -+ */ -+ conn->seq_ack_to_tap = conn->seq_from_tap; -+ -+ ret = tcp_buf_send_flag(c, conn, FIN | ACK); - if (ret) { - tcp_rst(c, conn); - return ret; -diff --git a/tcp_vu.c b/tcp_vu.c -index 57587cc..7a721a2 100644 ---- a/tcp_vu.c -+++ b/tcp_vu.c -@@ -415,7 +415,12 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) - conn_flag(c, conn, STALLED); - } else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == - SOCK_FIN_RCVD) { -- int ret = tcp_vu_send_flag(c, conn, FIN | ACK); -+ int ret; -+ -+ /* See tcp_buf_data_from_sock() */ -+ conn->seq_ack_to_tap = conn->seq_from_tap; -+ -+ ret = tcp_vu_send_flag(c, conn, FIN | ACK); - if (ret) { - tcp_rst(c, conn); - return ret; --- -2.47.1 - diff --git a/0011-tcp-Completely-ignore-data-segment-in-CLOSE-WAIT-sta.patch b/0011-tcp-Completely-ignore-data-segment-in-CLOSE-WAIT-sta.patch deleted file mode 100644 index d219f43..0000000 --- a/0011-tcp-Completely-ignore-data-segment-in-CLOSE-WAIT-sta.patch +++ /dev/null @@ -1,79 +0,0 @@ -From 188bcb00a0c0bdc1f468e71fd9a42345098c09d5 Mon Sep 17 00:00:00 2001 -From: Stefano Brivio -Date: Thu, 2 Oct 2025 01:13:27 +0200 -Subject: [PATCH 11/13] tcp: Completely ignore data segment in CLOSE-WAIT - state, log a message -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -According to RFC 9293 we should ignore data (note: not data segments) -in CLOSE-WAIT state (indicated by TAP_FIN_RCVD), see 3.10.7.4 -"Other states": - - [...] - - Seventh, process the segment text: - - [...] - - CLOSE-WAIT STATE - - This should not occur since a FIN has been received from the remote - side. Ignore the segment text. - -and we almost do that, except that we would look at the data length -to decide whether it's a request for fast re-transmission, so fix -that, and while at it, log a message, so that cases such as the -following one are more apparent in debug logs: - -28692 0.009758 88.198.0.164 → 93.235.151.95 54 TCP 55414 → 47080 [FIN, ACK] Seq=121441 Ack=141 Win=65536 Len=0 - -we should ignore this FIN flag, because we didn't accept data up -to this sequence (see next segment), but we don't do it, so, here: - -28693 0.000036 93.235.151.95 → 88.198.0.164 54 TCP 47080 → 55414 [ACK] Seq=141 Ack=90722 Win=32128 Len=0 -28694 0.034597 93.235.151.95 → 88.198.0.164 54 TCP 47080 → 55414 [FIN, ACK] Seq=141 Ack=90722 Win=121216 Len=0 -28695 0.000019 88.198.0.164 → 93.235.151.95 54 TCP 55414 → 47080 [ACK] Seq=121442 Ack=142 Win=65536 Len=0 -28696 0.162968 88.198.0.164 → 93.235.151.95 30773 TCP [TCP Retransmission] 55414 → 47080 [FIN, PSH, ACK] Seq=90722 Ack=142 Win=65536 Len=30719 [TCP segment of a reassembled PDU] - -we are erroneously in CLOSE-WAIT (TAP_FIN_RCVD) state, and this -segment would look pretty strange there. - -This specific case is fixed by the next patch, so it should never -happen again. - -Link: https://archives.passt.top/passt-dev/20250910115726.432bbb8d@elisabeth/ -Link: https://bugs.passt.top/show_bug.cgi?id=126 -Suggested-by: David Gibson -Signed-off-by: Stefano Brivio -Reviewed-by: David Gibson -(cherry picked from commit 8efa80b51f9f52082954aa26719b36a9ec367567) ---- - tcp.c | 8 +++++++- - 1 file changed, 7 insertions(+), 1 deletion(-) - -diff --git a/tcp.c b/tcp.c -index 70798de..ff071ee 100644 ---- a/tcp.c -+++ b/tcp.c -@@ -2102,9 +2102,15 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, - - /* Established connections not accepting data from tap */ - if (conn->events & TAP_FIN_RCVD) { -+ size_t dlen; - bool retr; - -- retr = th->ack && !tcp_packet_data_len(th, len) && !th->fin && -+ if ((dlen = tcp_packet_data_len(th, len))) { -+ flow_dbg(conn, "data segment in CLOSE-WAIT (%zu B)", -+ dlen); -+ } -+ -+ retr = th->ack && !th->fin && - ntohl(th->ack_seq) == conn->seq_ack_from_tap && - ntohs(th->window) == conn->wnd_from_tap; - --- -2.47.1 - diff --git a/0012-tcp-Don-t-consider-FIN-flags-with-mismatching-sequen.patch b/0012-tcp-Don-t-consider-FIN-flags-with-mismatching-sequen.patch deleted file mode 100644 index d1f017d..0000000 --- a/0012-tcp-Don-t-consider-FIN-flags-with-mismatching-sequen.patch +++ /dev/null @@ -1,76 +0,0 @@ -From 64ddc157615eadc7e566486746d7e8bf9c86c844 Mon Sep 17 00:00:00 2001 -From: Stefano Brivio -Date: Thu, 2 Oct 2025 00:41:54 +0200 -Subject: [PATCH 12/13] tcp: Don't consider FIN flags with mismatching sequence -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -If a guest or container sends us a FIN segment but its sequence number -doesn't match the highest sequence of data we *accepted* (not -necessarily the highest sequence we received), that is, -conn->seq_from_tap, plus any data we're accepting in the current -batch, we should discard the flag (not necessarily the segment), -because there's still data we need to receive (again) before the end -of the stream. - -If we consider those FIN flags as such, we'll end up in the -situation described below. - -Here, 192.168.10.102 is a HTTP server in a Podman container, and -192.168.10.44 is a client fetching approximately 121 KB of data from -it: - - 82 2.026811 192.168.10.102 → 192.168.10.44 54 TCP 55414 → 44992 [FIN, ACK] Seq=121441 Ack=143 Win=65536 Len=0 - -the server is done sending - - 83 2.026898 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [ACK] Seq=143 Ack=114394 Win=216192 Len=0 - -pasta (client) acknowledges a previous sequence, because of -a short sendmsg() - - 84 2.027324 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [FIN, ACK] Seq=143 Ack=114394 Win=216192 Len=0 - -pasta (client) sends FIN, ACK as the client has no more data to -send (a single GET request), while still acknowledging a previous -sequence, because the retransmission didn't happen yet - - 85 2.027349 192.168.10.102 → 192.168.10.44 54 TCP 55414 → 44992 [ACK] Seq=121442 Ack=144 Win=65536 Len=0 - -the server acknowledges the FIN, ACK - - 86 2.224125 192.168.10.102 → 192.168.10.44 4150 TCP [TCP Retransmission] 55414 → 44992 [ACK] Seq=114394 Ack=144 Win=65536 Len=4096 [TCP segment of a reassembled PDU] - -and finally a retransmission comes, but as we wrongly switched to -the CLOSE-WAIT state, - - 87 2.224202 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [RST] Seq=144 Win=0 Len=0 - -we consider frame #86 as an acknowledgement for the FIN segment we -sent, and close the connection, while we still had to re-receive -(and finally send) the missing data segment, instead. - -Link: https://github.com/containers/podman/issues/27179 -Signed-off-by: Stefano Brivio -(cherry picked from commit b145441913eef6f8885b6b84531e944ff593790c) ---- - tcp.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tcp.c b/tcp.c -index ff071ee..e6af26a 100644 ---- a/tcp.c -+++ b/tcp.c -@@ -1752,7 +1752,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, - } - } - -- if (th->fin) -+ if (th->fin && seq == seq_from_tap) - fin = 1; - - if (!len) --- -2.47.1 - diff --git a/0013-tcp-On-partial-send-incomplete-sendmsg-request-a-ret.patch b/0013-tcp-On-partial-send-incomplete-sendmsg-request-a-ret.patch deleted file mode 100644 index 1569fac..0000000 --- a/0013-tcp-On-partial-send-incomplete-sendmsg-request-a-ret.patch +++ /dev/null @@ -1,59 +0,0 @@ -From abd48f62fe0f1034a27a43ae5eb4e0e5f419b411 Mon Sep 17 00:00:00 2001 -From: Stefano Brivio -Date: Thu, 2 Oct 2025 00:52:53 +0200 -Subject: [PATCH 13/13] tcp: On partial send (incomplete sendmsg()), request a - retransmission right away -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -...possibly with an updated window value. As we're discarding the -remaining data, we'll need to receive it again. If we don't request -a retransmission immediately, we'll see an apparent gap in the -sequence, and request a retransmission on the next data batch or -segment, but we're just wasting time like that. In packets: - -28686 0.000007 88.198.0.164 → 93.235.151.95 16118 TCP 55414 → 47080 [ACK] Seq=80501 Ack=141 Win=65536 Len=16064 [TCP segment of a reassembled PDU] -28687 0.000012 88.198.0.164 → 93.235.151.95 16118 TCP [TCP Window Full] 55414 → 47080 [ACK] Seq=96565 Ack=141 Win=65536 Len=16064 [TCP segment of a reassembled PDU] - -on this second data segment, we have a short sendmsg(), and - -28688 0.000026 93.235.151.95 → 88.198.0.164 54 TCP 47080 → 55414 [ACK] Seq=141 Ack=90721 Win=32128 Len=0 - -consequently acknowledge it, without requesting a retransmission, - -28689 0.000006 88.198.0.164 → 93.235.151.95 8866 HTTP HTTP/1.1 200 ok (text/css) - -so the server proceeds sending a bit more, but - -28690 0.000016 93.235.151.95 → 88.198.0.164 54 TCP [TCP Dup ACK 28688#1] 47080 → 55414 [ACK] Seq=141 Ack=90721 Win=32128 Len=0 -28691 0.000000 93.235.151.95 → 88.198.0.164 54 TCP [TCP Dup ACK 28688#2] 47080 → 55414 [ACK] Seq=141 Ack=90721 Win=32128 Len=0 - -we'll throw that data (from frame #28689) away, and finally request -a retransmission as we spotted the gap now. - -Request a retransmission as soon as we know we'll need it, instead. - -Signed-off-by: Stefano Brivio -Reviewed-by: David Gibson -(cherry picked from commit a94783951d0d759912dcae0c6289c61fe6a86fc4) ---- - tcp.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tcp.c b/tcp.c -index e6af26a..9afc7f7 100644 ---- a/tcp.c -+++ b/tcp.c -@@ -1855,7 +1855,7 @@ eintr: - } - - out: -- if (keep != -1) { -+ if (keep != -1 || partial_send) { - /* We use an 8-bit approximation here: the associated risk is - * that we skip a duplicate ACK on 8-bit sequence number - * collision. Fast retransmit is a SHOULD in RFC 5681, 3.2. --- -2.47.1 - diff --git a/passt.spec b/passt.spec index c9ee35e..36ca7a4 100644 --- a/passt.spec +++ b/passt.spec @@ -7,32 +7,18 @@ # Copyright (c) 2022 Red Hat GmbH # Author: Stefano Brivio -%global git_hash 8ec134109eb136432a29bdf5a14f8b1fd4e46208 +%global git_hash c3f1ba70237a9e66822aff3aa5765d0adf6f6307 %global selinuxtype targeted Name: passt -Version: 0^20250512.g8ec1341 -Release: 3%{?dist} +Version: 0^20251209.gc3f1ba7 +Release: 1%{?dist} Summary: User-mode networking daemons for virtual machines and namespaces License: GPL-2.0-or-later AND BSD-3-Clause Group: System Environment/Daemons URL: https://passt.top/ Source: https://passt.top/passt/snapshot/passt-%{git_hash}.tar.xz -Patch1: 0001-treewide-By-default-don-t-quit-source-after-migratio.patch -Patch2: 0002-tcp-FIN-flags-have-to-be-retransmitted-as-well.patch -Patch3: 0003-tcp-Factor-sequence-rewind-for-retransmissions-into-.patch -Patch4: 0004-tcp-Rewind-sequence-when-guest-shrinks-window-to-zer.patch -Patch5: 0005-tcp-Fix-closing-logic-for-half-closed-connections.patch -Patch6: 0006-tcp-Don-t-try-to-transmit-right-after-the-peer-shran.patch -Patch7: 0007-tcp-Cast-operands-of-sequence-comparison-macros-to-u.patch -Patch8: 0008-tcp-Fast-re-transmit-if-half-closed-make-TAP_FIN_RCV.patch -Patch9: 0009-tcp-Don-t-send-FIN-segment-to-guest-yet-if-we-have-p.patch -Patch10: 0010-tcp-Fix-ACK-sequence-on-FIN-to-tap.patch -Patch11: 0011-tcp-Completely-ignore-data-segment-in-CLOSE-WAIT-sta.patch -Patch12: 0012-tcp-Don-t-consider-FIN-flags-with-mismatching-sequen.patch -Patch13: 0013-tcp-On-partial-send-incomplete-sendmsg-request-a-ret.patch - BuildRequires: gcc, make, git, checkpolicy, selinux-policy-devel Requires: (%{name}-selinux = %{version}-%{release} if selinux-policy-%{selinuxtype}) @@ -144,6 +130,9 @@ fi %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp %changelog +* Tue Dec 9 2025 Stefano Brivio - 0^20251209.gc3f1ba7-1 +- Resolves: RHEL-134120 + * Thu Oct 23 2025 Stefano Brivio - 0^20250512.g8ec1341-3 - Resolves: RHEL-123425 RHEL-123683 diff --git a/sources b/sources index 8dba02a..4f75492 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (passt-8ec134109eb136432a29bdf5a14f8b1fd4e46208.tar.xz) = 3a63f3f62aae06ae0da2293808223f539bca1a030131c50499d5de2daa96faf887fd486b6aa71d627b5ede6de6f5310876150983a3e77fbaf9926e69af56bdab +SHA512 (passt-c3f1ba70237a9e66822aff3aa5765d0adf6f6307.tar.xz) = 3f522803b1df364250989813d8e7c36e851572044bee39df7abf14f67a56bdfe03db59832c172bda28bbe3d985b3376bf5777770df79f2819b07492c718653ab