diff --git a/0002-tcp-FIN-flags-have-to-be-retransmitted-as-well.patch b/0002-tcp-FIN-flags-have-to-be-retransmitted-as-well.patch new file mode 100644 index 0000000..624bec6 --- /dev/null +++ b/0002-tcp-FIN-flags-have-to-be-retransmitted-as-well.patch @@ -0,0 +1,45 @@ +From 585326cf732c8396701b5d3fdf6a7b5f2ab79300 Mon Sep 17 00:00:00 2001 +From: Stefano Brivio +Date: Fri, 29 Aug 2025 22:11:26 +0200 +Subject: [PATCH 02/13] tcp: FIN flags have to be retransmitted as well + +If we're retransmitting any data, and we sent a FIN segment to our +peer, regardless of whether it was received, we obviously have to +retransmit it as well, given that it can only come with the last data +segment, or after it. + +Unconditionally clear the internal TAP_FIN_SENT flag whenever we +re-transmit, so that we know we have to send it again, in case. + +Reported-by: Paul Holzinger +Signed-off-by: Stefano Brivio +Reviewed-by: David Gibson +Tested-by: Paul Holzinger +(cherry picked from commit 2e3d93ba7033754301af0d50cd0bb20a38a005ac) +--- + tcp.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/tcp.c b/tcp.c +index 0ac298a..c5c430e 100644 +--- a/tcp.c ++++ b/tcp.c +@@ -1757,6 +1757,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, + "fast re-transmit, ACK: %u, previous sequence: %u", + max_ack_seq, conn->seq_to_tap); + conn->seq_to_tap = max_ack_seq; ++ conn->events &= ~TAP_FIN_SENT; + if (tcp_set_peek_offset(conn, 0)) { + tcp_rst(c, conn); + return -1; +@@ -2284,6 +2285,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) + flow_dbg(conn, "ACK timeout, retry"); + conn->retrans++; + conn->seq_to_tap = conn->seq_ack_from_tap; ++ conn->events &= ~TAP_FIN_SENT; + if (!conn->wnd_from_tap) + conn->wnd_from_tap = 1; /* Zero-window probe */ + if (tcp_set_peek_offset(conn, 0)) { +-- +2.47.1 + diff --git a/0003-tcp-Factor-sequence-rewind-for-retransmissions-into-.patch b/0003-tcp-Factor-sequence-rewind-for-retransmissions-into-.patch new file mode 100644 index 0000000..62bed28 --- /dev/null +++ b/0003-tcp-Factor-sequence-rewind-for-retransmissions-into-.patch @@ -0,0 +1,104 @@ +From 02f5a975d22449a2a940dec91e8e78a79e316693 Mon Sep 17 00:00:00 2001 +From: Stefano Brivio +Date: Fri, 29 Aug 2025 22:11:27 +0200 +Subject: [PATCH 03/13] tcp: Factor sequence rewind for retransmissions into a + new function + +...as I'm going to need a third occurrence of this in the next change. + +This introduces a small functional change in tcp_data_from_tap(): the +sequence was previously rewound to the highest ACK number we found in +the current packet batch, and not to the current value of +seq_ack_from_tap. + +The two might differ in case tcp_sock_consume() failed, because in +that case we're ignoring that ACK altogether. But if we're ignoring +it, it looks more correct to me to start retransmitting from an +earlier sequence anyway. + +Signed-off-by: Stefano Brivio +Reviewed-by: David Gibson +Tested-by: Paul Holzinger +(cherry picked from commit 1d502be1a809dcdc4614815f4ec0da77a1ff27ec) +--- + tcp.c | 47 ++++++++++++++++++++++++++++++++--------------- + 1 file changed, 32 insertions(+), 15 deletions(-) + +diff --git a/tcp.c b/tcp.c +index c5c430e..8b0e707 100644 +--- a/tcp.c ++++ b/tcp.c +@@ -1097,6 +1097,26 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, + } + } + ++/** ++ * tcp_rewind_seq() - Rewind sequence to tap and socket offset to current ACK ++ * @c: Execution context ++ * @conn: Connection pointer ++ * ++ * Return: 0 on success, -1 on failure, with connection reset ++ */ ++static int tcp_rewind_seq(const struct ctx *c, struct tcp_tap_conn *conn) ++{ ++ conn->seq_to_tap = conn->seq_ack_from_tap; ++ conn->events &= ~TAP_FIN_SENT; ++ ++ if (tcp_set_peek_offset(conn, 0)) { ++ tcp_rst(c, conn); ++ return -1; ++ } ++ ++ return 0; ++} ++ + /** + * tcp_prepare_flags() - Prepare header for flags-only segment (no payload) + * @c: Execution context +@@ -1755,13 +1775,11 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, + if (retr) { + flow_trace(conn, + "fast re-transmit, ACK: %u, previous sequence: %u", +- max_ack_seq, conn->seq_to_tap); +- conn->seq_to_tap = max_ack_seq; +- conn->events &= ~TAP_FIN_SENT; +- if (tcp_set_peek_offset(conn, 0)) { +- tcp_rst(c, conn); ++ conn->seq_ack_from_tap, conn->seq_to_tap); ++ ++ if (tcp_rewind_seq(c, conn)) + return -1; +- } ++ + tcp_data_from_sock(c, conn); + } + +@@ -2283,17 +2301,16 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) + tcp_rst(c, conn); + } else { + flow_dbg(conn, "ACK timeout, retry"); +- conn->retrans++; +- conn->seq_to_tap = conn->seq_ack_from_tap; +- conn->events &= ~TAP_FIN_SENT; ++ + if (!conn->wnd_from_tap) + conn->wnd_from_tap = 1; /* Zero-window probe */ +- if (tcp_set_peek_offset(conn, 0)) { +- tcp_rst(c, conn); +- } else { +- tcp_data_from_sock(c, conn); +- tcp_timer_ctl(c, conn); +- } ++ ++ conn->retrans++; ++ if (tcp_rewind_seq(c, conn)) ++ return; ++ ++ tcp_data_from_sock(c, conn); ++ tcp_timer_ctl(c, conn); + } + } else { + struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } }; +-- +2.47.1 + diff --git a/0004-tcp-Rewind-sequence-when-guest-shrinks-window-to-zer.patch b/0004-tcp-Rewind-sequence-when-guest-shrinks-window-to-zer.patch new file mode 100644 index 0000000..29bd90c --- /dev/null +++ b/0004-tcp-Rewind-sequence-when-guest-shrinks-window-to-zer.patch @@ -0,0 +1,123 @@ +From f6d5e6a614c385bccfd02453de5e04428d9fed9f Mon Sep 17 00:00:00 2001 +From: Stefano Brivio +Date: Fri, 29 Aug 2025 22:11:28 +0200 +Subject: [PATCH 04/13] tcp: Rewind sequence when guest shrinks window to zero + +A window shrunk to zero means by definition that anything else that +might be in flight is now out of window. Restart from the currently +acknowledged sequence. + +We need to do that both in tcp_tap_window_update(), where we already +check for zero-window updates, as well as in tcp_data_from_tap(), +because we might get one of those updates in a batch of packets that +also contains a non-zero window update. + +Suggested-by: Jon Maloy +Signed-off-by: Stefano Brivio +Reviewed-by: David Gibson +Tested-by: Paul Holzinger +Reviewed-by: Jon Maloy +(cherry picked from commit e86d480be4174fdfec592ba689c2ef03019393f6) +--- + tcp.c | 34 +++++++++++++++++++++++++--------- + 1 file changed, 25 insertions(+), 9 deletions(-) + +diff --git a/tcp.c b/tcp.c +index 8b0e707..74ed7c6 100644 +--- a/tcp.c ++++ b/tcp.c +@@ -1257,19 +1257,25 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn, + + /** + * tcp_tap_window_update() - Process an updated window from tap side ++ * @c: Execution context + * @conn: Connection pointer + * @window: Window value, host order, unscaled + */ +-static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd) ++static void tcp_tap_window_update(const struct ctx *c, ++ struct tcp_tap_conn *conn, unsigned wnd) + { + wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap); + + /* Work-around for bug introduced in peer kernel code, commit +- * e2142825c120 ("net: tcp: send zero-window ACK when no memory"). +- * We don't update if window shrank to zero. ++ * e2142825c120 ("net: tcp: send zero-window ACK when no memory"): don't ++ * update the window if it shrank to zero, so that we'll eventually ++ * retry to send data, but rewind the sequence as that obviously implies ++ * that no data beyond the updated window will be acknowledged. + */ +- if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) ++ if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) { ++ tcp_rewind_seq(c, conn); + return; ++ } + + conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX); + +@@ -1692,7 +1698,8 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, + tcp_timer_ctl(c, conn); + + if (p->count == 1) { +- tcp_tap_window_update(conn, ntohs(th->window)); ++ tcp_tap_window_update(c, conn, ++ ntohs(th->window)); + return 1; + } + +@@ -1711,6 +1718,15 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, + ack_seq == max_ack_seq && + ntohs(th->window) == max_ack_seq_wnd; + ++ /* See tcp_tap_window_update() for details. On ++ * top of that, we also need to check here if a ++ * zero-window update is contained in a batch of ++ * packets that includes a non-zero window as ++ * well. ++ */ ++ if (!ntohs(th->window)) ++ tcp_rewind_seq(c, conn); ++ + max_ack_seq_wnd = ntohs(th->window); + max_ack_seq = ack_seq; + } +@@ -1770,7 +1786,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, + if (ack && !tcp_sock_consume(conn, max_ack_seq)) + tcp_update_seqack_from_tap(c, conn, max_ack_seq); + +- tcp_tap_window_update(conn, max_ack_seq_wnd); ++ tcp_tap_window_update(c, conn, max_ack_seq_wnd); + + if (retr) { + flow_trace(conn, +@@ -1859,7 +1875,7 @@ static void tcp_conn_from_sock_finish(const struct ctx *c, + const struct tcphdr *th, + const char *opts, size_t optlen) + { +- tcp_tap_window_update(conn, ntohs(th->window)); ++ tcp_tap_window_update(c, conn, ntohs(th->window)); + tcp_get_tap_ws(conn, opts, optlen); + + /* First value is not scaled */ +@@ -2057,7 +2073,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, + if (!th->ack) + goto reset; + +- tcp_tap_window_update(conn, ntohs(th->window)); ++ tcp_tap_window_update(c, conn, ntohs(th->window)); + + tcp_data_from_sock(c, conn); + +@@ -2069,7 +2085,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, + if (conn->events & TAP_FIN_RCVD) { + tcp_sock_consume(conn, ntohl(th->ack_seq)); + tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); +- tcp_tap_window_update(conn, ntohs(th->window)); ++ tcp_tap_window_update(c, conn, ntohs(th->window)); + tcp_data_from_sock(c, conn); + + if (conn->events & SOCK_FIN_RCVD && +-- +2.47.1 + diff --git a/0005-tcp-Fix-closing-logic-for-half-closed-connections.patch b/0005-tcp-Fix-closing-logic-for-half-closed-connections.patch new file mode 100644 index 0000000..3d34f7f --- /dev/null +++ b/0005-tcp-Fix-closing-logic-for-half-closed-connections.patch @@ -0,0 +1,63 @@ +From 7a4c907502ffe8df07ee44d2d17818671dc4ae47 Mon Sep 17 00:00:00 2001 +From: Stefano Brivio +Date: Fri, 29 Aug 2025 22:11:29 +0200 +Subject: [PATCH 05/13] tcp: Fix closing logic for half-closed connections + +First off, don't close connections half-closed by the guest before +our own FIN is acknowledged by the guest itself. + +That is, after we receive a FIN from the guest (TAP_FIN_RCVD), if we +don't have any data left to send from the socket (SOCK_FIN_RCVD, or +EPOLLHUP), we send a FIN segment to the guest (TAP_FIN_SENT), but we +need to actually have it acknowledged (and have no pending +retransmissions) before we can close the connection: check for +TAP_FIN_ACKED, first. + +Then, if we set TAP_FIN_SENT, and we receive an ACK segment from the +guest, set TAP_FIN_ACKED. This was entirely missing for the +TAP_FIN_RCVD case, and as we fix the problem described above, this +becomes relevant as well. + +Signed-off-by: Stefano Brivio +Reviewed-by: David Gibson +Tested-by: Paul Holzinger +Reviewed-by: Jon Maloy +(cherry picked from commit c62fb08002ad88079b84d1aa694492746f0d7f22) +--- + tcp.c | 13 +++++++++---- + 1 file changed, 9 insertions(+), 4 deletions(-) + +diff --git a/tcp.c b/tcp.c +index 74ed7c6..ff1b6bf 100644 +--- a/tcp.c ++++ b/tcp.c +@@ -2088,9 +2088,14 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, + tcp_tap_window_update(c, conn, ntohs(th->window)); + tcp_data_from_sock(c, conn); + +- if (conn->events & SOCK_FIN_RCVD && +- conn->seq_ack_from_tap == conn->seq_to_tap) +- conn_event(c, conn, CLOSED); ++ if (conn->seq_ack_from_tap == conn->seq_to_tap) { ++ if (th->ack && conn->events & TAP_FIN_SENT) ++ conn_event(c, conn, TAP_FIN_ACKED); ++ ++ if (conn->events & SOCK_FIN_RCVD && ++ conn->events & TAP_FIN_ACKED) ++ conn_event(c, conn, CLOSED); ++ } + + return 1; + } +@@ -2370,7 +2375,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, + return; + } + +- if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) { ++ if ((conn->events & TAP_FIN_ACKED) && (events & EPOLLHUP)) { + conn_event(c, conn, CLOSED); + return; + } +-- +2.47.1 + diff --git a/0006-tcp-Don-t-try-to-transmit-right-after-the-peer-shran.patch b/0006-tcp-Don-t-try-to-transmit-right-after-the-peer-shran.patch new file mode 100644 index 0000000..46b452f --- /dev/null +++ b/0006-tcp-Don-t-try-to-transmit-right-after-the-peer-shran.patch @@ -0,0 +1,85 @@ +From b1a28f0dd1a2a31e73c03aefbfc8dbd016f09ac4 Mon Sep 17 00:00:00 2001 +From: Stefano Brivio +Date: Fri, 29 Aug 2025 22:11:30 +0200 +Subject: [PATCH 06/13] tcp: Don't try to transmit right after the peer shrank + the window to zero + +If the peer shrinks the window to zero, we'll skip storing the new +window, as a convenient way to cause window probes (which exceed any +zero-sized window, strictly speaking) if we don't get window updates +in a while. + +As we do so, though, we need to ensure we don't try to queue more data +from the socket right after we process this window update, as the +entire point of a zero-window advertisement is to keep us from sending +more data. + +Signed-off-by: Stefano Brivio +Reviewed-by: David Gibson +Tested-by: Paul Holzinger +Reviewed-by: Jon Maloy +(cherry picked from commit 25f93545e7232a7dab9a022862514778c18bc85e) +--- + tcp.c | 18 ++++++++++-------- + 1 file changed, 10 insertions(+), 8 deletions(-) + +diff --git a/tcp.c b/tcp.c +index ff1b6bf..4686846 100644 +--- a/tcp.c ++++ b/tcp.c +@@ -1259,9 +1259,11 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn, + * tcp_tap_window_update() - Process an updated window from tap side + * @c: Execution context + * @conn: Connection pointer +- * @window: Window value, host order, unscaled ++ * @wnd: Window value, host order, unscaled ++ * ++ * Return: false on zero window (not stored to wnd_from_tap), true otherwise + */ +-static void tcp_tap_window_update(const struct ctx *c, ++static bool tcp_tap_window_update(const struct ctx *c, + struct tcp_tap_conn *conn, unsigned wnd) + { + wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap); +@@ -1274,13 +1276,14 @@ static void tcp_tap_window_update(const struct ctx *c, + */ + if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) { + tcp_rewind_seq(c, conn); +- return; ++ return false; + } + + conn->wnd_from_tap = MIN(wnd >> conn->ws_from_tap, USHRT_MAX); + + /* FIXME: reflect the tap-side receiver's window back to the sock-side + * sender by adjusting SO_RCVBUF? */ ++ return true; + } + + /** +@@ -2073,9 +2076,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, + if (!th->ack) + goto reset; + +- tcp_tap_window_update(c, conn, ntohs(th->window)); +- +- tcp_data_from_sock(c, conn); ++ if (tcp_tap_window_update(c, conn, ntohs(th->window))) ++ tcp_data_from_sock(c, conn); + + if (p->count - idx == 1) + return 1; +@@ -2085,8 +2087,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, + if (conn->events & TAP_FIN_RCVD) { + tcp_sock_consume(conn, ntohl(th->ack_seq)); + tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); +- tcp_tap_window_update(c, conn, ntohs(th->window)); +- tcp_data_from_sock(c, conn); ++ if (tcp_tap_window_update(c, conn, ntohs(th->window))) ++ tcp_data_from_sock(c, conn); + + if (conn->seq_ack_from_tap == conn->seq_to_tap) { + if (th->ack && conn->events & TAP_FIN_SENT) +-- +2.47.1 + diff --git a/0007-tcp-Cast-operands-of-sequence-comparison-macros-to-u.patch b/0007-tcp-Cast-operands-of-sequence-comparison-macros-to-u.patch new file mode 100644 index 0000000..884a0d7 --- /dev/null +++ b/0007-tcp-Cast-operands-of-sequence-comparison-macros-to-u.patch @@ -0,0 +1,48 @@ +From 23331346494e8bd301cd635533812e329f693fdf Mon Sep 17 00:00:00 2001 +From: Stefano Brivio +Date: Fri, 29 Aug 2025 22:11:31 +0200 +Subject: [PATCH 07/13] tcp: Cast operands of sequence comparison macros to + uint32_t before using them + +Otherwise, passing signed types causes automatic promotion of the +result of the subtractions as well, which is not what we want, as +these macros rely on unsigned 32-bit arithmetic. + +The next patch introduces a ssize_t operand for SEQ_LE, illustrating +the issue. + +Signed-off-by: Stefano Brivio +Reviewed-by: David Gibson +Tested-by: Paul Holzinger +Reviewed-by: Jon Maloy +(cherry picked from commit 660cd6907e14a41ad9bc77d317140c70ab416fce) +--- + tcp_internal.h | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +diff --git a/tcp_internal.h b/tcp_internal.h +index 36c6533..c80ba40 100644 +--- a/tcp_internal.h ++++ b/tcp_internal.h +@@ -18,10 +18,14 @@ + sizeof(struct ipv6hdr), \ + sizeof(uint32_t)) + +-#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW) +-#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW) +-#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW) +-#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW) ++#define SEQ_LE(a, b) \ ++ ((uint32_t)(b) - (uint32_t)(a) < MAX_WINDOW) ++#define SEQ_LT(a, b) \ ++ ((uint32_t)(b) - (uint32_t)(a) - 1 < MAX_WINDOW) ++#define SEQ_GE(a, b) \ ++ ((uint32_t)(a) - (uint32_t)(b) < MAX_WINDOW) ++#define SEQ_GT(a, b) \ ++ ((uint32_t)(a) - (uint32_t)(b) - 1 < MAX_WINDOW) + + #define FIN (1 << 0) + #define SYN (1 << 1) +-- +2.47.1 + diff --git a/0008-tcp-Fast-re-transmit-if-half-closed-make-TAP_FIN_RCV.patch b/0008-tcp-Fast-re-transmit-if-half-closed-make-TAP_FIN_RCV.patch new file mode 100644 index 0000000..7fda0fe --- /dev/null +++ b/0008-tcp-Fast-re-transmit-if-half-closed-make-TAP_FIN_RCV.patch @@ -0,0 +1,95 @@ +From c1420a41bb6e95f4dc578846568dacd9e0e0bbd2 Mon Sep 17 00:00:00 2001 +From: Stefano Brivio +Date: Fri, 29 Aug 2025 22:11:32 +0200 +Subject: [PATCH 08/13] tcp: Fast re-transmit if half-closed, make TAP_FIN_RCVD + path consistent + +We currently have a number of discrepancies in the tcp_tap_handler() +path between the half-closed connection path and the regular one, and +they are mostly a result of code duplication, which comes in turn from +the fact that tcp_data_from_tap() deals with data transfers as well as +general connection bookkeeping, so we can't use it for half-closed +connections. + +This suggests that we should probably rework it into two or more +functions, in the long term, but for the moment being I'm just fixing +one obvious issue, which is the lack of fast retransmissions in the +TAP_FIN_RCVD path, and a potential one, which is the fact we don't +handle socket flush failures. + +Add fast re-transmit for half-closed connections, and handle the case +of socket flush (tcp_sock_consume()) flush failure in the same way as +tcp_data_from_tap() handles it. + +Signed-off-by: Stefano Brivio +Reviewed-by: David Gibson +Tested-by: Paul Holzinger +Reviewed-by: Jon Maloy +(cherry picked from commit bde1847960cfecc46d8bbf203b931705f2922d31) +--- + tcp.c | 42 +++++++++++++++++++++++++++++++++++++++--- + 1 file changed, 39 insertions(+), 3 deletions(-) + +diff --git a/tcp.c b/tcp.c +index 4686846..70798de 100644 +--- a/tcp.c ++++ b/tcp.c +@@ -1639,6 +1639,23 @@ static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) + return tcp_buf_data_from_sock(c, conn); + } + ++/** ++ * tcp_packet_data_len() - Get data (TCP payload) length for a TCP packet ++ * @th: Pointer to TCP header ++ * @l4len: TCP packet length, including TCP header ++ * ++ * Return: data length of TCP packet, -1 on invalid value of Data Offset field ++ */ ++static ssize_t tcp_packet_data_len(const struct tcphdr *th, size_t l4len) ++{ ++ size_t off = th->doff * 4UL; ++ ++ if (off < sizeof(*th) || off > l4len) ++ return -1; ++ ++ return l4len - off; ++} ++ + /** + * tcp_data_from_tap() - tap/guest data for established connection + * @c: Execution context +@@ -2085,9 +2102,28 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, + + /* Established connections not accepting data from tap */ + if (conn->events & TAP_FIN_RCVD) { +- tcp_sock_consume(conn, ntohl(th->ack_seq)); +- tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); +- if (tcp_tap_window_update(c, conn, ntohs(th->window))) ++ bool retr; ++ ++ retr = th->ack && !tcp_packet_data_len(th, len) && !th->fin && ++ ntohl(th->ack_seq) == conn->seq_ack_from_tap && ++ ntohs(th->window) == conn->wnd_from_tap; ++ ++ /* On socket flush failure, pretend there was no ACK, try again ++ * later ++ */ ++ if (th->ack && !tcp_sock_consume(conn, ntohl(th->ack_seq))) ++ tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); ++ ++ if (retr) { ++ flow_trace(conn, ++ "fast re-transmit, ACK: %u, previous sequence: %u", ++ ntohl(th->ack_seq), conn->seq_to_tap); ++ ++ if (tcp_rewind_seq(c, conn)) ++ return -1; ++ } ++ ++ if (tcp_tap_window_update(c, conn, ntohs(th->window)) || retr) + tcp_data_from_sock(c, conn); + + if (conn->seq_ack_from_tap == conn->seq_to_tap) { +-- +2.47.1 + diff --git a/0009-tcp-Don-t-send-FIN-segment-to-guest-yet-if-we-have-p.patch b/0009-tcp-Don-t-send-FIN-segment-to-guest-yet-if-we-have-p.patch new file mode 100644 index 0000000..a1e4ae1 --- /dev/null +++ b/0009-tcp-Don-t-send-FIN-segment-to-guest-yet-if-we-have-p.patch @@ -0,0 +1,95 @@ +From 8ec28b4d68a1f070b0b1dffd7f84863adac5c5a1 Mon Sep 17 00:00:00 2001 +From: Stefano Brivio +Date: Tue, 9 Sep 2025 19:18:41 +0200 +Subject: [PATCH 09/13] tcp: Don't send FIN segment to guest yet if we have + pending unacknowledged data +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +For some reason, tcp_vu_data_from_sock() already takes care of this, +but the non-vhost-user version ignores this possibility and just sends +out a FIN segment whenever we infer we received one host-side, +regardless of the fact that we might have unacknowledged data still to +send. + +Somewhat surprisingly, this didn't cause any issue to be reported yet, +until 6.17-rc1 and 1d2fbaad7cd8 ("tcp: stronger sk_rcvbuf checks") +came around, leading to the following report from Paul, who hit this +running Podman tests: + + 439 0.033032 169.254.1.2 → 192.168.122.100 65540 TCP 56602 → 5789 [PSH, ACK] Seq=10336361 Ack=1 Win=65536 Len=65480 + 440 0.033055 169.254.1.2 → 192.168.122.100 30324 TCP [TCP Window Full] 56602 → 5789 [PSH, ACK] Seq=10401841 Ack=1 Win=65536 Len=30264 + +we're sending data to the container, up to the edge of the window + + 441 0.033059 192.168.122.100 → 169.254.1.2 60 TCP 5789 → 56602 [ACK] Seq=1 Ack=10401841 Win=83968 Len=0 + +and the container acknowledges it + + 442 0.033091 169.254.1.2 → 192.168.122.100 53716 TCP 56602 → 5789 [PSH, ACK] Seq=10432105 Ack=1 Win=65536 Len=53656 + +we send more data, all we possibly can, in window + + 443 0.033126 192.168.122.100 → 169.254.1.2 60 TCP [TCP ZeroWindow] 5789 → 56602 [ACK] Seq=1 Ack=10432105 Win=0 Len=0 + +and the container shrinks the window due to the issue introduced +by kernel commit e2142825c120 ("net: tcp: send zero-window ACK when no +memory"). With a previous patch from this series, we rewind the +sequence, meaning that we assign conn->seq_to_tap from +conn->seq_ack_from_tap, so that we'll retransmit this segment, by +reading again from the socket, and increasing conn->seq_to_tap +once more. + +However: + + 444 0.033144 169.254.1.2 → 192.168.122.100 60 TCP 56602 → 5789 [FIN, PSH, ACK] Seq=10485761 Ack=1 Win=65536 Len=0 + +we eventually get a zero-length read from the socket and we miss the +fact that conn->seq_to_tap != conn->seq_ack_from_tap, so we send a +FIN flag with the most recent sequence. The kernel insists: + + 445 0.033147 192.168.122.100 → 169.254.1.2 60 TCP [TCP ZeroWindow] 5789 → 56602 [ACK] Seq=1 Ack=10432105 Win=0 Len=0 + +with its buggy zero-window update, but: + + 446 0.033152 192.168.122.100 → 169.254.1.2 60 TCP [TCP Window Update] 5789 → 56602 [ACK] Seq=1 Ack=10432105 Win=69120 Len=0 + 447 0.033202 192.168.122.100 → 169.254.1.2 60 TCP [TCP Window Update] 5789 → 56602 [ACK] Seq=1 Ack=10432105 Win=142848 Len=0 + +we don't reset the TAP_FIN_SENT flag anymore, and don't resend +the FIN segment (nor data), as we already rewound the sequence +earlier. + +To solve this, hold off the FIN segment until we get a zero-length +read from the socket *and* we know that there's no unacknowledged +pending data, also without vhost-user, in tcp_buf_data_from_sock(). + +Reported-by: Paul Holzinger +Signed-off-by: Stefano Brivio +Reviewed-by: David Gibson +Tested-by: Paul Holzinger +Reviewed-by: Jon Maloy +(cherry picked from commit 8d2f8c4d0fb58d6b2011e614bc7d7ff9dab406b3) +--- + tcp_buf.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/tcp_buf.c b/tcp_buf.c +index 0530563..028f8ba 100644 +--- a/tcp_buf.c ++++ b/tcp_buf.c +@@ -369,7 +369,10 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) + } + + if (!len) { +- if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { ++ if (already_sent) { ++ conn_flag(c, conn, STALLED); ++ } else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == ++ SOCK_FIN_RCVD) { + int ret = tcp_buf_send_flag(c, conn, FIN | ACK); + if (ret) { + tcp_rst(c, conn); +-- +2.47.1 + diff --git a/0010-tcp-Fix-ACK-sequence-on-FIN-to-tap.patch b/0010-tcp-Fix-ACK-sequence-on-FIN-to-tap.patch new file mode 100644 index 0000000..4df6342 --- /dev/null +++ b/0010-tcp-Fix-ACK-sequence-on-FIN-to-tap.patch @@ -0,0 +1,78 @@ +From 836afc99e59cfd1f6934464a81baaa95d0d78b31 Mon Sep 17 00:00:00 2001 +From: Stefano Brivio +Date: Tue, 30 Sep 2025 22:26:02 +0200 +Subject: [PATCH 10/13] tcp: Fix ACK sequence on FIN to tap + +If we reach end-of-file on a socket (or get EPOLLRDHUP / EPOLLHUP) and +send a FIN segment to the guest / container acknowledging a sequence +number that's behind what we received so far, we won't have any +further trigger to send an updated ACK segment, as we are now +switching the epoll socket monitoring to edge-triggered mode. + +To avoid this situation, in tcp_update_seqack_wnd(), we set the next +acknowledgement sequence to the current observed sequence, regardless +of what was acknowledged socket-side. + +However, we don't necessarily call tcp_update_seqack_wnd() before +sending the FIN segment, which might potentially lead to a situation, +not observed in practice, where we unnecessarily cause a +retransmission at some point after our FIN segment. + +Avoid that by setting the ACK sequence to whatever we received from +the container / guest, before sending a FIN segment and switching to +EPOLLET. + +Signed-off-by: Stefano Brivio +Reviewed-by: David Gibson +(cherry picked from commit b3217aa5aec10704774c7142ef07daec5e698415) +--- + tcp_buf.c | 14 +++++++++++++- + tcp_vu.c | 7 ++++++- + 2 files changed, 19 insertions(+), 2 deletions(-) + +diff --git a/tcp_buf.c b/tcp_buf.c +index 028f8ba..9527eed 100644 +--- a/tcp_buf.c ++++ b/tcp_buf.c +@@ -373,7 +373,19 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) + conn_flag(c, conn, STALLED); + } else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == + SOCK_FIN_RCVD) { +- int ret = tcp_buf_send_flag(c, conn, FIN | ACK); ++ int ret; ++ ++ /* On TAP_FIN_SENT, we won't get further data events ++ * from the socket, and this might be the last ACK ++ * segment we send to the tap, so update its sequence to ++ * include everything we received until now. ++ * ++ * See also the special handling on CONN_IS_CLOSING() in ++ * tcp_update_seqack_wnd(). ++ */ ++ conn->seq_ack_to_tap = conn->seq_from_tap; ++ ++ ret = tcp_buf_send_flag(c, conn, FIN | ACK); + if (ret) { + tcp_rst(c, conn); + return ret; +diff --git a/tcp_vu.c b/tcp_vu.c +index 57587cc..7a721a2 100644 +--- a/tcp_vu.c ++++ b/tcp_vu.c +@@ -415,7 +415,12 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) + conn_flag(c, conn, STALLED); + } else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == + SOCK_FIN_RCVD) { +- int ret = tcp_vu_send_flag(c, conn, FIN | ACK); ++ int ret; ++ ++ /* See tcp_buf_data_from_sock() */ ++ conn->seq_ack_to_tap = conn->seq_from_tap; ++ ++ ret = tcp_vu_send_flag(c, conn, FIN | ACK); + if (ret) { + tcp_rst(c, conn); + return ret; +-- +2.47.1 + diff --git a/0011-tcp-Completely-ignore-data-segment-in-CLOSE-WAIT-sta.patch b/0011-tcp-Completely-ignore-data-segment-in-CLOSE-WAIT-sta.patch new file mode 100644 index 0000000..d219f43 --- /dev/null +++ b/0011-tcp-Completely-ignore-data-segment-in-CLOSE-WAIT-sta.patch @@ -0,0 +1,79 @@ +From 188bcb00a0c0bdc1f468e71fd9a42345098c09d5 Mon Sep 17 00:00:00 2001 +From: Stefano Brivio +Date: Thu, 2 Oct 2025 01:13:27 +0200 +Subject: [PATCH 11/13] tcp: Completely ignore data segment in CLOSE-WAIT + state, log a message +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +According to RFC 9293 we should ignore data (note: not data segments) +in CLOSE-WAIT state (indicated by TAP_FIN_RCVD), see 3.10.7.4 +"Other states": + + [...] + + Seventh, process the segment text: + + [...] + + CLOSE-WAIT STATE + + This should not occur since a FIN has been received from the remote + side. Ignore the segment text. + +and we almost do that, except that we would look at the data length +to decide whether it's a request for fast re-transmission, so fix +that, and while at it, log a message, so that cases such as the +following one are more apparent in debug logs: + +28692 0.009758 88.198.0.164 → 93.235.151.95 54 TCP 55414 → 47080 [FIN, ACK] Seq=121441 Ack=141 Win=65536 Len=0 + +we should ignore this FIN flag, because we didn't accept data up +to this sequence (see next segment), but we don't do it, so, here: + +28693 0.000036 93.235.151.95 → 88.198.0.164 54 TCP 47080 → 55414 [ACK] Seq=141 Ack=90722 Win=32128 Len=0 +28694 0.034597 93.235.151.95 → 88.198.0.164 54 TCP 47080 → 55414 [FIN, ACK] Seq=141 Ack=90722 Win=121216 Len=0 +28695 0.000019 88.198.0.164 → 93.235.151.95 54 TCP 55414 → 47080 [ACK] Seq=121442 Ack=142 Win=65536 Len=0 +28696 0.162968 88.198.0.164 → 93.235.151.95 30773 TCP [TCP Retransmission] 55414 → 47080 [FIN, PSH, ACK] Seq=90722 Ack=142 Win=65536 Len=30719 [TCP segment of a reassembled PDU] + +we are erroneously in CLOSE-WAIT (TAP_FIN_RCVD) state, and this +segment would look pretty strange there. + +This specific case is fixed by the next patch, so it should never +happen again. + +Link: https://archives.passt.top/passt-dev/20250910115726.432bbb8d@elisabeth/ +Link: https://bugs.passt.top/show_bug.cgi?id=126 +Suggested-by: David Gibson +Signed-off-by: Stefano Brivio +Reviewed-by: David Gibson +(cherry picked from commit 8efa80b51f9f52082954aa26719b36a9ec367567) +--- + tcp.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/tcp.c b/tcp.c +index 70798de..ff071ee 100644 +--- a/tcp.c ++++ b/tcp.c +@@ -2102,9 +2102,15 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, + + /* Established connections not accepting data from tap */ + if (conn->events & TAP_FIN_RCVD) { ++ size_t dlen; + bool retr; + +- retr = th->ack && !tcp_packet_data_len(th, len) && !th->fin && ++ if ((dlen = tcp_packet_data_len(th, len))) { ++ flow_dbg(conn, "data segment in CLOSE-WAIT (%zu B)", ++ dlen); ++ } ++ ++ retr = th->ack && !th->fin && + ntohl(th->ack_seq) == conn->seq_ack_from_tap && + ntohs(th->window) == conn->wnd_from_tap; + +-- +2.47.1 + diff --git a/0012-tcp-Don-t-consider-FIN-flags-with-mismatching-sequen.patch b/0012-tcp-Don-t-consider-FIN-flags-with-mismatching-sequen.patch new file mode 100644 index 0000000..d1f017d --- /dev/null +++ b/0012-tcp-Don-t-consider-FIN-flags-with-mismatching-sequen.patch @@ -0,0 +1,76 @@ +From 64ddc157615eadc7e566486746d7e8bf9c86c844 Mon Sep 17 00:00:00 2001 +From: Stefano Brivio +Date: Thu, 2 Oct 2025 00:41:54 +0200 +Subject: [PATCH 12/13] tcp: Don't consider FIN flags with mismatching sequence +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +If a guest or container sends us a FIN segment but its sequence number +doesn't match the highest sequence of data we *accepted* (not +necessarily the highest sequence we received), that is, +conn->seq_from_tap, plus any data we're accepting in the current +batch, we should discard the flag (not necessarily the segment), +because there's still data we need to receive (again) before the end +of the stream. + +If we consider those FIN flags as such, we'll end up in the +situation described below. + +Here, 192.168.10.102 is a HTTP server in a Podman container, and +192.168.10.44 is a client fetching approximately 121 KB of data from +it: + + 82 2.026811 192.168.10.102 → 192.168.10.44 54 TCP 55414 → 44992 [FIN, ACK] Seq=121441 Ack=143 Win=65536 Len=0 + +the server is done sending + + 83 2.026898 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [ACK] Seq=143 Ack=114394 Win=216192 Len=0 + +pasta (client) acknowledges a previous sequence, because of +a short sendmsg() + + 84 2.027324 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [FIN, ACK] Seq=143 Ack=114394 Win=216192 Len=0 + +pasta (client) sends FIN, ACK as the client has no more data to +send (a single GET request), while still acknowledging a previous +sequence, because the retransmission didn't happen yet + + 85 2.027349 192.168.10.102 → 192.168.10.44 54 TCP 55414 → 44992 [ACK] Seq=121442 Ack=144 Win=65536 Len=0 + +the server acknowledges the FIN, ACK + + 86 2.224125 192.168.10.102 → 192.168.10.44 4150 TCP [TCP Retransmission] 55414 → 44992 [ACK] Seq=114394 Ack=144 Win=65536 Len=4096 [TCP segment of a reassembled PDU] + +and finally a retransmission comes, but as we wrongly switched to +the CLOSE-WAIT state, + + 87 2.224202 192.168.10.44 → 192.168.10.102 54 TCP 44992 → 55414 [RST] Seq=144 Win=0 Len=0 + +we consider frame #86 as an acknowledgement for the FIN segment we +sent, and close the connection, while we still had to re-receive +(and finally send) the missing data segment, instead. + +Link: https://github.com/containers/podman/issues/27179 +Signed-off-by: Stefano Brivio +(cherry picked from commit b145441913eef6f8885b6b84531e944ff593790c) +--- + tcp.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tcp.c b/tcp.c +index ff071ee..e6af26a 100644 +--- a/tcp.c ++++ b/tcp.c +@@ -1752,7 +1752,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, + } + } + +- if (th->fin) ++ if (th->fin && seq == seq_from_tap) + fin = 1; + + if (!len) +-- +2.47.1 + diff --git a/0013-tcp-On-partial-send-incomplete-sendmsg-request-a-ret.patch b/0013-tcp-On-partial-send-incomplete-sendmsg-request-a-ret.patch new file mode 100644 index 0000000..1569fac --- /dev/null +++ b/0013-tcp-On-partial-send-incomplete-sendmsg-request-a-ret.patch @@ -0,0 +1,59 @@ +From abd48f62fe0f1034a27a43ae5eb4e0e5f419b411 Mon Sep 17 00:00:00 2001 +From: Stefano Brivio +Date: Thu, 2 Oct 2025 00:52:53 +0200 +Subject: [PATCH 13/13] tcp: On partial send (incomplete sendmsg()), request a + retransmission right away +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +...possibly with an updated window value. As we're discarding the +remaining data, we'll need to receive it again. If we don't request +a retransmission immediately, we'll see an apparent gap in the +sequence, and request a retransmission on the next data batch or +segment, but we're just wasting time like that. In packets: + +28686 0.000007 88.198.0.164 → 93.235.151.95 16118 TCP 55414 → 47080 [ACK] Seq=80501 Ack=141 Win=65536 Len=16064 [TCP segment of a reassembled PDU] +28687 0.000012 88.198.0.164 → 93.235.151.95 16118 TCP [TCP Window Full] 55414 → 47080 [ACK] Seq=96565 Ack=141 Win=65536 Len=16064 [TCP segment of a reassembled PDU] + +on this second data segment, we have a short sendmsg(), and + +28688 0.000026 93.235.151.95 → 88.198.0.164 54 TCP 47080 → 55414 [ACK] Seq=141 Ack=90721 Win=32128 Len=0 + +consequently acknowledge it, without requesting a retransmission, + +28689 0.000006 88.198.0.164 → 93.235.151.95 8866 HTTP HTTP/1.1 200 ok (text/css) + +so the server proceeds sending a bit more, but + +28690 0.000016 93.235.151.95 → 88.198.0.164 54 TCP [TCP Dup ACK 28688#1] 47080 → 55414 [ACK] Seq=141 Ack=90721 Win=32128 Len=0 +28691 0.000000 93.235.151.95 → 88.198.0.164 54 TCP [TCP Dup ACK 28688#2] 47080 → 55414 [ACK] Seq=141 Ack=90721 Win=32128 Len=0 + +we'll throw that data (from frame #28689) away, and finally request +a retransmission as we spotted the gap now. + +Request a retransmission as soon as we know we'll need it, instead. + +Signed-off-by: Stefano Brivio +Reviewed-by: David Gibson +(cherry picked from commit a94783951d0d759912dcae0c6289c61fe6a86fc4) +--- + tcp.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tcp.c b/tcp.c +index e6af26a..9afc7f7 100644 +--- a/tcp.c ++++ b/tcp.c +@@ -1855,7 +1855,7 @@ eintr: + } + + out: +- if (keep != -1) { ++ if (keep != -1 || partial_send) { + /* We use an 8-bit approximation here: the associated risk is + * that we skip a duplicate ACK on 8-bit sequence number + * collision. Fast retransmit is a SHOULD in RFC 5681, 3.2. +-- +2.47.1 + diff --git a/passt.spec b/passt.spec index d62d70f..c9ee35e 100644 --- a/passt.spec +++ b/passt.spec @@ -12,7 +12,7 @@ Name: passt Version: 0^20250512.g8ec1341 -Release: 2%{?dist} +Release: 3%{?dist} Summary: User-mode networking daemons for virtual machines and namespaces License: GPL-2.0-or-later AND BSD-3-Clause Group: System Environment/Daemons @@ -20,6 +20,18 @@ URL: https://passt.top/ Source: https://passt.top/passt/snapshot/passt-%{git_hash}.tar.xz Patch1: 0001-treewide-By-default-don-t-quit-source-after-migratio.patch +Patch2: 0002-tcp-FIN-flags-have-to-be-retransmitted-as-well.patch +Patch3: 0003-tcp-Factor-sequence-rewind-for-retransmissions-into-.patch +Patch4: 0004-tcp-Rewind-sequence-when-guest-shrinks-window-to-zer.patch +Patch5: 0005-tcp-Fix-closing-logic-for-half-closed-connections.patch +Patch6: 0006-tcp-Don-t-try-to-transmit-right-after-the-peer-shran.patch +Patch7: 0007-tcp-Cast-operands-of-sequence-comparison-macros-to-u.patch +Patch8: 0008-tcp-Fast-re-transmit-if-half-closed-make-TAP_FIN_RCV.patch +Patch9: 0009-tcp-Don-t-send-FIN-segment-to-guest-yet-if-we-have-p.patch +Patch10: 0010-tcp-Fix-ACK-sequence-on-FIN-to-tap.patch +Patch11: 0011-tcp-Completely-ignore-data-segment-in-CLOSE-WAIT-sta.patch +Patch12: 0012-tcp-Don-t-consider-FIN-flags-with-mismatching-sequen.patch +Patch13: 0013-tcp-On-partial-send-incomplete-sendmsg-request-a-ret.patch BuildRequires: gcc, make, git, checkpolicy, selinux-policy-devel Requires: (%{name}-selinux = %{version}-%{release} if selinux-policy-%{selinuxtype}) @@ -132,6 +144,9 @@ fi %{_datadir}/selinux/packages/%{selinuxtype}/passt-repair.pp %changelog +* Thu Oct 23 2025 Stefano Brivio - 0^20250512.g8ec1341-3 +- Resolves: RHEL-123425 RHEL-123683 + * Tue Jul 29 2025 Stefano Brivio - 0^20250512.g8ec1341-2 - Resolves: RHEL-106425