diff --git a/kvm-block-Collapse-padded-I-O-vecs-exceeding-IOV_MAX.patch b/kvm-block-Collapse-padded-I-O-vecs-exceeding-IOV_MAX.patch new file mode 100644 index 0000000..94f19c6 --- /dev/null +++ b/kvm-block-Collapse-padded-I-O-vecs-exceeding-IOV_MAX.patch @@ -0,0 +1,359 @@ +From 1f7520baa6f0bf02ccba2ebfe7d1d5bf6520f95a Mon Sep 17 00:00:00 2001 +From: Hanna Czenczek +Date: Tue, 11 Apr 2023 19:34:16 +0200 +Subject: [PATCH 2/5] block: Collapse padded I/O vecs exceeding IOV_MAX + +RH-Author: Hanna Czenczek +RH-MergeRequest: 291: block: Split padded I/O vectors exceeding IOV_MAX +RH-Bugzilla: 2141964 +RH-Acked-by: Emanuele Giuseppe Esposito +RH-Acked-by: Stefan Hajnoczi +RH-Commit: [2/5] 1d86ce8398e4ab66e308a686f9855c963e52b0a9 + +When processing vectored guest requests that are not aligned to the +storage request alignment, we pad them by adding head and/or tail +buffers for a read-modify-write cycle. + +The guest can submit I/O vectors up to IOV_MAX (1024) in length, but +with this padding, the vector can exceed that limit. As of +4c002cef0e9abe7135d7916c51abce47f7fc1ee2 ("util/iov: make +qemu_iovec_init_extended() honest"), we refuse to pad vectors beyond the +limit, instead returning an error to the guest. + +To the guest, this appears as a random I/O error. We should not return +an I/O error to the guest when it issued a perfectly valid request. + +Before 4c002cef0e9abe7135d7916c51abce47f7fc1ee2, we just made the vector +longer than IOV_MAX, which generally seems to work (because the guest +assumes a smaller alignment than we really have, file-posix's +raw_co_prw() will generally see bdrv_qiov_is_aligned() return false, and +so emulate the request, so that the IOV_MAX does not matter). However, +that does not seem exactly great. + +I see two ways to fix this problem: +1. We split such long requests into two requests. +2. We join some elements of the vector into new buffers to make it + shorter. + +I am wary of (1), because it seems like it may have unintended side +effects. + +(2) on the other hand seems relatively simple to implement, with +hopefully few side effects, so this patch does that. + +To do this, the use of qemu_iovec_init_extended() in bdrv_pad_request() +is effectively replaced by the new function bdrv_create_padded_qiov(), +which not only wraps the request IOV with padding head/tail, but also +ensures that the resulting vector will not have more than IOV_MAX +elements. Putting that functionality into qemu_iovec_init_extended() is +infeasible because it requires allocating a bounce buffer; doing so +would require many more parameters (buffer alignment, how to initialize +the buffer, and out parameters like the buffer, its length, and the +original elements), which is not reasonable. + +Conversely, it is not difficult to move qemu_iovec_init_extended()'s +functionality into bdrv_create_padded_qiov() by using public +qemu_iovec_* functions, so that is what this patch does. + +Because bdrv_pad_request() was the only "serious" user of +qemu_iovec_init_extended(), the next patch will remove the latter +function, so the functionality is not implemented twice. + +Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2141964 +Signed-off-by: Hanna Czenczek +Message-Id: <20230411173418.19549-3-hreitz@redhat.com> +Reviewed-by: Vladimir Sementsov-Ogievskiy +(cherry picked from commit 18743311b829cafc1737a5f20bc3248d5f91ee2a) + +Conflicts: + block/io.c: Downstream bdrv_pad_request() has no @flags + parameter. + +Signed-off-by: Hanna Czenczek +--- + block/io.c | 166 ++++++++++++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 151 insertions(+), 15 deletions(-) + +diff --git a/block/io.c b/block/io.c +index c3e7301613..0fe8f0dd40 100644 +--- a/block/io.c ++++ b/block/io.c +@@ -1624,6 +1624,14 @@ out: + * @merge_reads is true for small requests, + * if @buf_len == @head + bytes + @tail. In this case it is possible that both + * head and tail exist but @buf_len == align and @tail_buf == @buf. ++ * ++ * @write is true for write requests, false for read requests. ++ * ++ * If padding makes the vector too long (exceeding IOV_MAX), then we need to ++ * merge existing vector elements into a single one. @collapse_bounce_buf acts ++ * as the bounce buffer in such cases. @pre_collapse_qiov has the pre-collapse ++ * I/O vector elements so for read requests, the data can be copied back after ++ * the read is done. + */ + typedef struct BdrvRequestPadding { + uint8_t *buf; +@@ -1632,11 +1640,17 @@ typedef struct BdrvRequestPadding { + size_t head; + size_t tail; + bool merge_reads; ++ bool write; + QEMUIOVector local_qiov; ++ ++ uint8_t *collapse_bounce_buf; ++ size_t collapse_len; ++ QEMUIOVector pre_collapse_qiov; + } BdrvRequestPadding; + + static bool bdrv_init_padding(BlockDriverState *bs, + int64_t offset, int64_t bytes, ++ bool write, + BdrvRequestPadding *pad) + { + int64_t align = bs->bl.request_alignment; +@@ -1668,6 +1682,8 @@ static bool bdrv_init_padding(BlockDriverState *bs, + pad->tail_buf = pad->buf + pad->buf_len - align; + } + ++ pad->write = write; ++ + return true; + } + +@@ -1733,8 +1749,23 @@ zero_mem: + return 0; + } + +-static void bdrv_padding_destroy(BdrvRequestPadding *pad) ++/** ++ * Free *pad's associated buffers, and perform any necessary finalization steps. ++ */ ++static void bdrv_padding_finalize(BdrvRequestPadding *pad) + { ++ if (pad->collapse_bounce_buf) { ++ if (!pad->write) { ++ /* ++ * If padding required elements in the vector to be collapsed into a ++ * bounce buffer, copy the bounce buffer content back ++ */ ++ qemu_iovec_from_buf(&pad->pre_collapse_qiov, 0, ++ pad->collapse_bounce_buf, pad->collapse_len); ++ } ++ qemu_vfree(pad->collapse_bounce_buf); ++ qemu_iovec_destroy(&pad->pre_collapse_qiov); ++ } + if (pad->buf) { + qemu_vfree(pad->buf); + qemu_iovec_destroy(&pad->local_qiov); +@@ -1742,6 +1773,101 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad) + memset(pad, 0, sizeof(*pad)); + } + ++/* ++ * Create pad->local_qiov by wrapping @iov in the padding head and tail, while ++ * ensuring that the resulting vector will not exceed IOV_MAX elements. ++ * ++ * To ensure this, when necessary, the first two or three elements of @iov are ++ * merged into pad->collapse_bounce_buf and replaced by a reference to that ++ * bounce buffer in pad->local_qiov. ++ * ++ * After performing a read request, the data from the bounce buffer must be ++ * copied back into pad->pre_collapse_qiov (e.g. by bdrv_padding_finalize()). ++ */ ++static int bdrv_create_padded_qiov(BlockDriverState *bs, ++ BdrvRequestPadding *pad, ++ struct iovec *iov, int niov, ++ size_t iov_offset, size_t bytes) ++{ ++ int padded_niov, surplus_count, collapse_count; ++ ++ /* Assert this invariant */ ++ assert(niov <= IOV_MAX); ++ ++ /* ++ * Cannot pad if resulting length would exceed SIZE_MAX. Returning an error ++ * to the guest is not ideal, but there is little else we can do. At least ++ * this will practically never happen on 64-bit systems. ++ */ ++ if (SIZE_MAX - pad->head < bytes || ++ SIZE_MAX - pad->head - bytes < pad->tail) ++ { ++ return -EINVAL; ++ } ++ ++ /* Length of the resulting IOV if we just concatenated everything */ ++ padded_niov = !!pad->head + niov + !!pad->tail; ++ ++ qemu_iovec_init(&pad->local_qiov, MIN(padded_niov, IOV_MAX)); ++ ++ if (pad->head) { ++ qemu_iovec_add(&pad->local_qiov, pad->buf, pad->head); ++ } ++ ++ /* ++ * If padded_niov > IOV_MAX, we cannot just concatenate everything. ++ * Instead, merge the first two or three elements of @iov to reduce the ++ * number of vector elements as necessary. ++ */ ++ if (padded_niov > IOV_MAX) { ++ /* ++ * Only head and tail can have lead to the number of entries exceeding ++ * IOV_MAX, so we can exceed it by the head and tail at most. We need ++ * to reduce the number of elements by `surplus_count`, so we merge that ++ * many elements plus one into one element. ++ */ ++ surplus_count = padded_niov - IOV_MAX; ++ assert(surplus_count <= !!pad->head + !!pad->tail); ++ collapse_count = surplus_count + 1; ++ ++ /* ++ * Move the elements to collapse into `pad->pre_collapse_qiov`, then ++ * advance `iov` (and associated variables) by those elements. ++ */ ++ qemu_iovec_init(&pad->pre_collapse_qiov, collapse_count); ++ qemu_iovec_concat_iov(&pad->pre_collapse_qiov, iov, ++ collapse_count, iov_offset, SIZE_MAX); ++ iov += collapse_count; ++ iov_offset = 0; ++ niov -= collapse_count; ++ bytes -= pad->pre_collapse_qiov.size; ++ ++ /* ++ * Construct the bounce buffer to match the length of the to-collapse ++ * vector elements, and for write requests, initialize it with the data ++ * from those elements. Then add it to `pad->local_qiov`. ++ */ ++ pad->collapse_len = pad->pre_collapse_qiov.size; ++ pad->collapse_bounce_buf = qemu_blockalign(bs, pad->collapse_len); ++ if (pad->write) { ++ qemu_iovec_to_buf(&pad->pre_collapse_qiov, 0, ++ pad->collapse_bounce_buf, pad->collapse_len); ++ } ++ qemu_iovec_add(&pad->local_qiov, ++ pad->collapse_bounce_buf, pad->collapse_len); ++ } ++ ++ qemu_iovec_concat_iov(&pad->local_qiov, iov, niov, iov_offset, bytes); ++ ++ if (pad->tail) { ++ qemu_iovec_add(&pad->local_qiov, ++ pad->buf + pad->buf_len - pad->tail, pad->tail); ++ } ++ ++ assert(pad->local_qiov.niov == MIN(padded_niov, IOV_MAX)); ++ return 0; ++} ++ + /* + * bdrv_pad_request + * +@@ -1749,6 +1875,8 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad) + * read of padding, bdrv_padding_rmw_read() should be called separately if + * needed. + * ++ * @write is true for write requests, false for read requests. ++ * + * Request parameters (@qiov, &qiov_offset, &offset, &bytes) are in-out: + * - on function start they represent original request + * - on failure or when padding is not needed they are unchanged +@@ -1757,25 +1885,33 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad) + static int bdrv_pad_request(BlockDriverState *bs, + QEMUIOVector **qiov, size_t *qiov_offset, + int64_t *offset, int64_t *bytes, ++ bool write, + BdrvRequestPadding *pad, bool *padded) + { + int ret; ++ struct iovec *sliced_iov; ++ int sliced_niov; ++ size_t sliced_head, sliced_tail; + + bdrv_check_qiov_request(*offset, *bytes, *qiov, *qiov_offset, &error_abort); + +- if (!bdrv_init_padding(bs, *offset, *bytes, pad)) { ++ if (!bdrv_init_padding(bs, *offset, *bytes, write, pad)) { + if (padded) { + *padded = false; + } + return 0; + } + +- ret = qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head, +- *qiov, *qiov_offset, *bytes, +- pad->buf + pad->buf_len - pad->tail, +- pad->tail); ++ sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes, ++ &sliced_head, &sliced_tail, ++ &sliced_niov); ++ ++ /* Guaranteed by bdrv_check_qiov_request() */ ++ assert(*bytes <= SIZE_MAX); ++ ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov, ++ sliced_head, *bytes); + if (ret < 0) { +- bdrv_padding_destroy(pad); ++ bdrv_padding_finalize(pad); + return ret; + } + *bytes += pad->head + pad->tail; +@@ -1836,8 +1972,8 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, + flags |= BDRV_REQ_COPY_ON_READ; + } + +- ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, +- NULL); ++ ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, false, ++ &pad, NULL); + if (ret < 0) { + goto fail; + } +@@ -1847,7 +1983,7 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, + bs->bl.request_alignment, + qiov, qiov_offset, flags); + tracked_request_end(&req); +- bdrv_padding_destroy(&pad); ++ bdrv_padding_finalize(&pad); + + fail: + bdrv_dec_in_flight(bs); +@@ -2167,7 +2303,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, + bool padding; + BdrvRequestPadding pad; + +- padding = bdrv_init_padding(bs, offset, bytes, &pad); ++ padding = bdrv_init_padding(bs, offset, bytes, true, &pad); + if (padding) { + bdrv_make_request_serialising(req, align); + +@@ -2214,7 +2350,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, + } + + out: +- bdrv_padding_destroy(&pad); ++ bdrv_padding_finalize(&pad); + + return ret; + } +@@ -2280,8 +2416,8 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, + * bdrv_co_do_zero_pwritev() does aligning by itself, so, we do + * alignment only if there is no ZERO flag. + */ +- ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, +- &padded); ++ ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, true, ++ &pad, &padded); + if (ret < 0) { + return ret; + } +@@ -2310,7 +2446,7 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, + ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, + qiov, qiov_offset, flags); + +- bdrv_padding_destroy(&pad); ++ bdrv_padding_finalize(&pad); + + out: + tracked_request_end(&req); +-- +2.39.3 + diff --git a/kvm-block-Fix-pad_request-s-request-restriction.patch b/kvm-block-Fix-pad_request-s-request-restriction.patch new file mode 100644 index 0000000..ce4eb0b --- /dev/null +++ b/kvm-block-Fix-pad_request-s-request-restriction.patch @@ -0,0 +1,75 @@ +From b9866279996ee065cb524bf30bc70e22efbab303 Mon Sep 17 00:00:00 2001 +From: Hanna Czenczek +Date: Fri, 14 Jul 2023 10:59:38 +0200 +Subject: [PATCH 5/5] block: Fix pad_request's request restriction + +RH-Author: Hanna Czenczek +RH-MergeRequest: 291: block: Split padded I/O vectors exceeding IOV_MAX +RH-Bugzilla: 2141964 +RH-Acked-by: Emanuele Giuseppe Esposito +RH-Acked-by: Stefan Hajnoczi +RH-Commit: [5/5] f9188bd089d6c67185ea1accde20d491a2ed3193 + +bdrv_pad_request() relies on requests' lengths not to exceed SIZE_MAX, +which bdrv_check_qiov_request() does not guarantee. + +bdrv_check_request32() however will guarantee this, and both of +bdrv_pad_request()'s callers (bdrv_co_preadv_part() and +bdrv_co_pwritev_part()) already run it before calling +bdrv_pad_request(). Therefore, bdrv_pad_request() can safely call +bdrv_check_request32() without expecting error, too. + +In effect, this patch will not change guest-visible behavior. It is a +clean-up to tighten a condition to match what is guaranteed by our +callers, and which exists purely to show clearly why the subsequent +assertion (`assert(*bytes <= SIZE_MAX)`) is always true. + +Note there is a difference between the interfaces of +bdrv_check_qiov_request() and bdrv_check_request32(): The former takes +an errp, the latter does not, so we can no longer just pass +&error_abort. Instead, we need to check the returned value. While we +do expect success (because the callers have already run this function), +an assert(ret == 0) is not much simpler than just to return an error if +it occurs, so let us handle errors by returning them up the stack now. + +Reported-by: Peter Maydell +Signed-off-by: Hanna Czenczek +Message-id: 20230714085938.202730-1-hreitz@redhat.com +Fixes: 18743311b829cafc1737a5f20bc3248d5f91ee2a + ("block: Collapse padded I/O vecs exceeding IOV_MAX") +Signed-off-by: Hanna Czenczek +Signed-off-by: Stefan Hajnoczi +Signed-off-by: Hanna Czenczek +--- + block/io.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/block/io.c b/block/io.c +index 0fe8f0dd40..8ae57728a6 100644 +--- a/block/io.c ++++ b/block/io.c +@@ -1893,7 +1893,11 @@ static int bdrv_pad_request(BlockDriverState *bs, + int sliced_niov; + size_t sliced_head, sliced_tail; + +- bdrv_check_qiov_request(*offset, *bytes, *qiov, *qiov_offset, &error_abort); ++ /* Should have been checked by the caller already */ ++ ret = bdrv_check_request32(*offset, *bytes, *qiov, *qiov_offset); ++ if (ret < 0) { ++ return ret; ++ } + + if (!bdrv_init_padding(bs, *offset, *bytes, write, pad)) { + if (padded) { +@@ -1906,7 +1910,7 @@ static int bdrv_pad_request(BlockDriverState *bs, + &sliced_head, &sliced_tail, + &sliced_niov); + +- /* Guaranteed by bdrv_check_qiov_request() */ ++ /* Guaranteed by bdrv_check_request32() */ + assert(*bytes <= SIZE_MAX); + ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov, + sliced_head, *bytes); +-- +2.39.3 + diff --git a/kvm-iotests-iov-padding-New-test.patch b/kvm-iotests-iov-padding-New-test.patch new file mode 100644 index 0000000..aa3db62 --- /dev/null +++ b/kvm-iotests-iov-padding-New-test.patch @@ -0,0 +1,187 @@ +From 084e211448f40c3e9d9b1907f6c98dca9f998bc3 Mon Sep 17 00:00:00 2001 +From: Hanna Czenczek +Date: Tue, 11 Apr 2023 19:34:18 +0200 +Subject: [PATCH 4/5] iotests/iov-padding: New test + +RH-Author: Hanna Czenczek +RH-MergeRequest: 291: block: Split padded I/O vectors exceeding IOV_MAX +RH-Bugzilla: 2141964 +RH-Acked-by: Emanuele Giuseppe Esposito +RH-Acked-by: Stefan Hajnoczi +RH-Commit: [4/5] a80be9c26ebd5503745989cd6823cb4814264258 + +Test that even vectored IO requests with 1024 vector elements that are +not aligned to the device's request alignment will succeed. + +Reviewed-by: Eric Blake +Reviewed-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Hanna Czenczek +Message-Id: <20230411173418.19549-5-hreitz@redhat.com> +(cherry picked from commit d7e1905e3f54ff9512db4c7a946a8603b62b108d) +Signed-off-by: Hanna Czenczek +--- + tests/qemu-iotests/tests/iov-padding | 85 ++++++++++++++++++++++++ + tests/qemu-iotests/tests/iov-padding.out | 59 ++++++++++++++++ + 2 files changed, 144 insertions(+) + create mode 100755 tests/qemu-iotests/tests/iov-padding + create mode 100644 tests/qemu-iotests/tests/iov-padding.out + +diff --git a/tests/qemu-iotests/tests/iov-padding b/tests/qemu-iotests/tests/iov-padding +new file mode 100755 +index 0000000000..b9604900c7 +--- /dev/null ++++ b/tests/qemu-iotests/tests/iov-padding +@@ -0,0 +1,85 @@ ++#!/usr/bin/env bash ++# group: rw quick ++# ++# Check the interaction of request padding (to fit alignment restrictions) with ++# vectored I/O from the guest ++# ++# Copyright Red Hat ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 2 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++# ++ ++seq=$(basename $0) ++echo "QA output created by $seq" ++ ++status=1 # failure is the default! ++ ++_cleanup() ++{ ++ _cleanup_test_img ++} ++trap "_cleanup; exit \$status" 0 1 2 3 15 ++ ++# get standard environment, filters and checks ++cd .. ++. ./common.rc ++. ./common.filter ++ ++_supported_fmt raw ++_supported_proto file ++ ++_make_test_img 1M ++ ++IMGSPEC="driver=blkdebug,align=4096,image.driver=file,image.filename=$TEST_IMG" ++ ++# Four combinations: ++# - Offset 4096, length 1023 * 512 + 512: Fully aligned to 4k ++# - Offset 4096, length 1023 * 512 + 4096: Head is aligned, tail is not ++# - Offset 512, length 1023 * 512 + 512: Neither head nor tail are aligned ++# - Offset 512, length 1023 * 512 + 4096: Tail is aligned, head is not ++for start_offset in 4096 512; do ++ for last_element_length in 512 4096; do ++ length=$((1023 * 512 + $last_element_length)) ++ ++ echo ++ echo "== performing 1024-element vectored requests to image (offset: $start_offset; length: $length) ==" ++ ++ # Fill with data for testing ++ $QEMU_IO -c 'write -P 1 0 1M' "$TEST_IMG" | _filter_qemu_io ++ ++ # 1023 512-byte buffers, and then one with length $last_element_length ++ cmd_params="-P 2 $start_offset $(yes 512 | head -n 1023 | tr '\n' ' ') $last_element_length" ++ QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS_NO_FMT" $QEMU_IO \ ++ -c "writev $cmd_params" \ ++ --image-opts \ ++ "$IMGSPEC" \ ++ | _filter_qemu_io ++ ++ # Read all patterns -- read the part we just wrote with writev twice, ++ # once "normally", and once with a readv, so we see that that works, too ++ QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS_NO_FMT" $QEMU_IO \ ++ -c "read -P 1 0 $start_offset" \ ++ -c "read -P 2 $start_offset $length" \ ++ -c "readv $cmd_params" \ ++ -c "read -P 1 $((start_offset + length)) $((1024 * 1024 - length - start_offset))" \ ++ --image-opts \ ++ "$IMGSPEC" \ ++ | _filter_qemu_io ++ done ++done ++ ++# success, all done ++echo "*** done" ++rm -f $seq.full ++status=0 +diff --git a/tests/qemu-iotests/tests/iov-padding.out b/tests/qemu-iotests/tests/iov-padding.out +new file mode 100644 +index 0000000000..e07a91fac7 +--- /dev/null ++++ b/tests/qemu-iotests/tests/iov-padding.out +@@ -0,0 +1,59 @@ ++QA output created by iov-padding ++Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 ++ ++== performing 1024-element vectored requests to image (offset: 4096; length: 524288) == ++wrote 1048576/1048576 bytes at offset 0 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++wrote 524288/524288 bytes at offset 4096 ++512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 4096/4096 bytes at offset 0 ++4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 524288/524288 bytes at offset 4096 ++512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 524288/524288 bytes at offset 4096 ++512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 520192/520192 bytes at offset 528384 ++508 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++== performing 1024-element vectored requests to image (offset: 4096; length: 527872) == ++wrote 1048576/1048576 bytes at offset 0 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++wrote 527872/527872 bytes at offset 4096 ++515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 4096/4096 bytes at offset 0 ++4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 527872/527872 bytes at offset 4096 ++515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 527872/527872 bytes at offset 4096 ++515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 516608/516608 bytes at offset 531968 ++504.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++== performing 1024-element vectored requests to image (offset: 512; length: 524288) == ++wrote 1048576/1048576 bytes at offset 0 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++wrote 524288/524288 bytes at offset 512 ++512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 524288/524288 bytes at offset 512 ++512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 524288/524288 bytes at offset 512 ++512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 523776/523776 bytes at offset 524800 ++511.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++ ++== performing 1024-element vectored requests to image (offset: 512; length: 527872) == ++wrote 1048576/1048576 bytes at offset 0 ++1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++wrote 527872/527872 bytes at offset 512 ++515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 512/512 bytes at offset 0 ++512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 527872/527872 bytes at offset 512 ++515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 527872/527872 bytes at offset 512 ++515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++read 520192/520192 bytes at offset 528384 ++508 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) ++*** done +-- +2.39.3 + diff --git a/kvm-util-iov-Make-qiov_slice-public.patch b/kvm-util-iov-Make-qiov_slice-public.patch new file mode 100644 index 0000000..807707a --- /dev/null +++ b/kvm-util-iov-Make-qiov_slice-public.patch @@ -0,0 +1,98 @@ +From 884e6dfecc8b0f155015f0a25888300d8e1707f8 Mon Sep 17 00:00:00 2001 +From: Hanna Czenczek +Date: Tue, 11 Apr 2023 19:34:15 +0200 +Subject: [PATCH 1/5] util/iov: Make qiov_slice() public + +RH-Author: Hanna Czenczek +RH-MergeRequest: 291: block: Split padded I/O vectors exceeding IOV_MAX +RH-Bugzilla: 2141964 +RH-Acked-by: Emanuele Giuseppe Esposito +RH-Acked-by: Stefan Hajnoczi +RH-Commit: [1/5] 7f082982e49bacbcc21ca24e471b4399e64321a9 + +We want to inline qemu_iovec_init_extended() in block/io.c for padding +requests, and having access to qiov_slice() is useful for this. As a +public function, it is renamed to qemu_iovec_slice(). + +(We will need to count the number of I/O vector elements of a slice +there, and then later process this slice. Without qiov_slice(), we +would need to call qemu_iovec_subvec_niov(), and all further +IOV-processing functions may need to skip prefixing elements to +accomodate for a qiov_offset. Because qemu_iovec_subvec_niov() +internally calls qiov_slice(), we can just have the block/io.c code call +qiov_slice() itself, thus get the number of elements, and also create an +iovec array with the superfluous prefixing elements stripped, so the +following processing functions no longer need to skip them.) + +Reviewed-by: Eric Blake +Reviewed-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Hanna Czenczek +Message-Id: <20230411173418.19549-2-hreitz@redhat.com> +(cherry picked from commit 3d06cea8256d54a6b0238934c31012f7f17100f5) +Signed-off-by: Hanna Czenczek +--- + include/qemu/iov.h | 3 +++ + util/iov.c | 14 +++++++------- + 2 files changed, 10 insertions(+), 7 deletions(-) + +diff --git a/include/qemu/iov.h b/include/qemu/iov.h +index 9330746680..46fadfb27a 100644 +--- a/include/qemu/iov.h ++++ b/include/qemu/iov.h +@@ -229,6 +229,9 @@ int qemu_iovec_init_extended( + void *tail_buf, size_t tail_len); + void qemu_iovec_init_slice(QEMUIOVector *qiov, QEMUIOVector *source, + size_t offset, size_t len); ++struct iovec *qemu_iovec_slice(QEMUIOVector *qiov, ++ size_t offset, size_t len, ++ size_t *head, size_t *tail, int *niov); + int qemu_iovec_subvec_niov(QEMUIOVector *qiov, size_t offset, size_t len); + void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len); + void qemu_iovec_concat(QEMUIOVector *dst, +diff --git a/util/iov.c b/util/iov.c +index 58c7b3eeee..3ccb530b16 100644 +--- a/util/iov.c ++++ b/util/iov.c +@@ -373,15 +373,15 @@ static struct iovec *iov_skip_offset(struct iovec *iov, size_t offset, + } + + /* +- * qiov_slice ++ * qemu_iovec_slice + * + * Find subarray of iovec's, containing requested range. @head would + * be offset in first iov (returned by the function), @tail would be + * count of extra bytes in last iovec (returned iov + @niov - 1). + */ +-static struct iovec *qiov_slice(QEMUIOVector *qiov, +- size_t offset, size_t len, +- size_t *head, size_t *tail, int *niov) ++struct iovec *qemu_iovec_slice(QEMUIOVector *qiov, ++ size_t offset, size_t len, ++ size_t *head, size_t *tail, int *niov) + { + struct iovec *iov, *end_iov; + +@@ -406,7 +406,7 @@ int qemu_iovec_subvec_niov(QEMUIOVector *qiov, size_t offset, size_t len) + size_t head, tail; + int niov; + +- qiov_slice(qiov, offset, len, &head, &tail, &niov); ++ qemu_iovec_slice(qiov, offset, len, &head, &tail, &niov); + + return niov; + } +@@ -434,8 +434,8 @@ int qemu_iovec_init_extended( + } + + if (mid_len) { +- mid_iov = qiov_slice(mid_qiov, mid_offset, mid_len, +- &mid_head, &mid_tail, &mid_niov); ++ mid_iov = qemu_iovec_slice(mid_qiov, mid_offset, mid_len, ++ &mid_head, &mid_tail, &mid_niov); + } + + total_niov = !!head_len + mid_niov + !!tail_len; +-- +2.39.3 + diff --git a/kvm-util-iov-Remove-qemu_iovec_init_extended.patch b/kvm-util-iov-Remove-qemu_iovec_init_extended.patch new file mode 100644 index 0000000..c49c3ac --- /dev/null +++ b/kvm-util-iov-Remove-qemu_iovec_init_extended.patch @@ -0,0 +1,157 @@ +From cc31f7eb1c362dc308a163b7364c96ed098a793a Mon Sep 17 00:00:00 2001 +From: Hanna Czenczek +Date: Tue, 11 Apr 2023 19:34:17 +0200 +Subject: [PATCH 3/5] util/iov: Remove qemu_iovec_init_extended() + +RH-Author: Hanna Czenczek +RH-MergeRequest: 291: block: Split padded I/O vectors exceeding IOV_MAX +RH-Bugzilla: 2141964 +RH-Acked-by: Emanuele Giuseppe Esposito +RH-Acked-by: Stefan Hajnoczi +RH-Commit: [3/5] 19c8307ef1289f1991199d1d1f6ab6c89a4b59ce + +bdrv_pad_request() was the main user of qemu_iovec_init_extended(). +HEAD^ has removed that use, so we can remove qemu_iovec_init_extended() +now. + +The only remaining user is qemu_iovec_init_slice(), which can easily +inline the small part it really needs. + +Note that qemu_iovec_init_extended() offered a memcpy() optimization to +initialize the new I/O vector. qemu_iovec_concat_iov(), which is used +to replace its functionality, does not, but calls qemu_iovec_add() for +every single element. If we decide this optimization was important, we +will need to re-implement it in qemu_iovec_concat_iov(), which might +also benefit its pre-existing users. + +Reviewed-by: Eric Blake +Reviewed-by: Vladimir Sementsov-Ogievskiy +Signed-off-by: Hanna Czenczek +Message-Id: <20230411173418.19549-4-hreitz@redhat.com> +(cherry picked from commit cc63f6f6fa1aaa4b6405dd69432c693e9c8d18ca) +Signed-off-by: Hanna Czenczek +--- + include/qemu/iov.h | 5 --- + util/iov.c | 79 +++++++--------------------------------------- + 2 files changed, 11 insertions(+), 73 deletions(-) + +diff --git a/include/qemu/iov.h b/include/qemu/iov.h +index 46fadfb27a..63a1c01965 100644 +--- a/include/qemu/iov.h ++++ b/include/qemu/iov.h +@@ -222,11 +222,6 @@ static inline void *qemu_iovec_buf(QEMUIOVector *qiov) + + void qemu_iovec_init(QEMUIOVector *qiov, int alloc_hint); + void qemu_iovec_init_external(QEMUIOVector *qiov, struct iovec *iov, int niov); +-int qemu_iovec_init_extended( +- QEMUIOVector *qiov, +- void *head_buf, size_t head_len, +- QEMUIOVector *mid_qiov, size_t mid_offset, size_t mid_len, +- void *tail_buf, size_t tail_len); + void qemu_iovec_init_slice(QEMUIOVector *qiov, QEMUIOVector *source, + size_t offset, size_t len); + struct iovec *qemu_iovec_slice(QEMUIOVector *qiov, +diff --git a/util/iov.c b/util/iov.c +index 3ccb530b16..af3ccc2546 100644 +--- a/util/iov.c ++++ b/util/iov.c +@@ -411,70 +411,6 @@ int qemu_iovec_subvec_niov(QEMUIOVector *qiov, size_t offset, size_t len) + return niov; + } + +-/* +- * Compile new iovec, combining @head_buf buffer, sub-qiov of @mid_qiov, +- * and @tail_buf buffer into new qiov. +- */ +-int qemu_iovec_init_extended( +- QEMUIOVector *qiov, +- void *head_buf, size_t head_len, +- QEMUIOVector *mid_qiov, size_t mid_offset, size_t mid_len, +- void *tail_buf, size_t tail_len) +-{ +- size_t mid_head, mid_tail; +- int total_niov, mid_niov = 0; +- struct iovec *p, *mid_iov = NULL; +- +- assert(mid_qiov->niov <= IOV_MAX); +- +- if (SIZE_MAX - head_len < mid_len || +- SIZE_MAX - head_len - mid_len < tail_len) +- { +- return -EINVAL; +- } +- +- if (mid_len) { +- mid_iov = qemu_iovec_slice(mid_qiov, mid_offset, mid_len, +- &mid_head, &mid_tail, &mid_niov); +- } +- +- total_niov = !!head_len + mid_niov + !!tail_len; +- if (total_niov > IOV_MAX) { +- return -EINVAL; +- } +- +- if (total_niov == 1) { +- qemu_iovec_init_buf(qiov, NULL, 0); +- p = &qiov->local_iov; +- } else { +- qiov->niov = qiov->nalloc = total_niov; +- qiov->size = head_len + mid_len + tail_len; +- p = qiov->iov = g_new(struct iovec, qiov->niov); +- } +- +- if (head_len) { +- p->iov_base = head_buf; +- p->iov_len = head_len; +- p++; +- } +- +- assert(!mid_niov == !mid_len); +- if (mid_niov) { +- memcpy(p, mid_iov, mid_niov * sizeof(*p)); +- p[0].iov_base = (uint8_t *)p[0].iov_base + mid_head; +- p[0].iov_len -= mid_head; +- p[mid_niov - 1].iov_len -= mid_tail; +- p += mid_niov; +- } +- +- if (tail_len) { +- p->iov_base = tail_buf; +- p->iov_len = tail_len; +- } +- +- return 0; +-} +- + /* + * Check if the contents of subrange of qiov data is all zeroes. + */ +@@ -506,14 +442,21 @@ bool qemu_iovec_is_zero(QEMUIOVector *qiov, size_t offset, size_t bytes) + void qemu_iovec_init_slice(QEMUIOVector *qiov, QEMUIOVector *source, + size_t offset, size_t len) + { +- int ret; ++ struct iovec *slice_iov; ++ int slice_niov; ++ size_t slice_head, slice_tail; + + assert(source->size >= len); + assert(source->size - len >= offset); + +- /* We shrink the request, so we can't overflow neither size_t nor MAX_IOV */ +- ret = qemu_iovec_init_extended(qiov, NULL, 0, source, offset, len, NULL, 0); +- assert(ret == 0); ++ slice_iov = qemu_iovec_slice(source, offset, len, ++ &slice_head, &slice_tail, &slice_niov); ++ if (slice_niov == 1) { ++ qemu_iovec_init_buf(qiov, slice_iov[0].iov_base + slice_head, len); ++ } else { ++ qemu_iovec_init(qiov, slice_niov); ++ qemu_iovec_concat_iov(qiov, slice_iov, slice_niov, slice_head, len); ++ } + } + + void qemu_iovec_destroy(QEMUIOVector *qiov) +-- +2.39.3 + diff --git a/qemu-kvm.spec b/qemu-kvm.spec index 091ed4d..6483966 100644 --- a/qemu-kvm.spec +++ b/qemu-kvm.spec @@ -83,7 +83,7 @@ Obsoletes: %1-rhev <= %{epoch}:%{version}-%{release} Summary: QEMU is a machine emulator and virtualizer Name: qemu-kvm Version: 6.2.0 -Release: 36%{?rcrel}%{?dist} +Release: 37%{?rcrel}%{?dist} # Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped Epoch: 15 License: GPLv2 and GPLv2+ and CC-BY @@ -759,6 +759,16 @@ Patch298: kvm-target-i386-add-support-for-FLUSH_L1D-feature.patch Patch299: kvm-target-i386-add-support-for-FB_CLEAR-feature.patch # For bz#2169733 - Qemu on destination host crashed if migrate with postcopy and multifd enabled Patch300: kvm-migration-Disable-postcopy-multifd-migration.patch +# For bz#2141964 - Guest hit EXT4-fs error on host 4K disk when repeatedly hot-plug/unplug running IO disk +Patch301: kvm-util-iov-Make-qiov_slice-public.patch +# For bz#2141964 - Guest hit EXT4-fs error on host 4K disk when repeatedly hot-plug/unplug running IO disk +Patch302: kvm-block-Collapse-padded-I-O-vecs-exceeding-IOV_MAX.patch +# For bz#2141964 - Guest hit EXT4-fs error on host 4K disk when repeatedly hot-plug/unplug running IO disk +Patch303: kvm-util-iov-Remove-qemu_iovec_init_extended.patch +# For bz#2141964 - Guest hit EXT4-fs error on host 4K disk when repeatedly hot-plug/unplug running IO disk +Patch304: kvm-iotests-iov-padding-New-test.patch +# For bz#2141964 - Guest hit EXT4-fs error on host 4K disk when repeatedly hot-plug/unplug running IO disk +Patch305: kvm-block-Fix-pad_request-s-request-restriction.patch BuildRequires: wget BuildRequires: rpm-build @@ -1928,6 +1938,15 @@ sh %{_sysconfdir}/sysconfig/modules/kvm.modules &> /dev/null || : %changelog +* Tue Jul 25 2023 Miroslav Rezanina - 6.2.0-37 +- kvm-util-iov-Make-qiov_slice-public.patch [bz#2141964] +- kvm-block-Collapse-padded-I-O-vecs-exceeding-IOV_MAX.patch [bz#2141964] +- kvm-util-iov-Remove-qemu_iovec_init_extended.patch [bz#2141964] +- kvm-iotests-iov-padding-New-test.patch [bz#2141964] +- kvm-block-Fix-pad_request-s-request-restriction.patch [bz#2141964] +- Resolves: bz#2141964 + (Guest hit EXT4-fs error on host 4K disk when repeatedly hot-plug/unplug running IO disk) + * Thu Jun 29 2023 Jon Maloy - 6.2.0-36 - kvm-memory-prevent-dma-reentracy-issues.patch [bz#1999236] - kvm-async-Add-an-optional-reentrancy-guard-to-the-BH-API.patch [bz#1999236]