From a4cd805786230ea8034ee0fb9672b99a02b1b4d3 Mon Sep 17 00:00:00 2001 From: eabdullin Date: Mon, 15 Sep 2025 12:25:50 +0000 Subject: [PATCH] import CS nbdkit-1.38.5-10.el9 --- ...preserve-errno-to-log_verror-functio.patch | 2 +- ...readlocal_-set-get-_error-to-._errno.patch | 2 +- ...uce-threadlocal_-set-get-_last_error.patch | 2 +- ...read-local-copy-of-the-last-call-to-.patch | 2 +- ...end-the-last-error-to-the-NBD-client.patch | 2 +- SOURCES/0006-vddk-Include-stdbool.h.patch | 2 +- ...dk-Cache-the-disk-size-in-the-handle.patch | 2 +- ...ents-Mark-some-local-variables-const.patch | 2 +- ...Exit-the-function-if-we-hit-req_one-.patch | 2 +- ...Avoid-reading-partial-chunk-beyond-t.patch | 2 +- ...ng-if-sync_file_range-posix_fadvise-.patch | 41 ++ ...e_range-fails-to-start-don-t-add-win.patch | 82 ++++ ...-Add-tests-of-file-plugin-cache-none.patch | 254 +++++++++++++ ...ore-generic-tests-of-file-cache-none.patch | 238 ++++++++++++ ...-Hard-error-if-sync_file_range-fails.patch | 101 +++++ ...size-of-the-lock-around-write-evicti.patch | 90 +++++ ...plicit-assumption-about-eviction-win.patch | 29 ++ ...urn-flush-into-a-controlpath-message.patch | 33 ++ ...file-Fix-minor-typo-in-debug-message.patch | 26 ++ ...debugging-when-D-file.zero-1-is-used.patch | 36 ++ ...le-Fix-comment-style-in-a-few-places.patch | 43 +++ ...Fix-do_fallocate-debugging-on-Alpine.patch | 60 +++ ...n_zeroout-to-h-can_blkzeroout-to-ref.patch | 65 ++++ ...nt-implicit-order-that-we-will-try-z.patch | 38 ++ ...BLKDISCARD-method-if-may_trim-is-set.patch | 122 ++++++ ...h-of-extents-when-using-D-vddk.exten.patch | 30 ++ ...tents-Mark-this-filter-as-deprecated.patch | 38 ++ ...e-extent-functions-to-nbdkit-common..patch | 77 ++++ ...mand-type-in-command-completed-messa.patch | 29 ++ ...eadonly-flag-from-the-.open-call-in-.patch | 41 ++ ...-Move-minimum-version-of-VDDK-to-6.7.patch | 250 +++++++++++++ ...ditionally-test-QueryAllocatedBlocks.patch | 78 ++++ ...the-extents-for-readonly-connections.patch | 349 ++++++++++++++++++ ...lename-or-equivalent-in-the-file-han.patch | 241 ++++++++++++ ...ename-or-equivalent-to-error-message.patch | 143 +++++++ ...e-Add-offset-count-to-error-messages.patch | 132 +++++++ ...s-instead-of-Unicode-s-for-microseco.patch | 34 ++ ...-stats-Line-up-the-columns-correctly.patch | 26 ++ ...d-the-byte-count-of-each-QueryAlloca.patch | 61 +++ ...ct-elapsed-time-for-ReadAsync-and-Wr.patch | 178 +++++++++ ...y-one-for-maximum-block_status-lengt.patch | 170 +++++++++ ...-bit-overflow-in-.extents-CVE-2025-4.patch | 163 ++++++++ SOURCES/copy-patches.sh | 2 +- SPECS/nbdkit.spec | 61 ++- 44 files changed, 3368 insertions(+), 13 deletions(-) create mode 100644 SOURCES/0011-file-Add-debugging-if-sync_file_range-posix_fadvise-.patch create mode 100644 SOURCES/0012-file-If-sync_file_range-fails-to-start-don-t-add-win.patch create mode 100644 SOURCES/0013-tests-Add-tests-of-file-plugin-cache-none.patch create mode 100644 SOURCES/0014-tests-Add-more-generic-tests-of-file-cache-none.patch create mode 100644 SOURCES/0015-file-Hard-error-if-sync_file_range-fails.patch create mode 100644 SOURCES/0016-file-Reduce-the-size-of-the-lock-around-write-evicti.patch create mode 100644 SOURCES/0017-file-Document-implicit-assumption-about-eviction-win.patch create mode 100644 SOURCES/0018-server-Turn-flush-into-a-controlpath-message.patch create mode 100644 SOURCES/0019-file-Fix-minor-typo-in-debug-message.patch create mode 100644 SOURCES/0020-file-Add-more-debugging-when-D-file.zero-1-is-used.patch create mode 100644 SOURCES/0021-file-Fix-comment-style-in-a-few-places.patch create mode 100644 SOURCES/0022-file-Fix-do_fallocate-debugging-on-Alpine.patch create mode 100644 SOURCES/0023-file-Rename-h-can_zeroout-to-h-can_blkzeroout-to-ref.patch create mode 100644 SOURCES/0024-file-zero-Document-implicit-order-that-we-will-try-z.patch create mode 100644 SOURCES/0025-file-zero-Use-BLKDISCARD-method-if-may_trim-is-set.patch create mode 100644 SOURCES/0026-vddk-Debug-length-of-extents-when-using-D-vddk.exten.patch create mode 100644 SOURCES/0027-cacheextents-Mark-this-filter-as-deprecated.patch create mode 100644 SOURCES/0028-include-Move-some-extent-functions-to-nbdkit-common..patch create mode 100644 SOURCES/0029-vddk-Display-command-type-in-command-completed-messa.patch create mode 100644 SOURCES/0030-vddk-Cache-the-readonly-flag-from-the-.open-call-in-.patch create mode 100644 SOURCES/0031-vddk-Move-minimum-version-of-VDDK-to-6.7.patch create mode 100644 SOURCES/0032-vddk-Unconditionally-test-QueryAllocatedBlocks.patch create mode 100644 SOURCES/0033-vddk-Pre-cache-the-extents-for-readonly-connections.patch create mode 100644 SOURCES/0034-file-Save-the-filename-or-equivalent-in-the-file-han.patch create mode 100644 SOURCES/0035-file-Add-the-filename-or-equivalent-to-error-message.patch create mode 100644 SOURCES/0036-file-Add-offset-count-to-error-messages.patch create mode 100644 SOURCES/0037-vddk-stats-Use-us-instead-of-Unicode-s-for-microseco.patch create mode 100644 SOURCES/0038-vddk-stats-Line-up-the-columns-correctly.patch create mode 100644 SOURCES/0039-vddk-stats-Record-the-byte-count-of-each-QueryAlloca.patch create mode 100644 SOURCES/0040-vddk-stats-Collect-elapsed-time-for-ReadAsync-and-Wr.patch create mode 100644 SOURCES/0041-server-Fix-off-by-one-for-maximum-block_status-lengt.patch create mode 100644 SOURCES/0042-blocksize-Fix-32-bit-overflow-in-.extents-CVE-2025-4.patch diff --git a/SOURCES/0001-server-log-Move-preserve-errno-to-log_verror-functio.patch b/SOURCES/0001-server-log-Move-preserve-errno-to-log_verror-functio.patch index 6c3892c..c238183 100644 --- a/SOURCES/0001-server-log-Move-preserve-errno-to-log_verror-functio.patch +++ b/SOURCES/0001-server-log-Move-preserve-errno-to-log_verror-functio.patch @@ -147,5 +147,5 @@ index 464e4f9a..9c1f667a 100644 /* Note: preserves the previous value of errno. */ -- -2.43.0 +2.47.1 diff --git a/SOURCES/0002-server-Rename-threadlocal_-set-get-_error-to-._errno.patch b/SOURCES/0002-server-Rename-threadlocal_-set-get-_error-to-._errno.patch index 800fed0..fdb9695 100644 --- a/SOURCES/0002-server-Rename-threadlocal_-set-get-_error-to-._errno.patch +++ b/SOURCES/0002-server-Rename-threadlocal_-set-get-_error-to-._errno.patch @@ -173,5 +173,5 @@ index 088fe55a..9bb656bc 100644 int err = errno; struct threadlocal *threadlocal = pthread_getspecific (threadlocal_key); -- -2.43.0 +2.47.1 diff --git a/SOURCES/0003-server-Introduce-threadlocal_-set-get-_last_error.patch b/SOURCES/0003-server-Introduce-threadlocal_-set-get-_last_error.patch index 2d09ae9..3608b77 100644 --- a/SOURCES/0003-server-Introduce-threadlocal_-set-get-_last_error.patch +++ b/SOURCES/0003-server-Introduce-threadlocal_-set-get-_last_error.patch @@ -91,5 +91,5 @@ index 9bb656bc..74a3c4e5 100644 * size is increased to ‘size’ bytes if required. * -- -2.43.0 +2.47.1 diff --git a/SOURCES/0004-server-Take-a-thread-local-copy-of-the-last-call-to-.patch b/SOURCES/0004-server-Take-a-thread-local-copy-of-the-last-call-to-.patch index 76a5d6d..593b818 100644 --- a/SOURCES/0004-server-Take-a-thread-local-copy-of-the-last-call-to-.patch +++ b/SOURCES/0004-server-Take-a-thread-local-copy-of-the-last-call-to-.patch @@ -91,5 +91,5 @@ index 677da05c..d428bfc8 100644 case NBD_CMD_READ: if (backend_pread (c, buf, count, offset, 0, &err) == -1) -- -2.43.0 +2.47.1 diff --git a/SOURCES/0005-server-Send-the-last-error-to-the-NBD-client.patch b/SOURCES/0005-server-Send-the-last-error-to-the-NBD-client.patch index 0646328..24d4cef 100644 --- a/SOURCES/0005-server-Send-the-last-error-to-the-NBD-client.patch +++ b/SOURCES/0005-server-Send-the-last-error-to-the-NBD-client.patch @@ -173,5 +173,5 @@ index 00000000..fc720606 + +grep "Go Away" $out -- -2.43.0 +2.47.1 diff --git a/SOURCES/0006-vddk-Include-stdbool.h.patch b/SOURCES/0006-vddk-Include-stdbool.h.patch index 7b7bd5d..829d183 100644 --- a/SOURCES/0006-vddk-Include-stdbool.h.patch +++ b/SOURCES/0006-vddk-Include-stdbool.h.patch @@ -24,5 +24,5 @@ index 467d00ca..5982fcea 100644 #include -- -2.43.0 +2.47.1 diff --git a/SOURCES/0007-vddk-Cache-the-disk-size-in-the-handle.patch b/SOURCES/0007-vddk-Cache-the-disk-size-in-the-handle.patch index d7b20cf..9b3f1e2 100644 --- a/SOURCES/0007-vddk-Cache-the-disk-size-in-the-handle.patch +++ b/SOURCES/0007-vddk-Cache-the-disk-size-in-the-handle.patch @@ -55,5 +55,5 @@ index fb0c79a8..1d1069cc 100644 /* reexec.c */ -- -2.43.0 +2.47.1 diff --git a/SOURCES/0008-vddk-do_extents-Mark-some-local-variables-const.patch b/SOURCES/0008-vddk-do_extents-Mark-some-local-variables-const.patch index a299ded..826d9c1 100644 --- a/SOURCES/0008-vddk-do_extents-Mark-some-local-variables-const.patch +++ b/SOURCES/0008-vddk-do_extents-Mark-some-local-variables-const.patch @@ -30,5 +30,5 @@ index 5982fcea..bc015d16 100644 uint64_t position, end, start_sector; -- -2.43.0 +2.47.1 diff --git a/SOURCES/0009-vddk-do_extents-Exit-the-function-if-we-hit-req_one-.patch b/SOURCES/0009-vddk-do_extents-Exit-the-function-if-we-hit-req_one-.patch index 2776dc1..34f187c 100644 --- a/SOURCES/0009-vddk-do_extents-Exit-the-function-if-we-hit-req_one-.patch +++ b/SOURCES/0009-vddk-do_extents-Exit-the-function-if-we-hit-req_one-.patch @@ -27,5 +27,5 @@ index bc015d16..112111e3 100644 return 0; -- -2.43.0 +2.47.1 diff --git a/SOURCES/0010-vddk-do_extents-Avoid-reading-partial-chunk-beyond-t.patch b/SOURCES/0010-vddk-do_extents-Avoid-reading-partial-chunk-beyond-t.patch index 3d6fa18..89ba958 100644 --- a/SOURCES/0010-vddk-do_extents-Avoid-reading-partial-chunk-beyond-t.patch +++ b/SOURCES/0010-vddk-do_extents-Avoid-reading-partial-chunk-beyond-t.patch @@ -198,5 +198,5 @@ index 00000000..28fccd6c + 196608 65536 3 hole,zero + 262144 1536 0 data" -- -2.43.0 +2.47.1 diff --git a/SOURCES/0011-file-Add-debugging-if-sync_file_range-posix_fadvise-.patch b/SOURCES/0011-file-Add-debugging-if-sync_file_range-posix_fadvise-.patch new file mode 100644 index 0000000..3c24bb9 --- /dev/null +++ b/SOURCES/0011-file-Add-debugging-if-sync_file_range-posix_fadvise-.patch @@ -0,0 +1,41 @@ +From 881113e29b45975742ca11a4d0539ed2eb40b717 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Fri, 28 Mar 2025 20:32:46 +0000 +Subject: [PATCH] file: Add debugging if sync_file_range / posix_fadvise fails + +These are advisory hints, but add some debugging so we know if these +calls are failing. The errors are ignored so there is no behaviour +change. + +Thanks: Eric Sandeen +(cherry picked from commit feb22cfc85b0750a4e4bfbc5fe56636883599feb) +--- + plugins/file/file.c | 12 +++++++----- + 1 file changed, 7 insertions(+), 5 deletions(-) + +diff --git a/plugins/file/file.c b/plugins/file/file.c +index 2bad6480..6d0e77a1 100644 +--- a/plugins/file/file.c ++++ b/plugins/file/file.c +@@ -123,11 +123,13 @@ evict_writes (int fd, uint64_t offset, size_t len) + + /* Evict the oldest window from the page cache. */ + if (window[0].len > 0) { +- sync_file_range (window[0].fd, window[0].offset, window[0].len, +- SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| +- SYNC_FILE_RANGE_WAIT_AFTER); +- posix_fadvise (window[0].fd, window[0].offset, window[0].len, +- POSIX_FADV_DONTNEED); ++ if (sync_file_range (window[0].fd, window[0].offset, window[0].len, ++ SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| ++ SYNC_FILE_RANGE_WAIT_AFTER) == -1) ++ nbdkit_debug ("sync_file_range: wait: (ignored): %m"); ++ if (posix_fadvise (window[0].fd, window[0].offset, window[0].len, ++ POSIX_FADV_DONTNEED) == -1) ++ nbdkit_debug ("posix_fadvise: POSIX_FADV_DONTNEED: (ignored): %m"); + } + + /* Move the Nth window to N-1. */ +-- +2.47.1 + diff --git a/SOURCES/0012-file-If-sync_file_range-fails-to-start-don-t-add-win.patch b/SOURCES/0012-file-If-sync_file_range-fails-to-start-don-t-add-win.patch new file mode 100644 index 0000000..f4a0c53 --- /dev/null +++ b/SOURCES/0012-file-If-sync_file_range-fails-to-start-don-t-add-win.patch @@ -0,0 +1,82 @@ +From 205fb5f8ebd72a37fb624e7d551ead313da2437b Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Fri, 28 Mar 2025 20:35:54 +0000 +Subject: [PATCH] file: If sync_file_range fails to start, don't add window to + list + +When starting the process of writing out the newest range, we +previously didn't check if sync_file_range succeeded or failed. But +if it failed there's no reason to add it to the list of windows. Also +debug any failures. + +This shouldn't change any semantics as far as we're aware. + +Thanks: Eric Sandeen +(cherry picked from commit d35f0636373e6a58c8f3fcfcf505af248e27c574) +--- + plugins/file/file.c | 48 ++++++++++++++++++++++++++------------------- + 1 file changed, 28 insertions(+), 20 deletions(-) + +diff --git a/plugins/file/file.c b/plugins/file/file.c +index 6d0e77a1..638c960d 100644 +--- a/plugins/file/file.c ++++ b/plugins/file/file.c +@@ -121,27 +121,35 @@ evict_writes (int fd, uint64_t offset, size_t len) + { + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock); + +- /* Evict the oldest window from the page cache. */ +- if (window[0].len > 0) { +- if (sync_file_range (window[0].fd, window[0].offset, window[0].len, +- SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| +- SYNC_FILE_RANGE_WAIT_AFTER) == -1) +- nbdkit_debug ("sync_file_range: wait: (ignored): %m"); +- if (posix_fadvise (window[0].fd, window[0].offset, window[0].len, +- POSIX_FADV_DONTNEED) == -1) +- nbdkit_debug ("posix_fadvise: POSIX_FADV_DONTNEED: (ignored): %m"); +- } +- +- /* Move the Nth window to N-1. */ +- memmove (&window[0], &window[1], sizeof window[0] * (NR_WINDOWS-1)); +- +- /* Set up the current window and tell Linux to start writing it out +- * to disk (asynchronously). ++ /* Tell Linux to start writing the current range out to disk ++ * (asynchronously). + */ +- sync_file_range (fd, offset, len, SYNC_FILE_RANGE_WRITE); +- window[NR_WINDOWS-1].fd = fd; +- window[NR_WINDOWS-1].offset = offset; +- window[NR_WINDOWS-1].len = len; ++ if (sync_file_range (fd, offset, len, SYNC_FILE_RANGE_WRITE) == -1) { ++ /* sync_file_range to start the write failed */ ++ nbdkit_debug ("sync_file_range: write: (ignored): %m"); ++ } ++ else { ++ /* sync_file_range to start the write succeeded, so ++ * evict the oldest window from the page cache. ++ */ ++ if (window[0].len > 0) { ++ if (sync_file_range (window[0].fd, window[0].offset, window[0].len, ++ SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| ++ SYNC_FILE_RANGE_WAIT_AFTER) == -1) ++ nbdkit_debug ("sync_file_range: wait: (ignored): %m"); ++ if (posix_fadvise (window[0].fd, window[0].offset, window[0].len, ++ POSIX_FADV_DONTNEED) == -1) ++ nbdkit_debug ("posix_fadvise: POSIX_FADV_DONTNEED: (ignored): %m"); ++ } ++ ++ /* Move the Nth window to N-1. */ ++ memmove (&window[0], &window[1], sizeof window[0] * (NR_WINDOWS-1)); ++ ++ /* Add the range to the newest end of the list of windows. */ ++ window[NR_WINDOWS-1].fd = fd; ++ window[NR_WINDOWS-1].offset = offset; ++ window[NR_WINDOWS-1].len = len; ++ } + } + + /* When we close the handle we must remove any windows which are still +-- +2.47.1 + diff --git a/SOURCES/0013-tests-Add-tests-of-file-plugin-cache-none.patch b/SOURCES/0013-tests-Add-tests-of-file-plugin-cache-none.patch new file mode 100644 index 0000000..d27e26a --- /dev/null +++ b/SOURCES/0013-tests-Add-tests-of-file-plugin-cache-none.patch @@ -0,0 +1,254 @@ +From e9c972bc03d42d6fe7f11cd076a5500d39976a61 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Sat, 29 Mar 2025 10:01:05 +0000 +Subject: [PATCH] tests: Add tests of file plugin cache=none + +These are based on the tests which were done in the commit message +when the feature was first added in commit aa5a2183a6, but cleaned up +and modernised a little. + +It is likely that these tests would fail under memory pressure (since +they test that the page cache is used vs not used). Making them more +robust left as an exercise for the future. + +(cherry picked from commit 6017ba21aeeb3d7ad85925e78dba85a005194dee) +--- + README.md | 1 + + tests/Makefile.am | 4 ++ + tests/test-file-cache-none-read.sh | 90 ++++++++++++++++++++++++++++ + tests/test-file-cache-none-write.sh | 92 +++++++++++++++++++++++++++++ + 4 files changed, 187 insertions(+) + create mode 100755 tests/test-file-cache-none-read.sh + create mode 100755 tests/test-file-cache-none-write.sh + +diff --git a/README.md b/README.md +index fbe403da..86ac6c57 100644 +--- a/README.md ++++ b/README.md +@@ -174,6 +174,7 @@ To test for memory leaks (`make check-valgrind`): + + For non-essential enhancements to the test suite: + ++* cachedel, cachestats (from https://github.com/Feh/nocache) + * expect + * fdisk, sfdisk (from util-linux) + * flake8 +diff --git a/tests/Makefile.am b/tests/Makefile.am +index 0aa36846..4340cc38 100644 +--- a/tests/Makefile.am ++++ b/tests/Makefile.am +@@ -846,6 +846,8 @@ TESTS += \ + test-file-extents.sh \ + test-file-dir.sh \ + test-file-dirfd.sh \ ++ test-file-cache-none-read.sh \ ++ test-file-cache-none-write.sh \ + $(NULL) + EXTRA_DIST += \ + test-file.sh \ +@@ -855,6 +857,8 @@ EXTRA_DIST += \ + test-file-extents.sh \ + test-file-dir.sh \ + test-file-dirfd.sh \ ++ test-file-cache-none-read.sh \ ++ test-file-cache-none-write.sh \ + $(NULL) + LIBGUESTFS_TESTS += test-file-block + +diff --git a/tests/test-file-cache-none-read.sh b/tests/test-file-cache-none-read.sh +new file mode 100755 +index 00000000..c9831b43 +--- /dev/null ++++ b/tests/test-file-cache-none-read.sh +@@ -0,0 +1,90 @@ ++#!/usr/bin/env bash ++# nbdkit ++# Copyright Red Hat ++# ++# Redistribution and use in source and binary forms, with or without ++# modification, are permitted provided that the following conditions are ++# met: ++# ++# * Redistributions of source code must retain the above copyright ++# notice, this list of conditions and the following disclaimer. ++# ++# * Redistributions in binary form must reproduce the above copyright ++# notice, this list of conditions and the following disclaimer in the ++# documentation and/or other materials provided with the distribution. ++# ++# * Neither the name of Red Hat nor the names of its contributors may be ++# used to endorse or promote products derived from this software without ++# specific prior written permission. ++# ++# THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND ++# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ++# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A ++# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR ++# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF ++# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT ++# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++# SUCH DAMAGE. ++ ++# Test the file plugin, reading with cache=none. ++ ++source ./functions.sh ++set -e ++set -x ++ ++# Makes no sense to run this test under valgrind. ++skip_if_valgrind ++ ++requires_plugin file ++requires_run ++requires_nbdcopy ++requires $TRUNCATE --version ++requires test -r /dev/urandom ++requires dd --version ++requires $SED --version ++ ++# Requires the cachestats tool from https://github.com/Feh/nocache ++# It doesn't support --version or --help, so use 'type' instead. ++requires type cachestats ++requires type cachedel ++ ++inp=file-cache-none-read.in ++stats1=file-cache-none-read.s1 ++stats2=file-cache-none-read.s2 ++rm -f $inp $stats1 $stats2 ++cleanup_fn rm -f $inp $stats1 $stats2 ++ ++# Create a large random file as input. ++dd if=/dev/urandom of=$inp bs=1024k count=1024 ++ ++# Drop the input file from the page cache and read it out with nbdkit. ++# We expect to see the input file mostly or completely in cache after. ++cachedel $inp ++nbdkit file $inp --run 'nbdcopy "$uri" null:' ++cachestats $inp > $stats1 ++cat $stats1 ++ ++# The same, with cache=none. ++# We expect to see the input file not cached after. ++cachedel $inp ++nbdkit file $inp --run 'nbdcopy "$uri" null:' cache=none ++cachestats $inp > $stats2 ++cat $stats2 ++ ++# The output of cachestats looks like this: ++# pages in cache: 262144/262144 (100.0%) [filesize=1048576.0K, pagesize=4K] ++# We want to check that % pages in cache using cache=none is much ++# lower than the default case. ++pic1="$($SED 's,pages in cache: [0-9/]* (\([0-9]*\)\.[0-9]*%).*,\1,' \ ++ < $stats1)" ++pic2="$($SED 's,pages in cache: [0-9/]* (\([0-9]*\)\.[0-9]*%).*,\1,' \ ++ < $stats2)" ++ ++# Test before is > 10% ++test "$pic1" -gt 10 ++# Test after is < 10% ++test "$pic2" -lt 10 +diff --git a/tests/test-file-cache-none-write.sh b/tests/test-file-cache-none-write.sh +new file mode 100755 +index 00000000..2041a5cd +--- /dev/null ++++ b/tests/test-file-cache-none-write.sh +@@ -0,0 +1,92 @@ ++#!/usr/bin/env bash ++# nbdkit ++# Copyright Red Hat ++# ++# Redistribution and use in source and binary forms, with or without ++# modification, are permitted provided that the following conditions are ++# met: ++# ++# * Redistributions of source code must retain the above copyright ++# notice, this list of conditions and the following disclaimer. ++# ++# * Redistributions in binary form must reproduce the above copyright ++# notice, this list of conditions and the following disclaimer in the ++# documentation and/or other materials provided with the distribution. ++# ++# * Neither the name of Red Hat nor the names of its contributors may be ++# used to endorse or promote products derived from this software without ++# specific prior written permission. ++# ++# THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND ++# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ++# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A ++# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR ++# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF ++# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT ++# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++# SUCH DAMAGE. ++ ++# Test the file plugin, writing with cache=none. ++ ++source ./functions.sh ++set -e ++set -x ++ ++# Makes no sense to run this test under valgrind. ++skip_if_valgrind ++ ++requires_plugin file ++requires_run ++requires_nbdcopy ++requires $TRUNCATE --version ++requires test -r /dev/urandom ++requires dd --version ++requires $SED --version ++ ++# Requires the cachestats tool from https://github.com/Feh/nocache ++# It doesn't support --version or --help, so use 'type' instead. ++requires type cachestats ++ ++inp=file-cache-none-write.in ++out=file-cache-none-write.out ++stats1=file-cache-none-write.s1 ++stats2=file-cache-none-write.s2 ++rm -f $inp $out $stats1 $stats2 ++cleanup_fn rm -f $inp $out $stats1 $stats2 ++ ++# Create a large random file as input. ++dd if=/dev/urandom of=$inp bs=1024k count=1024 ++ ++# Copy to output using cache=default and collect the stats. ++# We expect to see the output file mostly or completely in cache after. ++rm -f $out; truncate -r $inp $out ++export inp ++nbdkit file $out --run 'nbdcopy $inp "$uri"' cache=default ++cachestats $out > $stats1 ++cat $stats1 ++ ++# The same, with cache=none. ++# We expect to see the output file not cached after. ++rm -f $out; truncate -r $inp $out ++export inp ++nbdkit file $out --run 'nbdcopy $inp "$uri"' cache=none ++cachestats $out > $stats2 ++cat $stats2 ++ ++# The output of cachestats looks like this: ++# pages in cache: 262144/262144 (100.0%) [filesize=1048576.0K, pagesize=4K] ++# We want to check that % pages in cache using cache=none is much ++# lower than the default case. ++pic1="$($SED 's,pages in cache: [0-9/]* (\([0-9]*\)\.[0-9]*%).*,\1,' \ ++ < $stats1)" ++pic2="$($SED 's,pages in cache: [0-9/]* (\([0-9]*\)\.[0-9]*%).*,\1,' \ ++ < $stats2)" ++ ++# Test before is > 10% ++test "$pic1" -gt 10 ++# Test after is < 10% ++test "$pic2" -lt 10 +-- +2.47.1 + diff --git a/SOURCES/0014-tests-Add-more-generic-tests-of-file-cache-none.patch b/SOURCES/0014-tests-Add-more-generic-tests-of-file-cache-none.patch new file mode 100644 index 0000000..0f2038e --- /dev/null +++ b/SOURCES/0014-tests-Add-more-generic-tests-of-file-cache-none.patch @@ -0,0 +1,238 @@ +From bdab63138758e287b292140e17f2d0e3b4c40bbe Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Tue, 1 Apr 2025 12:45:43 +0100 +Subject: [PATCH] tests: Add more generic tests of file cache=none + +The tests added in commit 6017ba21ae require cachestats which means +they are almost always skipped in reality. Also we're interested in +whether the windowed eviction could ever corrupt data. Write a +simpler pair of tests for data integrity. + +Updates: commit 6017ba21aeeb3d7ad85925e78dba85a005194dee +(cherry picked from commit 7b45f73c0668e56a9188249eeefc2f67aeb50af3) +--- + tests/Makefile.am | 12 +++-- + tests/test-file-cache-none-read-consistent.sh | 53 ++++++++++++++++++ + ...=> test-file-cache-none-read-effective.sh} | 9 ++-- + .../test-file-cache-none-write-consistent.sh | 54 +++++++++++++++++++ + ...> test-file-cache-none-write-effective.sh} | 11 ++-- + 5 files changed, 126 insertions(+), 13 deletions(-) + create mode 100755 tests/test-file-cache-none-read-consistent.sh + rename tests/{test-file-cache-none-read.sh => test-file-cache-none-read-effective.sh} (93%) + create mode 100755 tests/test-file-cache-none-write-consistent.sh + rename tests/{test-file-cache-none-write.sh => test-file-cache-none-write-effective.sh} (92%) + +diff --git a/tests/Makefile.am b/tests/Makefile.am +index 4340cc38..eed96d28 100644 +--- a/tests/Makefile.am ++++ b/tests/Makefile.am +@@ -846,8 +846,10 @@ TESTS += \ + test-file-extents.sh \ + test-file-dir.sh \ + test-file-dirfd.sh \ +- test-file-cache-none-read.sh \ +- test-file-cache-none-write.sh \ ++ test-file-cache-none-read-consistent.sh \ ++ test-file-cache-none-read-effective.sh \ ++ test-file-cache-none-write-consistent.sh \ ++ test-file-cache-none-write-effective.sh \ + $(NULL) + EXTRA_DIST += \ + test-file.sh \ +@@ -857,8 +859,10 @@ EXTRA_DIST += \ + test-file-extents.sh \ + test-file-dir.sh \ + test-file-dirfd.sh \ +- test-file-cache-none-read.sh \ +- test-file-cache-none-write.sh \ ++ test-file-cache-none-read-consistent.sh \ ++ test-file-cache-none-read-effective.sh \ ++ test-file-cache-none-write-consistent.sh \ ++ test-file-cache-none-write-effective.sh \ + $(NULL) + LIBGUESTFS_TESTS += test-file-block + +diff --git a/tests/test-file-cache-none-read-consistent.sh b/tests/test-file-cache-none-read-consistent.sh +new file mode 100755 +index 00000000..5f5794ee +--- /dev/null ++++ b/tests/test-file-cache-none-read-consistent.sh +@@ -0,0 +1,53 @@ ++#!/usr/bin/env bash ++# nbdkit ++# Copyright Red Hat ++# ++# Redistribution and use in source and binary forms, with or without ++# modification, are permitted provided that the following conditions are ++# met: ++# ++# * Redistributions of source code must retain the above copyright ++# notice, this list of conditions and the following disclaimer. ++# ++# * Redistributions in binary form must reproduce the above copyright ++# notice, this list of conditions and the following disclaimer in the ++# documentation and/or other materials provided with the distribution. ++# ++# * Neither the name of Red Hat nor the names of its contributors may be ++# used to endorse or promote products derived from this software without ++# specific prior written permission. ++# ++# THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND ++# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ++# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A ++# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR ++# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF ++# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT ++# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++# SUCH DAMAGE. ++ ++# Test the file plugin, reading with cache=none. ++# ++# Unlike test-file-cache-none-read-effective.sh, this test doesn't ++# require cachestats. It's more about testing integrity of reading. ++ ++source ./functions.sh ++set -e ++set -x ++ ++requires_plugin file ++requires_run ++requires_nbdcopy ++requires test -f disk ++requires md5sum --version ++ ++out=file-cache-none-read-consistent.out ++cleanup_fn rm -f $out ++ ++export out ++nbdkit file disk cache=none --run 'nbdcopy "$uri" "$out"' ++test "$(md5sum < disk)" = "$(md5sum < $out)" +diff --git a/tests/test-file-cache-none-read.sh b/tests/test-file-cache-none-read-effective.sh +similarity index 93% +rename from tests/test-file-cache-none-read.sh +rename to tests/test-file-cache-none-read-effective.sh +index c9831b43..efead224 100755 +--- a/tests/test-file-cache-none-read.sh ++++ b/tests/test-file-cache-none-read-effective.sh +@@ -30,7 +30,8 @@ + # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + # SUCH DAMAGE. + +-# Test the file plugin, reading with cache=none. ++# Test the file plugin, reading with cache=none, is effective at ++# reducing page cache usage. + + source ./functions.sh + set -e +@@ -52,9 +53,9 @@ requires $SED --version + requires type cachestats + requires type cachedel + +-inp=file-cache-none-read.in +-stats1=file-cache-none-read.s1 +-stats2=file-cache-none-read.s2 ++inp=file-cache-none-read-effective.in ++stats1=file-cache-none-read-effective.s1 ++stats2=file-cache-none-read-effective.s2 + rm -f $inp $stats1 $stats2 + cleanup_fn rm -f $inp $stats1 $stats2 + +diff --git a/tests/test-file-cache-none-write-consistent.sh b/tests/test-file-cache-none-write-consistent.sh +new file mode 100755 +index 00000000..749805f1 +--- /dev/null ++++ b/tests/test-file-cache-none-write-consistent.sh +@@ -0,0 +1,54 @@ ++#!/usr/bin/env bash ++# nbdkit ++# Copyright Red Hat ++# ++# Redistribution and use in source and binary forms, with or without ++# modification, are permitted provided that the following conditions are ++# met: ++# ++# * Redistributions of source code must retain the above copyright ++# notice, this list of conditions and the following disclaimer. ++# ++# * Redistributions in binary form must reproduce the above copyright ++# notice, this list of conditions and the following disclaimer in the ++# documentation and/or other materials provided with the distribution. ++# ++# * Neither the name of Red Hat nor the names of its contributors may be ++# used to endorse or promote products derived from this software without ++# specific prior written permission. ++# ++# THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND ++# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ++# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A ++# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR ++# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF ++# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT ++# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++# SUCH DAMAGE. ++ ++# Test the file plugin, writing with cache=none. ++# ++# Unlike test-file-cache-none-write-effective.sh, this test doesn't ++# require cachestats. It's more about testing integrity of writing. ++ ++source ./functions.sh ++set -e ++set -x ++ ++requires_plugin file ++requires_run ++requires_nbdcopy ++requires test -f disk ++requires md5sum --version ++requires $TRUNCATE --version ++ ++out=file-cache-none-write-consistent.out ++cleanup_fn rm -f $out ++ ++rm -f $out; $TRUNCATE -r disk $out ++nbdkit file $out cache=none --run 'nbdcopy disk "$uri"' ++test "$(md5sum < disk)" = "$(md5sum < $out)" +diff --git a/tests/test-file-cache-none-write.sh b/tests/test-file-cache-none-write-effective.sh +similarity index 92% +rename from tests/test-file-cache-none-write.sh +rename to tests/test-file-cache-none-write-effective.sh +index 2041a5cd..e0159242 100755 +--- a/tests/test-file-cache-none-write.sh ++++ b/tests/test-file-cache-none-write-effective.sh +@@ -30,7 +30,8 @@ + # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + # SUCH DAMAGE. + +-# Test the file plugin, writing with cache=none. ++# Test the file plugin, writing with cache=none, is effective at ++# reducing page cache usage. + + source ./functions.sh + set -e +@@ -51,10 +52,10 @@ requires $SED --version + # It doesn't support --version or --help, so use 'type' instead. + requires type cachestats + +-inp=file-cache-none-write.in +-out=file-cache-none-write.out +-stats1=file-cache-none-write.s1 +-stats2=file-cache-none-write.s2 ++inp=file-cache-none-write-effective.in ++out=file-cache-none-write-effective.out ++stats1=file-cache-none-write-effective.s1 ++stats2=file-cache-none-write-effective.s2 + rm -f $inp $out $stats1 $stats2 + cleanup_fn rm -f $inp $out $stats1 $stats2 + +-- +2.47.1 + diff --git a/SOURCES/0015-file-Hard-error-if-sync_file_range-fails.patch b/SOURCES/0015-file-Hard-error-if-sync_file_range-fails.patch new file mode 100644 index 0000000..7b993a5 --- /dev/null +++ b/SOURCES/0015-file-Hard-error-if-sync_file_range-fails.patch @@ -0,0 +1,101 @@ +From 2772bc3e81cb0c396bde186a2581d010b8d19e5d Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Tue, 1 Apr 2025 11:07:51 +0100 +Subject: [PATCH] file: Hard error if sync_file_range fails + +After discussion with Dave Chinner, it appears that I/O errors can be +returned by sync_file_range, ie. it is not merely a hint to start the +eviction. Therefore return a hard error if any of the sync_file_range +calls fails. + +Thanks: Dave Chinner +Updates: commit d35f0636373e6a58c8f3fcfcf505af248e27c574 +(cherry picked from commit d2ed6a839c3ed50b417f45dc13e5b811ecdc4e74) +--- + plugins/file/file.c | 53 ++++++++++++++++++++++++--------------------- + 1 file changed, 28 insertions(+), 25 deletions(-) + +diff --git a/plugins/file/file.c b/plugins/file/file.c +index 638c960d..f195bcf3 100644 +--- a/plugins/file/file.c ++++ b/plugins/file/file.c +@@ -116,7 +116,7 @@ struct write_window { + static pthread_mutex_t window_lock = PTHREAD_MUTEX_INITIALIZER; + static struct write_window window[NR_WINDOWS]; + +-static void ++static int + evict_writes (int fd, uint64_t offset, size_t len) + { + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock); +@@ -125,31 +125,32 @@ evict_writes (int fd, uint64_t offset, size_t len) + * (asynchronously). + */ + if (sync_file_range (fd, offset, len, SYNC_FILE_RANGE_WRITE) == -1) { +- /* sync_file_range to start the write failed */ +- nbdkit_debug ("sync_file_range: write: (ignored): %m"); ++ nbdkit_error ("sync_file_range: cache=none: starting eviction: %m"); ++ return -1; + } +- else { +- /* sync_file_range to start the write succeeded, so +- * evict the oldest window from the page cache. +- */ +- if (window[0].len > 0) { +- if (sync_file_range (window[0].fd, window[0].offset, window[0].len, +- SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| +- SYNC_FILE_RANGE_WAIT_AFTER) == -1) +- nbdkit_debug ("sync_file_range: wait: (ignored): %m"); +- if (posix_fadvise (window[0].fd, window[0].offset, window[0].len, +- POSIX_FADV_DONTNEED) == -1) +- nbdkit_debug ("posix_fadvise: POSIX_FADV_DONTNEED: (ignored): %m"); ++ ++ /* Evict the oldest window from the page cache. */ ++ if (window[0].len > 0) { ++ if (sync_file_range (window[0].fd, window[0].offset, window[0].len, ++ SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| ++ SYNC_FILE_RANGE_WAIT_AFTER) == -1) { ++ nbdkit_error ("sync_file_range: cache=none: evicting oldest window: %m"); ++ return -1; + } +- +- /* Move the Nth window to N-1. */ +- memmove (&window[0], &window[1], sizeof window[0] * (NR_WINDOWS-1)); +- +- /* Add the range to the newest end of the list of windows. */ +- window[NR_WINDOWS-1].fd = fd; +- window[NR_WINDOWS-1].offset = offset; +- window[NR_WINDOWS-1].len = len; ++ if (posix_fadvise (window[0].fd, window[0].offset, window[0].len, ++ POSIX_FADV_DONTNEED) == -1) ++ nbdkit_debug ("posix_fadvise: POSIX_FADV_DONTNEED: (ignored): %m"); + } ++ ++ /* Move the Nth window to N-1. */ ++ memmove (&window[0], &window[1], sizeof window[0] * (NR_WINDOWS-1)); ++ ++ /* Add the range to the newest end of the list of windows. */ ++ window[NR_WINDOWS-1].fd = fd; ++ window[NR_WINDOWS-1].offset = offset; ++ window[NR_WINDOWS-1].len = len; ++ ++ return 0; + } + + /* When we close the handle we must remove any windows which are still +@@ -855,8 +856,10 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset, + return -1; + + #if EVICT_WRITES +- if (cache_mode == cache_none) +- evict_writes (h->fd, orig_offset, orig_count); ++ if (cache_mode == cache_none) { ++ if (evict_writes (h->fd, orig_offset, orig_count) == -1) ++ return -1; ++ } + #endif + + return 0; +-- +2.47.1 + diff --git a/SOURCES/0016-file-Reduce-the-size-of-the-lock-around-write-evicti.patch b/SOURCES/0016-file-Reduce-the-size-of-the-lock-around-write-evicti.patch new file mode 100644 index 0000000..09abced --- /dev/null +++ b/SOURCES/0016-file-Reduce-the-size-of-the-lock-around-write-evicti.patch @@ -0,0 +1,90 @@ +From 056a975578b42e8cbfd783b8b919c4ef02e40019 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Tue, 1 Apr 2025 11:56:22 +0100 +Subject: [PATCH] file: Reduce the size of the lock around write eviction + +Previously we held window_lock during the synchronous eviction of the +oldest window. This potentially serializes all writes in the slow +write case. We don't need to hold the lock here. + +(cherry picked from commit d3d2bc45bb59a30669a3d926435cf57e99feb3a2) +--- + plugins/file/file.c | 52 +++++++++++++++++++++++++++------------------ + 1 file changed, 31 insertions(+), 21 deletions(-) + +diff --git a/plugins/file/file.c b/plugins/file/file.c +index f195bcf3..e68c24ee 100644 +--- a/plugins/file/file.c ++++ b/plugins/file/file.c +@@ -119,37 +119,47 @@ static struct write_window window[NR_WINDOWS]; + static int + evict_writes (int fd, uint64_t offset, size_t len) + { +- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock); +- +- /* Tell Linux to start writing the current range out to disk +- * (asynchronously). +- */ +- if (sync_file_range (fd, offset, len, SYNC_FILE_RANGE_WRITE) == -1) { +- nbdkit_error ("sync_file_range: cache=none: starting eviction: %m"); +- return -1; ++ struct write_window oldest = { 0 }; ++ ++ { ++ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock); ++ ++ /* Save oldest window[0] for eviction below, and move all windows ++ * down one. Set the newest slot to empty. ++ */ ++ oldest = window[0]; ++ memmove (&window[0], &window[1], sizeof window[0] * (NR_WINDOWS-1)); ++ window[NR_WINDOWS-1].len = 0; ++ ++ /* Tell Linux to start writing the current range out to disk ++ * (asynchronously). ++ */ ++ if (sync_file_range (fd, offset, len, SYNC_FILE_RANGE_WRITE) == -1) { ++ nbdkit_error ("sync_file_range: cache=none: starting eviction: %m"); ++ return -1; ++ } ++ ++ /* Add the range to the newest end of the list of windows. */ ++ window[NR_WINDOWS-1].fd = fd; ++ window[NR_WINDOWS-1].offset = offset; ++ window[NR_WINDOWS-1].len = len; + } ++ /* Release lock here. */ + +- /* Evict the oldest window from the page cache. */ +- if (window[0].len > 0) { +- if (sync_file_range (window[0].fd, window[0].offset, window[0].len, +- SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| ++ /* Evict the oldest window from the page cache (synchronously). */ ++ if (oldest.len > 0) { ++ if (sync_file_range (oldest.fd, oldest.offset, oldest.len, ++ SYNC_FILE_RANGE_WAIT_BEFORE | ++ SYNC_FILE_RANGE_WRITE | + SYNC_FILE_RANGE_WAIT_AFTER) == -1) { + nbdkit_error ("sync_file_range: cache=none: evicting oldest window: %m"); + return -1; + } +- if (posix_fadvise (window[0].fd, window[0].offset, window[0].len, ++ if (posix_fadvise (oldest.fd, oldest.offset, oldest.len, + POSIX_FADV_DONTNEED) == -1) + nbdkit_debug ("posix_fadvise: POSIX_FADV_DONTNEED: (ignored): %m"); + } + +- /* Move the Nth window to N-1. */ +- memmove (&window[0], &window[1], sizeof window[0] * (NR_WINDOWS-1)); +- +- /* Add the range to the newest end of the list of windows. */ +- window[NR_WINDOWS-1].fd = fd; +- window[NR_WINDOWS-1].offset = offset; +- window[NR_WINDOWS-1].len = len; +- + return 0; + } + +-- +2.47.1 + diff --git a/SOURCES/0017-file-Document-implicit-assumption-about-eviction-win.patch b/SOURCES/0017-file-Document-implicit-assumption-about-eviction-win.patch new file mode 100644 index 0000000..fcb1bff --- /dev/null +++ b/SOURCES/0017-file-Document-implicit-assumption-about-eviction-win.patch @@ -0,0 +1,29 @@ +From 5f5f74fa0afae78365eb431a589a6fa4938b9287 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Tue, 1 Apr 2025 16:51:24 +0100 +Subject: [PATCH] file: Document implicit assumption about eviction windows + +Add a comment that slots in the window list are only valid when len > 0. +This was true before but only implicit. + +(cherry picked from commit 2022ac667f3c1de81b692f0128dd83a7c4999e38) +--- + plugins/file/file.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/plugins/file/file.c b/plugins/file/file.c +index e68c24ee..6bcc5537 100644 +--- a/plugins/file/file.c ++++ b/plugins/file/file.c +@@ -110,7 +110,7 @@ static enum { cache_default, cache_none } cache_mode = cache_default; + struct write_window { + int fd; + uint64_t offset; +- size_t len; ++ size_t len; /* window slot only valid if len > 0 */ + }; + + static pthread_mutex_t window_lock = PTHREAD_MUTEX_INITIALIZER; +-- +2.47.1 + diff --git a/SOURCES/0018-server-Turn-flush-into-a-controlpath-message.patch b/SOURCES/0018-server-Turn-flush-into-a-controlpath-message.patch new file mode 100644 index 0000000..dea7aaf --- /dev/null +++ b/SOURCES/0018-server-Turn-flush-into-a-controlpath-message.patch @@ -0,0 +1,33 @@ +From 23a2ec7a574dda51a5b4bd2ddef3dcc2b2c8b8f2 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Sat, 5 Apr 2025 09:02:03 +0100 +Subject: [PATCH] server: Turn flush into a controlpath message + +The server debug flags -D nbdkit.backend.controlpath= and +-D nbdkit.backend.datapath= control the verbosity of messages. + +'flush' was categorized as a datapath message, but it's more arguably +a controlpath message, and anyway it is rare and useful to see in +virt-v2v output even when datapath messages are suppressed. + +(cherry picked from commit 079c8a91bf5161614916470dcb1f52bee8bfb397) +--- + server/backend.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/server/backend.c b/server/backend.c +index 6232b69d..1f1bcfce 100644 +--- a/server/backend.c ++++ b/server/backend.c +@@ -693,7 +693,7 @@ backend_flush (struct context *c, + assert (c->handle && (c->state & HANDLE_CONNECTED)); + assert (c->can_flush == 1); + assert (flags == 0); +- datapath_debug ("%s: flush", b->name); ++ controlpath_debug ("%s: flush", b->name); + + r = b->flush (c, flags, err); + if (r == -1) +-- +2.47.1 + diff --git a/SOURCES/0019-file-Fix-minor-typo-in-debug-message.patch b/SOURCES/0019-file-Fix-minor-typo-in-debug-message.patch new file mode 100644 index 0000000..a48e4e2 --- /dev/null +++ b/SOURCES/0019-file-Fix-minor-typo-in-debug-message.patch @@ -0,0 +1,26 @@ +From a79bf57c8ec805516e8dbe7995aa2bd46b83ade3 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Thu, 1 May 2025 10:03:06 +0100 +Subject: [PATCH] file: Fix minor typo in debug message + +(cherry picked from commit a75db5636b94c9184f8eb02fd51182d935df64a6) +--- + plugins/file/file.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/plugins/file/file.c b/plugins/file/file.c +index 6bcc5537..71b349ac 100644 +--- a/plugins/file/file.c ++++ b/plugins/file/file.c +@@ -924,7 +924,7 @@ file_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + r = do_fallocate (h->fd, FALLOC_FL_ZERO_RANGE, offset, count); + if (r == 0) { + if (file_debug_zero) +- nbdkit_debug ("h->can_zero-range: " ++ nbdkit_debug ("h->can_zero_range: " + "zero succeeded using fallocate"); + goto out; + } +-- +2.47.1 + diff --git a/SOURCES/0020-file-Add-more-debugging-when-D-file.zero-1-is-used.patch b/SOURCES/0020-file-Add-more-debugging-when-D-file.zero-1-is-used.patch new file mode 100644 index 0000000..ae4945f --- /dev/null +++ b/SOURCES/0020-file-Add-more-debugging-when-D-file.zero-1-is-used.patch @@ -0,0 +1,36 @@ +From 1cb341e75c1a17553b69ea8d9889662e6d09ae78 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Thu, 1 May 2025 10:21:23 +0100 +Subject: [PATCH] file: Add more debugging when -D file.zero=1 is used + +(cherry picked from commit ecf6b15fa84a02b74ea969f06552c82ee418b9b4) +--- + plugins/file/file.c | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +diff --git a/plugins/file/file.c b/plugins/file/file.c +index 71b349ac..378f6988 100644 +--- a/plugins/file/file.c ++++ b/plugins/file/file.c +@@ -879,7 +879,17 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset, + static int + do_fallocate (int fd, int mode_, off_t offset, off_t len) + { +- int r = fallocate (fd, mode_, offset, len); ++ int r; ++ ++ r = fallocate (fd, mode_, offset, len); ++ ++ if (file_debug_zero) ++ nbdkit_debug ("fallocate ([%s%s ], %" PRIu64 ", %" PRIu64") => %d (%d)", ++ mode_ & FALLOC_FL_PUNCH_HOLE ? " FALLOC_FL_PUNCH_HOLE" : "", ++ mode_ & FALLOC_FL_ZERO_RANGE ? " FALLOC_FL_ZERO_RANGE" : "", ++ (uint64_t) offset, (uint64_t) len, r, ++ r == -1 ? errno : 0); ++ + if (r == -1 && errno == ENODEV) { + /* kernel 3.10 fails with ENODEV for block device. Kernel >= 4.9 fails + with EOPNOTSUPP in this case. Normalize errno to simplify callers. */ +-- +2.47.1 + diff --git a/SOURCES/0021-file-Fix-comment-style-in-a-few-places.patch b/SOURCES/0021-file-Fix-comment-style-in-a-few-places.patch new file mode 100644 index 0000000..693ebfc --- /dev/null +++ b/SOURCES/0021-file-Fix-comment-style-in-a-few-places.patch @@ -0,0 +1,43 @@ +From 664e447d858a21304610db3023cc728db0c974bd Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Thu, 1 May 2025 10:32:17 +0100 +Subject: [PATCH] file: Fix comment style in a few places + +No actual change here. + +(cherry picked from commit 0df4142c4be2b059c4d17aae0ec71f16ffc9ba35) +--- + plugins/file/file.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +diff --git a/plugins/file/file.c b/plugins/file/file.c +index 378f6988..01ad1ef2 100644 +--- a/plugins/file/file.c ++++ b/plugins/file/file.c +@@ -892,7 +892,8 @@ do_fallocate (int fd, int mode_, off_t offset, off_t len) + + if (r == -1 && errno == ENODEV) { + /* kernel 3.10 fails with ENODEV for block device. Kernel >= 4.9 fails +- with EOPNOTSUPP in this case. Normalize errno to simplify callers. */ ++ * with EOPNOTSUPP in this case. Normalize errno to simplify callers. ++ */ + errno = EOPNOTSUPP; + } + return r; +@@ -949,9 +950,10 @@ file_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + #endif + + #ifdef FALLOC_FL_PUNCH_HOLE +- /* If we can punch hole but may not trim, we can combine punching hole and +- * fallocate to zero a range. This is expected to be more efficient than +- * writing zeroes manually. */ ++ /* If we can punch hole but may not trim, we can combine punching ++ * hole and fallocate to zero a range. This is expected to be more ++ * efficient than writing zeroes manually. ++ */ + if (h->can_punch_hole && h->can_fallocate) { + int r; + +-- +2.47.1 + diff --git a/SOURCES/0022-file-Fix-do_fallocate-debugging-on-Alpine.patch b/SOURCES/0022-file-Fix-do_fallocate-debugging-on-Alpine.patch new file mode 100644 index 0000000..93b0951 --- /dev/null +++ b/SOURCES/0022-file-Fix-do_fallocate-debugging-on-Alpine.patch @@ -0,0 +1,60 @@ +From 4c02ff62f40497335da185cc4b45c2ba43fb609b Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Thu, 1 May 2025 10:59:17 +0100 +Subject: [PATCH] file: Fix do_fallocate debugging on Alpine + +Alpine has some weird/old kernel that doesn't support +FALLOC_FL_ZERO_RANGE but does support FALLOC_FL_PUNCH_HOLE, so the +debugging I added in commit ecf6b15fa8 failed to compile with: + + file.c: In function 'do_fallocate': + file.c:958:27: error: 'FALLOC_FL_ZERO_RANGE' undeclared (first use in this function) + 958 | mode_ & FALLOC_FL_ZERO_RANGE ? " FALLOC_FL_ZERO_RANGE" : "", + | ^~~~~~~~~~~~~~~~~~~~ + file.c:958:27: note: each undeclared identifier is reported only once for each function it appears in + make[3]: *** [Makefile:666: nbdkit_file_plugin_la-file.lo] Error 1 + +Fixes: commit ecf6b15fa84a02b74ea969f06552c82ee418b9b4 +(cherry picked from commit 419a347054f81c53706637feddbc5008beab77d3) +--- + plugins/file/file.c | 15 +++++++++++++-- + 1 file changed, 13 insertions(+), 2 deletions(-) + +diff --git a/plugins/file/file.c b/plugins/file/file.c +index 01ad1ef2..32c5e2b7 100644 +--- a/plugins/file/file.c ++++ b/plugins/file/file.c +@@ -875,7 +875,7 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset, + return 0; + } + +-#if defined (FALLOC_FL_PUNCH_HOLE) || defined (FALLOC_FL_ZERO_RANGE) ++#if defined(FALLOC_FL_PUNCH_HOLE) || defined(FALLOC_FL_ZERO_RANGE) + static int + do_fallocate (int fd, int mode_, off_t offset, off_t len) + { +@@ -884,9 +884,20 @@ do_fallocate (int fd, int mode_, off_t offset, off_t len) + r = fallocate (fd, mode_, offset, len); + + if (file_debug_zero) +- nbdkit_debug ("fallocate ([%s%s ], %" PRIu64 ", %" PRIu64") => %d (%d)", ++ nbdkit_debug ("fallocate ([" ++#if defined(FALLOC_FL_PUNCH_HOLE) ++ "%s" ++#endif ++#if defined(FALLOC_FL_ZERO_RANGE) ++ "%s" ++#endif ++ " ], %" PRIu64 ", %" PRIu64") => %d (%d)", ++#if defined(FALLOC_FL_PUNCH_HOLE) + mode_ & FALLOC_FL_PUNCH_HOLE ? " FALLOC_FL_PUNCH_HOLE" : "", ++#endif ++#if defined(FALLOC_FL_ZERO_RANGE) + mode_ & FALLOC_FL_ZERO_RANGE ? " FALLOC_FL_ZERO_RANGE" : "", ++#endif + (uint64_t) offset, (uint64_t) len, r, + r == -1 ? errno : 0); + +-- +2.47.1 + diff --git a/SOURCES/0023-file-Rename-h-can_zeroout-to-h-can_blkzeroout-to-ref.patch b/SOURCES/0023-file-Rename-h-can_zeroout-to-h-can_blkzeroout-to-ref.patch new file mode 100644 index 0000000..41f4f57 --- /dev/null +++ b/SOURCES/0023-file-Rename-h-can_zeroout-to-h-can_blkzeroout-to-ref.patch @@ -0,0 +1,65 @@ +From bc4598f3d2d1ef2f4ebdf5b365ed08eff14d5654 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Thu, 1 May 2025 10:26:41 +0100 +Subject: [PATCH] file: Rename h->can_zeroout to h->can_blkzeroout to reflect + ioctl + +Since we're calling the blockdev-specific BLKZEROOUT ioctl when this +flag is set, rename the flag. + +(cherry picked from commit fba20ce06c2f0e7c4be7e52e8e1934933851dfbc) +--- + plugins/file/file.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/plugins/file/file.c b/plugins/file/file.c +index 32c5e2b7..70805bd7 100644 +--- a/plugins/file/file.c ++++ b/plugins/file/file.c +@@ -497,7 +497,7 @@ struct handle { + bool can_punch_hole; + bool can_zero_range; + bool can_fallocate; +- bool can_zeroout; ++ bool can_blkzeroout; + }; + + /* Common code for opening a file by name, used by mode_filename and +@@ -703,7 +703,7 @@ file_open (int readonly) + #endif + + h->can_fallocate = true; +- h->can_zeroout = h->is_block_device; ++ h->can_blkzeroout = h->is_block_device; + + return h; + } +@@ -998,14 +998,14 @@ file_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + + #ifdef BLKZEROOUT + /* For aligned range and block device, we can use BLKZEROOUT. */ +- if (h->can_zeroout && IS_ALIGNED (offset | count, h->sector_size)) { ++ if (h->can_blkzeroout && IS_ALIGNED (offset | count, h->sector_size)) { + int r; + uint64_t range[2] = {offset, count}; + + r = ioctl (h->fd, BLKZEROOUT, &range); + if (r == 0) { + if (file_debug_zero) +- nbdkit_debug ("h->can_zeroout && IS_ALIGNED: " ++ nbdkit_debug ("h->can_blkzeroout && IS_ALIGNED: " + "zero succeeded using BLKZEROOUT"); + goto out; + } +@@ -1015,7 +1015,7 @@ file_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + return -1; + } + +- h->can_zeroout = false; ++ h->can_blkzeroout = false; + } + #endif + +-- +2.47.1 + diff --git a/SOURCES/0024-file-zero-Document-implicit-order-that-we-will-try-z.patch b/SOURCES/0024-file-zero-Document-implicit-order-that-we-will-try-z.patch new file mode 100644 index 0000000..c0cda6f --- /dev/null +++ b/SOURCES/0024-file-zero-Document-implicit-order-that-we-will-try-z.patch @@ -0,0 +1,38 @@ +From c1984ddcc6497c4446d1bf0e8828d1259852eb74 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Thu, 1 May 2025 10:30:41 +0100 +Subject: [PATCH] file: zero: Document implicit order that we will try zeroing + methods + +There's no substantive change here. I just pulled out the test (flags +& NBDKIT_FLAG_MAY_TRIM) into a boolean variable, and documented that +we (will) try zero-with-trim methods first. + +(cherry picked from commit 61fc023f235b17f8a19302885d1613dd0a7a3793) +--- + plugins/file/file.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/plugins/file/file.c b/plugins/file/file.c +index 70805bd7..3b82e02d 100644 +--- a/plugins/file/file.c ++++ b/plugins/file/file.c +@@ -916,9 +916,14 @@ static int + file_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + { + struct handle *h __attribute__ ((unused)) = handle; ++ const bool may_trim __attribute__ ((unused)) = flags & NBDKIT_FLAG_MAY_TRIM; + ++ /* These alternate zeroing methods are ordered. Methods which can ++ * trim (if may_trim is set) are tried first. Methods which can ++ * only zero are tried last. ++ */ + #ifdef FALLOC_FL_PUNCH_HOLE +- if (h->can_punch_hole && (flags & NBDKIT_FLAG_MAY_TRIM)) { ++ if (may_trim && h->can_punch_hole) { + int r; + + r = do_fallocate (h->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, +-- +2.47.1 + diff --git a/SOURCES/0025-file-zero-Use-BLKDISCARD-method-if-may_trim-is-set.patch b/SOURCES/0025-file-zero-Use-BLKDISCARD-method-if-may_trim-is-set.patch new file mode 100644 index 0000000..fdb39b3 --- /dev/null +++ b/SOURCES/0025-file-zero-Use-BLKDISCARD-method-if-may_trim-is-set.patch @@ -0,0 +1,122 @@ +From 396e8a97835155a620cabbcf1aabaaa1fa4a08f1 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Thu, 1 May 2025 10:36:23 +0100 +Subject: [PATCH] file: zero: Use BLKDISCARD method if may_trim is set + +If we're allowed to trim and we're writing to a block device, +previously we hit the case fallocate(FALLOC_FL_ZERO_RANGE) first. +This succeeds in Linux, zeroing (not trimming) the range. + +However it would be better to trim in this case. Linux supports +ioctl(BLKDISCARD) on block devices, so try this method first. + +Fixes: https://issues.redhat.com/browse/RHEL-89353 +Reported-by: Germano Veit Michel +Thanks: Eric Blake +(cherry picked from commit 7a9ecda24906c64d9f8c7238a96cb3f686e894eb) +--- + plugins/file/file.c | 50 +++++++++++++++++++++++++++++ + plugins/file/nbdkit-file-plugin.pod | 5 +++ + 2 files changed, 55 insertions(+) + +diff --git a/plugins/file/file.c b/plugins/file/file.c +index 3b82e02d..b4dec3c5 100644 +--- a/plugins/file/file.c ++++ b/plugins/file/file.c +@@ -397,6 +397,9 @@ file_dump_plugin (void) + #ifdef BLKSSZGET + printf ("file_blksszget=yes\n"); + #endif ++#ifdef BLKDISCARD ++ printf ("file_blkdiscard=yes\n"); ++#endif + #ifdef BLKZEROOUT + printf ("file_blkzeroout=yes\n"); + #endif +@@ -497,6 +500,7 @@ struct handle { + bool can_punch_hole; + bool can_zero_range; + bool can_fallocate; ++ bool can_blkdiscard; + bool can_blkzeroout; + }; + +@@ -704,6 +708,7 @@ file_open (int readonly) + + h->can_fallocate = true; + h->can_blkzeroout = h->is_block_device; ++ h->can_blkdiscard = h->is_block_device; + + return h; + } +@@ -944,6 +949,51 @@ file_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + } + #endif + ++#if defined(BLKDISCARD) && defined(FALLOC_FL_ZERO_RANGE) ++ /* For aligned range and block device, we can use BLKDISCARD to ++ * trim. However BLKDISCARD doesn't necessarily zero (eg for local ++ * disk) so we have to zero first and then discard. ++ * ++ * In future all Linux block devices may understand ++ * FALLOC_FL_PUNCH_HOLE which means this case would no longer be ++ * necessary, since the case above will handle it. ++ */ ++ if (may_trim && h->can_blkdiscard && h->can_zero_range && ++ IS_ALIGNED (offset | count, h->sector_size)) { ++ int r; ++ uint64_t range[2] = {offset, count}; ++ ++ r = do_fallocate (h->fd, FALLOC_FL_ZERO_RANGE, offset, count); ++ if (r == 0) { ++ /* We could use FALLOC_FL_PUNCH_HOLE here instead, but currently ++ * thin LVs do not support it (XXX 2025-04). ++ */ ++ r = ioctl (h->fd, BLKDISCARD, &range); ++ if (r == 0) { ++ if (file_debug_zero) ++ nbdkit_debug ("h->can_blkdiscard && may_trim && IS_ALIGNED: " ++ "zero succeeded using BLKDISCARD"); ++ goto out; ++ } ++ ++ if (!is_enotsup (errno)) { ++ nbdkit_error ("zero: %m"); ++ return -1; ++ } ++ ++ h->can_blkdiscard = false; ++ } ++ else { ++ if (!is_enotsup (errno)) { ++ nbdkit_error ("zero: %m"); ++ return -1; ++ } ++ ++ h->can_fallocate = false; ++ } ++ } ++#endif ++ + #ifdef FALLOC_FL_ZERO_RANGE + if (h->can_zero_range) { + int r; +diff --git a/plugins/file/nbdkit-file-plugin.pod b/plugins/file/nbdkit-file-plugin.pod +index a50bef2d..0e260b7f 100644 +--- a/plugins/file/nbdkit-file-plugin.pod ++++ b/plugins/file/nbdkit-file-plugin.pod +@@ -227,6 +227,11 @@ future. + If both set, the plugin may be able to efficiently zero ranges of + block devices, where the driver and block device itself supports this. + ++=item C ++ ++If set, the plugin may be able to efficiently trim ranges of block ++devices, where the driver and block device itself supports this. ++ + =item C + + If set, the plugin can read file extents. +-- +2.47.1 + diff --git a/SOURCES/0026-vddk-Debug-length-of-extents-when-using-D-vddk.exten.patch b/SOURCES/0026-vddk-Debug-length-of-extents-when-using-D-vddk.exten.patch new file mode 100644 index 0000000..bd4abdf --- /dev/null +++ b/SOURCES/0026-vddk-Debug-length-of-extents-when-using-D-vddk.exten.patch @@ -0,0 +1,30 @@ +From 2b4f411ebf4fca9c084e8fc74ed2c53debfb3614 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Thu, 29 May 2025 09:15:33 +0000 +Subject: [PATCH] vddk: Debug length of extents when using -D vddk.extents=1 + +(cherry picked from commit a53746d326e08fae9ec1ea782df740abb48d0114) +--- + plugins/vddk/worker.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/plugins/vddk/worker.c b/plugins/vddk/worker.c +index 8a91250a..c61c4d16 100644 +--- a/plugins/vddk/worker.c ++++ b/plugins/vddk/worker.c +@@ -375,9 +375,10 @@ add_extent (struct nbdkit_extents *extents, + return 0; + + if (vddk_debug_extents) +- nbdkit_debug ("adding extent type %s at [%" PRIu64 "...%" PRIu64 "]", ++ nbdkit_debug ("adding extent type %s at [%" PRIu64 "...%" PRIu64 "] " ++ "(length %" PRIu64 ")", + is_hole ? "hole" : "allocated data", +- *position, next_position-1); ++ *position, next_position-1, length); + if (nbdkit_add_extent (extents, *position, length, type) == -1) + return -1; + +-- +2.47.1 + diff --git a/SOURCES/0027-cacheextents-Mark-this-filter-as-deprecated.patch b/SOURCES/0027-cacheextents-Mark-this-filter-as-deprecated.patch new file mode 100644 index 0000000..195bb53 --- /dev/null +++ b/SOURCES/0027-cacheextents-Mark-this-filter-as-deprecated.patch @@ -0,0 +1,38 @@ +From 70df77c1abd92fccf0c5594f613f3e375c71c2b8 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Sat, 31 May 2025 08:12:32 +0100 +Subject: [PATCH] cacheextents: Mark this filter as deprecated + +We will remove it in nbdkit 1.46. + +Its usage was removed from virt-v2v because we found that it did not +do anything: +https://github.com/libguestfs/virt-v2v/commit/48c4ce8e6cf6f1c390a48245ef0f99233f80cfe8 + +(cherry picked from commit 9717c47999fb2f56c3569cf1cdd4d0c160f866c1) +--- + filters/cacheextents/nbdkit-cacheextents-filter.pod | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/filters/cacheextents/nbdkit-cacheextents-filter.pod b/filters/cacheextents/nbdkit-cacheextents-filter.pod +index a2b2aa51..ebf0fbc5 100644 +--- a/filters/cacheextents/nbdkit-cacheextents-filter.pod ++++ b/filters/cacheextents/nbdkit-cacheextents-filter.pod +@@ -6,6 +6,14 @@ nbdkit-cacheextents-filter - cache extents + + nbdkit --filter=cacheextents plugin + ++=head1 DEPRECATED ++ ++B 1.43.10> and ++will be removed in S>. There is no direct replacement, ++but as the filter only worked for a narrow and unusual range of access ++patterns it is likely that it has no effect and you can just stop ++using it. ++ + =head1 DESCRIPTION + + C is a filter that caches the result of last +-- +2.47.1 + diff --git a/SOURCES/0028-include-Move-some-extent-functions-to-nbdkit-common..patch b/SOURCES/0028-include-Move-some-extent-functions-to-nbdkit-common..patch new file mode 100644 index 0000000..bc7c095 --- /dev/null +++ b/SOURCES/0028-include-Move-some-extent-functions-to-nbdkit-common..patch @@ -0,0 +1,77 @@ +From ea41c6517d0093e60b3acbc7a310758ba75211fc Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Sat, 31 May 2025 08:04:41 +0100 +Subject: [PATCH] include: Move some extent functions to nbdkit-common.h + +No existing plugins use the extent functions nbdkit_extents_new, +nbdkit_extents_free, etc, since they are mainly useful for filters so +they can build and manipulate new lists of extents. Nevertheless +there is nothing that prevents them from being used in plugins, so +move those functions to the common header (so they appear for users of +) + +There are a couple of helper functions which are really +filter-specific so leave those in nbdkit-filter.h. + +(cherry picked from commit 03792d04f270f2cffc589dd9703c9de9c3d5a65e) +--- + include/nbdkit-common.h | 15 +++++++++++++++ + include/nbdkit-filter.h | 15 +-------------- + 2 files changed, 16 insertions(+), 14 deletions(-) + +diff --git a/include/nbdkit-common.h b/include/nbdkit-common.h +index d0a5c854..9d7409e6 100644 +--- a/include/nbdkit-common.h ++++ b/include/nbdkit-common.h +@@ -152,7 +152,22 @@ NBDKIT_EXTERN_DECL (const char *, nbdkit_vprintf_intern, + (const char *msg, va_list args) + ATTRIBUTE_FORMAT_PRINTF (1, 0)); + ++/* Extent functions. */ ++struct nbdkit_extent { ++ uint64_t offset; ++ uint64_t length; ++ uint32_t type; ++}; ++ + struct nbdkit_extents; ++ ++NBDKIT_EXTERN_DECL (struct nbdkit_extents *, nbdkit_extents_new, ++ (uint64_t start, uint64_t end)); ++NBDKIT_EXTERN_DECL (void, nbdkit_extents_free, (struct nbdkit_extents *)); ++NBDKIT_EXTERN_DECL (size_t, nbdkit_extents_count, ++ (const struct nbdkit_extents *)); ++NBDKIT_EXTERN_DECL (struct nbdkit_extent, nbdkit_get_extent, ++ (const struct nbdkit_extents *, size_t)); + NBDKIT_EXTERN_DECL (int, nbdkit_add_extent, + (struct nbdkit_extents *, + uint64_t offset, uint64_t length, uint32_t type)); +diff --git a/include/nbdkit-filter.h b/include/nbdkit-filter.h +index a4ac09d4..31520bf5 100644 +--- a/include/nbdkit-filter.h ++++ b/include/nbdkit-filter.h +@@ -121,20 +121,7 @@ struct nbdkit_next_ops { + */ + }; + +-/* Extent functions. */ +-struct nbdkit_extent { +- uint64_t offset; +- uint64_t length; +- uint32_t type; +-}; +- +-NBDKIT_EXTERN_DECL (struct nbdkit_extents *, nbdkit_extents_new, +- (uint64_t start, uint64_t end)); +-NBDKIT_EXTERN_DECL (void, nbdkit_extents_free, (struct nbdkit_extents *)); +-NBDKIT_EXTERN_DECL (size_t, nbdkit_extents_count, +- (const struct nbdkit_extents *)); +-NBDKIT_EXTERN_DECL (struct nbdkit_extent, nbdkit_get_extent, +- (const struct nbdkit_extents *, size_t)); ++/* Extent helpers. */ + NBDKIT_EXTERN_DECL (struct nbdkit_extents *, nbdkit_extents_full, + (nbdkit_next *next, + uint32_t count, uint64_t offset, +-- +2.47.1 + diff --git a/SOURCES/0029-vddk-Display-command-type-in-command-completed-messa.patch b/SOURCES/0029-vddk-Display-command-type-in-command-completed-messa.patch new file mode 100644 index 0000000..4845e99 --- /dev/null +++ b/SOURCES/0029-vddk-Display-command-type-in-command-completed-messa.patch @@ -0,0 +1,29 @@ +From dde9c60307ca0cefcc8109dc1ef71dee8144d931 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Fri, 30 May 2025 13:29:28 +0100 +Subject: [PATCH] vddk: Display command type in command completed message + +Useful extra debugging. + +(cherry picked from commit 81d4d74fecf3c071e144a8ba016f43ba1de1b093) +--- + plugins/vddk/worker.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/plugins/vddk/worker.c b/plugins/vddk/worker.c +index c61c4d16..3bf1d5c2 100644 +--- a/plugins/vddk/worker.c ++++ b/plugins/vddk/worker.c +@@ -116,7 +116,8 @@ complete_command (void *vp, VixError result) + struct command *cmd = vp; + + if (vddk_debug_datapath) +- nbdkit_debug ("command %" PRIu64 " completed", cmd->id); ++ nbdkit_debug ("command %" PRIu64 " (%s) completed", ++ cmd->id, command_type_string (cmd->type)); + + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&cmd->mutex); + +-- +2.47.1 + diff --git a/SOURCES/0030-vddk-Cache-the-readonly-flag-from-the-.open-call-in-.patch b/SOURCES/0030-vddk-Cache-the-readonly-flag-from-the-.open-call-in-.patch new file mode 100644 index 0000000..4224993 --- /dev/null +++ b/SOURCES/0030-vddk-Cache-the-readonly-flag-from-the-.open-call-in-.patch @@ -0,0 +1,41 @@ +From 6bfb320ff03dd89d1b9b584516b7e20830d2cdb5 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Fri, 30 May 2025 13:32:00 +0100 +Subject: [PATCH] vddk: Cache the readonly flag from the .open call in the + handle + +(cherry picked from commit 0d953ea644f44259edb19c97e3c7863794c0ca1c) +--- + plugins/vddk/vddk.c | 1 + + plugins/vddk/vddk.h | 3 +++ + 2 files changed, 4 insertions(+) + +diff --git a/plugins/vddk/vddk.c b/plugins/vddk/vddk.c +index 2a787453..468971f4 100644 +--- a/plugins/vddk/vddk.c ++++ b/plugins/vddk/vddk.c +@@ -699,6 +699,7 @@ vddk_open (int readonly) + nbdkit_error ("calloc: %m"); + return NULL; + } ++ h->readonly = readonly; + h->commands = (command_queue) empty_vector; + pthread_mutex_init (&h->commands_lock, NULL); + pthread_cond_init (&h->commands_cond, NULL); +diff --git a/plugins/vddk/vddk.h b/plugins/vddk/vddk.h +index 1d1069cc..3586c5da 100644 +--- a/plugins/vddk/vddk.h ++++ b/plugins/vddk/vddk.h +@@ -166,6 +166,9 @@ struct vddk_handle { + pthread_cond_t commands_cond; /* condition (queue size 0 -> 1) */ + uint64_t id; /* next command ID */ + ++ /* Cached readonly flag from the open call. */ ++ int readonly; ++ + /* Cached disk size in bytes (set in get_size()). */ + uint64_t size; + }; +-- +2.47.1 + diff --git a/SOURCES/0031-vddk-Move-minimum-version-of-VDDK-to-6.7.patch b/SOURCES/0031-vddk-Move-minimum-version-of-VDDK-to-6.7.patch new file mode 100644 index 0000000..1e98e90 --- /dev/null +++ b/SOURCES/0031-vddk-Move-minimum-version-of-VDDK-to-6.7.patch @@ -0,0 +1,250 @@ +From ddd507afe350e5c4470c49e4e714914032c535c8 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Fri, 30 May 2025 13:49:10 +0100 +Subject: [PATCH] vddk: Move minimum version of VDDK to 6.7 + +We can now assume that QueryAllocatedBlocks exists, as well as a +couple of other functions, simplifying the code. + +This was released in 2018 (7 years ago) so there's been plenty of time +to upgrade. + +(cherry picked from commit 7c90116664b1b1a3c3756d6a79d6d483bc6062dd) +--- + plugins/vddk/nbdkit-vddk-plugin.pod | 8 ++--- + plugins/vddk/vddk-stubs.h | 32 +++++++++++--------- + plugins/vddk/vddk.c | 43 ++++++--------------------- + plugins/vddk/worker.c | 14 ++------- + tests/dummy-vddk.c | 45 +++++++++++++++++++++++++++-- + 5 files changed, 76 insertions(+), 66 deletions(-) + +diff --git a/plugins/vddk/nbdkit-vddk-plugin.pod b/plugins/vddk/nbdkit-vddk-plugin.pod +index 19f25423..e937942c 100644 +--- a/plugins/vddk/nbdkit-vddk-plugin.pod ++++ b/plugins/vddk/nbdkit-vddk-plugin.pod +@@ -396,15 +396,13 @@ is I recommended. + + =over 4 + +-=item VDDK E 6.5 (released Nov 2016) ++=item VDDK E 6.7 (released Apr 2018) + + This is the minimum version of VDDK supported. Older versions will + not work. + +-=item VDDK 6.7 (released Apr 2018) +- +-This is the first version that supported the +-C API. This is required to provide ++This is also the first version that supported the ++C API. This is used to provide + sparseness (extent) information over NBD. + + =item VDDK 8.0.2.1 (released Feb 2024) +diff --git a/plugins/vddk/vddk-stubs.h b/plugins/vddk/vddk-stubs.h +index 8620d11a..658c11f1 100644 +--- a/plugins/vddk/vddk-stubs.h ++++ b/plugins/vddk/vddk-stubs.h +@@ -129,16 +129,22 @@ STUB (VixDiskLib_Wait, + VixError, + (VixDiskLibHandle handle)); + +-/* Added in VDDK 6.7, these will be NULL for earlier versions: */ +-OPTIONAL_STUB (VixDiskLib_QueryAllocatedBlocks, +- VixError, +- (VixDiskLibHandle diskHandle, +- uint64_t start_sector, uint64_t nr_sectors, +- uint64_t chunk_size, +- VixDiskLibBlockList **block_list)); +-OPTIONAL_STUB (VixDiskLib_FreeBlockList, +- VixError, +- (VixDiskLibBlockList *block_list)); +-OPTIONAL_STUB (VixDiskLib_AllocateConnectParams, +- VixDiskLibConnectParams *, +- (void)); ++/* Added in VDDK 6.7. */ ++STUB (VixDiskLib_QueryAllocatedBlocks, ++ VixError, ++ (VixDiskLibHandle diskHandle, ++ uint64_t start_sector, uint64_t nr_sectors, ++ uint64_t chunk_size, ++ VixDiskLibBlockList **block_list)); ++STUB (VixDiskLib_FreeBlockList, ++ VixError, ++ (VixDiskLibBlockList *block_list)); ++STUB (VixDiskLib_AllocateConnectParams, ++ VixDiskLibConnectParams *, ++ (void)); ++ ++/* OPTIONAL_STUB can be used to add new APIs which are not supported ++ * by older versions of VDDK, and to make them NULL if not present. ++ * However VDDK >= 6.7 has everything we need for now so we are no ++ * longer using this macro. ++ */ +diff --git a/plugins/vddk/vddk.c b/plugins/vddk/vddk.c +index 468971f4..9d49203c 100644 +--- a/plugins/vddk/vddk.c ++++ b/plugins/vddk/vddk.c +@@ -652,37 +652,6 @@ vddk_dump_plugin (void) + /* Lock protecting open/close calls - see above. */ + static pthread_mutex_t open_close_lock = PTHREAD_MUTEX_INITIALIZER; + +-static inline VixDiskLibConnectParams * +-allocate_connect_params (void) +-{ +- VixDiskLibConnectParams *ret; +- +- if (VixDiskLib_AllocateConnectParams != NULL) { +- VDDK_CALL_START (VixDiskLib_AllocateConnectParams, "") +- ret = VixDiskLib_AllocateConnectParams (); +- VDDK_CALL_END (VixDiskLib_AllocateConnectParams, 0); +- } +- else +- ret = calloc (1, sizeof (VixDiskLibConnectParams)); +- +- return ret; +-} +- +-static inline void +-free_connect_params (VixDiskLibConnectParams *params) +-{ +- /* Only use FreeConnectParams if AllocateConnectParams was +- * originally called. Otherwise use free. +- */ +- if (VixDiskLib_AllocateConnectParams != NULL) { +- VDDK_CALL_START (VixDiskLib_FreeConnectParams, "params") +- VixDiskLib_FreeConnectParams (params); +- VDDK_CALL_END (VixDiskLib_FreeConnectParams, 0); +- } +- else +- free (params); +-} +- + /* Create the per-connection handle. */ + static void * + vddk_open (int readonly) +@@ -704,7 +673,9 @@ vddk_open (int readonly) + pthread_mutex_init (&h->commands_lock, NULL); + pthread_cond_init (&h->commands_cond, NULL); + +- h->params = allocate_connect_params (); ++ VDDK_CALL_START (VixDiskLib_AllocateConnectParams, "") ++ h->params = VixDiskLib_AllocateConnectParams (); ++ VDDK_CALL_END (VixDiskLib_AllocateConnectParams, 0); + if (h->params == NULL) { + nbdkit_error ("allocate VixDiskLibConnectParams: %m"); + goto err0; +@@ -837,7 +808,9 @@ vddk_open (int readonly) + VixDiskLib_Disconnect (h->connection); + VDDK_CALL_END (VixDiskLib_Disconnect, 0); + err1: +- free_connect_params (h->params); ++ VDDK_CALL_START (VixDiskLib_FreeConnectParams, "params") ++ VixDiskLib_FreeConnectParams (h->params); ++ VDDK_CALL_END (VixDiskLib_FreeConnectParams, 0); + err0: + pthread_mutex_destroy (&h->commands_lock); + pthread_cond_destroy (&h->commands_cond); +@@ -863,7 +836,9 @@ vddk_close (void *handle) + VixDiskLib_Disconnect (h->connection); + VDDK_CALL_END (VixDiskLib_Disconnect, 0); + +- free_connect_params (h->params); ++ VDDK_CALL_START (VixDiskLib_FreeConnectParams, "params") ++ VixDiskLib_FreeConnectParams (h->params); ++ VDDK_CALL_END (VixDiskLib_FreeConnectParams, 0); + pthread_mutex_destroy (&h->commands_lock); + pthread_cond_destroy (&h->commands_cond); + command_queue_reset (&h->commands); +diff --git a/plugins/vddk/worker.c b/plugins/vddk/worker.c +index 3bf1d5c2..076b2bd7 100644 +--- a/plugins/vddk/worker.c ++++ b/plugins/vddk/worker.c +@@ -309,23 +309,13 @@ do_can_extents (struct command *cmd, struct vddk_handle *h) + VixError err; + VixDiskLibBlockList *block_list; + +- /* This call was added in VDDK 6.7. In earlier versions the +- * function pointer will be NULL and we cannot query extents. +- */ +- if (VixDiskLib_QueryAllocatedBlocks == NULL) { +- nbdkit_debug ("can_extents: VixDiskLib_QueryAllocatedBlocks == NULL, " +- "probably this is VDDK < 6.7"); +- return 0; +- } +- + /* Suppress errors around this call. See: + * https://bugzilla.redhat.com/show_bug.cgi?id=1709211#c7 + */ + error_suppression = 1; + +- /* However even when the call is available it rarely works well so +- * the best thing we can do here is to try the call and if it's +- * non-functional return false. ++ /* Try the QueryAllocatedBlocks call and if it's non-functional ++ * return false. + */ + VDDK_CALL_START (VixDiskLib_QueryAllocatedBlocks, + "handle, 0, %d sectors, %d sectors", +diff --git a/tests/dummy-vddk.c b/tests/dummy-vddk.c +index 8b45ea6b..5247173b 100644 +--- a/tests/dummy-vddk.c ++++ b/tests/dummy-vddk.c +@@ -110,11 +110,52 @@ VixDiskLib_FreeErrorText (char *text) + free (text); + } + ++NBDKIT_DLL_PUBLIC VixError ++VixDiskLib_QueryAllocatedBlocks (VixDiskLibHandle diskHandle, ++ uint64_t start_sector, uint64_t nr_sectors, ++ uint64_t chunk_size, ++ VixDiskLibBlockList **block_list) ++{ ++ VixDiskLibBlockList *ret; ++ ++ /* This is safe because ret->blocks is a 1-sized array and we only ++ * use 1 entry here. ++ */ ++ ret = calloc (1, sizeof *ret); ++ if (ret == NULL) ++ abort (); ++ ++ /* Pretend it's all allocated. */ ++ ret->numBlocks = 1; ++ ret->blocks[0].offset = start_sector; ++ ret->blocks[0].length = nr_sectors; ++ ++ *block_list = ret; ++ return VIX_OK; ++} ++ ++NBDKIT_DLL_PUBLIC VixError ++VixDiskLib_FreeBlockList (VixDiskLibBlockList *block_list) ++{ ++ free (block_list); ++ return VIX_OK; ++} ++ ++NBDKIT_DLL_PUBLIC VixDiskLibConnectParams * ++VixDiskLib_AllocateConnectParams (void) ++{ ++ VixDiskLibConnectParams *ret; ++ ++ ret = calloc (1, sizeof *ret); ++ if (ret == NULL) ++ abort (); ++ return ret; ++} ++ + NBDKIT_DLL_PUBLIC void + VixDiskLib_FreeConnectParams (VixDiskLibConnectParams *params) + { +- /* never called since we don't define optional AllocateConnectParams */ +- abort (); ++ free (params); + } + + NBDKIT_DLL_PUBLIC VixError +-- +2.47.1 + diff --git a/SOURCES/0032-vddk-Unconditionally-test-QueryAllocatedBlocks.patch b/SOURCES/0032-vddk-Unconditionally-test-QueryAllocatedBlocks.patch new file mode 100644 index 0000000..b07755c --- /dev/null +++ b/SOURCES/0032-vddk-Unconditionally-test-QueryAllocatedBlocks.patch @@ -0,0 +1,78 @@ +From 8d5ab5acd2d111f6cf02dbce4ad034b49b86bc35 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Fri, 30 May 2025 13:59:28 +0100 +Subject: [PATCH] vddk: Unconditionally test QueryAllocatedBlocks + +Except in rare cases like a suddenly dropped connection, we always +call vddk_can_extents and therefore do this test. We might as well do +it unconditionally when the worker thread starts up. + +This simplifies following commits where we may do more work based on +this flag when the worker thread starts up. + +(cherry picked from commit 787db3e8ecfd81b683f541065daee75665ab47e0) +--- + plugins/vddk/worker.c | 23 +++++++++++++++-------- + 1 file changed, 15 insertions(+), 8 deletions(-) + +diff --git a/plugins/vddk/worker.c b/plugins/vddk/worker.c +index 076b2bd7..6efcc6f6 100644 +--- a/plugins/vddk/worker.c ++++ b/plugins/vddk/worker.c +@@ -303,8 +303,13 @@ do_flush (struct command *cmd, struct vddk_handle *h) + return 0; + } + +-static int +-do_can_extents (struct command *cmd, struct vddk_handle *h) ++/* Try the QueryAllocatedBlocks call and if it's non-functional return ++ * false. At some point in future, perhaps when we move to baseline ++ * VDDK >= 7, we can just assume it works and remove this test ++ * entirely. ++ */ ++static bool ++test_can_extents (struct vddk_handle *h) + { + VixError err; + VixDiskLibBlockList *block_list; +@@ -314,9 +319,6 @@ do_can_extents (struct command *cmd, struct vddk_handle *h) + */ + error_suppression = 1; + +- /* Try the QueryAllocatedBlocks call and if it's non-functional +- * return false. +- */ + VDDK_CALL_START (VixDiskLib_QueryAllocatedBlocks, + "handle, 0, %d sectors, %d sectors", + VIXDISKLIB_MIN_CHUNK_SIZE, VIXDISKLIB_MIN_CHUNK_SIZE) +@@ -502,6 +504,10 @@ vddk_worker_thread (void *handle) + { + struct vddk_handle *h = handle; + bool stop = false; ++ bool can_extents; ++ ++ /* Test if QueryAllocatedBlocks will work. */ ++ can_extents = test_can_extents (h); + + while (!stop) { + struct command *cmd; +@@ -544,12 +550,13 @@ vddk_worker_thread (void *handle) + break; + + case CAN_EXTENTS: +- r = do_can_extents (cmd, h); +- if (r >= 0) +- *(int *)cmd->ptr = r; ++ *(int *)cmd->ptr = can_extents; ++ r = 0; + break; + + case EXTENTS: ++ /* If we returned false above, we should never be called here. */ ++ assert (can_extents); + r = do_extents (cmd, h); + break; + +-- +2.47.1 + diff --git a/SOURCES/0033-vddk-Pre-cache-the-extents-for-readonly-connections.patch b/SOURCES/0033-vddk-Pre-cache-the-extents-for-readonly-connections.patch new file mode 100644 index 0000000..a6e1ff9 --- /dev/null +++ b/SOURCES/0033-vddk-Pre-cache-the-extents-for-readonly-connections.patch @@ -0,0 +1,349 @@ +From 9bcda0ca197a20db8675253957fee954f362e689 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Fri, 30 May 2025 15:06:07 +0100 +Subject: [PATCH] vddk: Pre-cache the extents for readonly connections + +As explained in detail in the code comment, QueryAllocatedBlocks has +very poor performance. We can partially work around this by +pre-caching extents when the first NBD_BLOCK_STATUS request is made. +We only do this for readonly connections, since it would be very +complex to do it for writable connections where the extent information +could change under us. And we only do it on the first +NBD_BLOCK_STATUS request, so we know that the caller is interested in +extents. + +Benchmarking +------------ + +This improves performance, dramatically in some cases: + + Size Used% [ni] [nm] [nc] [qi] [qm] [qc] [dd] + +before 64G 16% 17 21 354 6 62 180 +after " " 18 13 59 6 7 66 57 + nc=178 MBytes/s dd=1150 MBytes/sec + +before 128G 5.5% 17 29 646 6 50 151 +after " " 17 14 68 6 8 71 + nc=106 MBytes/s + +before 128G 45% 17 30 1073 6 52 578 +after " " 17 14 457 6 8 506 143 + nc=128 MBytes/s dd=409 MBytes/sec + + (all times in seconds) + +[ni] nbdinfo $uri + Note: Makes two connections, unlike qemu-img info. + +[nm] nbdinfo --map --totals $uri + Note: Slower than it ought to be, needs investigation. + +[nc] nbdcopy -p $uri null: + +[qi] qemu-img info $uri + +[qm] qemu-img map $uri + +[qc] qemu-img convert -p -n -f raw $uri \ + -O raw 'json:{"file.driver":"null-co","file.size":"1E"}' + Note: Requests the map up front, which is why it performs better + than nbdcopy on the "before" version since reads are not being + serialized by concurrent calls to QueryAllocatedBlocks. + +[dd] dd if=*-flat.vmdk of=/dev/null bs=16777216 + Note: This command was run on the ESXi server where the storage + is assumed to be local. ESXi has very limited tools available + (eg. no "fio" etc). Also the cp command is from Busybox and is + notably slower than dd. To get the accurate copying rate I + assumed that this command copies all data on disk to /dev/null, + skipping reading holes where thin provisioned. IOW using the "ls + -s" output as the number of blocks read. + +It should be noted that the after/[nc] numbers are not very stable. +In the last test where [nc] = 457, I see a deviation of as much as 10% +either side over multiple runs. + +The network used in the test is 25 Gbps and clearly we are nowhere +near able to reach that. A more likely upper limit is the speed of +reading from the disks ([dd]). There is also a large gap between our +performance and that number. VMware is thought to impose a +per-connection limit of around 1 Gbps on NFC connections, and there +are other limitations +(https://knowledge.broadcom.com/external/article/307001/nfc-performance-is-slow-resulting-in-mig.html). + +Tuning NFC makes no observable difference +----------------------------------------- + +Further tuning of NFC is possible +(https://techdocs.broadcom.com/us/en/vmware-cis/vsphere/vsphere-supervisor/7-0/best-practices-for-nbd-transport.html). + +Using compression (nbdkit vddk plugin 'compression' option) is +possible but in my test it makes things much slower. This is using +the first VM from the tests above: + + [nc] + + (no compression) 59 (178 MBytes/sec) + compression=zlib 323 ( 33 MBytes/sec) + +VMware documentation also suggests using a configuration file +containing the entries below (the configuration file is placed +somewhere on the client, and referenced using the +config=/path/to/config.ini parameter): + + vixDiskLib.nfcAio.Session.BufSizeIn64KB=32 + vixDiskLib.nfcAio.Session.BufCount=16 + +This made no difference for me, at least when testing a single +conversion. Separate tests done by the MTV team suggest it may +improve performance if you are converting multiple disks / VMs in +parallel +(https://docs.redhat.com/en/documentation/migration_toolkit_for_virtualization/2.7/html/installing_and_using_the_migration_toolkit_for_virtualization/mtv-performance-recommendation_mtv#mtv-aio-buffer-key-findings_mtv) + +Some VMware documentation also suggests: + + config.nfc.useSSL=false + +but this also made no difference. + +Some VMware documentation suggests using unbuffered I/O +(unbuffered=true) but in my test this caused a large slow down. + +Continue to disable multi-conn +------------------------------ + +We have recommended against using multi-conn with the VDDK plugin, +because we observed some slow down. This commit makes no difference +to this advice. The same amount of slow down is still observed. (In +virt-v2v we use --filter=multi-conn multi-conn-mode=disable to ensure +it is never used.) + +(cherry picked from commit 5a882e74cae3dbaa09bf3b942a02f9947b12f6e5) +--- + plugins/vddk/vddk.c | 2 + + plugins/vddk/vddk.h | 3 + + plugins/vddk/worker.c | 166 +++++++++++++++++++++++++++++++++++++++++- + 3 files changed, 170 insertions(+), 1 deletion(-) + +diff --git a/plugins/vddk/vddk.c b/plugins/vddk/vddk.c +index 9d49203c..bbf0af31 100644 +--- a/plugins/vddk/vddk.c ++++ b/plugins/vddk/vddk.c +@@ -829,6 +829,8 @@ vddk_close (void *handle) + send_command_and_wait (h, &stop_cmd); + pthread_join (h->thread, NULL); + ++ nbdkit_extents_free (h->extents); ++ + VDDK_CALL_START (VixDiskLib_Close, "handle") + VixDiskLib_Close (h->handle); + VDDK_CALL_END (VixDiskLib_Close, 0); +diff --git a/plugins/vddk/vddk.h b/plugins/vddk/vddk.h +index 3586c5da..461fb528 100644 +--- a/plugins/vddk/vddk.h ++++ b/plugins/vddk/vddk.h +@@ -171,6 +171,9 @@ struct vddk_handle { + + /* Cached disk size in bytes (set in get_size()). */ + uint64_t size; ++ ++ /* Cached extents for readonly disks. */ ++ struct nbdkit_extents *extents; + }; + + /* reexec.c */ +diff --git a/plugins/vddk/worker.c b/plugins/vddk/worker.c +index 6efcc6f6..3925fb91 100644 +--- a/plugins/vddk/worker.c ++++ b/plugins/vddk/worker.c +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + + #include + +@@ -380,7 +381,7 @@ add_extent (struct nbdkit_extents *extents, + } + + static int +-do_extents (struct command *cmd, struct vddk_handle *h) ++get_extents_slow (struct command *cmd, struct vddk_handle *h) + { + const uint32_t count = cmd->count; + const uint64_t offset = cmd->offset; +@@ -496,6 +497,169 @@ do_extents (struct command *cmd, struct vddk_handle *h) + return 0; + } + ++static int ++pre_cache_extents (struct vddk_handle *h) ++{ ++ struct nbdkit_extents *extents; ++ uint64_t start_sector = 0; ++ uint64_t nr_chunks_remaining = ++ h->size / VIXDISKLIB_MIN_CHUNK_SIZE / VIXDISKLIB_SECTOR_SIZE; ++ uint64_t position = 0; ++ ++ extents = nbdkit_extents_new (0, h->size); ++ if (extents == NULL) ++ return -1; ++ ++ /* Scan through the disk reading whole "chunks" (32 GB), the most ++ * efficient way to use QueryAllocatedBlocks. ++ */ ++ while (nr_chunks_remaining > 0) { ++ VixError err; ++ uint32_t i; ++ uint64_t nr_chunks, nr_sectors; ++ VixDiskLibBlockList *block_list; ++ ++ assert (IS_ALIGNED (start_sector, VIXDISKLIB_MIN_CHUNK_SIZE)); ++ ++ nr_chunks = MIN (nr_chunks_remaining, VIXDISKLIB_MAX_CHUNK_NUMBER); ++ nr_sectors = nr_chunks * VIXDISKLIB_MIN_CHUNK_SIZE; ++ ++ VDDK_CALL_START (VixDiskLib_QueryAllocatedBlocks, ++ "handle, %" PRIu64 " sectors, %" PRIu64 " sectors, " ++ "%d sectors", ++ start_sector, nr_sectors, VIXDISKLIB_MIN_CHUNK_SIZE) ++ err = VixDiskLib_QueryAllocatedBlocks (h->handle, ++ start_sector, nr_sectors, ++ VIXDISKLIB_MIN_CHUNK_SIZE, ++ &block_list); ++ VDDK_CALL_END (VixDiskLib_QueryAllocatedBlocks, 0); ++ if (err != VIX_OK) { ++ VDDK_ERROR (err, "VixDiskLib_QueryAllocatedBlocks"); ++ nbdkit_extents_free (extents); ++ return -1; ++ } ++ ++ for (i = 0; i < block_list->numBlocks; ++i) { ++ uint64_t blk_offset, blk_length; ++ ++ blk_offset = block_list->blocks[i].offset * VIXDISKLIB_SECTOR_SIZE; ++ blk_length = block_list->blocks[i].length * VIXDISKLIB_SECTOR_SIZE; ++ ++ /* The query returns allocated blocks. We must insert holes ++ * between the blocks as necessary. ++ */ ++ if ((position < blk_offset && ++ add_extent (extents, &position, blk_offset, true) == -1) || ++ (add_extent (extents, ++ &position, blk_offset + blk_length, false) == -1)) { ++ VDDK_CALL_START (VixDiskLib_FreeBlockList, "block_list") ++ VixDiskLib_FreeBlockList (block_list); ++ VDDK_CALL_END (VixDiskLib_FreeBlockList, 0); ++ nbdkit_extents_free (extents); ++ return -1; ++ } ++ } ++ VDDK_CALL_START (VixDiskLib_FreeBlockList, "block_list") ++ VixDiskLib_FreeBlockList (block_list); ++ VDDK_CALL_END (VixDiskLib_FreeBlockList, 0); ++ ++ /* There's an implicit hole after the returned list of blocks, ++ * up to the end of the QueryAllocatedBlocks request. ++ */ ++ if (add_extent (extents, ++ &position, ++ (start_sector + nr_sectors) * VIXDISKLIB_SECTOR_SIZE, ++ true) == -1) { ++ nbdkit_extents_free (extents); ++ return -1; ++ } ++ ++ start_sector += nr_sectors; ++ nr_chunks_remaining -= nr_chunks; ++ } ++ ++ /* Add the allocated unaligned bit at the end. */ ++ if (position < h->size) { ++ if (add_extent (extents, &position, h->size, false) == -1) { ++ nbdkit_extents_free (extents); ++ return -1; ++ } ++ } ++ ++ /* Save the pre-cached extents in the handle. */ ++ h->extents = extents; ++ return 0; ++} ++ ++static int ++get_extents_from_cache (struct command *cmd, struct vddk_handle *h) ++{ ++ struct nbdkit_extents *rextents = cmd->ptr; ++ struct nbdkit_extent e; ++ size_t i; ++ ++ /* We can just copy from the pre-cached extents in the handle which ++ * cover the entire disk, into the returned extents, because ++ * nbdkit_add_extent does the right thing. ++ */ ++ for (i = 0; i < nbdkit_extents_count (h->extents); ++i) { ++ e = nbdkit_get_extent (h->extents, i); ++ if (nbdkit_add_extent (rextents, e.offset, e.length, e.type) == -1) ++ return -1; ++ } ++ ++ return 0; ++} ++ ++/* Handle extents. ++ * ++ * Oh QueryAllocatedBlocks, how much I hate you. The API has two ++ * enormous problems: (a) It's slow, taking about 1 second per ++ * invocation regardless of how much or little data you request. (b) ++ * It serialises all other requests to the disk, like concurrent ++ * reads. ++ * ++ * NBD / nbdkit doesn't help much either by having a 4GB - 1 byte ++ * limit on the size of extent requests. This means that for each 4GB ++ * of disk, we will need to run QueryAllocatedBlocks twice. For a 1TB ++ * virtual disk, about 500 seconds would be used directly in the API ++ * calls, and much more time is lost because of serialization. ++ * ++ * To work around these problems, in the readonly case (used by ++ * virt-v2v), when the first NBD_BLOCK_STATUS request is received, we ++ * will read over the whole disk and cache the extents. We will read ++ * in 32 GB chunks (the maximum possible for the underlying ++ * QueryAllocatedBlocks API). For a 1TB disk this will take ~ 30 ++ * seconds, but avoids all the overheads above. The cached extents ++ * are stored in the handle, and subsequent NBD_BLOCK_STATUS will use ++ * the cache only. ++ * ++ * For writable disks we can't easily do any caching so don't attempt ++ * it. ++ */ ++static int ++do_extents (struct command *cmd, struct vddk_handle *h) ++{ ++ if (h->readonly && !h->extents) { ++ time_t start_t, end_t; ++ ++ time (&start_t); ++ nbdkit_debug ("vddk: pre-caching extents"); ++ ++ if (pre_cache_extents (h) == -1) ++ return -1; ++ ++ time (&end_t); ++ nbdkit_debug ("vddk: finished pre-caching extents in %d second(s)", ++ (int) (end_t - start_t)); ++ } ++ ++ if (h->extents) ++ return get_extents_from_cache (cmd, h); ++ else ++ return get_extents_slow (cmd, h); ++} ++ + /* Background worker thread, one per connection, which is where the + * VDDK commands are issued. + */ +-- +2.47.1 + diff --git a/SOURCES/0034-file-Save-the-filename-or-equivalent-in-the-file-han.patch b/SOURCES/0034-file-Save-the-filename-or-equivalent-in-the-file-han.patch new file mode 100644 index 0000000..211cd92 --- /dev/null +++ b/SOURCES/0034-file-Save-the-filename-or-equivalent-in-the-file-han.patch @@ -0,0 +1,241 @@ +From 0d7f579afc815049a6a35e1ecdd34f1857b2784b Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Wed, 4 Jun 2025 21:00:47 +0100 +Subject: [PATCH] file: Save the filename or equivalent in the file handle + +This will allow us to improve error messages. + +To make this change easier by reducing duplication, I also implemented +a common error path in the file_open function. + +(cherry picked from commit 83ad77d28d598e4b85809e7225e754e64314f802) +--- + plugins/file/file.c | 108 +++++++++++++++++++++++++------------------- + 1 file changed, 62 insertions(+), 46 deletions(-) + +diff --git a/plugins/file/file.c b/plugins/file/file.c +index b4dec3c5..f96d8a8c 100644 +--- a/plugins/file/file.c ++++ b/plugins/file/file.c +@@ -493,6 +493,7 @@ file_list_exports (int readonly, int default_only, + /* The per-connection handle. */ + struct handle { + int fd; ++ char *name; /* Filename or equivalent. */ + struct stat statbuf; + bool is_block_device; + int sector_size; +@@ -548,7 +549,6 @@ static void * + file_open (int readonly) + { + struct handle *h; +- const char *file; + + h = malloc (sizeof *h); + if (h == NULL) { +@@ -557,36 +557,44 @@ file_open (int readonly) + } + h->can_write = !readonly; + h->fd = -1; ++ h->name = NULL; + + switch (mode) { + case mode_filename: +- file = filename; +- if (open_file_by_name (h, readonly, -1, file) == -1) { +- free (h); +- return NULL; ++ h->name = strdup (filename); ++ if (h->name == NULL) { ++ nbdkit_error ("strdup: %m"); ++ goto err; + } ++ if (open_file_by_name (h, readonly, -1, h->name) == -1) ++ goto err; + break; + + case mode_directory: { + int dfd; ++ const char *export_name = nbdkit_export_name (); + +- file = nbdkit_export_name (); +- if (strchr (file, '/')) { +- nbdkit_error ("exportname cannot contain /"); +- free (h); ++ if (export_name == NULL) ++ goto err; ++ ++ h->name = strdup (export_name); ++ if (h->name == NULL) { ++ nbdkit_error ("strdup: %m"); ++ goto err; ++ } ++ if (strchr (h->name, '/')) { ++ nbdkit_error ("%s: exportname cannot contain /", h->name); + errno = EINVAL; +- return NULL; ++ goto err; + } + dfd = open (directory, O_RDONLY | O_DIRECTORY | O_CLOEXEC); + if (dfd == -1) { + nbdkit_error ("open %s: %m", directory); +- free (h); +- return NULL; ++ goto err; + } +- if (open_file_by_name (h, readonly, dfd, file) == -1) { +- free (h); ++ if (open_file_by_name (h, readonly, dfd, h->name) == -1) { + close (dfd); +- return NULL; ++ goto err; + } + close (dfd); + break; +@@ -595,14 +603,14 @@ file_open (int readonly) + case mode_fd: { + int r; + +- /* This is needed for error messages. */ +- file = ""; +- ++ if (asprintf (&h->name, "fd=%d", filedesc) == -1) { ++ nbdkit_error ("asprintf: %m"); ++ goto err; ++ } + h->fd = dup (filedesc); + if (h->fd == -1) { +- nbdkit_error ("dup fd=%d: %m", filedesc); +- free (h); +- return NULL; ++ nbdkit_error ("dup: %s: %m", h->name); ++ goto err; + } + + /* If the file descriptor is readonly then we should not advertise +@@ -610,10 +618,8 @@ file_open (int readonly) + */ + r = fcntl (h->fd, F_GETFL); + if (r == -1) { +- nbdkit_error ("fcntl: F_GETFL: %m"); +- close (h->fd); +- free (h); +- return NULL; ++ nbdkit_error ("fcntl: %s: F_GETFL: %m", h->name); ++ goto err; + } + r &= O_ACCMODE; + if (r == O_RDONLY) +@@ -627,25 +633,30 @@ file_open (int readonly) + + case mode_dirfd: { + int dfd; ++ const char *export_name = nbdkit_export_name (); + +- file = nbdkit_export_name (); +- if (strchr (file, '/')) { +- nbdkit_error ("exportname cannot contain /"); +- free (h); ++ if (export_name == NULL) ++ goto err; ++ ++ h->name = strdup (export_name); ++ if (h->name == NULL) { ++ nbdkit_error ("strdup: %m"); ++ goto err; ++ } ++ if (strchr (h->name, '/')) { ++ nbdkit_error ("%s: exportname cannot contain /", h->name); + errno = EINVAL; +- return NULL; ++ goto err; + } + /* We don't fork, so no need to worry about FD_CLOEXEC on the directory */ + dfd = dup (filedesc); + if (dfd == -1) { +- nbdkit_error ("dup dirfd=%d: %m", filedesc); +- free (h); +- return NULL; ++ nbdkit_error ("dup: dirfd=%d: %m", filedesc); ++ goto err; + } +- if (open_file_by_name (h, readonly, dfd, file) == -1) { +- free (h); ++ if (open_file_by_name (h, readonly, dfd, h->name) == -1) { + close (dfd); +- return NULL; ++ goto err; + } + close (dfd); + break; +@@ -656,12 +667,11 @@ file_open (int readonly) + } + + assert (h->fd >= 0); ++ assert (h->name != NULL); + + if (fstat (h->fd, &h->statbuf) == -1) { +- nbdkit_error ("fstat: %s: %m", file); +- close (h->fd); +- free (h); +- return NULL; ++ nbdkit_error ("fstat: %s: %m", h->name); ++ goto err; + } + + if (fadvise_mode != -1) { +@@ -669,7 +679,7 @@ file_open (int readonly) + #ifdef HAVE_POSIX_FADVISE + int r = posix_fadvise (h->fd, 0, 0, fadvise_mode); + if (r == -1) +- nbdkit_debug ("posix_fadvise: %s: %m (ignored)", file); ++ nbdkit_debug ("posix_fadvise: %s: %m (ignored)", h->name); + #else + nbdkit_debug ("fadvise is not supported"); + #endif +@@ -680,17 +690,15 @@ file_open (int readonly) + else if (S_ISREG (h->statbuf.st_mode)) + h->is_block_device = false; + else { +- nbdkit_error ("file is not regular or block device: %s", file); +- close (h->fd); +- free (h); +- return NULL; ++ nbdkit_error ("file is not regular or block device: %s", h->name); ++ goto err; + } + + h->sector_size = 4096; /* Start with safe guess */ + #ifdef BLKSSZGET + if (h->is_block_device) { + if (ioctl (h->fd, BLKSSZGET, &h->sector_size) == -1) +- nbdkit_debug ("cannot get sector size: %s: %m", file); ++ nbdkit_debug ("cannot get sector size: %s: %m", h->name); + } + #endif + +@@ -711,6 +719,13 @@ file_open (int readonly) + h->can_blkdiscard = h->is_block_device; + + return h; ++ ++ err: ++ if (h->fd >= 0) ++ close (h->fd); ++ free (h->name); ++ free (h); ++ return NULL; + } + + /* Free up the per-connection handle. */ +@@ -723,6 +738,7 @@ file_close (void *handle) + remove_fd_from_window (h->fd); + #endif + close (h->fd); ++ free (h->name); + free (h); + } + +-- +2.47.1 + diff --git a/SOURCES/0035-file-Add-the-filename-or-equivalent-to-error-message.patch b/SOURCES/0035-file-Add-the-filename-or-equivalent-to-error-message.patch new file mode 100644 index 0000000..46cf972 --- /dev/null +++ b/SOURCES/0035-file-Add-the-filename-or-equivalent-to-error-message.patch @@ -0,0 +1,143 @@ +From 6e5d6e5da19acfa8ba2c9fcf81651d0894f83dda Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Wed, 4 Jun 2025 21:03:58 +0100 +Subject: [PATCH] file: Add the filename (or equivalent) to error messages + +It helps to have this information when there's an error. It +particularly helps in two cases: (1) You don't have access to the +nbdkit command line. (2) The filename is derived from an export name +passed by the client. + +(cherry picked from commit 7273bd829fc0d8e492786f8908d8ddc3d5399e06) +--- + plugins/file/file.c | 28 ++++++++++++++-------------- + 1 file changed, 14 insertions(+), 14 deletions(-) + +diff --git a/plugins/file/file.c b/plugins/file/file.c +index f96d8a8c..9ba61187 100644 +--- a/plugins/file/file.c ++++ b/plugins/file/file.c +@@ -754,7 +754,7 @@ file_get_size (void *handle) + + r = device_size (h->fd, &h->statbuf); + if (r == -1) { +- nbdkit_error ("device_size: %m"); ++ nbdkit_error ("device_size: %s: %m", h->name); + return -1; + } + return r; +@@ -818,7 +818,7 @@ file_flush (void *handle, uint32_t flags) + struct handle *h = handle; + + if (fdatasync (h->fd) == -1) { +- nbdkit_error ("fdatasync: %m"); ++ nbdkit_error ("fdatasync: %s: %m", h->name); + return -1; + } + +@@ -839,11 +839,11 @@ file_pread (void *handle, void *buf, uint32_t count, uint64_t offset, + while (count > 0) { + ssize_t r = pread (h->fd, buf, count, offset); + if (r == -1) { +- nbdkit_error ("pread: %m"); ++ nbdkit_error ("pread: %s: %m", h->name); + return -1; + } + if (r == 0) { +- nbdkit_error ("pread: unexpected end of file"); ++ nbdkit_error ("pread: %s: unexpected end of file", h->name); + return -1; + } + buf += r; +@@ -875,7 +875,7 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset, + while (count > 0) { + ssize_t r = pwrite (h->fd, buf, count, offset); + if (r == -1) { +- nbdkit_error ("pwrite: %m"); ++ nbdkit_error ("pwrite: %s: %m", h->name); + return -1; + } + buf += r; +@@ -957,7 +957,7 @@ file_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + } + + if (!is_enotsup (errno)) { +- nbdkit_error ("zero: %m"); ++ nbdkit_error ("zero: %s: %m", h->name); + return -1; + } + +@@ -993,7 +993,7 @@ file_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + } + + if (!is_enotsup (errno)) { +- nbdkit_error ("zero: %m"); ++ nbdkit_error ("zero: %s: %m", h->name); + return -1; + } + +@@ -1001,7 +1001,7 @@ file_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + } + else { + if (!is_enotsup (errno)) { +- nbdkit_error ("zero: %m"); ++ nbdkit_error ("zero: %s: %m", h->name); + return -1; + } + +@@ -1023,7 +1023,7 @@ file_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + } + + if (!is_enotsup (errno)) { +- nbdkit_error ("zero: %m"); ++ nbdkit_error ("zero: %s: %m", h->name); + return -1; + } + +@@ -1051,14 +1051,14 @@ file_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + } + + if (!is_enotsup (errno)) { +- nbdkit_error ("zero: %m"); ++ nbdkit_error ("zero: %s: %m", h->name); + return -1; + } + + h->can_fallocate = false; + } else { + if (!is_enotsup (errno)) { +- nbdkit_error ("zero: %m"); ++ nbdkit_error ("zero: %s: %m", h->name); + return -1; + } + +@@ -1082,7 +1082,7 @@ file_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + } + + if (errno != ENOTTY) { +- nbdkit_error ("zero: %m"); ++ nbdkit_error ("zero: %s: %m", h->name); + return -1; + } + +@@ -1120,7 +1120,7 @@ file_trim (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + /* Trim is advisory; we don't care if it fails for anything other + * than EIO or EPERM. */ + if (errno == EPERM || errno == EIO) { +- nbdkit_error ("fallocate: %m"); ++ nbdkit_error ("fallocate: %s: %m", h->name); + return -1; + } + +@@ -1241,7 +1241,7 @@ file_cache (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + r = posix_fadvise (h->fd, offset, count, POSIX_FADV_WILLNEED); + if (r) { + errno = r; +- nbdkit_error ("posix_fadvise: %m"); ++ nbdkit_error ("posix_fadvise: %s: %m", h->name); + return -1; + } + return 0; +-- +2.47.1 + diff --git a/SOURCES/0036-file-Add-offset-count-to-error-messages.patch b/SOURCES/0036-file-Add-offset-count-to-error-messages.patch new file mode 100644 index 0000000..665780e --- /dev/null +++ b/SOURCES/0036-file-Add-offset-count-to-error-messages.patch @@ -0,0 +1,132 @@ +From 45f740e3b4a707e5d2db739b16fb33f4d7b466cf Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Wed, 4 Jun 2025 21:15:26 +0100 +Subject: [PATCH] file: Add offset/count to error messages + +Although the same information is available in debug logs, if verbose +messages (-v) are not used then the information is lost. As it might +be useful occasionally, ensure it is captured in errors. + +(cherry picked from commit 253574354252f4a77e2df0769b721e76f1777651) +--- + plugins/file/file.c | 35 ++++++++++++++++++++++++----------- + 1 file changed, 24 insertions(+), 11 deletions(-) + +diff --git a/plugins/file/file.c b/plugins/file/file.c +index 9ba61187..ade6f5ff 100644 +--- a/plugins/file/file.c ++++ b/plugins/file/file.c +@@ -839,7 +839,8 @@ file_pread (void *handle, void *buf, uint32_t count, uint64_t offset, + while (count > 0) { + ssize_t r = pread (h->fd, buf, count, offset); + if (r == -1) { +- nbdkit_error ("pread: %s: %m", h->name); ++ nbdkit_error ("pread: %s: offset=%" PRIu64 ", count=%" PRIu32 ": %m", ++ h->name, offset, count); + return -1; + } + if (r == 0) { +@@ -875,7 +876,8 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset, + while (count > 0) { + ssize_t r = pwrite (h->fd, buf, count, offset); + if (r == -1) { +- nbdkit_error ("pwrite: %s: %m", h->name); ++ nbdkit_error ("pwrite: %s: offset=%" PRIu64 ", count=%" PRIu32 ": %m", ++ h->name, offset, count); + return -1; + } + buf += r; +@@ -957,7 +959,8 @@ file_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + } + + if (!is_enotsup (errno)) { +- nbdkit_error ("zero: %s: %m", h->name); ++ nbdkit_error ("zero: %s: offset=%" PRIu64 ", count=%" PRIu32 ": %m", ++ h->name, offset, count); + return -1; + } + +@@ -993,7 +996,8 @@ file_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + } + + if (!is_enotsup (errno)) { +- nbdkit_error ("zero: %s: %m", h->name); ++ nbdkit_error ("zero: %s: offset=%" PRIu64 ", count=%" PRIu32 ": %m", ++ h->name, offset, count); + return -1; + } + +@@ -1001,7 +1005,8 @@ file_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + } + else { + if (!is_enotsup (errno)) { +- nbdkit_error ("zero: %s: %m", h->name); ++ nbdkit_error ("zero: %s: offset=%" PRIu64 ", count=%" PRIu32 ": %m", ++ h->name, offset, count); + return -1; + } + +@@ -1023,7 +1028,8 @@ file_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + } + + if (!is_enotsup (errno)) { +- nbdkit_error ("zero: %s: %m", h->name); ++ nbdkit_error ("zero: %s: offset=%" PRIu64 ", count=%" PRIu32 ": %m", ++ h->name, offset, count); + return -1; + } + +@@ -1051,14 +1057,16 @@ file_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + } + + if (!is_enotsup (errno)) { +- nbdkit_error ("zero: %s: %m", h->name); ++ nbdkit_error ("zero: %s: offset=%" PRIu64 ", count=%" PRIu32 ": %m", ++ h->name, offset, count); + return -1; + } + + h->can_fallocate = false; + } else { + if (!is_enotsup (errno)) { +- nbdkit_error ("zero: %s: %m", h->name); ++ nbdkit_error ("zero: %s: offset=%" PRIu64 ", count=%" PRIu32 ":%m", ++ h->name, offset, count); + return -1; + } + +@@ -1082,7 +1090,8 @@ file_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + } + + if (errno != ENOTTY) { +- nbdkit_error ("zero: %s: %m", h->name); ++ nbdkit_error ("zero: %s: offset=%" PRIu64 ", count=%" PRIu32 ": %m", ++ h->name, offset, count); + return -1; + } + +@@ -1120,7 +1129,9 @@ file_trim (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + /* Trim is advisory; we don't care if it fails for anything other + * than EIO or EPERM. */ + if (errno == EPERM || errno == EIO) { +- nbdkit_error ("fallocate: %s: %m", h->name); ++ nbdkit_error ("fallocate: %s: offset=%" PRIu64 ", count=%" PRIu32 ":" ++ " %m", ++ h->name, offset, count); + return -1; + } + +@@ -1241,7 +1252,9 @@ file_cache (void *handle, uint32_t count, uint64_t offset, uint32_t flags) + r = posix_fadvise (h->fd, offset, count, POSIX_FADV_WILLNEED); + if (r) { + errno = r; +- nbdkit_error ("posix_fadvise: %s: %m", h->name); ++ nbdkit_error ("posix_fadvise: %s: offset=%" PRIu64 ", count=%" PRIu32 ":" ++ " %m", ++ h->name, offset, count); + return -1; + } + return 0; +-- +2.47.1 + diff --git a/SOURCES/0037-vddk-stats-Use-us-instead-of-Unicode-s-for-microseco.patch b/SOURCES/0037-vddk-stats-Use-us-instead-of-Unicode-s-for-microseco.patch new file mode 100644 index 0000000..695ede4 --- /dev/null +++ b/SOURCES/0037-vddk-stats-Use-us-instead-of-Unicode-s-for-microseco.patch @@ -0,0 +1,34 @@ +From bac77fb3da2aa6353b5b5a71a7b86bc83289bfc6 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Sun, 8 Jun 2025 10:44:32 +0100 +Subject: [PATCH] =?UTF-8?q?vddk:=20stats:=20Use=20"us"=20instead=20of=20(U?= + =?UTF-8?q?nicode)=20"=C2=B5s"=20for=20microseconds?= +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +nbdkit_debug uses C-style escaping for non-ASCII characters in debug +strings, so in logs what we actually see is "\xc2\xb5s" which messes +up the columns. + +(cherry picked from commit 1f09bb4abefe8f3f052e8c0b6b34d314887b3c32) +--- + plugins/vddk/stats.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/plugins/vddk/stats.c b/plugins/vddk/stats.c +index 7f63333a..59cfee5a 100644 +--- a/plugins/vddk/stats.c ++++ b/plugins/vddk/stats.c +@@ -98,7 +98,7 @@ display_stats (void) + + nbdkit_debug ("VDDK function stats (-D vddk.stats=1):"); + nbdkit_debug ("%-24s %15s %5s %15s", +- "VixDiskLib_...", "µs", "calls", "bytes"); ++ "VixDiskLib_...", "us", "calls", "bytes"); + for (i = 0; i < stats.len; ++i) { + if (stats.ptr[i].usecs) { + if (stats.ptr[i].bytes > 0) +-- +2.47.1 + diff --git a/SOURCES/0038-vddk-stats-Line-up-the-columns-correctly.patch b/SOURCES/0038-vddk-stats-Line-up-the-columns-correctly.patch new file mode 100644 index 0000000..33de0a8 --- /dev/null +++ b/SOURCES/0038-vddk-stats-Line-up-the-columns-correctly.patch @@ -0,0 +1,26 @@ +From c39e0da834bf7fdd23a9be0d391c2666596988be Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Sun, 8 Jun 2025 11:04:41 +0100 +Subject: [PATCH] vddk: stats: Line up the columns correctly + +(cherry picked from commit 7da09b07148cc12c3214b18bc96c65ed45625dde) +--- + plugins/vddk/stats.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/plugins/vddk/stats.c b/plugins/vddk/stats.c +index 59cfee5a..b551fc5a 100644 +--- a/plugins/vddk/stats.c ++++ b/plugins/vddk/stats.c +@@ -97,7 +97,7 @@ display_stats (void) + qsort (stats.ptr, stats.len, sizeof stats.ptr[0], stat_compare); + + nbdkit_debug ("VDDK function stats (-D vddk.stats=1):"); +- nbdkit_debug ("%-24s %15s %5s %15s", ++ nbdkit_debug ("%-24s %15s %5s %15s", + "VixDiskLib_...", "us", "calls", "bytes"); + for (i = 0; i < stats.len; ++i) { + if (stats.ptr[i].usecs) { +-- +2.47.1 + diff --git a/SOURCES/0039-vddk-stats-Record-the-byte-count-of-each-QueryAlloca.patch b/SOURCES/0039-vddk-stats-Record-the-byte-count-of-each-QueryAlloca.patch new file mode 100644 index 0000000..1f83d86 --- /dev/null +++ b/SOURCES/0039-vddk-stats-Record-the-byte-count-of-each-QueryAlloca.patch @@ -0,0 +1,61 @@ +From c721bf99917a3f33454ebdf683fa450f6d996202 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Sun, 8 Jun 2025 10:40:26 +0100 +Subject: [PATCH] vddk: stats: Record the byte count of each + QueryAllocatedBlocks call + +In -D vddk.stats=1 output, keep track of the size of each call to +QueryAllocatedBlocks, and display that at the end. + +After this change, we see the number of bytes scanned in the "bytes" +column (previously this column was empty for this call): + +nbdkit: debug: VixDiskLib_... us calls bytes +nbdkit: debug: Open 27051497 2 +nbdkit: debug: GetInfo 12538428 4 +nbdkit: debug: ConnectEx 6944819 2 +nbdkit: debug: QueryAllocatedBlocks 4563503 3 21474967552 +nbdkit: debug: Close 1440271 2 +nbdkit: debug: Exit 1001407 1 +(cherry picked from commit 2db1ede27bb529b36b0075eab337f0c585d1a7ec) +--- + plugins/vddk/worker.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/plugins/vddk/worker.c b/plugins/vddk/worker.c +index 3925fb91..feb8d96f 100644 +--- a/plugins/vddk/worker.c ++++ b/plugins/vddk/worker.c +@@ -327,7 +327,8 @@ test_can_extents (struct vddk_handle *h) + 0, VIXDISKLIB_MIN_CHUNK_SIZE, + VIXDISKLIB_MIN_CHUNK_SIZE, + &block_list); +- VDDK_CALL_END (VixDiskLib_QueryAllocatedBlocks, 0); ++ VDDK_CALL_END (VixDiskLib_QueryAllocatedBlocks, ++ VIXDISKLIB_MIN_CHUNK_SIZE * VIXDISKLIB_SECTOR_SIZE); + error_suppression = 0; + if (err == VIX_OK) { + VDDK_CALL_START (VixDiskLib_FreeBlockList, "block_list") +@@ -435,7 +436,8 @@ get_extents_slow (struct command *cmd, struct vddk_handle *h) + start_sector, nr_sectors, + VIXDISKLIB_MIN_CHUNK_SIZE, + &block_list); +- VDDK_CALL_END (VixDiskLib_QueryAllocatedBlocks, 0); ++ VDDK_CALL_END (VixDiskLib_QueryAllocatedBlocks, ++ nr_sectors * VIXDISKLIB_SECTOR_SIZE); + if (err != VIX_OK) { + VDDK_ERROR (err, "VixDiskLib_QueryAllocatedBlocks"); + return -1; +@@ -532,7 +534,8 @@ pre_cache_extents (struct vddk_handle *h) + start_sector, nr_sectors, + VIXDISKLIB_MIN_CHUNK_SIZE, + &block_list); +- VDDK_CALL_END (VixDiskLib_QueryAllocatedBlocks, 0); ++ VDDK_CALL_END (VixDiskLib_QueryAllocatedBlocks, ++ nr_sectors * VIXDISKLIB_SECTOR_SIZE); + if (err != VIX_OK) { + VDDK_ERROR (err, "VixDiskLib_QueryAllocatedBlocks"); + nbdkit_extents_free (extents); +-- +2.47.1 + diff --git a/SOURCES/0040-vddk-stats-Collect-elapsed-time-for-ReadAsync-and-Wr.patch b/SOURCES/0040-vddk-stats-Collect-elapsed-time-for-ReadAsync-and-Wr.patch new file mode 100644 index 0000000..92af692 --- /dev/null +++ b/SOURCES/0040-vddk-stats-Collect-elapsed-time-for-ReadAsync-and-Wr.patch @@ -0,0 +1,178 @@ +From 3bc36ccfafeea14e3df60e44e36731d27ba44585 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Sun, 8 Jun 2025 10:59:57 +0100 +Subject: [PATCH] vddk: stats: Collect elapsed time for ReadAsync and + WriteAsync + +A major existing problem with -D vddk.stats=1 is that it did not +accurately account for the time taken in these asynchronous calls. +This change records the elapsed time. (It's not possible to collect +the actual time spent "working on" these calls, we just add up the +total elapsed time, which is fine.) + +In the remote case, here's how the output changes from before this +change: + +nbdkit: debug: VixDiskLib_... us calls bytes +nbdkit: debug: ReadAsync 18 1 8192 + +to after this change: + +nbdkit: debug: VixDiskLib_... us calls bytes +nbdkit: debug: ReadAsync 243745 1 8192 + +An interesting thing about this is that in the local case this +actually reduces the apparent elapsed time, I think because VDDK calls +complete_command directly from the Read/WriteAsync function. It only +increases the apparent elapsed time in the remote case. + +(cherry picked from commit a224d7fd8d839287c63cb4a063569283dd974b48) +--- + plugins/vddk/nbdkit-vddk-plugin.pod | 3 +-- + plugins/vddk/vddk.h | 37 ++++++++++++++++++++++------- + plugins/vddk/worker.c | 13 ++++++++-- + 3 files changed, 41 insertions(+), 12 deletions(-) + +diff --git a/plugins/vddk/nbdkit-vddk-plugin.pod b/plugins/vddk/nbdkit-vddk-plugin.pod +index e937942c..1e376140 100644 +--- a/plugins/vddk/nbdkit-vddk-plugin.pod ++++ b/plugins/vddk/nbdkit-vddk-plugin.pod +@@ -715,8 +715,7 @@ Same as above, but for writing and flushing writes. + =item C + + Same as above, but for asynchronous read and write calls introduced in +-nbdkit 1.30. Unfortunately at the moment the amount of time spent in +-these calls is not accounted for correctly. ++S. + + =item C + +diff --git a/plugins/vddk/vddk.h b/plugins/vddk/vddk.h +index 461fb528..5ff02649 100644 +--- a/plugins/vddk/vddk.h ++++ b/plugins/vddk/vddk.h +@@ -39,6 +39,7 @@ + + #include + ++#include "cleanup.h" + #include "isaligned.h" + #include "tvdiff.h" + #include "vector.h" +@@ -91,7 +92,7 @@ extern int vddk_debug_stats; + */ + #define VDDK_CALL_START(fn, fs, ...) \ + do { \ +- struct timeval start_t, end_t; \ ++ struct timeval start_t; \ + /* GCC can optimize this away at compile time: */ \ + const bool datapath = \ + strcmp (#fn, "VixDiskLib_Read") == 0 || \ +@@ -105,13 +106,13 @@ extern int vddk_debug_stats; + do + #define VDDK_CALL_END(fn, bytes_) \ + while (0); \ +- if (vddk_debug_stats) { \ +- gettimeofday (&end_t, NULL); \ +- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&stats_lock); \ +- stats_##fn.usecs += tvdiff_usec (&start_t, &end_t); \ +- stats_##fn.calls++; \ +- stats_##fn.bytes += bytes_; \ +- } \ ++ update_stats (&start_t, bytes_, &stats_##fn); \ ++ } while (0) ++/* Same as VDDK_CALL_END, but for {Read,Write}Async, where we will ++ * update the stats ourself. ++ */ ++#define VDDK_CALL_END_ASYNC() \ ++ while (0); \ + } while (0) + + /* Print VDDK errors. */ +@@ -141,6 +142,7 @@ struct command { + uint64_t id; /* serial number */ + + /* These fields are used by the internal implementation. */ ++ struct timeval start_t; /* start time */ + pthread_mutex_t mutex; /* completion mutex */ + pthread_cond_t cond; /* completion condition */ + enum { SUBMITTED, SUCCEEDED, FAILED } status; +@@ -197,6 +199,25 @@ extern pthread_mutex_t stats_lock; + #undef OPTIONAL_STUB + extern void display_stats (void); + ++static inline void ++update_stats (const struct timeval *start_t, uint64_t bytes, ++ struct vddk_stat *stat) ++{ ++ if (vddk_debug_stats) { ++ struct timeval end_t; ++ int64_t usecs; ++ ++ gettimeofday (&end_t, NULL); ++ usecs = tvdiff_usec (start_t, &end_t); ++ ++ /* Keep this critical section as small as possible. */ ++ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&stats_lock); ++ stat->usecs += usecs; ++ stat->calls++; ++ stat->bytes += bytes; ++ } ++} ++ + /* utils.c */ + extern void trim (char *str); + +diff --git a/plugins/vddk/worker.c b/plugins/vddk/worker.c +index feb8d96f..9d3a5940 100644 +--- a/plugins/vddk/worker.c ++++ b/plugins/vddk/worker.c +@@ -120,6 +120,11 @@ complete_command (void *vp, VixError result) + nbdkit_debug ("command %" PRIu64 " (%s) completed", + cmd->id, command_type_string (cmd->type)); + ++ /* Update the stats for this asynchronous call. */ ++ update_stats (&cmd->start_t, cmd->count, ++ cmd->type == READ ? &stats_VixDiskLib_ReadAsync : ++ &stats_VixDiskLib_WriteAsync); ++ + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&cmd->mutex); + + if (result == VIX_OK) { +@@ -219,13 +224,15 @@ do_read (struct command *cmd, struct vddk_handle *h) + offset /= VIXDISKLIB_SECTOR_SIZE; + count /= VIXDISKLIB_SECTOR_SIZE; + ++ gettimeofday (&cmd->start_t, NULL); ++ + VDDK_CALL_START (VixDiskLib_ReadAsync, + "handle, %" PRIu64 " sectors, " + "%" PRIu32 " sectors, buffer, callback, %" PRIu64, + offset, count, cmd->id) + err = VixDiskLib_ReadAsync (h->handle, offset, count, buf, + complete_command, cmd); +- VDDK_CALL_END (VixDiskLib_ReadAsync, count * VIXDISKLIB_SECTOR_SIZE); ++ VDDK_CALL_END_ASYNC (); + if (err != VIX_ASYNC) { + VDDK_ERROR (err, "VixDiskLib_ReadAsync"); + return -1; +@@ -256,13 +263,15 @@ do_write (struct command *cmd, struct vddk_handle *h) + offset /= VIXDISKLIB_SECTOR_SIZE; + count /= VIXDISKLIB_SECTOR_SIZE; + ++ gettimeofday (&cmd->start_t, NULL); ++ + VDDK_CALL_START (VixDiskLib_WriteAsync, + "handle, %" PRIu64 " sectors, " + "%" PRIu32 " sectors, buffer, callback, %" PRIu64, + offset, count, cmd->id) + err = VixDiskLib_WriteAsync (h->handle, offset, count, buf, + complete_command, cmd); +- VDDK_CALL_END (VixDiskLib_WriteAsync, count * VIXDISKLIB_SECTOR_SIZE); ++ VDDK_CALL_END_ASYNC (); + if (err != VIX_ASYNC) { + VDDK_ERROR (err, "VixDiskLib_WriteAsync"); + return -1; +-- +2.47.1 + diff --git a/SOURCES/0041-server-Fix-off-by-one-for-maximum-block_status-lengt.patch b/SOURCES/0041-server-Fix-off-by-one-for-maximum-block_status-lengt.patch new file mode 100644 index 0000000..7e187fe --- /dev/null +++ b/SOURCES/0041-server-Fix-off-by-one-for-maximum-block_status-lengt.patch @@ -0,0 +1,170 @@ +From 4d7da0f8b01f70ceddfd3d10345bf08284591938 Mon Sep 17 00:00:00 2001 +From: Eric Blake +Date: Tue, 22 Apr 2025 17:01:12 -0500 +Subject: [PATCH] server: Fix off-by-one for maximum block_status length + [CVE-2025-47711] + +There has been an off-by-one bug in the code for .extents since the +introduction of that callback. Remember, internally the code allows +plugins to report on extents with 64-bit lengths, but the protocol +only supports 32-bit block status calls (nbdkit will need to create +plugin version 3 before it can support NBD's newer 64-bit block +status). As such, the server loop intentionally truncates a plugin's +large extent to 2**32-1 bytes. But in the process of checking whether +the loop should exit early, or if any additional extents should be +reported to the client, the server used 'pos > offset+count' instead +of >=, which is one byte too far. If the client has requested exactly +2**32-1 bytes, and the plugin's first extent has that same length, the +code erroneously proceeds on to the plugin's second extent. Worse, if +the plugin's first extent has 2**32 bytes or more, it was truncated to +2**31-1 bytes, but not completely handled, and the failure to exit the +loop early means that the server then fails the assertion: + +nbdkit: ../../server/protocol.c:505: extents_to_block_descriptors: +Assertion `e.length <= length' failed. + +The single-byte fix addresses both symptoms, while the added test +demonstrates both when run on older nbdkit (the protocol violation +when the plugin returns 2**32-1 bytes in the first extent, and the +assertion failure when the plugin returns 2**32 or more bytes in the +first extent). + +The problem can only be triggered by a client request for 2**32-1 +bytes; anything smaller is immune. The problem also does not occur +for plugins that do not return extents information beyond the client's +request, or if the first extent is smaller than the client's request. + +The ability to cause the server to die from an assertion failure can +be used as a denial of service attack against other clients. +Mitigations: if you require the use of TLS, then you can ensure that +you only have trusted clients that won't trigger a block status call +of length 2**32-1 bytes. Also, you can use "--filter=blocksize-policy +blocksize-minimum=512" to reject block status attempts from clients +that are not sector-aligned. + +Fixes: 26455d45 ('server: protocol: Implement Block Status "base:allocation".', v1.11.10) +Reported-by: Nikolay Ivanets +Signed-off-by: Eric Blake +Message-ID: <20250423211953.GR1450@redhat.com> +Reviewed-by: Richard W.M. Jones +(cherry picked from commit e6f96bd1b77c0cc927ce6aeff650b52238304f39) +--- + server/protocol.c | 2 +- + tests/Makefile.am | 2 ++ + tests/test-eval-extents.sh | 71 ++++++++++++++++++++++++++++++++++++++ + 3 files changed, 74 insertions(+), 1 deletion(-) + create mode 100755 tests/test-eval-extents.sh + +diff --git a/server/protocol.c b/server/protocol.c +index d428bfc8..b4b1c162 100644 +--- a/server/protocol.c ++++ b/server/protocol.c +@@ -499,7 +499,7 @@ extents_to_block_descriptors (struct nbdkit_extents *extents, + (*nr_blocks)++; + + pos += length; +- if (pos > offset + count) /* this must be the last block */ ++ if (pos >= offset + count) /* this must be the last block */ + break; + + /* If we reach here then we must have consumed this whole +diff --git a/tests/Makefile.am b/tests/Makefile.am +index eed96d28..9f9885b4 100644 +--- a/tests/Makefile.am ++++ b/tests/Makefile.am +@@ -824,6 +824,7 @@ TESTS += \ + test-eval.sh \ + test-eval-file.sh \ + test-eval-exports.sh \ ++ test-eval-extents.sh \ + test-eval-cache.sh \ + test-eval-dump-plugin.sh \ + test-eval-disconnect.sh \ +@@ -832,6 +833,7 @@ EXTRA_DIST += \ + test-eval.sh \ + test-eval-file.sh \ + test-eval-exports.sh \ ++ test-eval-extents.sh \ + test-eval-cache.sh \ + test-eval-dump-plugin.sh \ + test-eval-disconnect.sh \ +diff --git a/tests/test-eval-extents.sh b/tests/test-eval-extents.sh +new file mode 100755 +index 00000000..92b503e6 +--- /dev/null ++++ b/tests/test-eval-extents.sh +@@ -0,0 +1,71 @@ ++#!/usr/bin/env bash ++# nbdkit ++# Copyright Red Hat ++# ++# Redistribution and use in source and binary forms, with or without ++# modification, are permitted provided that the following conditions are ++# met: ++# ++# * Redistributions of source code must retain the above copyright ++# notice, this list of conditions and the following disclaimer. ++# ++# * Redistributions in binary form must reproduce the above copyright ++# notice, this list of conditions and the following disclaimer in the ++# documentation and/or other materials provided with the distribution. ++# ++# * Neither the name of Red Hat nor the names of its contributors may be ++# used to endorse or promote products derived from this software without ++# specific prior written permission. ++# ++# THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND ++# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ++# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A ++# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR ++# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF ++# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT ++# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++# SUCH DAMAGE. ++ ++source ./functions.sh ++set -e ++set -x ++ ++requires_run ++requires_plugin eval ++requires_nbdsh_uri ++requires nbdsh --base-allocation --version ++ ++files="eval-extents.out" ++rm -f $files ++cleanup_fn rm -f $files ++ ++# Trigger an off-by-one bug introduced in v1.11.10 and fixed in v1.43.7 ++export script=' ++def f(context, offset, extents, status): ++ print(extents) ++ ++# First, probe where the server should return 2 extents. ++h.block_status(2**32-1, 2, f) ++ ++# Next, probe where the server has exactly 2**32-1 bytes in its first extent. ++h.block_status(2**32-1, 1, f) ++ ++# Now, probe where the first extent has to be truncated. ++h.block_status(2**32-1, 0, f) ++' ++nbdkit eval \ ++ get_size='echo 5G' \ ++ pread='dd if=/dev/zero count=$3 iflag=count_bytes' \ ++ extents='echo 0 4G 1; echo 4G 1G 2' \ ++ --run 'nbdsh --base-allocation --uri "$uri" -c "$script"' \ ++ > eval-extents.out ++cat eval-extents.out ++diff -u - eval-extents.out < +Date: Tue, 22 Apr 2025 19:53:39 -0500 +Subject: [PATCH] blocksize: Fix 32-bit overflow in .extents [CVE-2025-47712] + +If the original request is larger than 2**32 - minblock, then we were +calling nbdkit_extents_aligned() with a count that rounded up then +overflowed to 0 instead of the intended 4G because of overflowing a +32-bit type, which in turn causes an assertion failure: + +nbdkit: ../../server/backend.c:814: backend_extents: Assertion `backend_valid_range (c, offset, count)' failed. + +The fix is to force the rounding to be in a 64-bit type from the +get-go. + +The ability for a well-behaved client to cause the server to die from +an assertion failure can be used as a denial of service attack against +other clients. Mitigations: if you requrire the use of TLS, then you +can ensure that you only have trusted clients that won't trigger a +block status call that large. Also, the problem only occurs when +using the blocksize filter, although setting the filter's maxlen +parameter to a smaller value than its default of 2**32-1 does not +help. + +Fixes: 2680be00 ('blocksize: Fix .extents when plugin changes type within minblock', v1.21.16) +Signed-off-by: Eric Blake +Message-ID: <20250423210917.1784789-3-eblake@redhat.com> +Reviewed-by: Richard W.M. Jones +(cherry picked from commit a486f88d1eea653ea88b0bf8804c4825dab25ec7) +--- + filters/blocksize/blocksize.c | 5 +- + tests/Makefile.am | 2 + + tests/test-blocksize-extents-overflow.sh | 83 ++++++++++++++++++++++++ + 3 files changed, 88 insertions(+), 2 deletions(-) + create mode 100755 tests/test-blocksize-extents-overflow.sh + +diff --git a/filters/blocksize/blocksize.c b/filters/blocksize/blocksize.c +index 09195cea..e5c8b744 100644 +--- a/filters/blocksize/blocksize.c ++++ b/filters/blocksize/blocksize.c +@@ -482,8 +482,9 @@ blocksize_extents (nbdkit_next *next, + return -1; + } + +- if (nbdkit_extents_aligned (next, MIN (ROUND_UP (count, h->minblock), +- h->maxlen), ++ if (nbdkit_extents_aligned (next, ++ MIN (ROUND_UP ((uint64_t) count, h->minblock), ++ h->maxlen), + ROUND_DOWN (offset, h->minblock), flags, + h->minblock, extents2, err) == -1) + return -1; +diff --git a/tests/Makefile.am b/tests/Makefile.am +index 9f9885b4..428b65e2 100644 +--- a/tests/Makefile.am ++++ b/tests/Makefile.am +@@ -1592,12 +1592,14 @@ test_layers_filter3_la_LIBADD = $(IMPORT_LIBRARY_ON_WINDOWS) + TESTS += \ + test-blocksize.sh \ + test-blocksize-extents.sh \ ++ test-blocksize-extents-overflow.sh \ + test-blocksize-default.sh \ + test-blocksize-sharding.sh \ + $(NULL) + EXTRA_DIST += \ + test-blocksize.sh \ + test-blocksize-extents.sh \ ++ test-blocksize-extents-overflow.sh \ + test-blocksize-default.sh \ + test-blocksize-sharding.sh \ + $(NULL) +diff --git a/tests/test-blocksize-extents-overflow.sh b/tests/test-blocksize-extents-overflow.sh +new file mode 100755 +index 00000000..844c3999 +--- /dev/null ++++ b/tests/test-blocksize-extents-overflow.sh +@@ -0,0 +1,83 @@ ++#!/usr/bin/env bash ++# nbdkit ++# Copyright Red Hat ++# ++# Redistribution and use in source and binary forms, with or without ++# modification, are permitted provided that the following conditions are ++# met: ++# ++# * Redistributions of source code must retain the above copyright ++# notice, this list of conditions and the following disclaimer. ++# ++# * Redistributions in binary form must reproduce the above copyright ++# notice, this list of conditions and the following disclaimer in the ++# documentation and/or other materials provided with the distribution. ++# ++# * Neither the name of Red Hat nor the names of its contributors may be ++# used to endorse or promote products derived from this software without ++# specific prior written permission. ++# ++# THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND ++# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, ++# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A ++# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR ++# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF ++# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT ++# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++# SUCH DAMAGE. ++ ++# Demonstrate a fix for a bug where blocksize overflowed 32 bits ++ ++source ./functions.sh ++set -e ++set -x ++ ++requires_run ++requires_plugin eval ++requires_nbdsh_uri ++requires nbdsh --base-allocation --version ++ ++# Script a sparse server that requires 512-byte aligned requests. ++exts=' ++if test $(( ($3|$4) & 511 )) != 0; then ++ echo "EINVAL request unaligned" 2>&1 ++ exit 1 ++fi ++echo 0 5G 0 ++' ++ ++# We also need an nbdsh script to parse all extents, coalescing adjacent ++# types for simplicity. ++# FIXME: Once nbdkit plugin version 3 allows 64-bit block extents, run ++# this test twice, once for each bit size (32-bit needs 2 extents, 64-bit ++# will get the same result with only 1 extent). ++export script=' ++size = h.get_size() ++offs = 0 ++entries = [] ++def f(metacontext, offset, e, err): ++ global entries ++ global offs ++ assert offs == offset ++ for length, flags in zip(*[iter(e)] * 2): ++ if entries and flags == entries[-1][1]: ++ entries[-1] = (entries[-1][0] + length, flags) ++ else: ++ entries.append((length, flags)) ++ offs = offs + length ++ ++# Test a loop over the entire device ++while offs < size: ++ len = min(size - offs, 2**32-1) ++ h.block_status(len, offs, f) ++assert entries == [(5 * 2**30, 0)] ++' ++ ++# Now run everything ++nbdkit --filter=blocksize eval minblock=512 \ ++ get_size='echo 5G' pread='exit 1' extents="$exts" \ ++ --run 'nbdsh --base-allocation -u "$uri" -c "$script"' +-- +2.47.1 + diff --git a/SOURCES/copy-patches.sh b/SOURCES/copy-patches.sh index 38b87a1..cad5231 100755 --- a/SOURCES/copy-patches.sh +++ b/SOURCES/copy-patches.sh @@ -6,7 +6,7 @@ set -e # directory. Use it like this: # ./copy-patches.sh -rhel_version=9.6 +rhel_version=9.7 # Check we're in the right directory. if [ ! -f nbdkit.spec ]; then diff --git a/SPECS/nbdkit.spec b/SPECS/nbdkit.spec index 2da1292..1873250 100644 --- a/SPECS/nbdkit.spec +++ b/SPECS/nbdkit.spec @@ -56,7 +56,7 @@ Name: nbdkit Version: 1.38.5 -Release: 2%{?dist} +Release: 10%{?dist} Summary: NBD server License: BSD-3-Clause @@ -78,7 +78,7 @@ Source2: libguestfs.keyring Source3: copy-patches.sh # Patches come from the upstream repository: -# https://gitlab.com/nbdkit/nbdkit/-/commits/rhel-9.6/ +# https://gitlab.com/nbdkit/nbdkit/-/commits/rhel-9.7/ # Patches. Patch0001: 0001-server-log-Move-preserve-errno-to-log_verror-functio.patch @@ -91,6 +91,38 @@ Patch0007: 0007-vddk-Cache-the-disk-size-in-the-handle.patch Patch0008: 0008-vddk-do_extents-Mark-some-local-variables-const.patch Patch0009: 0009-vddk-do_extents-Exit-the-function-if-we-hit-req_one-.patch Patch0010: 0010-vddk-do_extents-Avoid-reading-partial-chunk-beyond-t.patch +Patch0011: 0011-file-Add-debugging-if-sync_file_range-posix_fadvise-.patch +Patch0012: 0012-file-If-sync_file_range-fails-to-start-don-t-add-win.patch +Patch0013: 0013-tests-Add-tests-of-file-plugin-cache-none.patch +Patch0014: 0014-tests-Add-more-generic-tests-of-file-cache-none.patch +Patch0015: 0015-file-Hard-error-if-sync_file_range-fails.patch +Patch0016: 0016-file-Reduce-the-size-of-the-lock-around-write-evicti.patch +Patch0017: 0017-file-Document-implicit-assumption-about-eviction-win.patch +Patch0018: 0018-server-Turn-flush-into-a-controlpath-message.patch +Patch0019: 0019-file-Fix-minor-typo-in-debug-message.patch +Patch0020: 0020-file-Add-more-debugging-when-D-file.zero-1-is-used.patch +Patch0021: 0021-file-Fix-comment-style-in-a-few-places.patch +Patch0022: 0022-file-Fix-do_fallocate-debugging-on-Alpine.patch +Patch0023: 0023-file-Rename-h-can_zeroout-to-h-can_blkzeroout-to-ref.patch +Patch0024: 0024-file-zero-Document-implicit-order-that-we-will-try-z.patch +Patch0025: 0025-file-zero-Use-BLKDISCARD-method-if-may_trim-is-set.patch +Patch0026: 0026-vddk-Debug-length-of-extents-when-using-D-vddk.exten.patch +Patch0027: 0027-cacheextents-Mark-this-filter-as-deprecated.patch +Patch0028: 0028-include-Move-some-extent-functions-to-nbdkit-common..patch +Patch0029: 0029-vddk-Display-command-type-in-command-completed-messa.patch +Patch0030: 0030-vddk-Cache-the-readonly-flag-from-the-.open-call-in-.patch +Patch0031: 0031-vddk-Move-minimum-version-of-VDDK-to-6.7.patch +Patch0032: 0032-vddk-Unconditionally-test-QueryAllocatedBlocks.patch +Patch0033: 0033-vddk-Pre-cache-the-extents-for-readonly-connections.patch +Patch0034: 0034-file-Save-the-filename-or-equivalent-in-the-file-han.patch +Patch0035: 0035-file-Add-the-filename-or-equivalent-to-error-message.patch +Patch0036: 0036-file-Add-offset-count-to-error-messages.patch +Patch0037: 0037-vddk-stats-Use-us-instead-of-Unicode-s-for-microseco.patch +Patch0038: 0038-vddk-stats-Line-up-the-columns-correctly.patch +Patch0039: 0039-vddk-stats-Record-the-byte-count-of-each-QueryAlloca.patch +Patch0040: 0040-vddk-stats-Collect-elapsed-time-for-ReadAsync-and-Wr.patch +Patch0041: 0041-server-Fix-off-by-one-for-maximum-block_status-lengt.patch +Patch0042: 0042-blocksize-Fix-32-bit-overflow-in-.extents-CVE-2025-4.patch # For automatic RPM Provides generation. # See: https://rpm-software-management.github.io/rpm/manual/dependency_generators.html @@ -1509,6 +1541,31 @@ fi %changelog +* Mon Jun 09 2025 Richard W.M. Jones - 1.38.5-10 +- CVE-2025-47711 denial of service attack by client sending maximum size block + status +- CVE-2025-47712 denial of service attack by client sending large unaligned + size block status + resolves: RHEL-95814 + +* Sun Jun 08 2025 Richard W.M. Jones - 1.38.5-9 +- vddk: Improve statistics + related: RHEL-94823 + +* Thu Jun 05 2025 Richard W.M. Jones - 1.38.5-8 +- Log filename, offset and count in nbdkit-file-plugin error messages + resolves: RHEL-95363 + +* Mon Jun 02 2025 Richard W.M. Jones - 1.38.5-7 +- vddk: Pre-cache the extents for readonly connections + resolves: RHEL-94823 + +* Thu May 01 2025 Richard W.M. Jones - 1.38.5-6 +- Add extra system call checking and debugging to nbdkit-file-plugin + resolves: RHEL-85510 +- Allow nbdkit-file-plugin to zero and trim block devices + resolves: RHEL-89353 + * Mon Jan 06 2025 Richard W.M. Jones - 1.38.5-2 - vddk: Avoid reading partial chunk beyond the end of the disk resolves: RHEL-71694