From da9411d61e4c5f345bef4a7a28de024fde5b7efd Mon Sep 17 00:00:00 2001 From: "Richard W.M. Jones" Date: Wed, 8 Dec 2021 11:25:06 +0000 Subject: [PATCH] file: Fix implementation of cache=none for writes Backport the following upstream commit: https://gitlab.com/nbdkit/nbdkit/-/commit/a956e2e75d6c88eeefecd967505667c9f176e3af related: rhbz#2011709 --- ...lementation-of-cache-none-for-writes.patch | 181 ++++++++++++++++++ nbdkit.spec | 6 +- 2 files changed, 185 insertions(+), 2 deletions(-) create mode 100644 0029-file-Fix-implementation-of-cache-none-for-writes.patch diff --git a/0029-file-Fix-implementation-of-cache-none-for-writes.patch b/0029-file-Fix-implementation-of-cache-none-for-writes.patch new file mode 100644 index 0000000..7ed3e22 --- /dev/null +++ b/0029-file-Fix-implementation-of-cache-none-for-writes.patch @@ -0,0 +1,181 @@ +From 86b2baa7cd4a32246d447cbcedaaef3a15212ea4 Mon Sep 17 00:00:00 2001 +From: "Richard W.M. Jones" +Date: Tue, 7 Dec 2021 21:08:26 +0000 +Subject: [PATCH] file: Fix implementation of cache=none for writes + +When testing virt-v2v we found that cache=none had very pessimal +performance in its current implementation when writing. See: + + https://github.com/libguestfs/virt-v2v/commit/ac59d3b2310511b1537d408b675b19ec9a5d384e + +However we know of a much better implementation - the one in nbdcopy. +This commit copies that implementation (for writes only). + +A simple test is to do: + + $ ./nbdkit file out.img cache=none --run 'nbdcopy fedora-33.img $uri' + +and then check the cache usage of the output file, which should be +around 0% (using https://github.com/Feh/nocache): + + $ cachestats out.img + pages in cache: 409/1572864 (0.0%) [filesize=6291456.0K, pagesize=4K] + +For modular virt-v2v doing a local disk to local disk conversion: + + - before this change, without cache=none + virt-v2v took 93.7 seconds, 19.1% pages cached in output file + + - before this change, enabling cache=none + virt-v2v took 125.4 seconds, 0.0% pages cached in output file + ^^^ this is the bad case which caused the investigation + + - after this change, without cache=none + virt-v2v took 93.2 seconds, 19.1% pages cached in output file + + - after this change, enabling cache=none + virt-v2v took 97.9 seconds, 0.1% pages cached in output file + +I tried to adjust NR_WINDOWS to find an optimum. Increasing it made +no difference in performance but predictably caused a slight increase +in cached pages. Reducing it slowed performance slightly. So I +conclude that 8 is about right, but it probably depends on the +hardware. + +(cherry picked from commit a956e2e75d6c88eeefecd967505667c9f176e3af) +--- + plugins/file/file.c | 79 +++++++++++++++++++++++++---- + plugins/file/nbdkit-file-plugin.pod | 3 ++ + 2 files changed, 72 insertions(+), 10 deletions(-) + +diff --git a/plugins/file/file.c b/plugins/file/file.c +index 35270a24..caf24b2c 100644 +--- a/plugins/file/file.c ++++ b/plugins/file/file.c +@@ -85,6 +85,69 @@ static int fadvise_mode = + /* cache mode */ + static enum { cache_default, cache_none } cache_mode = cache_default; + ++/* Define EVICT_WRITES if we are going to evict the page cache ++ * (cache=none) after writing. This is only known to work on Linux. ++ */ ++#ifdef __linux__ ++#define EVICT_WRITES 1 ++#endif ++ ++#ifdef EVICT_WRITES ++/* Queue writes so they will be evicted from the cache. See ++ * libnbd.git copy/file-ops.c for the rationale behind this. ++ */ ++#define NR_WINDOWS 8 ++ ++struct write_window { ++ int fd; ++ uint64_t offset; ++ size_t len; ++}; ++ ++static pthread_mutex_t window_lock = PTHREAD_MUTEX_INITIALIZER; ++static struct write_window window[NR_WINDOWS]; ++ ++static void ++evict_writes (int fd, uint64_t offset, size_t len) ++{ ++ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock); ++ ++ /* Evict the oldest window from the page cache. */ ++ if (window[0].len > 0) { ++ sync_file_range (window[0].fd, window[0].offset, window[0].len, ++ SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| ++ SYNC_FILE_RANGE_WAIT_AFTER); ++ posix_fadvise (window[0].fd, window[0].offset, window[0].len, ++ POSIX_FADV_DONTNEED); ++ } ++ ++ /* Move the Nth window to N-1. */ ++ memmove (&window[0], &window[1], sizeof window[0] * (NR_WINDOWS-1)); ++ ++ /* Set up the current window and tell Linux to start writing it out ++ * to disk (asynchronously). ++ */ ++ sync_file_range (fd, offset, len, SYNC_FILE_RANGE_WRITE); ++ window[NR_WINDOWS-1].fd = fd; ++ window[NR_WINDOWS-1].offset = offset; ++ window[NR_WINDOWS-1].len = len; ++} ++ ++/* When we close the handle we must remove any windows which are still ++ * associated. They missed the boat, oh well :-( ++ */ ++static void ++remove_fd_from_window (int fd) ++{ ++ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock); ++ size_t i; ++ ++ for (i = 0; i < NR_WINDOWS; ++i) ++ if (window[i].len > 0 && window[i].fd == fd) ++ window[i].len = 0; ++} ++#endif /* EVICT_WRITES */ ++ + /* Any callbacks using lseek must be protected by this lock. */ + static pthread_mutex_t lseek_lock = PTHREAD_MUTEX_INITIALIZER; + +@@ -431,6 +494,9 @@ file_close (void *handle) + { + struct handle *h = handle; + ++#ifdef EVICT_WRITES ++ remove_fd_from_window (h->fd); ++#endif + close (h->fd); + free (h); + } +@@ -583,15 +649,9 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset, + { + struct handle *h = handle; + +-#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_DONTNEED) ++#if EVICT_WRITES + uint32_t orig_count = count; + uint64_t orig_offset = offset; +- +- /* If cache=none we want to force pages we have just written to the +- * file to be flushed to disk so we can immediately evict them from +- * the page cache. +- */ +- if (cache_mode == cache_none) flags |= NBDKIT_FLAG_FUA; + #endif + + while (count > 0) { +@@ -608,10 +668,9 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset, + if ((flags & NBDKIT_FLAG_FUA) && file_flush (handle, 0) == -1) + return -1; + +-#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_DONTNEED) +- /* On Linux this will evict the pages we just wrote from the page cache. */ ++#if EVICT_WRITES + if (cache_mode == cache_none) +- posix_fadvise (h->fd, orig_offset, orig_count, POSIX_FADV_DONTNEED); ++ evict_writes (h->fd, orig_offset, orig_count); + #endif + + return 0; +diff --git a/plugins/file/nbdkit-file-plugin.pod b/plugins/file/nbdkit-file-plugin.pod +index 0ac0ee53..f8f0e198 100644 +--- a/plugins/file/nbdkit-file-plugin.pod ++++ b/plugins/file/nbdkit-file-plugin.pod +@@ -117,6 +117,9 @@ cache: + + nbdkit file disk.img fadvise=sequential cache=none + ++Only use fadvise=sequential if reading, and the reads are mainly ++sequential. ++ + =head2 Files on tmpfs + + If you want to expose a file that resides on a file system known to +-- +2.31.1 + diff --git a/nbdkit.spec b/nbdkit.spec index 6a06800..b1ade79 100644 --- a/nbdkit.spec +++ b/nbdkit.spec @@ -51,7 +51,7 @@ ExclusiveArch: x86_64 Name: nbdkit Version: 1.28.3 -Release: 1%{?dist} +Release: 2%{?dist} Summary: NBD server License: BSD @@ -104,6 +104,7 @@ Patch0025: 0025-vddk-Drop-obsolete-documentation-related-to-thread-m.patch Patch0026: 0026-Revert-podwrapper.pl.in-Use-short-commit-date.patch Patch0027: 0027-Fix-podwrapper.pl.in-Use-short-commit-date.patch Patch0028: 0028-scripts-Add-simple-script-for-automating-VDDK-disk-c.patch +Patch0029: 0029-file-Fix-implementation-of-cache-none-for-writes.patch BuildRequires: make %if 0%{patches_touch_autotools} @@ -1217,7 +1218,7 @@ export LIBGUESTFS_TRACE=1 %changelog -* Tue Dec 07 2021 Richard W.M. Jones - 1.28.3-1 +* Wed Dec 08 2021 Richard W.M. Jones - 1.28.3-2 - Rebase to new stable branch version 1.28.3 resolves: rhbz#2011709 - Move nbdkit-null-plugin to nbdkit-server package @@ -1228,6 +1229,7 @@ export LIBGUESTFS_TRACE=1 - Switch to xorriso (instead of genisoimage) - Distribute README.VDDK in nbdkit-vddk-plugin subpackage - Fix nbdkit-cow-filter cow-block-size=4096 +- file: Fix implementation of cache=none for writes resolves: rhbz#2029751 * Thu Aug 19 2021 Richard W.M. Jones - 1.26.5-1