From 5cb4adb94a6ff4325205fea3512c037c91579263 Mon Sep 17 00:00:00 2001 From: "Richard W.M. Jones" Date: Tue, 7 Dec 2021 21:08:26 +0000 Subject: [PATCH] file: Fix implementation of cache=none for writes When testing virt-v2v we found that cache=none had very pessimal performance in its current implementation when writing. See: https://github.com/libguestfs/virt-v2v/commit/ac59d3b2310511b1537d408b675b19ec9a5d384e However we know of a much better implementation - the one in nbdcopy. This commit copies that implementation (for writes only). A simple test is to do: $ ./nbdkit file out.img cache=none --run 'nbdcopy fedora-33.img $uri' and then check the cache usage of the output file, which should be around 0% (using https://github.com/Feh/nocache): $ cachestats out.img pages in cache: 409/1572864 (0.0%) [filesize=6291456.0K, pagesize=4K] For modular virt-v2v doing a local disk to local disk conversion: - before this change, without cache=none virt-v2v took 93.7 seconds, 19.1% pages cached in output file - before this change, enabling cache=none virt-v2v took 125.4 seconds, 0.0% pages cached in output file ^^^ this is the bad case which caused the investigation - after this change, without cache=none virt-v2v took 93.2 seconds, 19.1% pages cached in output file - after this change, enabling cache=none virt-v2v took 97.9 seconds, 0.1% pages cached in output file I tried to adjust NR_WINDOWS to find an optimum. Increasing it made no difference in performance but predictably caused a slight increase in cached pages. Reducing it slowed performance slightly. So I conclude that 8 is about right, but it probably depends on the hardware. (cherry picked from commit a956e2e75d6c88eeefecd967505667c9f176e3af) --- plugins/file/file.c | 79 +++++++++++++++++++++++++---- plugins/file/nbdkit-file-plugin.pod | 3 ++ 2 files changed, 72 insertions(+), 10 deletions(-) diff --git a/plugins/file/file.c b/plugins/file/file.c index 35270a24..caf24b2c 100644 --- a/plugins/file/file.c +++ b/plugins/file/file.c @@ -85,6 +85,69 @@ static int fadvise_mode = /* cache mode */ static enum { cache_default, cache_none } cache_mode = cache_default; +/* Define EVICT_WRITES if we are going to evict the page cache + * (cache=none) after writing. This is only known to work on Linux. + */ +#ifdef __linux__ +#define EVICT_WRITES 1 +#endif + +#ifdef EVICT_WRITES +/* Queue writes so they will be evicted from the cache. See + * libnbd.git copy/file-ops.c for the rationale behind this. + */ +#define NR_WINDOWS 8 + +struct write_window { + int fd; + uint64_t offset; + size_t len; +}; + +static pthread_mutex_t window_lock = PTHREAD_MUTEX_INITIALIZER; +static struct write_window window[NR_WINDOWS]; + +static void +evict_writes (int fd, uint64_t offset, size_t len) +{ + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock); + + /* Evict the oldest window from the page cache. */ + if (window[0].len > 0) { + sync_file_range (window[0].fd, window[0].offset, window[0].len, + SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| + SYNC_FILE_RANGE_WAIT_AFTER); + posix_fadvise (window[0].fd, window[0].offset, window[0].len, + POSIX_FADV_DONTNEED); + } + + /* Move the Nth window to N-1. */ + memmove (&window[0], &window[1], sizeof window[0] * (NR_WINDOWS-1)); + + /* Set up the current window and tell Linux to start writing it out + * to disk (asynchronously). + */ + sync_file_range (fd, offset, len, SYNC_FILE_RANGE_WRITE); + window[NR_WINDOWS-1].fd = fd; + window[NR_WINDOWS-1].offset = offset; + window[NR_WINDOWS-1].len = len; +} + +/* When we close the handle we must remove any windows which are still + * associated. They missed the boat, oh well :-( + */ +static void +remove_fd_from_window (int fd) +{ + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock); + size_t i; + + for (i = 0; i < NR_WINDOWS; ++i) + if (window[i].len > 0 && window[i].fd == fd) + window[i].len = 0; +} +#endif /* EVICT_WRITES */ + /* Any callbacks using lseek must be protected by this lock. */ static pthread_mutex_t lseek_lock = PTHREAD_MUTEX_INITIALIZER; @@ -431,6 +494,9 @@ file_close (void *handle) { struct handle *h = handle; +#ifdef EVICT_WRITES + remove_fd_from_window (h->fd); +#endif close (h->fd); free (h); } @@ -583,15 +649,9 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset, { struct handle *h = handle; -#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_DONTNEED) +#if EVICT_WRITES uint32_t orig_count = count; uint64_t orig_offset = offset; - - /* If cache=none we want to force pages we have just written to the - * file to be flushed to disk so we can immediately evict them from - * the page cache. - */ - if (cache_mode == cache_none) flags |= NBDKIT_FLAG_FUA; #endif while (count > 0) { @@ -608,10 +668,9 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset, if ((flags & NBDKIT_FLAG_FUA) && file_flush (handle, 0) == -1) return -1; -#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_DONTNEED) - /* On Linux this will evict the pages we just wrote from the page cache. */ +#if EVICT_WRITES if (cache_mode == cache_none) - posix_fadvise (h->fd, orig_offset, orig_count, POSIX_FADV_DONTNEED); + evict_writes (h->fd, orig_offset, orig_count); #endif return 0; diff --git a/plugins/file/nbdkit-file-plugin.pod b/plugins/file/nbdkit-file-plugin.pod index 0ac0ee53..f8f0e198 100644 --- a/plugins/file/nbdkit-file-plugin.pod +++ b/plugins/file/nbdkit-file-plugin.pod @@ -117,6 +117,9 @@ cache: nbdkit file disk.img fadvise=sequential cache=none +Only use fadvise=sequential if reading, and the reads are mainly +sequential. + =head2 Files on tmpfs If you want to expose a file that resides on a file system known to -- 2.31.1