file: Fix implementation of cache=none for writes

Backport the following upstream commit:
a956e2e75d

related: rhbz#2011709
This commit is contained in:
Richard W.M. Jones 2021-12-08 11:25:06 +00:00
parent d2ce66ec1f
commit da9411d61e
2 changed files with 185 additions and 2 deletions

View File

@ -0,0 +1,181 @@
From 86b2baa7cd4a32246d447cbcedaaef3a15212ea4 Mon Sep 17 00:00:00 2001
From: "Richard W.M. Jones" <rjones@redhat.com>
Date: Tue, 7 Dec 2021 21:08:26 +0000
Subject: [PATCH] file: Fix implementation of cache=none for writes
When testing virt-v2v we found that cache=none had very pessimal
performance in its current implementation when writing. See:
https://github.com/libguestfs/virt-v2v/commit/ac59d3b2310511b1537d408b675b19ec9a5d384e
However we know of a much better implementation - the one in nbdcopy.
This commit copies that implementation (for writes only).
A simple test is to do:
$ ./nbdkit file out.img cache=none --run 'nbdcopy fedora-33.img $uri'
and then check the cache usage of the output file, which should be
around 0% (using https://github.com/Feh/nocache):
$ cachestats out.img
pages in cache: 409/1572864 (0.0%) [filesize=6291456.0K, pagesize=4K]
For modular virt-v2v doing a local disk to local disk conversion:
- before this change, without cache=none
virt-v2v took 93.7 seconds, 19.1% pages cached in output file
- before this change, enabling cache=none
virt-v2v took 125.4 seconds, 0.0% pages cached in output file
^^^ this is the bad case which caused the investigation
- after this change, without cache=none
virt-v2v took 93.2 seconds, 19.1% pages cached in output file
- after this change, enabling cache=none
virt-v2v took 97.9 seconds, 0.1% pages cached in output file
I tried to adjust NR_WINDOWS to find an optimum. Increasing it made
no difference in performance but predictably caused a slight increase
in cached pages. Reducing it slowed performance slightly. So I
conclude that 8 is about right, but it probably depends on the
hardware.
(cherry picked from commit a956e2e75d6c88eeefecd967505667c9f176e3af)
---
plugins/file/file.c | 79 +++++++++++++++++++++++++----
plugins/file/nbdkit-file-plugin.pod | 3 ++
2 files changed, 72 insertions(+), 10 deletions(-)
diff --git a/plugins/file/file.c b/plugins/file/file.c
index 35270a24..caf24b2c 100644
--- a/plugins/file/file.c
+++ b/plugins/file/file.c
@@ -85,6 +85,69 @@ static int fadvise_mode =
/* cache mode */
static enum { cache_default, cache_none } cache_mode = cache_default;
+/* Define EVICT_WRITES if we are going to evict the page cache
+ * (cache=none) after writing. This is only known to work on Linux.
+ */
+#ifdef __linux__
+#define EVICT_WRITES 1
+#endif
+
+#ifdef EVICT_WRITES
+/* Queue writes so they will be evicted from the cache. See
+ * libnbd.git copy/file-ops.c for the rationale behind this.
+ */
+#define NR_WINDOWS 8
+
+struct write_window {
+ int fd;
+ uint64_t offset;
+ size_t len;
+};
+
+static pthread_mutex_t window_lock = PTHREAD_MUTEX_INITIALIZER;
+static struct write_window window[NR_WINDOWS];
+
+static void
+evict_writes (int fd, uint64_t offset, size_t len)
+{
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock);
+
+ /* Evict the oldest window from the page cache. */
+ if (window[0].len > 0) {
+ sync_file_range (window[0].fd, window[0].offset, window[0].len,
+ SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|
+ SYNC_FILE_RANGE_WAIT_AFTER);
+ posix_fadvise (window[0].fd, window[0].offset, window[0].len,
+ POSIX_FADV_DONTNEED);
+ }
+
+ /* Move the Nth window to N-1. */
+ memmove (&window[0], &window[1], sizeof window[0] * (NR_WINDOWS-1));
+
+ /* Set up the current window and tell Linux to start writing it out
+ * to disk (asynchronously).
+ */
+ sync_file_range (fd, offset, len, SYNC_FILE_RANGE_WRITE);
+ window[NR_WINDOWS-1].fd = fd;
+ window[NR_WINDOWS-1].offset = offset;
+ window[NR_WINDOWS-1].len = len;
+}
+
+/* When we close the handle we must remove any windows which are still
+ * associated. They missed the boat, oh well :-(
+ */
+static void
+remove_fd_from_window (int fd)
+{
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock);
+ size_t i;
+
+ for (i = 0; i < NR_WINDOWS; ++i)
+ if (window[i].len > 0 && window[i].fd == fd)
+ window[i].len = 0;
+}
+#endif /* EVICT_WRITES */
+
/* Any callbacks using lseek must be protected by this lock. */
static pthread_mutex_t lseek_lock = PTHREAD_MUTEX_INITIALIZER;
@@ -431,6 +494,9 @@ file_close (void *handle)
{
struct handle *h = handle;
+#ifdef EVICT_WRITES
+ remove_fd_from_window (h->fd);
+#endif
close (h->fd);
free (h);
}
@@ -583,15 +649,9 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset,
{
struct handle *h = handle;
-#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_DONTNEED)
+#if EVICT_WRITES
uint32_t orig_count = count;
uint64_t orig_offset = offset;
-
- /* If cache=none we want to force pages we have just written to the
- * file to be flushed to disk so we can immediately evict them from
- * the page cache.
- */
- if (cache_mode == cache_none) flags |= NBDKIT_FLAG_FUA;
#endif
while (count > 0) {
@@ -608,10 +668,9 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset,
if ((flags & NBDKIT_FLAG_FUA) && file_flush (handle, 0) == -1)
return -1;
-#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_DONTNEED)
- /* On Linux this will evict the pages we just wrote from the page cache. */
+#if EVICT_WRITES
if (cache_mode == cache_none)
- posix_fadvise (h->fd, orig_offset, orig_count, POSIX_FADV_DONTNEED);
+ evict_writes (h->fd, orig_offset, orig_count);
#endif
return 0;
diff --git a/plugins/file/nbdkit-file-plugin.pod b/plugins/file/nbdkit-file-plugin.pod
index 0ac0ee53..f8f0e198 100644
--- a/plugins/file/nbdkit-file-plugin.pod
+++ b/plugins/file/nbdkit-file-plugin.pod
@@ -117,6 +117,9 @@ cache:
nbdkit file disk.img fadvise=sequential cache=none
+Only use fadvise=sequential if reading, and the reads are mainly
+sequential.
+
=head2 Files on tmpfs
If you want to expose a file that resides on a file system known to
--
2.31.1

View File

@ -51,7 +51,7 @@ ExclusiveArch: x86_64
Name: nbdkit Name: nbdkit
Version: 1.28.3 Version: 1.28.3
Release: 1%{?dist} Release: 2%{?dist}
Summary: NBD server Summary: NBD server
License: BSD License: BSD
@ -104,6 +104,7 @@ Patch0025: 0025-vddk-Drop-obsolete-documentation-related-to-thread-m.patch
Patch0026: 0026-Revert-podwrapper.pl.in-Use-short-commit-date.patch Patch0026: 0026-Revert-podwrapper.pl.in-Use-short-commit-date.patch
Patch0027: 0027-Fix-podwrapper.pl.in-Use-short-commit-date.patch Patch0027: 0027-Fix-podwrapper.pl.in-Use-short-commit-date.patch
Patch0028: 0028-scripts-Add-simple-script-for-automating-VDDK-disk-c.patch Patch0028: 0028-scripts-Add-simple-script-for-automating-VDDK-disk-c.patch
Patch0029: 0029-file-Fix-implementation-of-cache-none-for-writes.patch
BuildRequires: make BuildRequires: make
%if 0%{patches_touch_autotools} %if 0%{patches_touch_autotools}
@ -1217,7 +1218,7 @@ export LIBGUESTFS_TRACE=1
%changelog %changelog
* Tue Dec 07 2021 Richard W.M. Jones <rjones@redhat.com> - 1.28.3-1 * Wed Dec 08 2021 Richard W.M. Jones <rjones@redhat.com> - 1.28.3-2
- Rebase to new stable branch version 1.28.3 - Rebase to new stable branch version 1.28.3
resolves: rhbz#2011709 resolves: rhbz#2011709
- Move nbdkit-null-plugin to nbdkit-server package - Move nbdkit-null-plugin to nbdkit-server package
@ -1228,6 +1229,7 @@ export LIBGUESTFS_TRACE=1
- Switch to xorriso (instead of genisoimage) - Switch to xorriso (instead of genisoimage)
- Distribute README.VDDK in nbdkit-vddk-plugin subpackage - Distribute README.VDDK in nbdkit-vddk-plugin subpackage
- Fix nbdkit-cow-filter cow-block-size=4096 - Fix nbdkit-cow-filter cow-block-size=4096
- file: Fix implementation of cache=none for writes
resolves: rhbz#2029751 resolves: rhbz#2029751
* Thu Aug 19 2021 Richard W.M. Jones <rjones@redhat.com> - 1.26.5-1 * Thu Aug 19 2021 Richard W.M. Jones <rjones@redhat.com> - 1.26.5-1