7f6c9b5ea1
resolves: rhbz#1950632 Add Python .cleanup() method. Fix data corruption in zero and trim on unaligned tail. resolves: rhbz#1990134
401 lines
13 KiB
Diff
401 lines
13 KiB
Diff
From f6d831cd517851157e822ee96de5894e0b37c22d Mon Sep 17 00:00:00 2001
|
||
From: "Richard W.M. Jones" <rjones@redhat.com>
|
||
Date: Mon, 26 Jul 2021 13:55:21 +0100
|
||
Subject: [PATCH] cache, cow: Add blk_read_multiple function
|
||
|
||
Currently the cache and cow filters break up large requests into many
|
||
single block-sized requests to the underlying plugin. For some
|
||
plugins (eg. curl) this is very inefficient and causes huge
|
||
slow-downs.
|
||
|
||
For example I tested nbdkit + curl vs nbdkit + cache + curl against a
|
||
slow, remote VMware server. A simple run of virt-inspector was at
|
||
least 6-7 times slower with the cache filter. (It was so slow that I
|
||
didn't actually let it run to completion - I am estimating the
|
||
slowdown multiple using interim debug messages).
|
||
|
||
Implement a new blk_read_multiple function in the cache filter. It
|
||
does not break up "runs" of blocks which all have the same cache
|
||
state. The cache .pread method uses the new function to read the
|
||
block-aligned part of the request.
|
||
|
||
(cherry picked from commit ab661ccef5b3369fa22c33d0289baddc251b73bf)
|
||
---
|
||
filters/cache/blk.c | 83 ++++++++++++++++++++++++++++++++-----------
|
||
filters/cache/blk.h | 6 ++++
|
||
filters/cache/cache.c | 21 +++++------
|
||
filters/cow/blk.c | 63 +++++++++++++++++++++++---------
|
||
filters/cow/blk.h | 6 ++++
|
||
filters/cow/cow.c | 21 +++++------
|
||
6 files changed, 138 insertions(+), 62 deletions(-)
|
||
|
||
diff --git a/filters/cache/blk.c b/filters/cache/blk.c
|
||
index f52f30e3..f85ada35 100644
|
||
--- a/filters/cache/blk.c
|
||
+++ b/filters/cache/blk.c
|
||
@@ -44,6 +44,7 @@
|
||
#include <string.h>
|
||
#include <unistd.h>
|
||
#include <fcntl.h>
|
||
+#include <limits.h>
|
||
#include <errno.h>
|
||
|
||
#ifdef HAVE_SYS_STATVFS_H
|
||
@@ -193,26 +194,40 @@ blk_set_size (uint64_t new_size)
|
||
return 0;
|
||
}
|
||
|
||
-int
|
||
-blk_read (nbdkit_next *next,
|
||
- uint64_t blknum, uint8_t *block, int *err)
|
||
+static int
|
||
+_blk_read_multiple (nbdkit_next *next,
|
||
+ uint64_t blknum, uint64_t nrblocks,
|
||
+ uint8_t *block, int *err)
|
||
{
|
||
off_t offset = blknum * blksize;
|
||
- enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_CACHED);
|
||
+ bool not_cached =
|
||
+ bitmap_get_blk (&bm, blknum, BLOCK_NOT_CACHED) == BLOCK_NOT_CACHED;
|
||
+ uint64_t b, runblocks;
|
||
|
||
- reclaim (fd, &bm);
|
||
+ assert (nrblocks > 0);
|
||
|
||
if (cache_debug_verbose)
|
||
- nbdkit_debug ("cache: blk_read block %" PRIu64
|
||
+ nbdkit_debug ("cache: blk_read_multiple block %" PRIu64
|
||
" (offset %" PRIu64 ") is %s",
|
||
blknum, (uint64_t) offset,
|
||
- state == BLOCK_NOT_CACHED ? "not cached" :
|
||
- state == BLOCK_CLEAN ? "clean" :
|
||
- state == BLOCK_DIRTY ? "dirty" :
|
||
- "unknown");
|
||
+ not_cached ? "not cached" : "cached");
|
||
|
||
- if (state == BLOCK_NOT_CACHED) { /* Read underlying plugin. */
|
||
- unsigned n = blksize, tail = 0;
|
||
+ /* Find out how many of the following blocks form a "run" with the
|
||
+ * same cached/not-cached state. We can process that many blocks in
|
||
+ * one go.
|
||
+ */
|
||
+ for (b = 1, runblocks = 1; b < nrblocks; ++b, ++runblocks) {
|
||
+ bool s =
|
||
+ bitmap_get_blk (&bm, blknum + b, BLOCK_NOT_CACHED) == BLOCK_NOT_CACHED;
|
||
+ if (not_cached != s)
|
||
+ break;
|
||
+ }
|
||
+
|
||
+ if (not_cached) { /* Read underlying plugin. */
|
||
+ unsigned n, tail = 0;
|
||
+
|
||
+ assert (blksize * runblocks <= UINT_MAX);
|
||
+ n = blksize * runblocks;
|
||
|
||
if (offset + n > size) {
|
||
tail = offset + n - size;
|
||
@@ -228,32 +243,60 @@ blk_read (nbdkit_next *next,
|
||
*/
|
||
memset (block + n, 0, tail);
|
||
|
||
- /* If cache-on-read, copy the block to the cache. */
|
||
+ /* If cache-on-read, copy the blocks to the cache. */
|
||
if (cache_on_read) {
|
||
if (cache_debug_verbose)
|
||
nbdkit_debug ("cache: cache-on-read block %" PRIu64
|
||
" (offset %" PRIu64 ")",
|
||
blknum, (uint64_t) offset);
|
||
|
||
- if (pwrite (fd, block, blksize, offset) == -1) {
|
||
+ if (pwrite (fd, block, blksize * runblocks, offset) == -1) {
|
||
*err = errno;
|
||
nbdkit_error ("pwrite: %m");
|
||
return -1;
|
||
}
|
||
- bitmap_set_blk (&bm, blknum, BLOCK_CLEAN);
|
||
- lru_set_recently_accessed (blknum);
|
||
+ for (b = 0; b < runblocks; ++b) {
|
||
+ bitmap_set_blk (&bm, blknum + b, BLOCK_CLEAN);
|
||
+ lru_set_recently_accessed (blknum + b);
|
||
+ }
|
||
}
|
||
- return 0;
|
||
}
|
||
else { /* Read cache. */
|
||
- if (pread (fd, block, blksize, offset) == -1) {
|
||
+ if (pread (fd, block, blksize * runblocks, offset) == -1) {
|
||
*err = errno;
|
||
nbdkit_error ("pread: %m");
|
||
return -1;
|
||
}
|
||
- lru_set_recently_accessed (blknum);
|
||
- return 0;
|
||
+ for (b = 0; b < runblocks; ++b)
|
||
+ lru_set_recently_accessed (blknum + b);
|
||
}
|
||
+
|
||
+ /* If all done, return. */
|
||
+ if (runblocks == nrblocks)
|
||
+ return 0;
|
||
+
|
||
+ /* Recurse to read remaining blocks. */
|
||
+ return _blk_read_multiple (next,
|
||
+ blknum + runblocks,
|
||
+ nrblocks - runblocks,
|
||
+ block + blksize * runblocks,
|
||
+ err);
|
||
+}
|
||
+
|
||
+int
|
||
+blk_read_multiple (nbdkit_next *next,
|
||
+ uint64_t blknum, uint64_t nrblocks,
|
||
+ uint8_t *block, int *err)
|
||
+{
|
||
+ reclaim (fd, &bm);
|
||
+ return _blk_read_multiple (next, blknum, nrblocks, block, err);
|
||
+}
|
||
+
|
||
+int
|
||
+blk_read (nbdkit_next *next,
|
||
+ uint64_t blknum, uint8_t *block, int *err)
|
||
+{
|
||
+ return blk_read_multiple (next, blknum, 1, block, err);
|
||
}
|
||
|
||
int
|
||
diff --git a/filters/cache/blk.h b/filters/cache/blk.h
|
||
index 87c753e2..1ee33ed7 100644
|
||
--- a/filters/cache/blk.h
|
||
+++ b/filters/cache/blk.h
|
||
@@ -55,6 +55,12 @@ extern int blk_read (nbdkit_next *next,
|
||
uint64_t blknum, uint8_t *block, int *err)
|
||
__attribute__((__nonnull__ (1, 3, 4)));
|
||
|
||
+/* As above, but read multiple blocks. */
|
||
+extern int blk_read_multiple (nbdkit_next *next,
|
||
+ uint64_t blknum, uint64_t nrblocks,
|
||
+ uint8_t *block, int *err)
|
||
+ __attribute__((__nonnull__ (1, 4, 5)));
|
||
+
|
||
/* If a single block is not cached, copy it from the plugin. */
|
||
extern int blk_cache (nbdkit_next *next,
|
||
uint64_t blknum, uint8_t *block, int *err)
|
||
diff --git a/filters/cache/cache.c b/filters/cache/cache.c
|
||
index 745f552d..14cc03f2 100644
|
||
--- a/filters/cache/cache.c
|
||
+++ b/filters/cache/cache.c
|
||
@@ -313,7 +313,7 @@ cache_pread (nbdkit_next *next,
|
||
uint32_t flags, int *err)
|
||
{
|
||
CLEANUP_FREE uint8_t *block = NULL;
|
||
- uint64_t blknum, blkoffs;
|
||
+ uint64_t blknum, blkoffs, nrblocks;
|
||
int r;
|
||
|
||
assert (!flags);
|
||
@@ -348,22 +348,17 @@ cache_pread (nbdkit_next *next,
|
||
}
|
||
|
||
/* Aligned body */
|
||
- /* XXX This breaks up large read requests into smaller ones, which
|
||
- * is a problem for plugins which have a large, fixed per-request
|
||
- * overhead (hello, curl). We should try to keep large requests
|
||
- * together as much as possible, but that requires us to be much
|
||
- * smarter here.
|
||
- */
|
||
- while (count >= blksize) {
|
||
+ nrblocks = count / blksize;
|
||
+ if (nrblocks > 0) {
|
||
ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
|
||
- r = blk_read (next, blknum, buf, err);
|
||
+ r = blk_read_multiple (next, blknum, nrblocks, buf, err);
|
||
if (r == -1)
|
||
return -1;
|
||
|
||
- buf += blksize;
|
||
- count -= blksize;
|
||
- offset += blksize;
|
||
- blknum++;
|
||
+ buf += nrblocks * blksize;
|
||
+ count -= nrblocks * blksize;
|
||
+ offset += nrblocks * blksize;
|
||
+ blknum += nrblocks;
|
||
}
|
||
|
||
/* Unaligned tail */
|
||
diff --git a/filters/cow/blk.c b/filters/cow/blk.c
|
||
index 0f12d510..9e6c8879 100644
|
||
--- a/filters/cow/blk.c
|
||
+++ b/filters/cow/blk.c
|
||
@@ -79,6 +79,7 @@
|
||
#include <inttypes.h>
|
||
#include <unistd.h>
|
||
#include <fcntl.h>
|
||
+#include <limits.h>
|
||
#include <errno.h>
|
||
#include <sys/types.h>
|
||
|
||
@@ -223,33 +224,48 @@ blk_status (uint64_t blknum, bool *present, bool *trimmed)
|
||
*trimmed = state == BLOCK_TRIMMED;
|
||
}
|
||
|
||
-/* These are the block operations. They always read or write a single
|
||
- * whole block of size ‘blksize’.
|
||
+/* These are the block operations. They always read or write whole
|
||
+ * blocks of size ‘blksize’.
|
||
*/
|
||
int
|
||
-blk_read (nbdkit_next *next,
|
||
- uint64_t blknum, uint8_t *block, int *err)
|
||
+blk_read_multiple (nbdkit_next *next,
|
||
+ uint64_t blknum, uint64_t nrblocks,
|
||
+ uint8_t *block, int *err)
|
||
{
|
||
off_t offset = blknum * BLKSIZE;
|
||
enum bm_entry state;
|
||
+ uint64_t b, runblocks;
|
||
|
||
- /* The state might be modified from another thread - for example
|
||
- * another thread might write (BLOCK_NOT_ALLOCATED ->
|
||
- * BLOCK_ALLOCATED) while we are reading from the plugin, returning
|
||
- * the old data. However a read issued after the write returns
|
||
- * should always return the correct data.
|
||
+ /* Find out how many of the following blocks form a "run" with the
|
||
+ * same state. We can process that many blocks in one go.
|
||
+ *
|
||
+ * About the locking: The state might be modified from another
|
||
+ * thread - for example another thread might write
|
||
+ * (BLOCK_NOT_ALLOCATED -> BLOCK_ALLOCATED) while we are reading
|
||
+ * from the plugin, returning the old data. However a read issued
|
||
+ * after the write returns should always return the correct data.
|
||
*/
|
||
{
|
||
ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
|
||
state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
|
||
+
|
||
+ for (b = 1, runblocks = 1; b < nrblocks; ++b, ++runblocks) {
|
||
+ enum bm_entry s = bitmap_get_blk (&bm, blknum + b, BLOCK_NOT_ALLOCATED);
|
||
+ if (state != s)
|
||
+ break;
|
||
+ }
|
||
}
|
||
|
||
if (cow_debug_verbose)
|
||
- nbdkit_debug ("cow: blk_read block %" PRIu64 " (offset %" PRIu64 ") is %s",
|
||
+ nbdkit_debug ("cow: blk_read_multiple block %" PRIu64
|
||
+ " (offset %" PRIu64 ") is %s",
|
||
blknum, (uint64_t) offset, state_to_string (state));
|
||
|
||
if (state == BLOCK_NOT_ALLOCATED) { /* Read underlying plugin. */
|
||
- unsigned n = BLKSIZE, tail = 0;
|
||
+ unsigned n, tail = 0;
|
||
+
|
||
+ assert (BLKSIZE * runblocks <= UINT_MAX);
|
||
+ n = BLKSIZE * runblocks;
|
||
|
||
if (offset + n > size) {
|
||
tail = offset + n - size;
|
||
@@ -264,20 +280,35 @@ blk_read (nbdkit_next *next,
|
||
* zeroing the tail.
|
||
*/
|
||
memset (block + n, 0, tail);
|
||
- return 0;
|
||
}
|
||
else if (state == BLOCK_ALLOCATED) { /* Read overlay. */
|
||
- if (pread (fd, block, BLKSIZE, offset) == -1) {
|
||
+ if (pread (fd, block, BLKSIZE * runblocks, offset) == -1) {
|
||
*err = errno;
|
||
nbdkit_error ("pread: %m");
|
||
return -1;
|
||
}
|
||
- return 0;
|
||
}
|
||
else /* state == BLOCK_TRIMMED */ {
|
||
- memset (block, 0, BLKSIZE);
|
||
- return 0;
|
||
+ memset (block, 0, BLKSIZE * runblocks);
|
||
}
|
||
+
|
||
+ /* If all done, return. */
|
||
+ if (runblocks == nrblocks)
|
||
+ return 0;
|
||
+
|
||
+ /* Recurse to read remaining blocks. */
|
||
+ return blk_read_multiple (next,
|
||
+ blknum + runblocks,
|
||
+ nrblocks - runblocks,
|
||
+ block + BLKSIZE * runblocks,
|
||
+ err);
|
||
+}
|
||
+
|
||
+int
|
||
+blk_read (nbdkit_next *next,
|
||
+ uint64_t blknum, uint8_t *block, int *err)
|
||
+{
|
||
+ return blk_read_multiple (next, blknum, 1, block, err);
|
||
}
|
||
|
||
int
|
||
diff --git a/filters/cow/blk.h b/filters/cow/blk.h
|
||
index e6fd7417..b066c602 100644
|
||
--- a/filters/cow/blk.h
|
||
+++ b/filters/cow/blk.h
|
||
@@ -55,6 +55,12 @@ extern int blk_read (nbdkit_next *next,
|
||
uint64_t blknum, uint8_t *block, int *err)
|
||
__attribute__((__nonnull__ (1, 3, 4)));
|
||
|
||
+/* Read multiple blocks from the overlay or plugin. */
|
||
+extern int blk_read_multiple (nbdkit_next *next,
|
||
+ uint64_t blknum, uint64_t nrblocks,
|
||
+ uint8_t *block, int *err)
|
||
+ __attribute__((__nonnull__ (1, 4, 5)));
|
||
+
|
||
/* Cache mode for blocks not already in overlay */
|
||
enum cache_mode {
|
||
BLK_CACHE_IGNORE, /* Do nothing */
|
||
diff --git a/filters/cow/cow.c b/filters/cow/cow.c
|
||
index 6cefee36..e939f23f 100644
|
||
--- a/filters/cow/cow.c
|
||
+++ b/filters/cow/cow.c
|
||
@@ -210,7 +210,7 @@ cow_pread (nbdkit_next *next,
|
||
uint32_t flags, int *err)
|
||
{
|
||
CLEANUP_FREE uint8_t *block = NULL;
|
||
- uint64_t blknum, blkoffs;
|
||
+ uint64_t blknum, blkoffs, nrblocks;
|
||
int r;
|
||
|
||
if (!IS_ALIGNED (count | offset, BLKSIZE)) {
|
||
@@ -243,21 +243,16 @@ cow_pread (nbdkit_next *next,
|
||
}
|
||
|
||
/* Aligned body */
|
||
- /* XXX This breaks up large read requests into smaller ones, which
|
||
- * is a problem for plugins which have a large, fixed per-request
|
||
- * overhead (hello, curl). We should try to keep large requests
|
||
- * together as much as possible, but that requires us to be much
|
||
- * smarter here.
|
||
- */
|
||
- while (count >= BLKSIZE) {
|
||
- r = blk_read (next, blknum, buf, err);
|
||
+ nrblocks = count / BLKSIZE;
|
||
+ if (nrblocks > 0) {
|
||
+ r = blk_read_multiple (next, blknum, nrblocks, buf, err);
|
||
if (r == -1)
|
||
return -1;
|
||
|
||
- buf += BLKSIZE;
|
||
- count -= BLKSIZE;
|
||
- offset += BLKSIZE;
|
||
- blknum++;
|
||
+ buf += nrblocks * BLKSIZE;
|
||
+ count -= nrblocks * BLKSIZE;
|
||
+ offset += nrblocks * BLKSIZE;
|
||
+ blknum += nrblocks;
|
||
}
|
||
|
||
/* Unaligned tail */
|
||
--
|
||
2.31.1
|
||
|