2021-08-19 07:53:56 +00:00
|
|
|
|
From b5dc8577c5c6d1205e2106b629fad327c3a409ea Mon Sep 17 00:00:00 2001
|
2021-07-26 18:44:01 +00:00
|
|
|
|
From: "Richard W.M. Jones" <rjones@redhat.com>
|
|
|
|
|
Date: Mon, 26 Jul 2021 13:55:21 +0100
|
|
|
|
|
Subject: [PATCH] cache, cow: Add blk_read_multiple function
|
|
|
|
|
|
|
|
|
|
Currently the cache and cow filters break up large requests into many
|
|
|
|
|
single block-sized requests to the underlying plugin. For some
|
|
|
|
|
plugins (eg. curl) this is very inefficient and causes huge
|
|
|
|
|
slow-downs.
|
|
|
|
|
|
|
|
|
|
For example I tested nbdkit + curl vs nbdkit + cache + curl against a
|
|
|
|
|
slow, remote VMware server. A simple run of virt-inspector was at
|
|
|
|
|
least 6-7 times slower with the cache filter. (It was so slow that I
|
|
|
|
|
didn't actually let it run to completion - I am estimating the
|
|
|
|
|
slowdown multiple using interim debug messages).
|
|
|
|
|
|
|
|
|
|
Implement a new blk_read_multiple function in the cache filter. It
|
|
|
|
|
does not break up "runs" of blocks which all have the same cache
|
|
|
|
|
state. The cache .pread method uses the new function to read the
|
|
|
|
|
block-aligned part of the request.
|
|
|
|
|
|
|
|
|
|
(cherry picked from commit ab661ccef5b3369fa22c33d0289baddc251b73bf)
|
|
|
|
|
---
|
|
|
|
|
filters/cache/blk.c | 83 ++++++++++++++++++++++++++++++++-----------
|
|
|
|
|
filters/cache/blk.h | 6 ++++
|
|
|
|
|
filters/cache/cache.c | 21 +++++------
|
|
|
|
|
filters/cow/blk.c | 63 +++++++++++++++++++++++---------
|
|
|
|
|
filters/cow/blk.h | 6 ++++
|
|
|
|
|
filters/cow/cow.c | 21 +++++------
|
|
|
|
|
6 files changed, 138 insertions(+), 62 deletions(-)
|
|
|
|
|
|
|
|
|
|
diff --git a/filters/cache/blk.c b/filters/cache/blk.c
|
|
|
|
|
index f52f30e3..f85ada35 100644
|
|
|
|
|
--- a/filters/cache/blk.c
|
|
|
|
|
+++ b/filters/cache/blk.c
|
|
|
|
|
@@ -44,6 +44,7 @@
|
|
|
|
|
#include <string.h>
|
|
|
|
|
#include <unistd.h>
|
|
|
|
|
#include <fcntl.h>
|
|
|
|
|
+#include <limits.h>
|
|
|
|
|
#include <errno.h>
|
|
|
|
|
|
|
|
|
|
#ifdef HAVE_SYS_STATVFS_H
|
|
|
|
|
@@ -193,26 +194,40 @@ blk_set_size (uint64_t new_size)
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
-int
|
|
|
|
|
-blk_read (nbdkit_next *next,
|
|
|
|
|
- uint64_t blknum, uint8_t *block, int *err)
|
|
|
|
|
+static int
|
|
|
|
|
+_blk_read_multiple (nbdkit_next *next,
|
|
|
|
|
+ uint64_t blknum, uint64_t nrblocks,
|
|
|
|
|
+ uint8_t *block, int *err)
|
|
|
|
|
{
|
|
|
|
|
off_t offset = blknum * blksize;
|
|
|
|
|
- enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_CACHED);
|
|
|
|
|
+ bool not_cached =
|
|
|
|
|
+ bitmap_get_blk (&bm, blknum, BLOCK_NOT_CACHED) == BLOCK_NOT_CACHED;
|
|
|
|
|
+ uint64_t b, runblocks;
|
|
|
|
|
|
|
|
|
|
- reclaim (fd, &bm);
|
|
|
|
|
+ assert (nrblocks > 0);
|
|
|
|
|
|
|
|
|
|
if (cache_debug_verbose)
|
|
|
|
|
- nbdkit_debug ("cache: blk_read block %" PRIu64
|
|
|
|
|
+ nbdkit_debug ("cache: blk_read_multiple block %" PRIu64
|
|
|
|
|
" (offset %" PRIu64 ") is %s",
|
|
|
|
|
blknum, (uint64_t) offset,
|
|
|
|
|
- state == BLOCK_NOT_CACHED ? "not cached" :
|
|
|
|
|
- state == BLOCK_CLEAN ? "clean" :
|
|
|
|
|
- state == BLOCK_DIRTY ? "dirty" :
|
|
|
|
|
- "unknown");
|
|
|
|
|
+ not_cached ? "not cached" : "cached");
|
|
|
|
|
|
|
|
|
|
- if (state == BLOCK_NOT_CACHED) { /* Read underlying plugin. */
|
|
|
|
|
- unsigned n = blksize, tail = 0;
|
|
|
|
|
+ /* Find out how many of the following blocks form a "run" with the
|
|
|
|
|
+ * same cached/not-cached state. We can process that many blocks in
|
|
|
|
|
+ * one go.
|
|
|
|
|
+ */
|
|
|
|
|
+ for (b = 1, runblocks = 1; b < nrblocks; ++b, ++runblocks) {
|
|
|
|
|
+ bool s =
|
|
|
|
|
+ bitmap_get_blk (&bm, blknum + b, BLOCK_NOT_CACHED) == BLOCK_NOT_CACHED;
|
|
|
|
|
+ if (not_cached != s)
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (not_cached) { /* Read underlying plugin. */
|
|
|
|
|
+ unsigned n, tail = 0;
|
|
|
|
|
+
|
|
|
|
|
+ assert (blksize * runblocks <= UINT_MAX);
|
|
|
|
|
+ n = blksize * runblocks;
|
|
|
|
|
|
|
|
|
|
if (offset + n > size) {
|
|
|
|
|
tail = offset + n - size;
|
|
|
|
|
@@ -228,32 +243,60 @@ blk_read (nbdkit_next *next,
|
|
|
|
|
*/
|
|
|
|
|
memset (block + n, 0, tail);
|
|
|
|
|
|
|
|
|
|
- /* If cache-on-read, copy the block to the cache. */
|
|
|
|
|
+ /* If cache-on-read, copy the blocks to the cache. */
|
|
|
|
|
if (cache_on_read) {
|
|
|
|
|
if (cache_debug_verbose)
|
|
|
|
|
nbdkit_debug ("cache: cache-on-read block %" PRIu64
|
|
|
|
|
" (offset %" PRIu64 ")",
|
|
|
|
|
blknum, (uint64_t) offset);
|
|
|
|
|
|
|
|
|
|
- if (pwrite (fd, block, blksize, offset) == -1) {
|
|
|
|
|
+ if (pwrite (fd, block, blksize * runblocks, offset) == -1) {
|
|
|
|
|
*err = errno;
|
|
|
|
|
nbdkit_error ("pwrite: %m");
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
- bitmap_set_blk (&bm, blknum, BLOCK_CLEAN);
|
|
|
|
|
- lru_set_recently_accessed (blknum);
|
|
|
|
|
+ for (b = 0; b < runblocks; ++b) {
|
|
|
|
|
+ bitmap_set_blk (&bm, blknum + b, BLOCK_CLEAN);
|
|
|
|
|
+ lru_set_recently_accessed (blknum + b);
|
|
|
|
|
+ }
|
|
|
|
|
}
|
|
|
|
|
- return 0;
|
|
|
|
|
}
|
|
|
|
|
else { /* Read cache. */
|
|
|
|
|
- if (pread (fd, block, blksize, offset) == -1) {
|
|
|
|
|
+ if (pread (fd, block, blksize * runblocks, offset) == -1) {
|
|
|
|
|
*err = errno;
|
|
|
|
|
nbdkit_error ("pread: %m");
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
- lru_set_recently_accessed (blknum);
|
|
|
|
|
- return 0;
|
|
|
|
|
+ for (b = 0; b < runblocks; ++b)
|
|
|
|
|
+ lru_set_recently_accessed (blknum + b);
|
|
|
|
|
}
|
|
|
|
|
+
|
|
|
|
|
+ /* If all done, return. */
|
|
|
|
|
+ if (runblocks == nrblocks)
|
|
|
|
|
+ return 0;
|
|
|
|
|
+
|
|
|
|
|
+ /* Recurse to read remaining blocks. */
|
|
|
|
|
+ return _blk_read_multiple (next,
|
|
|
|
|
+ blknum + runblocks,
|
|
|
|
|
+ nrblocks - runblocks,
|
|
|
|
|
+ block + blksize * runblocks,
|
|
|
|
|
+ err);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+int
|
|
|
|
|
+blk_read_multiple (nbdkit_next *next,
|
|
|
|
|
+ uint64_t blknum, uint64_t nrblocks,
|
|
|
|
|
+ uint8_t *block, int *err)
|
|
|
|
|
+{
|
|
|
|
|
+ reclaim (fd, &bm);
|
|
|
|
|
+ return _blk_read_multiple (next, blknum, nrblocks, block, err);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+int
|
|
|
|
|
+blk_read (nbdkit_next *next,
|
|
|
|
|
+ uint64_t blknum, uint8_t *block, int *err)
|
|
|
|
|
+{
|
|
|
|
|
+ return blk_read_multiple (next, blknum, 1, block, err);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int
|
|
|
|
|
diff --git a/filters/cache/blk.h b/filters/cache/blk.h
|
|
|
|
|
index 87c753e2..1ee33ed7 100644
|
|
|
|
|
--- a/filters/cache/blk.h
|
|
|
|
|
+++ b/filters/cache/blk.h
|
|
|
|
|
@@ -55,6 +55,12 @@ extern int blk_read (nbdkit_next *next,
|
|
|
|
|
uint64_t blknum, uint8_t *block, int *err)
|
|
|
|
|
__attribute__((__nonnull__ (1, 3, 4)));
|
|
|
|
|
|
|
|
|
|
+/* As above, but read multiple blocks. */
|
|
|
|
|
+extern int blk_read_multiple (nbdkit_next *next,
|
|
|
|
|
+ uint64_t blknum, uint64_t nrblocks,
|
|
|
|
|
+ uint8_t *block, int *err)
|
|
|
|
|
+ __attribute__((__nonnull__ (1, 4, 5)));
|
|
|
|
|
+
|
|
|
|
|
/* If a single block is not cached, copy it from the plugin. */
|
|
|
|
|
extern int blk_cache (nbdkit_next *next,
|
|
|
|
|
uint64_t blknum, uint8_t *block, int *err)
|
|
|
|
|
diff --git a/filters/cache/cache.c b/filters/cache/cache.c
|
2021-08-05 08:13:39 +00:00
|
|
|
|
index 745f552d..14cc03f2 100644
|
2021-07-26 18:44:01 +00:00
|
|
|
|
--- a/filters/cache/cache.c
|
|
|
|
|
+++ b/filters/cache/cache.c
|
|
|
|
|
@@ -313,7 +313,7 @@ cache_pread (nbdkit_next *next,
|
|
|
|
|
uint32_t flags, int *err)
|
|
|
|
|
{
|
|
|
|
|
CLEANUP_FREE uint8_t *block = NULL;
|
|
|
|
|
- uint64_t blknum, blkoffs;
|
|
|
|
|
+ uint64_t blknum, blkoffs, nrblocks;
|
|
|
|
|
int r;
|
|
|
|
|
|
|
|
|
|
assert (!flags);
|
|
|
|
|
@@ -348,22 +348,17 @@ cache_pread (nbdkit_next *next,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Aligned body */
|
|
|
|
|
- /* XXX This breaks up large read requests into smaller ones, which
|
|
|
|
|
- * is a problem for plugins which have a large, fixed per-request
|
|
|
|
|
- * overhead (hello, curl). We should try to keep large requests
|
|
|
|
|
- * together as much as possible, but that requires us to be much
|
|
|
|
|
- * smarter here.
|
|
|
|
|
- */
|
|
|
|
|
- while (count >= blksize) {
|
|
|
|
|
+ nrblocks = count / blksize;
|
|
|
|
|
+ if (nrblocks > 0) {
|
|
|
|
|
ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
|
|
|
|
|
- r = blk_read (next, blknum, buf, err);
|
|
|
|
|
+ r = blk_read_multiple (next, blknum, nrblocks, buf, err);
|
|
|
|
|
if (r == -1)
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
|
|
- buf += blksize;
|
|
|
|
|
- count -= blksize;
|
|
|
|
|
- offset += blksize;
|
|
|
|
|
- blknum++;
|
|
|
|
|
+ buf += nrblocks * blksize;
|
|
|
|
|
+ count -= nrblocks * blksize;
|
|
|
|
|
+ offset += nrblocks * blksize;
|
|
|
|
|
+ blknum += nrblocks;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Unaligned tail */
|
|
|
|
|
diff --git a/filters/cow/blk.c b/filters/cow/blk.c
|
2021-08-19 07:53:56 +00:00
|
|
|
|
index b7c4d7f1..4ec8d1b8 100644
|
2021-07-26 18:44:01 +00:00
|
|
|
|
--- a/filters/cow/blk.c
|
|
|
|
|
+++ b/filters/cow/blk.c
|
|
|
|
|
@@ -79,6 +79,7 @@
|
|
|
|
|
#include <inttypes.h>
|
|
|
|
|
#include <unistd.h>
|
|
|
|
|
#include <fcntl.h>
|
|
|
|
|
+#include <limits.h>
|
|
|
|
|
#include <errno.h>
|
|
|
|
|
#include <sys/types.h>
|
|
|
|
|
|
2021-08-19 07:53:56 +00:00
|
|
|
|
@@ -219,33 +220,48 @@ blk_status (uint64_t blknum, bool *present, bool *trimmed)
|
2021-07-26 18:44:01 +00:00
|
|
|
|
*trimmed = state == BLOCK_TRIMMED;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
-/* These are the block operations. They always read or write a single
|
|
|
|
|
- * whole block of size ‘blksize’.
|
|
|
|
|
+/* These are the block operations. They always read or write whole
|
|
|
|
|
+ * blocks of size ‘blksize’.
|
|
|
|
|
*/
|
|
|
|
|
int
|
|
|
|
|
-blk_read (nbdkit_next *next,
|
|
|
|
|
- uint64_t blknum, uint8_t *block, int *err)
|
|
|
|
|
+blk_read_multiple (nbdkit_next *next,
|
|
|
|
|
+ uint64_t blknum, uint64_t nrblocks,
|
|
|
|
|
+ uint8_t *block, int *err)
|
|
|
|
|
{
|
|
|
|
|
off_t offset = blknum * BLKSIZE;
|
|
|
|
|
enum bm_entry state;
|
|
|
|
|
+ uint64_t b, runblocks;
|
|
|
|
|
|
|
|
|
|
- /* The state might be modified from another thread - for example
|
|
|
|
|
- * another thread might write (BLOCK_NOT_ALLOCATED ->
|
|
|
|
|
- * BLOCK_ALLOCATED) while we are reading from the plugin, returning
|
|
|
|
|
- * the old data. However a read issued after the write returns
|
|
|
|
|
- * should always return the correct data.
|
|
|
|
|
+ /* Find out how many of the following blocks form a "run" with the
|
|
|
|
|
+ * same state. We can process that many blocks in one go.
|
|
|
|
|
+ *
|
|
|
|
|
+ * About the locking: The state might be modified from another
|
|
|
|
|
+ * thread - for example another thread might write
|
|
|
|
|
+ * (BLOCK_NOT_ALLOCATED -> BLOCK_ALLOCATED) while we are reading
|
|
|
|
|
+ * from the plugin, returning the old data. However a read issued
|
|
|
|
|
+ * after the write returns should always return the correct data.
|
|
|
|
|
*/
|
|
|
|
|
{
|
|
|
|
|
ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
|
|
|
|
|
state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
|
|
|
|
|
+
|
|
|
|
|
+ for (b = 1, runblocks = 1; b < nrblocks; ++b, ++runblocks) {
|
|
|
|
|
+ enum bm_entry s = bitmap_get_blk (&bm, blknum + b, BLOCK_NOT_ALLOCATED);
|
|
|
|
|
+ if (state != s)
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (cow_debug_verbose)
|
|
|
|
|
- nbdkit_debug ("cow: blk_read block %" PRIu64 " (offset %" PRIu64 ") is %s",
|
|
|
|
|
+ nbdkit_debug ("cow: blk_read_multiple block %" PRIu64
|
|
|
|
|
+ " (offset %" PRIu64 ") is %s",
|
|
|
|
|
blknum, (uint64_t) offset, state_to_string (state));
|
|
|
|
|
|
|
|
|
|
if (state == BLOCK_NOT_ALLOCATED) { /* Read underlying plugin. */
|
|
|
|
|
- unsigned n = BLKSIZE, tail = 0;
|
|
|
|
|
+ unsigned n, tail = 0;
|
|
|
|
|
+
|
|
|
|
|
+ assert (BLKSIZE * runblocks <= UINT_MAX);
|
|
|
|
|
+ n = BLKSIZE * runblocks;
|
|
|
|
|
|
|
|
|
|
if (offset + n > size) {
|
|
|
|
|
tail = offset + n - size;
|
2021-08-19 07:53:56 +00:00
|
|
|
|
@@ -260,20 +276,35 @@ blk_read (nbdkit_next *next,
|
2021-07-26 18:44:01 +00:00
|
|
|
|
* zeroing the tail.
|
|
|
|
|
*/
|
|
|
|
|
memset (block + n, 0, tail);
|
|
|
|
|
- return 0;
|
|
|
|
|
}
|
|
|
|
|
else if (state == BLOCK_ALLOCATED) { /* Read overlay. */
|
|
|
|
|
- if (pread (fd, block, BLKSIZE, offset) == -1) {
|
|
|
|
|
+ if (pread (fd, block, BLKSIZE * runblocks, offset) == -1) {
|
|
|
|
|
*err = errno;
|
|
|
|
|
nbdkit_error ("pread: %m");
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
- return 0;
|
|
|
|
|
}
|
|
|
|
|
else /* state == BLOCK_TRIMMED */ {
|
|
|
|
|
- memset (block, 0, BLKSIZE);
|
|
|
|
|
- return 0;
|
|
|
|
|
+ memset (block, 0, BLKSIZE * runblocks);
|
|
|
|
|
}
|
|
|
|
|
+
|
|
|
|
|
+ /* If all done, return. */
|
|
|
|
|
+ if (runblocks == nrblocks)
|
|
|
|
|
+ return 0;
|
|
|
|
|
+
|
|
|
|
|
+ /* Recurse to read remaining blocks. */
|
|
|
|
|
+ return blk_read_multiple (next,
|
|
|
|
|
+ blknum + runblocks,
|
|
|
|
|
+ nrblocks - runblocks,
|
|
|
|
|
+ block + BLKSIZE * runblocks,
|
|
|
|
|
+ err);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+int
|
|
|
|
|
+blk_read (nbdkit_next *next,
|
|
|
|
|
+ uint64_t blknum, uint8_t *block, int *err)
|
|
|
|
|
+{
|
|
|
|
|
+ return blk_read_multiple (next, blknum, 1, block, err);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int
|
|
|
|
|
diff --git a/filters/cow/blk.h b/filters/cow/blk.h
|
|
|
|
|
index e6fd7417..b066c602 100644
|
|
|
|
|
--- a/filters/cow/blk.h
|
|
|
|
|
+++ b/filters/cow/blk.h
|
|
|
|
|
@@ -55,6 +55,12 @@ extern int blk_read (nbdkit_next *next,
|
|
|
|
|
uint64_t blknum, uint8_t *block, int *err)
|
|
|
|
|
__attribute__((__nonnull__ (1, 3, 4)));
|
|
|
|
|
|
|
|
|
|
+/* Read multiple blocks from the overlay or plugin. */
|
|
|
|
|
+extern int blk_read_multiple (nbdkit_next *next,
|
|
|
|
|
+ uint64_t blknum, uint64_t nrblocks,
|
|
|
|
|
+ uint8_t *block, int *err)
|
|
|
|
|
+ __attribute__((__nonnull__ (1, 4, 5)));
|
|
|
|
|
+
|
|
|
|
|
/* Cache mode for blocks not already in overlay */
|
|
|
|
|
enum cache_mode {
|
|
|
|
|
BLK_CACHE_IGNORE, /* Do nothing */
|
|
|
|
|
diff --git a/filters/cow/cow.c b/filters/cow/cow.c
|
2021-08-19 07:53:56 +00:00
|
|
|
|
index f30b7505..78daca22 100644
|
2021-07-26 18:44:01 +00:00
|
|
|
|
--- a/filters/cow/cow.c
|
|
|
|
|
+++ b/filters/cow/cow.c
|
|
|
|
|
@@ -210,7 +210,7 @@ cow_pread (nbdkit_next *next,
|
|
|
|
|
uint32_t flags, int *err)
|
|
|
|
|
{
|
|
|
|
|
CLEANUP_FREE uint8_t *block = NULL;
|
|
|
|
|
- uint64_t blknum, blkoffs;
|
|
|
|
|
+ uint64_t blknum, blkoffs, nrblocks;
|
|
|
|
|
int r;
|
|
|
|
|
|
|
|
|
|
if (!IS_ALIGNED (count | offset, BLKSIZE)) {
|
|
|
|
|
@@ -243,21 +243,16 @@ cow_pread (nbdkit_next *next,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Aligned body */
|
|
|
|
|
- /* XXX This breaks up large read requests into smaller ones, which
|
|
|
|
|
- * is a problem for plugins which have a large, fixed per-request
|
|
|
|
|
- * overhead (hello, curl). We should try to keep large requests
|
|
|
|
|
- * together as much as possible, but that requires us to be much
|
|
|
|
|
- * smarter here.
|
|
|
|
|
- */
|
|
|
|
|
- while (count >= BLKSIZE) {
|
|
|
|
|
- r = blk_read (next, blknum, buf, err);
|
|
|
|
|
+ nrblocks = count / BLKSIZE;
|
|
|
|
|
+ if (nrblocks > 0) {
|
|
|
|
|
+ r = blk_read_multiple (next, blknum, nrblocks, buf, err);
|
|
|
|
|
if (r == -1)
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
|
|
- buf += BLKSIZE;
|
|
|
|
|
- count -= BLKSIZE;
|
|
|
|
|
- offset += BLKSIZE;
|
|
|
|
|
- blknum++;
|
|
|
|
|
+ buf += nrblocks * BLKSIZE;
|
|
|
|
|
+ count -= nrblocks * BLKSIZE;
|
|
|
|
|
+ offset += nrblocks * BLKSIZE;
|
|
|
|
|
+ blknum += nrblocks;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Unaligned tail */
|
|
|
|
|
--
|
|
|
|
|
2.31.1
|
|
|
|
|
|