nbdkit/SOURCES/0003-cache-cow-Add-blk_read...

From b5dc8577c5c6d1205e2106b629fad327c3a409ea Mon Sep 17 00:00:00 2001
From: "Richard W.M. Jones" <rjones@redhat.com>
Date: Mon, 26 Jul 2021 13:55:21 +0100
Subject: [PATCH] cache, cow: Add blk_read_multiple function

Currently the cache and cow filters break up large requests into many
single block-sized requests to the underlying plugin.  For some
plugins (eg. curl) this is very inefficient and causes huge
slow-downs.

For example I tested nbdkit + curl vs nbdkit + cache + curl against a
slow, remote VMware server.  A simple run of virt-inspector was at
least 6-7 times slower with the cache filter.  (It was so slow that I
didn't actually let it run to completion - I am estimating the
slowdown multiple using interim debug messages).

Implement a new blk_read_multiple function in the cache filter.  It
does not break up "runs" of blocks which all have the same cache
state.  The cache .pread method uses the new function to read the
block-aligned part of the request.

(cherry picked from commit ab661ccef5b3369fa22c33d0289baddc251b73bf)
---
 filters/cache/blk.c   | 83 ++++++++++++++++++++++++++++++++-----------
 filters/cache/blk.h   |  6 ++++
 filters/cache/cache.c | 21 +++++------
 filters/cow/blk.c     | 63 +++++++++++++++++++++++---------
 filters/cow/blk.h     |  6 ++++
 filters/cow/cow.c     | 21 +++++------
 6 files changed, 138 insertions(+), 62 deletions(-)

diff --git a/filters/cache/blk.c b/filters/cache/blk.c
index f52f30e3..f85ada35 100644
--- a/filters/cache/blk.c
+++ b/filters/cache/blk.c
@@ -44,6 +44,7 @@
 #include <string.h>
 #include <unistd.h>
 #include <fcntl.h>
+#include <limits.h>
 #include <errno.h>

 #ifdef HAVE_SYS_STATVFS_H
@@ -193,26 +194,40 @@ blk_set_size (uint64_t new_size)
   return 0;
 }

-int
-blk_read (nbdkit_next *next,
-          uint64_t blknum, uint8_t *block, int *err)
+static int
+_blk_read_multiple (nbdkit_next *next,
+                    uint64_t blknum, uint64_t nrblocks,
+                    uint8_t *block, int *err)
 {
   off_t offset = blknum * blksize;
-  enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_CACHED);
+  bool not_cached =
+    bitmap_get_blk (&bm, blknum, BLOCK_NOT_CACHED) == BLOCK_NOT_CACHED;
+  uint64_t b, runblocks;

-  reclaim (fd, &bm);
+  assert (nrblocks > 0);

   if (cache_debug_verbose)
-    nbdkit_debug ("cache: blk_read block %" PRIu64
+    nbdkit_debug ("cache: blk_read_multiple block %" PRIu64
                   " (offset %" PRIu64 ") is %s",
                   blknum, (uint64_t) offset,
-                  state == BLOCK_NOT_CACHED ? "not cached" :
-                  state == BLOCK_CLEAN ? "clean" :
-                  state == BLOCK_DIRTY ? "dirty" :
-                  "unknown");
+                  not_cached ? "not cached" : "cached");

-  if (state == BLOCK_NOT_CACHED) { /* Read underlying plugin. */
-    unsigned n = blksize, tail = 0;
+  /* Find out how many of the following blocks form a "run" with the
+   * same cached/not-cached state.  We can process that many blocks in
+   * one go.
+   */
+  for (b = 1, runblocks = 1; b < nrblocks; ++b, ++runblocks) {
+    bool s =
+      bitmap_get_blk (&bm, blknum + b, BLOCK_NOT_CACHED) == BLOCK_NOT_CACHED;
+    if (not_cached != s)
+      break;
+  }
+
+  if (not_cached) {             /* Read underlying plugin. */
+    unsigned n, tail = 0;
+
+    assert (blksize * runblocks <= UINT_MAX);
+    n = blksize * runblocks;

     if (offset + n > size) {
       tail = offset + n - size;
@@ -228,32 +243,60 @@ blk_read (nbdkit_next *next,
      */
     memset (block + n, 0, tail);

-    /* If cache-on-read, copy the block to the cache. */
+    /* If cache-on-read, copy the blocks to the cache. */
     if (cache_on_read) {
       if (cache_debug_verbose)
         nbdkit_debug ("cache: cache-on-read block %" PRIu64
                       " (offset %" PRIu64 ")",
                       blknum, (uint64_t) offset);

-      if (pwrite (fd, block, blksize, offset) == -1) {
+      if (pwrite (fd, block, blksize * runblocks, offset) == -1) {
         *err = errno;
         nbdkit_error ("pwrite: %m");
         return -1;
       }
-      bitmap_set_blk (&bm, blknum, BLOCK_CLEAN);
-      lru_set_recently_accessed (blknum);
+      for (b = 0; b < runblocks; ++b) {
+        bitmap_set_blk (&bm, blknum + b, BLOCK_CLEAN);
+        lru_set_recently_accessed (blknum + b);
+      }
     }
-    return 0;
   }
   else {                        /* Read cache. */
-    if (pread (fd, block, blksize, offset) == -1) {
+    if (pread (fd, block, blksize * runblocks, offset) == -1) {
       *err = errno;
       nbdkit_error ("pread: %m");
       return -1;
     }
-    lru_set_recently_accessed (blknum);
-    return 0;
+    for (b = 0; b < runblocks; ++b)
+      lru_set_recently_accessed (blknum + b);
   }
+
+  /* If all done, return. */
+  if (runblocks == nrblocks)
+    return 0;
+
+  /* Recurse to read remaining blocks. */
+  return _blk_read_multiple (next,
+                             blknum + runblocks,
+                             nrblocks - runblocks,
+                             block + blksize * runblocks,
+                             err);
+}
+
+int
+blk_read_multiple (nbdkit_next *next,
+                   uint64_t blknum, uint64_t nrblocks,
+                   uint8_t *block, int *err)
+{
+  reclaim (fd, &bm);
+  return _blk_read_multiple (next, blknum, nrblocks, block, err);
+}
+
+int
+blk_read (nbdkit_next *next,
+          uint64_t blknum, uint8_t *block, int *err)
+{
+  return blk_read_multiple (next, blknum, 1, block, err);
 }

 int
diff --git a/filters/cache/blk.h b/filters/cache/blk.h
index 87c753e2..1ee33ed7 100644
--- a/filters/cache/blk.h
+++ b/filters/cache/blk.h
@@ -55,6 +55,12 @@ extern int blk_read (nbdkit_next *next,
                      uint64_t blknum, uint8_t *block, int *err)
   __attribute__((__nonnull__ (1, 3, 4)));

+/* As above, but read multiple blocks. */
+extern int blk_read_multiple (nbdkit_next *next,
+                              uint64_t blknum, uint64_t nrblocks,
+                              uint8_t *block, int *err)
+  __attribute__((__nonnull__ (1, 4, 5)));
+
 /* If a single block is not cached, copy it from the plugin. */
 extern int blk_cache (nbdkit_next *next,
                       uint64_t blknum, uint8_t *block, int *err)
diff --git a/filters/cache/cache.c b/filters/cache/cache.c
index 745f552d..14cc03f2 100644
--- a/filters/cache/cache.c
+++ b/filters/cache/cache.c
@@ -313,7 +313,7 @@ cache_pread (nbdkit_next *next,
              uint32_t flags, int *err)
 {
   CLEANUP_FREE uint8_t *block = NULL;
-  uint64_t blknum, blkoffs;
+  uint64_t blknum, blkoffs, nrblocks;
   int r;

   assert (!flags);
@@ -348,22 +348,17 @@ cache_pread (nbdkit_next *next,
   }

   /* Aligned body */
-  /* XXX This breaks up large read requests into smaller ones, which
-   * is a problem for plugins which have a large, fixed per-request
-   * overhead (hello, curl).  We should try to keep large requests
-   * together as much as possible, but that requires us to be much
-   * smarter here.
-   */
-  while (count >= blksize) {
+  nrblocks = count / blksize;
+  if (nrblocks > 0) {
     ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
-    r = blk_read (next, blknum, buf, err);
+    r = blk_read_multiple (next, blknum, nrblocks, buf, err);
     if (r == -1)
       return -1;

-    buf += blksize;
-    count -= blksize;
-    offset += blksize;
-    blknum++;
+    buf += nrblocks * blksize;
+    count -= nrblocks * blksize;
+    offset += nrblocks * blksize;
+    blknum += nrblocks;
   }

   /* Unaligned tail */
diff --git a/filters/cow/blk.c b/filters/cow/blk.c
index b7c4d7f1..4ec8d1b8 100644
--- a/filters/cow/blk.c
+++ b/filters/cow/blk.c
@@ -79,6 +79,7 @@
 #include <inttypes.h>
 #include <unistd.h>
 #include <fcntl.h>
+#include <limits.h>
 #include <errno.h>
 #include <sys/types.h>

@@ -219,33 +220,48 @@ blk_status (uint64_t blknum, bool *present, bool *trimmed)
   *trimmed = state == BLOCK_TRIMMED;
 }

-/* These are the block operations.  They always read or write a single
- * whole block of size ‘blksize’.
+/* These are the block operations.  They always read or write whole
+ * blocks of size ‘blksize’.
  */
 int
-blk_read (nbdkit_next *next,
-          uint64_t blknum, uint8_t *block, int *err)
+blk_read_multiple (nbdkit_next *next,
+                   uint64_t blknum, uint64_t nrblocks,
+                   uint8_t *block, int *err)
 {
   off_t offset = blknum * BLKSIZE;
   enum bm_entry state;
+  uint64_t b, runblocks;

-  /* The state might be modified from another thread - for example
-   * another thread might write (BLOCK_NOT_ALLOCATED ->
-   * BLOCK_ALLOCATED) while we are reading from the plugin, returning
-   * the old data.  However a read issued after the write returns
-   * should always return the correct data.
+  /* Find out how many of the following blocks form a "run" with the
+   * same state.  We can process that many blocks in one go.
+   *
+   * About the locking: The state might be modified from another
+   * thread - for example another thread might write
+   * (BLOCK_NOT_ALLOCATED -> BLOCK_ALLOCATED) while we are reading
+   * from the plugin, returning the old data.  However a read issued
+   * after the write returns should always return the correct data.
    */
   {
     ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
     state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
+
+    for (b = 1, runblocks = 1; b < nrblocks; ++b, ++runblocks) {
+      enum bm_entry s = bitmap_get_blk (&bm, blknum + b, BLOCK_NOT_ALLOCATED);
+      if (state != s)
+        break;
+    }
   }

   if (cow_debug_verbose)
-    nbdkit_debug ("cow: blk_read block %" PRIu64 " (offset %" PRIu64 ") is %s",
+    nbdkit_debug ("cow: blk_read_multiple block %" PRIu64
+                  " (offset %" PRIu64 ") is %s",
                   blknum, (uint64_t) offset, state_to_string (state));

   if (state == BLOCK_NOT_ALLOCATED) { /* Read underlying plugin. */
-    unsigned n = BLKSIZE, tail = 0;
+    unsigned n, tail = 0;
+
+    assert (BLKSIZE * runblocks <= UINT_MAX);
+    n = BLKSIZE * runblocks;

     if (offset + n > size) {
       tail = offset + n - size;
@@ -260,20 +276,35 @@ blk_read (nbdkit_next *next,
      * zeroing the tail.
      */
     memset (block + n, 0, tail);
-    return 0;
   }
   else if (state == BLOCK_ALLOCATED) { /* Read overlay. */
-    if (pread (fd, block, BLKSIZE, offset) == -1) {
+    if (pread (fd, block, BLKSIZE * runblocks, offset) == -1) {
       *err = errno;
       nbdkit_error ("pread: %m");
       return -1;
     }
-    return 0;
   }
   else /* state == BLOCK_TRIMMED */ {
-    memset (block, 0, BLKSIZE);
-    return 0;
+    memset (block, 0, BLKSIZE * runblocks);
   }
+
+  /* If all done, return. */
+  if (runblocks == nrblocks)
+    return 0;
+
+  /* Recurse to read remaining blocks. */
+  return blk_read_multiple (next,
+                            blknum + runblocks,
+                            nrblocks - runblocks,
+                            block + BLKSIZE * runblocks,
+                            err);
+}
+
+int
+blk_read (nbdkit_next *next,
+          uint64_t blknum, uint8_t *block, int *err)
+{
+  return blk_read_multiple (next, blknum, 1, block, err);
 }

 int
diff --git a/filters/cow/blk.h b/filters/cow/blk.h
index e6fd7417..b066c602 100644
--- a/filters/cow/blk.h
+++ b/filters/cow/blk.h
@@ -55,6 +55,12 @@ extern int blk_read (nbdkit_next *next,
                      uint64_t blknum, uint8_t *block, int *err)
   __attribute__((__nonnull__ (1, 3, 4)));

+/* Read multiple blocks from the overlay or plugin. */
+extern int blk_read_multiple (nbdkit_next *next,
+                              uint64_t blknum, uint64_t nrblocks,
+                              uint8_t *block, int *err)
+  __attribute__((__nonnull__ (1, 4, 5)));
+
 /* Cache mode for blocks not already in overlay */
 enum cache_mode {
   BLK_CACHE_IGNORE,      /* Do nothing */
diff --git a/filters/cow/cow.c b/filters/cow/cow.c
index f30b7505..78daca22 100644
--- a/filters/cow/cow.c
+++ b/filters/cow/cow.c
@@ -210,7 +210,7 @@ cow_pread (nbdkit_next *next,
            uint32_t flags, int *err)
 {
   CLEANUP_FREE uint8_t *block = NULL;
-  uint64_t blknum, blkoffs;
+  uint64_t blknum, blkoffs, nrblocks;
   int r;

   if (!IS_ALIGNED (count | offset, BLKSIZE)) {
@@ -243,21 +243,16 @@ cow_pread (nbdkit_next *next,
   }

   /* Aligned body */
-  /* XXX This breaks up large read requests into smaller ones, which
-   * is a problem for plugins which have a large, fixed per-request
-   * overhead (hello, curl).  We should try to keep large requests
-   * together as much as possible, but that requires us to be much
-   * smarter here.
-   */
-  while (count >= BLKSIZE) {
-    r = blk_read (next, blknum, buf, err);
+  nrblocks = count / BLKSIZE;
+  if (nrblocks > 0) {
+    r = blk_read_multiple (next, blknum, nrblocks, buf, err);
     if (r == -1)
       return -1;

-    buf += BLKSIZE;
-    count -= BLKSIZE;
-    offset += BLKSIZE;
-    blknum++;
+    buf += nrblocks * BLKSIZE;
+    count -= nrblocks * BLKSIZE;
+    offset += nrblocks * BLKSIZE;
+    blknum += nrblocks;
   }

   /* Unaligned tail */
--
2.31.1