Rebase to new stable branch version 1.30.4

resolves: rhbz#2059289

Add support for ssh create remote file.
Backport new readahead filter from 1.32.
This commit is contained in:
Richard W.M. Jones 2022-04-26 10:19:35 +01:00
parent 3ebd7154dd
commit c54cb3f9a4
5 changed files with 1217 additions and 7 deletions

View File

@ -0,0 +1,292 @@
From 04cf7c2aa8aa8ddc446f1571d2f98661ba8a8ca7 Mon Sep 17 00:00:00 2001
From: "Richard W.M. Jones" <rjones@redhat.com>
Date: Fri, 15 Apr 2022 12:08:37 +0100
Subject: [PATCH] ssh: Allow the remote file to be created
This adds new parameters, create=(true|false), create-size=SIZE and
create-mode=MODE to create and truncate the remote file.
Reviewed-by: Laszlo Ersek <lersek@redhat.com>
(cherry picked from commit 0793f30b1071753532362b2ebf9cb8156a88c3c3)
---
plugins/ssh/nbdkit-ssh-plugin.pod | 34 ++++++++-
plugins/ssh/ssh.c | 112 +++++++++++++++++++++++++++---
tests/test-ssh.sh | 13 +++-
3 files changed, 146 insertions(+), 13 deletions(-)
diff --git a/plugins/ssh/nbdkit-ssh-plugin.pod b/plugins/ssh/nbdkit-ssh-plugin.pod
index 3f401c15..2bc2c4a7 100644
--- a/plugins/ssh/nbdkit-ssh-plugin.pod
+++ b/plugins/ssh/nbdkit-ssh-plugin.pod
@@ -5,8 +5,10 @@ nbdkit-ssh-plugin - access disk images over the SSH protocol
=head1 SYNOPSIS
nbdkit ssh host=HOST [path=]PATH
- [compression=true] [config=CONFIG_FILE] [identity=FILENAME]
- [known-hosts=FILENAME] [password=PASSWORD|-|+FILENAME]
+ [compression=true] [config=CONFIG_FILE]
+ [create=true] [create-mode=MODE] [create-size=SIZE]
+ [identity=FILENAME] [known-hosts=FILENAME]
+ [password=PASSWORD|-|+FILENAME]
[port=PORT] [timeout=SECS] [user=USER]
[verify-remote-host=false]
@@ -62,6 +64,34 @@ The C<config> parameter is optional. If it is I<not> specified at all
then F<~/.ssh/config> and F</etc/ssh/ssh_config> are both read.
Missing or unreadable files are ignored.
+=item B<create=true>
+
+(nbdkit E<ge> 1.32)
+
+If set, the remote file will be created. The remote file is created
+on the first NBD connection to nbdkit, not when nbdkit starts up. If
+the file already exists, it will be replaced and any existing content
+lost.
+
+If using this option, you must use C<create-size>. C<create-mode> can
+be used to control the permissions of the new file.
+
+=item B<create-mode=>MODE
+
+(nbdkit E<ge> 1.32)
+
+If using C<create=true> specify the default permissions of the new
+remote file. You can use octal modes like C<create-mode=0777> or
+C<create-mode=0644>. The default is C<0600>, ie. only readable and
+writable by the remote user.
+
+=item B<create-size=>SIZE
+
+(nbdkit E<ge> 1.32)
+
+If using C<create=true>, specify the virtual size of the new disk.
+C<SIZE> can use modifiers like C<100M> etc.
+
=item B<host=>HOST
Specify the name or IP address of the remote host.
diff --git a/plugins/ssh/ssh.c b/plugins/ssh/ssh.c
index 77cfcf6c..ac9cc230 100644
--- a/plugins/ssh/ssh.c
+++ b/plugins/ssh/ssh.c
@@ -44,12 +44,15 @@
#include <fcntl.h>
#include <sys/stat.h>
+#include <pthread.h>
+
#include <libssh/libssh.h>
#include <libssh/sftp.h>
#include <libssh/callbacks.h>
#include <nbdkit-plugin.h>
+#include "cleanup.h"
#include "const-string-vector.h"
#include "minmax.h"
@@ -63,6 +66,9 @@ static const char *known_hosts = NULL;
static const_string_vector identities = empty_vector;
static uint32_t timeout = 0;
static bool compression = false;
+static bool create = false;
+static int64_t create_size = -1;
+static unsigned create_mode = S_IRUSR | S_IWUSR /* 0600 */;
/* config can be:
* NULL => parse options from default file
@@ -167,6 +173,27 @@ ssh_config (const char *key, const char *value)
return -1;
compression = r;
}
+ else if (strcmp (key, "create") == 0) {
+ r = nbdkit_parse_bool (value);
+ if (r == -1)
+ return -1;
+ create = r;
+ }
+ else if (strcmp (key, "create-size") == 0) {
+ create_size = nbdkit_parse_size (value);
+ if (create_size == -1)
+ return -1;
+ }
+ else if (strcmp (key, "create-mode") == 0) {
+ r = nbdkit_parse_unsigned (key, value, &create_mode);
+ if (r == -1)
+ return -1;
+ /* OpenSSH checks this too. */
+ if (create_mode > 0777) {
+ nbdkit_error ("create-mode must be <= 0777");
+ return -1;
+ }
+ }
else {
nbdkit_error ("unknown parameter '%s'", key);
@@ -186,6 +213,13 @@ ssh_config_complete (void)
return -1;
}
+ /* If create=true, create-size must be supplied. */
+ if (create && create_size == -1) {
+ nbdkit_error ("if using create=true, you must specify the size "
+ "of the new remote file using create-size=SIZE");
+ return -1;
+ }
+
return 0;
}
@@ -200,7 +234,10 @@ ssh_config_complete (void)
"identity=<FILENAME> Prepend private key (identity) file.\n" \
"timeout=SECS Set SSH connection timeout.\n" \
"verify-remote-host=false Ignore known_hosts.\n" \
- "compression=true Enable compression."
+ "compression=true Enable compression.\n" \
+ "create=true Create the remote file.\n" \
+ "create-mode=MODE Set the permissions of the remote file.\n" \
+ "create-size=SIZE Set the size of the remote file."
/* Since we must simulate atomic pread and pwrite using seek +
* read/write, calls on each handle must be serialized.
@@ -329,6 +366,65 @@ authenticate (struct ssh_handle *h)
return -1;
}
+/* This function opens or creates the remote file (depending on
+ * create=false|true). Parallel connections might call this function
+ * at the same time, and so we must hold a lock to ensure that the
+ * file is created at most once.
+ */
+static pthread_mutex_t create_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static sftp_file
+open_or_create_path (ssh_session session, sftp_session sftp, int readonly)
+{
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&create_lock);
+ int access_type;
+ int r;
+ sftp_file file;
+
+ access_type = readonly ? O_RDONLY : O_RDWR;
+ if (create) access_type |= O_CREAT | O_TRUNC;
+
+ file = sftp_open (sftp, path, access_type, S_IRWXU);
+ if (!file) {
+ nbdkit_error ("cannot %s file for %s: %s",
+ create ? "create" : "open",
+ readonly ? "reading" : "writing",
+ ssh_get_error (session));
+ return NULL;
+ }
+
+ if (create) {
+ /* There's no sftp_truncate call. However OpenSSH lets you call
+ * SSH_FXP_SETSTAT + SSH_FILEXFER_ATTR_SIZE which invokes
+ * truncate(2) on the server. Libssh doesn't provide a binding
+ * for SSH_FXP_FSETSTAT so we have to pass the session + path.
+ */
+ struct sftp_attributes_struct attrs = {
+ .flags = SSH_FILEXFER_ATTR_SIZE |
+ SSH_FILEXFER_ATTR_PERMISSIONS,
+ .size = create_size,
+ .permissions = create_mode,
+ };
+
+ r = sftp_setstat (sftp, path, &attrs);
+ if (r != SSH_OK) {
+ nbdkit_error ("setstat failed: %s", ssh_get_error (session));
+
+ /* Best-effort attempt to delete the remote file on failure. */
+ r = sftp_unlink (sftp, path);
+ if (r != SSH_OK)
+ nbdkit_debug ("unlink failed: %s", ssh_get_error (session));
+
+ return NULL;
+ }
+ }
+
+ /* On the next connection, don't create or truncate the file. */
+ create = false;
+
+ return file;
+}
+
/* Create the per-connection handle. */
static void *
ssh_open (int readonly)
@@ -337,7 +433,6 @@ ssh_open (int readonly)
const int set = 1;
size_t i;
int r;
- int access_type;
h = calloc (1, sizeof *h);
if (h == NULL) {
@@ -471,7 +566,7 @@ ssh_open (int readonly)
if (authenticate (h) == -1)
goto err;
- /* Open the SFTP connection and file. */
+ /* Open the SFTP connection. */
h->sftp = sftp_new (h->session);
if (!h->sftp) {
nbdkit_error ("failed to allocate sftp session: %s",
@@ -484,14 +579,11 @@ ssh_open (int readonly)
ssh_get_error (h->session));
goto err;
}
- access_type = readonly ? O_RDONLY : O_RDWR;
- h->file = sftp_open (h->sftp, path, access_type, S_IRWXU);
- if (!h->file) {
- nbdkit_error ("cannot open file for %s: %s",
- readonly ? "reading" : "writing",
- ssh_get_error (h->session));
+
+ /* Open or create the remote file. */
+ h->file = open_or_create_path (h->session, h->sftp, readonly);
+ if (!h->file)
goto err;
- }
nbdkit_debug ("opened libssh handle");
diff --git a/tests/test-ssh.sh b/tests/test-ssh.sh
index 6c0ce410..f04b4488 100755
--- a/tests/test-ssh.sh
+++ b/tests/test-ssh.sh
@@ -36,6 +36,7 @@ set -x
requires test -f disk
requires nbdcopy --version
+requires stat --version
# Check that ssh to localhost will work without any passwords or phrases.
#
@@ -48,7 +49,7 @@ then
exit 77
fi
-files="ssh.img"
+files="ssh.img ssh2.img"
rm -f $files
cleanup_fn rm -f $files
@@ -59,3 +60,13 @@ nbdkit -v -D ssh.log=2 -U - \
# The output should be identical.
cmp disk ssh.img
+
+# Copy local file 'ssh.img' to newly created "remote" 'ssh2.img'
+size="$(stat -c %s disk)"
+nbdkit -v -D ssh.log=2 -U - \
+ ssh host=localhost $PWD/ssh2.img \
+ create=true create-size=$size \
+ --run 'nbdcopy ssh.img "$uri"'
+
+# The output should be identical.
+cmp disk ssh2.img
--
2.31.1

View File

@ -0,0 +1,794 @@
From 160601397d307f70586f1bc70111141855ff68ea Mon Sep 17 00:00:00 2001
From: "Richard W.M. Jones" <rjones@redhat.com>
Date: Sat, 16 Apr 2022 18:39:13 +0100
Subject: [PATCH] readahead: Rewrite this filter so it prefetches using .cache
The previous readahead filter did not work well and we have stopped
using it in virt-v2v. However the concept is a good one if we can
make it work. This commit completely rethinks the filter.
Now, in parallel with the ordinary pread command, we issue a prefetch
(ie. .cache) to the underlying plugin for the data immediately
following the pread. For example a simple sequence of operations:
t=1 pread (offset=0, count=65536)
t=2 pread (offset=65536, count=65536)
t=3 pread (offset=131072, count=65536)
would become:
t=1 pread (offset=0, count=65536) <--\ issued
cache (offset=65536, count=65536) <--/ in parallel
t=2 pread (offset=65536, count=65536)
cache (offset=131072, count=65536)
t=3 pread (offset=131072, count=65536)
cache (offset=196608, count=65536)
This requires that the underlying filter(s) and plugin chain can
actually do something with the .cache request. If this is not the
case then the filter does nothing (but it will print a warning). For
plugins which don't have native support for prefetching, it is
sufficient to insert nbdkit-cache-filter after this filter.
(nbdkit-cow-filter can also be used with cow-on-cache=true, but that
is more useful for advanced users who are already using the cow
filter).
The implementation creates a background thread per connection to issue
the parallel .cache requests. This is safer than the alternative (one
background thread in total) since we don't have to deal with the
problem of cache requests being issued with the wrong export name or
stale next pointer. The background thread is controlled by a queue of
commands, with the only possible commands being "cache" or "quit".
Because the background thread issues parallel requests on the same
connection, the underlying plugin must support the parallel thread
model, otherwise we would be violating the plugin's thread model. It
may be possible in future to open a new connection from the background
thread (but with the same exportname), which would lift this
restriction to at least serialize_requests. Because of the current
limitation, nbdkit-curl-plugin cannot use prefetch.
(cherry picked from commit 2ff548d66ad3eae87868402ec5b3319edd12090f)
---
TODO | 22 +-
filters/readahead/Makefile.am | 2 +
filters/readahead/bgthread.c | 76 ++++
filters/readahead/nbdkit-readahead-filter.pod | 70 +++-
filters/readahead/readahead.c | 338 +++++++++---------
filters/readahead/readahead.h | 60 ++++
tests/test-readahead.sh | 2 +-
7 files changed, 367 insertions(+), 203 deletions(-)
create mode 100644 filters/readahead/bgthread.c
create mode 100644 filters/readahead/readahead.h
diff --git a/TODO b/TODO
index 5ae21db5..4d2a9796 100644
--- a/TODO
+++ b/TODO
@@ -62,16 +62,6 @@ General ideas for improvements
continue to keep their non-standard handshake while utilizing nbdkit
to prototype new behaviors in serving the kernel.
-* Background thread for filters. Some filters (readahead, cache and
- proposed scan filter - see below) could be more effective if they
- were able to defer work to a background thread. We finally have
- nbdkit_next_context_open and friends for allowing a background
- thread to have access into the plugin, but still need to worry about
- thread-safety (how much must the filter do vs. nbdkit, to avoid
- calling into the plugin too many times at once) and cleanup
- (spawning the thread during .after_fork is viable, but cleaning it
- up during .unload is too late).
-
* "nbdkit.so": nbdkit as a loadable shared library. The aim of nbdkit
is to make it reusable from other programs (see nbdkit-captive(1)).
If it was a loadable shared library it would be even more reusable.
@@ -228,6 +218,8 @@ Suggestions for filters
* nbdkit-cache-filter should handle ENOSPC errors automatically by
reclaiming blocks from the cache
+* nbdkit-cache-filter could use a background thread for reclaiming.
+
* zstd filter was requested as a way to do what we currently do with
xz but saving many hours on compression (at the cost of hundreds of
MBs of extra data)
@@ -240,6 +232,16 @@ Suggestions for filters
could inject a flush after pausing. However this requires that
filter background threads have access to the plugin (see above).
+nbdkit-readahead-filter:
+
+* The filter should open a new connection to the plugin per background
+ thread so it is able to work with plugins that use the
+ serialize_requests thread model (like curl). At the moment it makes
+ requests on the same connection, so it requires plugins to use the
+ parallel thread model.
+
+* It should combine (or avoid) overlapping cache requests.
+
nbdkit-rate-filter:
* allow other kinds of traffic shaping such as VBR
diff --git a/filters/readahead/Makefile.am b/filters/readahead/Makefile.am
index ee5bb3fb..187993ae 100644
--- a/filters/readahead/Makefile.am
+++ b/filters/readahead/Makefile.am
@@ -37,6 +37,8 @@ filter_LTLIBRARIES = nbdkit-readahead-filter.la
nbdkit_readahead_filter_la_SOURCES = \
readahead.c \
+ readahead.h \
+ bgthread.c \
$(top_srcdir)/include/nbdkit-filter.h \
$(NULL)
diff --git a/filters/readahead/bgthread.c b/filters/readahead/bgthread.c
new file mode 100644
index 00000000..5894bb5f
--- /dev/null
+++ b/filters/readahead/bgthread.c
@@ -0,0 +1,76 @@
+/* nbdkit
+ * Copyright (C) 2019-2022 Red Hat Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of Red Hat nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <pthread.h>
+
+#include <nbdkit-filter.h>
+
+#include "readahead.h"
+
+#include "cleanup.h"
+
+void *
+readahead_thread (void *vp)
+{
+ struct bgthread_ctrl *ctrl = vp;
+
+ for (;;) {
+ struct command cmd;
+
+ /* Wait until we are sent at least one command. */
+ {
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&ctrl->lock);
+ while (ctrl->cmds.len == 0)
+ pthread_cond_wait (&ctrl->cond, &ctrl->lock);
+ cmd = ctrl->cmds.ptr[0];
+ command_queue_remove (&ctrl->cmds, 0);
+ }
+
+ switch (cmd.type) {
+ case CMD_QUIT:
+ /* Finish processing and exit the thread. */
+ return NULL;
+
+ case CMD_CACHE:
+ /* Issue .cache (readahead) to underlying plugin. We ignore any
+ * errors because there's no way to communicate that back to the
+ * client, and readahead is only advisory.
+ */
+ cmd.next->cache (cmd.next, cmd.count, cmd.offset, 0, NULL);
+ }
+ }
+}
diff --git a/filters/readahead/nbdkit-readahead-filter.pod b/filters/readahead/nbdkit-readahead-filter.pod
index c220d379..630e5924 100644
--- a/filters/readahead/nbdkit-readahead-filter.pod
+++ b/filters/readahead/nbdkit-readahead-filter.pod
@@ -1,28 +1,66 @@
=head1 NAME
-nbdkit-readahead-filter - prefetch data when reading sequentially
+nbdkit-readahead-filter - prefetch data ahead of sequential reads
=head1 SYNOPSIS
- nbdkit --filter=readahead plugin
+ nbdkit --filter=readahead PLUGIN
+
+ nbdkit --filter=readahead --filter=cache PLUGIN
+
+ nbdkit --filter=readahead --filter=cow PLUGIN cow-on-cache=true
=head1 DESCRIPTION
C<nbdkit-readahead-filter> is a filter that prefetches data when the
-client is reading sequentially.
+client is reading.
-A common use for this filter is to accelerate sequential copy
-operations (like S<C<qemu-img convert>>) when plugin requests have a
-high overhead (like L<nbdkit-curl-plugin(1)>). For example:
-
- nbdkit -U - --filter=readahead curl https://example.com/disk.img \
- --run 'qemu-img convert $nbd disk.img'
+When the client issues a read, this filter issues a parallel prefetch
+(C<.cache>) for subsequent data. Plugins which support this command
+will prefetch the data, making sequential reads faster. For plugins
+which do not support this command, you can inject
+L<nbdkit-cache-filter(1)> below (after) this filter, giving
+approximately the same effect. L<nbdkit-cow-filter(1)> can be used
+instead of nbdkit-cache-filter, if you add the C<cow-on-cache=true>
+option.
The filter uses a simple adaptive algorithm which accelerates
-sequential reads, but has a small penalty if the client does random
-reads. If the client mixes reads with writes or write-like operations
-(trimming, zeroing) then it will work but there can be a large
-performance penalty.
+sequential reads and requires no further configuration.
+
+=head2 Limitations
+
+In a number of significant cases this filter will do nothing. The
+filter will print a warning message if this happens.
+
+=over 4
+
+=item Thread model must be parallel
+
+For example L<nbdkit-curl-plugin(1)> only supports
+C<serialize_requests>, and so this filter cannot perform prefetches in
+parallel with the read requests.
+
+We may be able to lift this restriction in future.
+
+=item Underlying filters or plugin must support C<.cache> (prefetch)
+
+Very many plugins do not have the concept of prefetching and/or
+do not implement the C<.cache> callback, and so there is no
+way for this filter to issue prefetches.
+
+You can usually get around this by adding I<--filter=cache> after this
+filter as explained above. It may be necessary to limit the total
+size of the cache (see L<nbdkit-cache-filter(1)/CACHE MAXIMUM SIZE>).
+
+=item Clients and kernels may do readahead already
+
+It may be the case that NBD clients are already issuing
+C<NBD_CMD_CACHE> (NBD prefetch) commands. It may also be the case
+that your plugin is using local file functions where the kernel is
+doing readahead. In such cases this filter is not necessary and may
+be pessimal.
+
+=back
=head1 PARAMETERS
@@ -50,9 +88,9 @@ C<nbdkit-readahead-filter> first appeared in nbdkit 1.12.
L<nbdkit(1)>,
L<nbdkit-cache-filter(1)>,
-L<nbdkit-curl-plugin(1)>,
+L<nbdkit-cow-filter(1)>,
+L<nbdkit-file-plugin(1)>,
L<nbdkit-retry-filter(1)>,
-L<nbdkit-ssh-plugin(1)>,
L<nbdkit-torrent-plugin(1)>,
L<nbdkit-vddk-plugin(1)>,
L<nbdkit-filter(3)>,
@@ -64,4 +102,4 @@ Richard W.M. Jones
=head1 COPYRIGHT
-Copyright (C) 2019 Red Hat Inc.
+Copyright (C) 2019-2022 Red Hat Inc.
diff --git a/filters/readahead/readahead.c b/filters/readahead/readahead.c
index f5552d4c..1d7ae111 100644
--- a/filters/readahead/readahead.c
+++ b/filters/readahead/readahead.c
@@ -1,5 +1,5 @@
/* nbdkit
- * Copyright (C) 2019-2021 Red Hat Inc.
+ * Copyright (C) 2019-2022 Red Hat Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
@@ -34,232 +34,218 @@
#include <stdio.h>
#include <stdlib.h>
+#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#include <errno.h>
-
#include <pthread.h>
#include <nbdkit-filter.h>
+#include "readahead.h"
+
#include "cleanup.h"
#include "minmax.h"
-
-/* Copied from server/plugins.c. */
-#define MAX_REQUEST_SIZE (64 * 1024 * 1024)
+#include "vector.h"
/* These could be made configurable in future. */
-#define READAHEAD_MIN 65536
-#define READAHEAD_MAX MAX_REQUEST_SIZE
-
-/* This lock protects the global state. */
-static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
-
-/* The real size of the underlying plugin. */
-static uint64_t size;
+#define READAHEAD_MIN 32768
+#define READAHEAD_MAX (4*1024*1024)
/* Size of the readahead window. */
+static pthread_mutex_t window_lock = PTHREAD_MUTEX_INITIALIZER;
static uint64_t window = READAHEAD_MIN;
+static uint64_t last_offset = 0, last_readahead = 0;
-/* The single prefetch buffer shared by all threads, and its virtual
- * location in the virtual disk. The prefetch buffer grows
- * dynamically as required, but never shrinks.
+static int thread_model = -1; /* Thread model of the underlying plugin. */
+
+/* Per-connection data. */
+struct readahead_handle {
+ int can_cache; /* Can the underlying plugin cache? */
+ pthread_t thread; /* The background thread, one per connection. */
+ struct bgthread_ctrl ctrl;
+};
+
+/* We have various requirements of the underlying filter(s) + plugin:
+ * - They must support NBDKIT_CACHE_NATIVE (otherwise our requests
+ * would not do anything useful).
+ * - They must use the PARALLEL thread model (otherwise we could
+ * violate their thread model).
+ */
+static bool
+filter_working (struct readahead_handle *h)
+{
+ return
+ h->can_cache == NBDKIT_CACHE_NATIVE &&
+ thread_model == NBDKIT_THREAD_MODEL_PARALLEL;
+}
+
+static bool
+suggest_cache_filter (struct readahead_handle *h)
+{
+ return
+ h->can_cache != NBDKIT_CACHE_NATIVE &&
+ thread_model == NBDKIT_THREAD_MODEL_PARALLEL;
+}
+
+/* We need to hook into .get_ready() so we can read the final thread
+ * model (of the whole server).
*/
-static char *buffer = NULL;
-static size_t bufsize = 0;
-static uint64_t position;
-static uint32_t length = 0;
+static int
+readahead_get_ready (int final_thread_model)
+{
+ thread_model = final_thread_model;
+ return 0;
+}
+
+static int
+send_command_to_background_thread (struct bgthread_ctrl *ctrl,
+ const struct command cmd)
+{
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&ctrl->lock);
+ if (command_queue_append (&ctrl->cmds, cmd) == -1)
+ return -1;
+ /* Signal the thread if it could be sleeping on an empty queue. */
+ if (ctrl->cmds.len == 1)
+ pthread_cond_signal (&ctrl->cond);
+ return 0;
+}
+
+static void *
+readahead_open (nbdkit_next_open *next, nbdkit_context *nxdata,
+ int readonly, const char *exportname, int is_tls)
+{
+ struct readahead_handle *h;
+ int err;
+
+ if (next (nxdata, readonly, exportname) == -1)
+ return NULL;
+
+ h = malloc (sizeof *h);
+ if (h == NULL) {
+ nbdkit_error ("malloc: %m");
+ return NULL;
+ }
+
+ h->ctrl.cmds = (command_queue) empty_vector;
+ pthread_mutex_init (&h->ctrl.lock, NULL);
+ pthread_cond_init (&h->ctrl.cond, NULL);
+
+ /* Create the background thread. */
+ err = pthread_create (&h->thread, NULL, readahead_thread, &h->ctrl);
+ if (err != 0) {
+ errno = err;
+ nbdkit_error ("pthread_create: %m");
+ pthread_cond_destroy (&h->ctrl.cond);
+ pthread_mutex_destroy (&h->ctrl.lock);
+ free (h);
+ return NULL;
+ }
+
+ return h;
+}
static void
-readahead_unload (void)
+readahead_close (void *handle)
{
- free (buffer);
+ struct readahead_handle *h = handle;
+ const struct command quit_cmd = { .type = CMD_QUIT };
+
+ send_command_to_background_thread (&h->ctrl, quit_cmd);
+ pthread_join (h->thread, NULL);
+ pthread_cond_destroy (&h->ctrl.cond);
+ pthread_mutex_destroy (&h->ctrl.lock);
+ command_queue_reset (&h->ctrl.cmds);
+ free (h);
}
-static int64_t readahead_get_size (nbdkit_next *next, void *handle);
-
-/* In prepare, force a call to get_size which sets the size global. */
static int
-readahead_prepare (nbdkit_next *next, void *handle, int readonly)
+readahead_can_cache (nbdkit_next *next, void *handle)
{
- int64_t r;
+ struct readahead_handle *h = handle;
+ int r;
- r = readahead_get_size (next, handle);
- return r >= 0 ? 0 : -1;
-}
-
-/* Get the size. */
-static int64_t
-readahead_get_size (nbdkit_next *next, void *handle)
-{
- int64_t r;
-
- r = next->get_size (next);
+ /* Call next->can_cache to read the underlying 'can_cache'. */
+ r = next->can_cache (next);
if (r == -1)
return -1;
+ h->can_cache = r;
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
- size = r;
+ if (!filter_working (h)) {
+ nbdkit_error ("readahead: warning: underlying plugin does not support "
+ "NBD_CMD_CACHE or PARALLEL thread model, so the filter "
+ "won't do anything");
+ if (suggest_cache_filter (h))
+ nbdkit_error ("readahead: try adding --filter=cache "
+ "after this filter");
+ /* This is an error, but that's just to ensure that the warning
+ * above is seen. We don't need to return -1 here.
+ */
+ }
return r;
}
-/* Cache */
-static int
-readahead_can_cache (nbdkit_next *next, void *handle)
-{
- /* We are already operating as a cache regardless of the plugin's
- * underlying .can_cache, but it's easiest to just rely on nbdkit's
- * behavior of calling .pread for caching.
- */
- return NBDKIT_CACHE_EMULATE;
-}
-
/* Read data. */
-
-static int
-fill_readahead (nbdkit_next *next,
- uint32_t count, uint64_t offset, uint32_t flags, int *err)
-{
- position = offset;
-
- /* Read at least window bytes, but if count is larger read that.
- * Note that the count cannot be bigger than the buffer size.
- */
- length = MAX (count, window);
-
- /* Don't go beyond the end of the underlying file. */
- length = MIN (length, size - position);
-
- /* Grow the buffer if necessary. */
- if (bufsize < length) {
- char *new_buffer = realloc (buffer, length);
- if (new_buffer == NULL) {
- *err = errno;
- nbdkit_error ("realloc: %m");
- return -1;
- }
- buffer = new_buffer;
- bufsize = length;
- }
-
- if (next->pread (next, buffer, length, offset, flags, err) == -1) {
- length = 0; /* failed to fill the prefetch buffer */
- return -1;
- }
-
- return 0;
-}
-
static int
readahead_pread (nbdkit_next *next,
void *handle, void *buf, uint32_t count, uint64_t offset,
uint32_t flags, int *err)
{
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+ struct readahead_handle *h = handle;
- while (count > 0) {
- if (length == 0) {
- /* We don't have a prefetch buffer at all. This could be the
- * first request or reset after a miss.
- */
- window = READAHEAD_MIN;
- if (fill_readahead (next, count, offset, flags, err) == -1)
- return -1;
- }
+ /* If the underlying plugin doesn't support caching then skip that
+ * step completely. The filter will do nothing.
+ */
+ if (filter_working (h)) {
+ struct command ra_cmd = { .type = CMD_CACHE, .next = NULL };
+ int64_t size;
- /* Can we satisfy this request partly or entirely from the prefetch
- * buffer?
- */
- else if (position <= offset && offset < position + length) {
- uint32_t n = MIN (position - offset + length, count);
- memcpy (buf, &buffer[offset-position], n);
- buf += n;
- offset += n;
- count -= n;
- }
+ size = next->get_size (next);
+ if (size >= 0) {
+ ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock);
- /* Does the request start immediately after the prefetch buffer?
- * This is a “hit” allowing us to double the window size.
- */
- else if (offset == position + length) {
- window = MIN (window * 2, READAHEAD_MAX);
- if (fill_readahead (next, count, offset, flags, err) == -1)
- return -1;
+ /* Generate the asynchronous (background) cache command for
+ * the readahead window.
+ */
+ ra_cmd.offset = offset + count;
+ if (ra_cmd.offset < size) {
+ ra_cmd.count = MIN (window, size - ra_cmd.offset);
+ ra_cmd.next = next; /* If .next is non-NULL, we'll send it below. */
+ }
+
+ /* Should we change the window size?
+ * If the last readahead < current offset, double the window.
+ * If not, but we're still making forward progress, keep the window.
+ * If we're not making forward progress, reduce the window to minimum.
+ */
+ if (last_readahead < offset)
+ window = MIN (window * 2, READAHEAD_MAX);
+ else if (last_offset < offset)
+ /* leave window unchanged */ ;
+ else
+ window = READAHEAD_MIN;
+ last_offset = offset;
+ last_readahead = ra_cmd.offset + ra_cmd.count;
}
- /* Else it's a “miss”. Reset everything and start again. */
- else
- length = 0;
+ if (ra_cmd.next &&
+ send_command_to_background_thread (&h->ctrl, ra_cmd) == -1)
+ return -1;
}
- return 0;
-}
-
-/* Any writes or write-like operations kill the prefetch buffer.
- *
- * We could do better here, but for the current use case of this
- * filter it doesn't matter. XXX
- */
-
-static void
-kill_readahead (void)
-{
- ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
- window = READAHEAD_MIN;
- length = 0;
-}
-
-static int
-readahead_pwrite (nbdkit_next *next,
- void *handle,
- const void *buf, uint32_t count, uint64_t offset,
- uint32_t flags, int *err)
-{
- kill_readahead ();
- return next->pwrite (next, buf, count, offset, flags, err);
-}
-
-static int
-readahead_trim (nbdkit_next *next,
- void *handle,
- uint32_t count, uint64_t offset, uint32_t flags,
- int *err)
-{
- kill_readahead ();
- return next->trim (next, count, offset, flags, err);
-}
-
-static int
-readahead_zero (nbdkit_next *next,
- void *handle,
- uint32_t count, uint64_t offset, uint32_t flags,
- int *err)
-{
- kill_readahead ();
- return next->zero (next, count, offset, flags, err);
-}
-
-static int
-readahead_flush (nbdkit_next *next,
- void *handle, uint32_t flags, int *err)
-{
- kill_readahead ();
- return next->flush (next, flags, err);
+ /* Issue the synchronous read. */
+ return next->pread (next, buf, count, offset, flags, err);
}
static struct nbdkit_filter filter = {
.name = "readahead",
.longname = "nbdkit readahead filter",
- .unload = readahead_unload,
- .prepare = readahead_prepare,
- .get_size = readahead_get_size,
+ .get_ready = readahead_get_ready,
+ .open = readahead_open,
+ .close = readahead_close,
.can_cache = readahead_can_cache,
.pread = readahead_pread,
- .pwrite = readahead_pwrite,
- .trim = readahead_trim,
- .zero = readahead_zero,
- .flush = readahead_flush,
};
NBDKIT_REGISTER_FILTER(filter)
diff --git a/filters/readahead/readahead.h b/filters/readahead/readahead.h
new file mode 100644
index 00000000..a68204d5
--- /dev/null
+++ b/filters/readahead/readahead.h
@@ -0,0 +1,60 @@
+/* nbdkit
+ * Copyright (C) 2019-2022 Red Hat Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of Red Hat nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef NBDKIT_READAHEAD_H
+#define NBDKIT_READAHEAD_H
+
+#include <pthread.h>
+
+#include <nbdkit-filter.h>
+
+#include "vector.h"
+
+/* List of commands issued to the background thread. */
+struct command {
+ enum { CMD_QUIT, CMD_CACHE } type;
+ nbdkit_next *next;
+ uint64_t offset;
+ uint32_t count;
+};
+DEFINE_VECTOR_TYPE(command_queue, struct command);
+
+struct bgthread_ctrl {
+ command_queue cmds; /* Command queue. */
+ pthread_mutex_t lock; /* Lock for queue. */
+ pthread_cond_t cond; /* Condition queue size 0 -> 1. */
+};
+
+/* Start background thread (one per connection). */
+extern void *readahead_thread (void *vp);
+
+#endif /* NBDKIT_READAHEAD_H */
diff --git a/tests/test-readahead.sh b/tests/test-readahead.sh
index 7ec7f8e9..17126e5a 100755
--- a/tests/test-readahead.sh
+++ b/tests/test-readahead.sh
@@ -59,7 +59,7 @@ for i in range(0, 512*10, 512):
echo $((end_t - start_t))
}
-t1=$(test --filter=readahead)
+t1=$(test --filter=readahead --filter=cache)
t2=$(test)
# In the t1 case we should make only 1 request into the plugin,
--
2.31.1

View File

@ -0,0 +1,120 @@
From a9708db324eb1989680132813da20d5a4c5496a9 Mon Sep 17 00:00:00 2001
From: "Richard W.M. Jones" <rjones@redhat.com>
Date: Thu, 21 Apr 2022 16:14:46 +0100
Subject: [PATCH] readahead: Fix test
The previous test turned out to be pretty bad at testing the new
filter. A specific problem is that the filter starts a background
thread which issues .cache requests, while on the main connection
.pread requests are being passed through. The test used
--filter=readahead --filter=cache with the cache filter only caching
on .cache requests (since cache-on-read defaults to false), so only
caching requests made by the background thread.
main thread
client ---- .pread ----- delay-filter -------> plugin
\
\ background thread
.cache --- cache-filter
Under very high load, the background thread could be starved. This
means no requests were being cached at all, and all requests were
passing through the delay filter. It would appear that readahead was
failing (which it was, in a way).
It's not very easy to fix this since readahead is best-effort, but we
can go back to using a simpler plugin that logs reads and caches and
check that they look valid.
Update: commit 2ff548d66ad3eae87868402ec5b3319edd12090f
(cherry picked from commit db1e3311727c6ecab3264a1811d33db1aa45a4d0)
---
tests/test-readahead.sh | 61 +++++++++++++++++++++++------------------
1 file changed, 35 insertions(+), 26 deletions(-)
diff --git a/tests/test-readahead.sh b/tests/test-readahead.sh
index 17126e5a..37f4a06f 100755
--- a/tests/test-readahead.sh
+++ b/tests/test-readahead.sh
@@ -30,43 +30,52 @@
# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
-# Is the readahead filter faster? Copy a blank disk with a custom
-# plugin that sleeps on every request. Because the readahead filter
-# should result in fewer requests it should run faster.
-
source ./functions.sh
set -e
set -x
-requires_filter delay
+requires_plugin sh
requires nbdsh --version
requires dd iflag=count_bytes </dev/null
-files="readahead.img"
+files="readahead.out"
rm -f $files
cleanup_fn rm -f $files
-test ()
-{
- start_t=$SECONDS
- nbdkit -fv -U - "$@" null size=1M --filter=delay rdelay=5 \
- --run 'nbdsh --uri "$uri" -c "
+nbdkit -fv -U - "$@" sh - \
+ --filter=readahead \
+ --run 'nbdsh --uri "$uri" -c "
for i in range(0, 512*10, 512):
h.pread(512, i)
-"'
+"' <<'EOF'
+case "$1" in
+ thread_model)
+ echo parallel
+ ;;
+ can_cache)
+ echo native
+ ;;
+ get_size)
+ echo 1M
+ ;;
+ cache)
+ echo "$@" >> readahead.out
+ ;;
+ pread)
+ echo "$@" >> readahead.out
+ dd if=/dev/zero count=$3 iflag=count_bytes
+ ;;
+ *)
+ exit 2
+ ;;
+esac
+EOF
- end_t=$SECONDS
- echo $((end_t - start_t))
-}
+cat readahead.out
-t1=$(test --filter=readahead --filter=cache)
-t2=$(test)
-
-# In the t1 case we should make only 1 request into the plugin,
-# resulting in around 1 sleep period (5 seconds). In the t2 case we
-# make 10 requests so sleep for around 50 seconds. t1 should be < t2
-# is every reasonable scenario.
-if [ $t1 -ge $t2 ]; then
- echo "$0: readahead filter took longer, should be shorter"
- exit 1
-fi
+# We should see the pread requests, and additional cache requests for
+# the 32K region following each pread request.
+for i in `seq 0 512 $((512*10 - 512))` ; do
+ grep "pread 512 $i" readahead.out
+ grep "cache 32768 $((i+512))" readahead.out
+done
--
2.31.1

View File

@ -46,13 +46,13 @@ ExclusiveArch: x86_64
%global verify_tarball_signature 1
# If there are patches which touch autotools files, set this to 1.
%global patches_touch_autotools %{nil}
%global patches_touch_autotools 1
# The source directory.
%global source_directory 1.30-stable
Name: nbdkit
Version: 1.30.2
Version: 1.30.4
Release: 1%{?dist}
Summary: NBD server
@ -78,7 +78,9 @@ Source3: copy-patches.sh
# https://gitlab.com/nbdkit/nbdkit/-/commits/rhel-9.1/
# Patches.
# (none)
Patch0001: 0001-ssh-Allow-the-remote-file-to-be-created.patch
Patch0002: 0002-readahead-Rewrite-this-filter-so-it-prefetches-using.patch
Patch0003: 0003-readahead-Fix-test.patch
# For automatic RPM Provides generation.
# See: https://rpm-software-management.github.io/rpm/manual/dependency_generators.html
@ -1178,8 +1180,8 @@ export LIBGUESTFS_TRACE=1
%changelog
* Mon Apr 04 2022 Richard W.M. Jones <rjones@redhat.com> - 1.30.2-1
- Rebase to new stable branch version 1.30.2
* Tue Apr 26 2022 Richard W.M. Jones <rjones@redhat.com> - 1.30.4-1
- Rebase to new stable branch version 1.30.4
resolves: rhbz#2059289
- Add automatic provides generator and subpackage nbdkit-srpm-macros
resolves: rhbz#2059291
@ -1188,6 +1190,8 @@ export LIBGUESTFS_TRACE=1
- vddk: Fix use of uninitialized memory when computing block size
resolves: rhbz#2066655
- Skip vsock tests unless the vsock_loopback module is loaded (2069558)
- Add support for ssh create remote file.
- Backport new readahead filter from 1.32.
* Mon Jan 24 2022 Richard W.M. Jones <rjones@redhat.com> - 1.28.5-1
- Rebase to new stable branch version 1.28.5

View File

@ -1,2 +1,2 @@
SHA512 (nbdkit-1.30.2.tar.gz) = 84da92005bbb59a860c9ba5a88082d859c9dd4f9db5c81d8760e9d6925e10d80d27de78be9391113bf715d16de7726594b5b803512cd8b59ce14ec135fc4b09d
SHA512 (nbdkit-1.30.2.tar.gz.sig) = c64ccdbc25f56626360aa812af330e33f26e6954415b93976f5a3d58808affdc94efa6e309066b23dbfd354192c4693a408b79aa583a6f5e84ab1c2ef406179e
SHA512 (nbdkit-1.30.4.tar.gz) = b2db6cde29b04fc73831a5f062b7b5d0a4ed2c3785ccefe81add4bc89cd62d5705b904922f53ea1fd4b0987c9aa2a0ef34fc58eefa804b37d424a44112fbf1b1
SHA512 (nbdkit-1.30.4.tar.gz.sig) = ed8a60214274d88f418656b1b50d9eb5b73aa0ff18e36446cf21a6fe2c07ca72311fd745c5d99f94cb3676dba2ee3a768ab08a05cf9d0d24d8156c43118d57d6