fio/0001-Revert-pmemblk-remove-...

723 lines
19 KiB
Diff

From 2383a1ec2ad9090259f6d200b92676acff37de3b Mon Sep 17 00:00:00 2001
From: Pavel Reichl <preichl@redhat.com>
Date: Fri, 9 Jun 2023 18:18:48 +0200
Subject: [PATCH] Revert "pmemblk: remove pmemblk engine"
This reverts commit 04c1cdc4c108c6537681ab7c50daaed6d2fb4c93.
Signed-off-by: Pavel Reichl <preichl@redhat.com>
---
HOWTO.rst | 5 +
Makefile | 5 +
ci/actions-install.sh | 1 +
configure | 41 ++++
engines/pmemblk.c | 449 ++++++++++++++++++++++++++++++++++++++++
examples/pmemblk.fio | 71 +++++++
fio.1 | 5 +
options.c | 6 +
os/windows/examples.wxs | 4 +
10 files changed, 587 insertions(+)
create mode 100644 engines/pmemblk.c
create mode 100644 examples/pmemblk.fio
diff --git a/HOWTO.rst b/HOWTO.rst
index 32fff5ecbde42cf894214f766f38130dba079760..4f003524f69e5cced1195fb0f7efcc2590648122 100644
--- a/HOWTO.rst
+++ b/HOWTO.rst
@@ -2147,6 +2147,11 @@ I/O engine
before overwriting. The `trimwrite` mode works well for this
constraint.
+ **pmemblk**
+ Read and write using filesystem DAX to a file on a filesystem
+ mounted with DAX on a persistent memory device through the PMDK
+ libpmemblk library.
+
**dev-dax**
Read and write using device DAX to a persistent memory device (e.g.,
/dev/dax0.0) through the PMDK libpmem library.
diff --git a/Makefile b/Makefile
index 6d7fd4e2bbbdeb196d22299d379bebb29172d538..89205ebf498f957ceedefdf1b0e565f08c85060f 100644
--- a/Makefile
+++ b/Makefile
@@ -208,6 +208,11 @@ ifdef CONFIG_MTD
SOURCE += oslib/libmtd.c
SOURCE += oslib/libmtd_legacy.c
endif
+ifdef CONFIG_PMEMBLK
+ pmemblk_SRCS = engines/pmemblk.c
+ pmemblk_LIBS = -lpmemblk
+ ENGINES += pmemblk
+endif
ifdef CONFIG_LINUX_DEVDAX
dev-dax_SRCS = engines/dev-dax.c
dev-dax_LIBS = -lpmem
diff --git a/ci/actions-install.sh b/ci/actions-install.sh
index 95241e78825a9939814a747daf486f866949e392..2f1a0cbaeef4b528a93813d18b94f9bc041bfa04 100755
--- a/ci/actions-install.sh
+++ b/ci/actions-install.sh
@@ -47,6 +47,7 @@ DPKGCFG
libnbd-dev
libpmem-dev
libpmem2-dev
+ libpmemblk-dev
libprotobuf-c-dev
librbd-dev
libtcmalloc-minimal4
diff --git a/configure b/configure
index 74416fd48bc73e35cd8fd5440b9733efd1d0adbb..f6b160c99d374034ba308b74a306e3eb1570be1e 100755
--- a/configure
+++ b/configure
@@ -163,6 +163,7 @@ show_help="no"
exit_val=0
gfio_check="no"
libhdfs="no"
+pmemblk="no"
devdax="no"
pmem="no"
cuda="no"
@@ -2260,6 +2261,43 @@ if test "$libpmem" = "yes" && test "$disable_pmem" = "no"; then
fi
fi
+##########################################
+# Check whether we have libpmemblk
+# libpmem is a prerequisite
+if test "$libpmemblk" != "yes" ; then
+ libpmemblk="no"
+fi
+if test "$libpmem" = "yes"; then
+ cat > $TMPC << EOF
+#include <libpmemblk.h>
+int main(int argc, char **argv)
+{
+ PMEMblkpool *pbp;
+ pbp = pmemblk_open("", 0);
+ return 0;
+}
+EOF
+ if compile_prog "" "-lpmemblk" "libpmemblk"; then
+ libpmemblk="yes"
+ fi
+fi
+print_config "libpmemblk" "$libpmemblk"
+
+# Choose libpmem-based ioengines
+if test "$libpmem" = "yes" && test "$disable_pmem" = "no"; then
+ devdax="yes"
+ if test "$libpmem1_5" = "yes"; then
+ pmem="yes"
+ fi
+ if test "$libpmemblk" = "yes"; then
+ pmemblk="yes"
+ fi
+fi
+
+##########################################
+# Report whether pmemblk engine is enabled
+print_config "PMDK pmemblk engine" "$pmemblk"
+
##########################################
# Report whether dev-dax engine is enabled
print_config "PMDK dev-dax engine" "$devdax"
@@ -3188,6 +3226,9 @@ fi
if test "$mtd" = "yes" ; then
output_sym "CONFIG_MTD"
fi
+if test "$pmemblk" = "yes" ; then
+ output_sym "CONFIG_PMEMBLK"
+fi
if test "$devdax" = "yes" ; then
output_sym "CONFIG_LINUX_DEVDAX"
fi
diff --git a/engines/pmemblk.c b/engines/pmemblk.c
new file mode 100644
index 0000000000000000000000000000000000000000..849d8a15a0da59d07209c2475b78a1e4098c143a
--- /dev/null
+++ b/engines/pmemblk.c
@@ -0,0 +1,449 @@
+/*
+ * pmemblk: IO engine that uses PMDK libpmemblk to read and write data
+ *
+ * Copyright (C) 2016 Hewlett Packard Enterprise Development LP
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the Free
+ * Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * pmemblk engine
+ *
+ * IO engine that uses libpmemblk to read and write data
+ *
+ * To use:
+ * ioengine=pmemblk
+ *
+ * Other relevant settings:
+ * thread=1 REQUIRED
+ * iodepth=1
+ * direct=1
+ * unlink=1
+ * filename=/mnt/pmem0/fiotestfile,BSIZE,FSIZEMiB
+ *
+ * thread must be set to 1 for pmemblk as multiple processes cannot
+ * open the same block pool file.
+ *
+ * iodepth should be set to 1 as pmemblk is always synchronous.
+ * Use numjobs to scale up.
+ *
+ * direct=1 is implied as pmemblk is always direct. A warning message
+ * is printed if this is not specified.
+ *
+ * unlink=1 removes the block pool file after testing, and is optional.
+ *
+ * The pmem device must have a DAX-capable filesystem and be mounted
+ * with DAX enabled. filename must point to a file on that filesystem.
+ *
+ * Example:
+ * mkfs.xfs /dev/pmem0
+ * mkdir /mnt/pmem0
+ * mount -o dax /dev/pmem0 /mnt/pmem0
+ *
+ * When specifying the filename, if the block pool file does not already
+ * exist, then the pmemblk engine creates the pool file if you specify
+ * the block and file sizes. BSIZE is the block size in bytes.
+ * FSIZEMB is the pool file size in MiB.
+ *
+ * See examples/pmemblk.fio for more.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/uio.h>
+#include <errno.h>
+#include <assert.h>
+#include <string.h>
+#include <libpmem.h>
+#include <libpmemblk.h>
+
+#include "../fio.h"
+
+/*
+ * libpmemblk
+ */
+typedef struct fio_pmemblk_file *fio_pmemblk_file_t;
+
+struct fio_pmemblk_file {
+ fio_pmemblk_file_t pmb_next;
+ char *pmb_filename;
+ uint64_t pmb_refcnt;
+ PMEMblkpool *pmb_pool;
+ size_t pmb_bsize;
+ size_t pmb_nblocks;
+};
+
+static fio_pmemblk_file_t Cache;
+
+static pthread_mutex_t CacheLock = PTHREAD_MUTEX_INITIALIZER;
+
+#define PMB_CREATE (0x0001) /* should create file */
+
+fio_pmemblk_file_t fio_pmemblk_cache_lookup(const char *filename)
+{
+ fio_pmemblk_file_t i;
+
+ for (i = Cache; i != NULL; i = i->pmb_next)
+ if (!strcmp(filename, i->pmb_filename))
+ return i;
+
+ return NULL;
+}
+
+static void fio_pmemblk_cache_insert(fio_pmemblk_file_t pmb)
+{
+ pmb->pmb_next = Cache;
+ Cache = pmb;
+}
+
+static void fio_pmemblk_cache_remove(fio_pmemblk_file_t pmb)
+{
+ fio_pmemblk_file_t i;
+
+ if (pmb == Cache) {
+ Cache = Cache->pmb_next;
+ pmb->pmb_next = NULL;
+ return;
+ }
+
+ for (i = Cache; i != NULL; i = i->pmb_next)
+ if (pmb == i->pmb_next) {
+ i->pmb_next = i->pmb_next->pmb_next;
+ pmb->pmb_next = NULL;
+ return;
+ }
+}
+
+/*
+ * to control block size and gross file size at the libpmemblk
+ * level, we allow the block size and file size to be appended
+ * to the file name:
+ *
+ * path[,bsize,fsizemib]
+ *
+ * note that we do not use the fio option "filesize" to dictate
+ * the file size because we can only give libpmemblk the gross
+ * file size, which is different from the net or usable file
+ * size (which is probably what fio wants).
+ *
+ * the final path without the parameters is returned in ppath.
+ * the block size and file size are returned in pbsize and fsize.
+ *
+ * note that the user specifies the file size in MiB, but
+ * we return bytes from here.
+ */
+static void pmb_parse_path(const char *pathspec, char **ppath, uint64_t *pbsize,
+ uint64_t *pfsize)
+{
+ char *path;
+ char *s;
+ uint64_t bsize;
+ uint64_t fsizemib;
+
+ path = strdup(pathspec);
+ if (!path) {
+ *ppath = NULL;
+ return;
+ }
+
+ /* extract sizes, if given */
+ s = strrchr(path, ',');
+ if (s && (fsizemib = strtoull(s + 1, NULL, 10))) {
+ *s = 0;
+ s = strrchr(path, ',');
+ if (s && (bsize = strtoull(s + 1, NULL, 10))) {
+ *s = 0;
+ *ppath = path;
+ *pbsize = bsize;
+ *pfsize = fsizemib << 20;
+ return;
+ }
+ }
+
+ /* size specs not found */
+ strcpy(path, pathspec);
+ *ppath = path;
+ *pbsize = 0;
+ *pfsize = 0;
+}
+
+static fio_pmemblk_file_t pmb_open(const char *pathspec, int flags)
+{
+ fio_pmemblk_file_t pmb;
+ char *path = NULL;
+ uint64_t bsize = 0;
+ uint64_t fsize = 0;
+
+ pmb_parse_path(pathspec, &path, &bsize, &fsize);
+ if (!path)
+ return NULL;
+
+ pthread_mutex_lock(&CacheLock);
+
+ pmb = fio_pmemblk_cache_lookup(path);
+ if (!pmb) {
+ pmb = malloc(sizeof(*pmb));
+ if (!pmb)
+ goto error;
+
+ /* try opening existing first, create it if needed */
+ pmb->pmb_pool = pmemblk_open(path, bsize);
+ if (!pmb->pmb_pool && (errno == ENOENT) &&
+ (flags & PMB_CREATE) && (0 < fsize) && (0 < bsize)) {
+ pmb->pmb_pool =
+ pmemblk_create(path, bsize, fsize, 0644);
+ }
+ if (!pmb->pmb_pool) {
+ log_err("pmemblk: unable to open pmemblk pool file %s (%s)\n",
+ path, strerror(errno));
+ goto error;
+ }
+
+ pmb->pmb_filename = path;
+ pmb->pmb_next = NULL;
+ pmb->pmb_refcnt = 0;
+ pmb->pmb_bsize = pmemblk_bsize(pmb->pmb_pool);
+ pmb->pmb_nblocks = pmemblk_nblock(pmb->pmb_pool);
+
+ fio_pmemblk_cache_insert(pmb);
+ } else {
+ free(path);
+ }
+
+ pmb->pmb_refcnt += 1;
+
+ pthread_mutex_unlock(&CacheLock);
+
+ return pmb;
+
+error:
+ if (pmb) {
+ if (pmb->pmb_pool)
+ pmemblk_close(pmb->pmb_pool);
+ pmb->pmb_pool = NULL;
+ pmb->pmb_filename = NULL;
+ free(pmb);
+ }
+ if (path)
+ free(path);
+
+ pthread_mutex_unlock(&CacheLock);
+ return NULL;
+}
+
+static void pmb_close(fio_pmemblk_file_t pmb, const bool keep)
+{
+ pthread_mutex_lock(&CacheLock);
+
+ pmb->pmb_refcnt--;
+
+ if (!keep && !pmb->pmb_refcnt) {
+ pmemblk_close(pmb->pmb_pool);
+ pmb->pmb_pool = NULL;
+ free(pmb->pmb_filename);
+ pmb->pmb_filename = NULL;
+ fio_pmemblk_cache_remove(pmb);
+ free(pmb);
+ }
+
+ pthread_mutex_unlock(&CacheLock);
+}
+
+static int pmb_get_flags(struct thread_data *td, uint64_t *pflags)
+{
+ static int thread_warned = 0;
+ static int odirect_warned = 0;
+
+ uint64_t flags = 0;
+
+ if (!td->o.use_thread) {
+ if (!thread_warned) {
+ thread_warned = 1;
+ log_err("pmemblk: must set thread=1 for pmemblk engine\n");
+ }
+ return 1;
+ }
+
+ if (!td->o.odirect && !odirect_warned) {
+ odirect_warned = 1;
+ log_info("pmemblk: direct == 0, but pmemblk is always direct\n");
+ }
+
+ if (td->o.allow_create)
+ flags |= PMB_CREATE;
+
+ (*pflags) = flags;
+ return 0;
+}
+
+static int fio_pmemblk_open_file(struct thread_data *td, struct fio_file *f)
+{
+ uint64_t flags = 0;
+ fio_pmemblk_file_t pmb;
+
+ if (pmb_get_flags(td, &flags))
+ return 1;
+
+ pmb = pmb_open(f->file_name, flags);
+ if (!pmb)
+ return 1;
+
+ FILE_SET_ENG_DATA(f, pmb);
+ return 0;
+}
+
+static int fio_pmemblk_close_file(struct thread_data fio_unused *td,
+ struct fio_file *f)
+{
+ fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
+
+ if (pmb)
+ pmb_close(pmb, false);
+
+ FILE_SET_ENG_DATA(f, NULL);
+ return 0;
+}
+
+static int fio_pmemblk_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+ uint64_t flags = 0;
+ fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
+
+ if (fio_file_size_known(f))
+ return 0;
+
+ if (!pmb) {
+ if (pmb_get_flags(td, &flags))
+ return 1;
+ pmb = pmb_open(f->file_name, flags);
+ if (!pmb)
+ return 1;
+ }
+
+ f->real_file_size = pmb->pmb_bsize * pmb->pmb_nblocks;
+
+ fio_file_set_size_known(f);
+
+ if (!FILE_ENG_DATA(f))
+ pmb_close(pmb, true);
+
+ return 0;
+}
+
+static enum fio_q_status fio_pmemblk_queue(struct thread_data *td,
+ struct io_u *io_u)
+{
+ struct fio_file *f = io_u->file;
+ fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
+
+ unsigned long long off;
+ unsigned long len;
+ void *buf;
+
+ fio_ro_check(td, io_u);
+
+ switch (io_u->ddir) {
+ case DDIR_READ:
+ case DDIR_WRITE:
+ off = io_u->offset;
+ len = io_u->xfer_buflen;
+
+ io_u->error = EINVAL;
+ if (off % pmb->pmb_bsize)
+ break;
+ if (len % pmb->pmb_bsize)
+ break;
+ if ((off + len) / pmb->pmb_bsize > pmb->pmb_nblocks)
+ break;
+
+ io_u->error = 0;
+ buf = io_u->xfer_buf;
+ off /= pmb->pmb_bsize;
+ len /= pmb->pmb_bsize;
+ while (0 < len) {
+ if (io_u->ddir == DDIR_READ) {
+ if (0 != pmemblk_read(pmb->pmb_pool, buf, off)) {
+ io_u->error = errno;
+ break;
+ }
+ } else if (0 != pmemblk_write(pmb->pmb_pool, buf, off)) {
+ io_u->error = errno;
+ break;
+ }
+ buf += pmb->pmb_bsize;
+ off++;
+ len--;
+ }
+ off *= pmb->pmb_bsize;
+ len *= pmb->pmb_bsize;
+ io_u->resid = io_u->xfer_buflen - (off - io_u->offset);
+ break;
+ case DDIR_SYNC:
+ case DDIR_DATASYNC:
+ case DDIR_SYNC_FILE_RANGE:
+ /* we're always sync'd */
+ io_u->error = 0;
+ break;
+ default:
+ io_u->error = EINVAL;
+ break;
+ }
+
+ return FIO_Q_COMPLETED;
+}
+
+static int fio_pmemblk_unlink_file(struct thread_data *td, struct fio_file *f)
+{
+ char *path = NULL;
+ uint64_t bsize = 0;
+ uint64_t fsize = 0;
+
+ /*
+ * we need our own unlink in case the user has specified
+ * the block and file sizes in the path name. we parse
+ * the file_name to determine the file name we actually used.
+ */
+
+ pmb_parse_path(f->file_name, &path, &bsize, &fsize);
+ if (!path)
+ return ENOENT;
+
+ unlink(path);
+ free(path);
+ return 0;
+}
+
+FIO_STATIC struct ioengine_ops ioengine = {
+ .name = "pmemblk",
+ .version = FIO_IOOPS_VERSION,
+ .queue = fio_pmemblk_queue,
+ .open_file = fio_pmemblk_open_file,
+ .close_file = fio_pmemblk_close_file,
+ .get_file_size = fio_pmemblk_get_file_size,
+ .unlink_file = fio_pmemblk_unlink_file,
+ .flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL,
+};
+
+static void fio_init fio_pmemblk_register(void)
+{
+ register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_pmemblk_unregister(void)
+{
+ unregister_ioengine(&ioengine);
+}
diff --git a/examples/pmemblk.fio b/examples/pmemblk.fio
new file mode 100644
index 0000000000000000000000000000000000000000..59bb2a8a5acbf0e03d16a988f6ae0b9eb84575d2
--- /dev/null
+++ b/examples/pmemblk.fio
@@ -0,0 +1,71 @@
+[global]
+bs=1m
+ioengine=pmemblk
+norandommap
+time_based
+runtime=30
+group_reporting
+disable_lat=1
+disable_slat=1
+disable_clat=1
+clat_percentiles=0
+cpus_allowed_policy=split
+
+# For the pmemblk engine:
+#
+# IOs always complete immediately
+# IOs are always direct
+# Must use threads
+#
+iodepth=1
+direct=1
+thread
+numjobs=16
+#
+# Unlink can be used to remove the files when done, but if you are
+# using serial runs with stonewall, and you want the files to be created
+# only once and unlinked only at the very end, then put the unlink=1
+# in the last group. This is the method demonstrated here.
+#
+# Note that if you have a read-only group and if the files will be
+# newly created, then all of the data will read back as zero and the
+# read will be optimized, yielding performance that is different from
+# that of reading non-zero blocks (or unoptimized zero blocks).
+#
+unlink=0
+#
+# The pmemblk engine does IO to files in a DAX-mounted filesystem.
+# The filesystem should be created on an NVDIMM (e.g /dev/pmem0)
+# and then mounted with the '-o dax' option. Note that the engine
+# accesses the underlying NVDIMM directly, bypassing the kernel block
+# layer, so the usual filesystem/disk performance monitoring tools such
+# as iostat will not provide useful data.
+#
+# Here we specify a test file on each of two NVDIMMs. The first
+# number after the file name is the block size in bytes (4096 bytes
+# in this example). The second number is the size of the file to
+# create in MiB (1 GiB in this example); note that the actual usable
+# space available to fio will be less than this as libpmemblk requires
+# some space for metadata.
+#
+# Currently, the minimum block size is 512 bytes and the minimum file
+# size is about 17 MiB (these are libpmemblk requirements).
+#
+# While both files in this example have the same block size and file
+# size, this is not required.
+#
+filename=/pmem0/fio-test,4096,1024
+#filename=/pmem1/fio-test,4096,1024
+
+[pmemblk-write]
+rw=randwrite
+stonewall
+
+[pmemblk-read]
+rw=randread
+stonewall
+#
+# We're done, so unlink the file:
+#
+unlink=1
+
diff --git a/fio.1 b/fio.1
index 80bf3371a3556406cbcb29323bb21f55e769b9f9..3dc7e062a063d54bcd79120a7d6dc1ffd934b80c 100644
--- a/fio.1
+++ b/fio.1
@@ -1960,6 +1960,11 @@ e.g., on NAND, writing sequentially to erase blocks and discarding
before overwriting. The \fBtrimwrite\fR mode works well for this
constraint.
.TP
+.B pmemblk
+Read and write using filesystem DAX to a file on a filesystem
+mounted with DAX on a persistent memory device through the PMDK
+libpmemblk library.
+.TP
.B dev\-dax
Read and write using device DAX to a persistent memory device (e.g.,
/dev/dax0.0) through the PMDK libpmem library.
diff --git a/options.c b/options.c
index 8193fb29fe2b1cdfab8e745b9522aeb507f5361e..6c58577d8dbfd0a8dfd2b77e11c9be57e4aaaf10 100644
--- a/options.c
+++ b/options.c
@@ -2125,6 +2125,12 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
.help = "Hadoop Distributed Filesystem (HDFS) engine"
},
#endif
+#ifdef CONFIG_PMEMBLK
+ { .ival = "pmemblk",
+ .help = "PMDK libpmemblk based IO engine",
+ },
+
+#endif
#ifdef CONFIG_IME
{ .ival = "ime_psync",
.help = "DDN's IME synchronous IO engine",
diff --git a/os/windows/examples.wxs b/os/windows/examples.wxs
index d70c77133f5a9f24908fffde9ce5ad20dbad2562..9308ba8be829c62b88cb06470a068cc2aef3f7dc 100755
--- a/os/windows/examples.wxs
+++ b/os/windows/examples.wxs
@@ -125,6 +125,9 @@
<Component>
<File Source="..\..\examples\numa.fio" />
</Component>
+ <Component>
+ <File Source="..\..\examples\pmemblk.fio" />
+ </Component>
<Component>
<File Source="..\..\examples\poisson-rate-submission.fio" />
</Component>
@@ -209,6 +212,7 @@
<ComponentRef Id="netio_multicast.fio" />
<ComponentRef Id="null.fio" />
<ComponentRef Id="numa.fio" />
+ <ComponentRef Id="pmemblk.fio" />
<ComponentRef Id="poisson_rate_submission.fio" />
<ComponentRef Id="rados.fio"/>
<ComponentRef Id="rand_zones.fio" />
--
2.41.0