From 2383a1ec2ad9090259f6d200b92676acff37de3b Mon Sep 17 00:00:00 2001 From: Pavel Reichl Date: Fri, 9 Jun 2023 18:18:48 +0200 Subject: [PATCH] Revert "pmemblk: remove pmemblk engine" This reverts commit 04c1cdc4c108c6537681ab7c50daaed6d2fb4c93. Signed-off-by: Pavel Reichl --- HOWTO.rst | 5 + Makefile | 5 + ci/actions-install.sh | 1 + configure | 41 ++++ engines/pmemblk.c | 449 ++++++++++++++++++++++++++++++++++++++++ examples/pmemblk.fio | 71 +++++++ fio.1 | 5 + options.c | 6 + os/windows/examples.wxs | 4 + 10 files changed, 587 insertions(+) create mode 100644 engines/pmemblk.c create mode 100644 examples/pmemblk.fio diff --git a/HOWTO.rst b/HOWTO.rst index 32fff5ecbde42cf894214f766f38130dba079760..4f003524f69e5cced1195fb0f7efcc2590648122 100644 --- a/HOWTO.rst +++ b/HOWTO.rst @@ -2147,6 +2147,11 @@ I/O engine before overwriting. The `trimwrite` mode works well for this constraint. + **pmemblk** + Read and write using filesystem DAX to a file on a filesystem + mounted with DAX on a persistent memory device through the PMDK + libpmemblk library. + **dev-dax** Read and write using device DAX to a persistent memory device (e.g., /dev/dax0.0) through the PMDK libpmem library. diff --git a/Makefile b/Makefile index 6d7fd4e2bbbdeb196d22299d379bebb29172d538..89205ebf498f957ceedefdf1b0e565f08c85060f 100644 --- a/Makefile +++ b/Makefile @@ -208,6 +208,11 @@ ifdef CONFIG_MTD SOURCE += oslib/libmtd.c SOURCE += oslib/libmtd_legacy.c endif +ifdef CONFIG_PMEMBLK + pmemblk_SRCS = engines/pmemblk.c + pmemblk_LIBS = -lpmemblk + ENGINES += pmemblk +endif ifdef CONFIG_LINUX_DEVDAX dev-dax_SRCS = engines/dev-dax.c dev-dax_LIBS = -lpmem diff --git a/ci/actions-install.sh b/ci/actions-install.sh index 95241e78825a9939814a747daf486f866949e392..2f1a0cbaeef4b528a93813d18b94f9bc041bfa04 100755 --- a/ci/actions-install.sh +++ b/ci/actions-install.sh @@ -47,6 +47,7 @@ DPKGCFG libnbd-dev libpmem-dev libpmem2-dev + libpmemblk-dev libprotobuf-c-dev librbd-dev libtcmalloc-minimal4 diff --git a/configure b/configure index 74416fd48bc73e35cd8fd5440b9733efd1d0adbb..f6b160c99d374034ba308b74a306e3eb1570be1e 100755 --- a/configure +++ b/configure @@ -163,6 +163,7 @@ show_help="no" exit_val=0 gfio_check="no" libhdfs="no" +pmemblk="no" devdax="no" pmem="no" cuda="no" @@ -2260,6 +2261,43 @@ if test "$libpmem" = "yes" && test "$disable_pmem" = "no"; then fi fi +########################################## +# Check whether we have libpmemblk +# libpmem is a prerequisite +if test "$libpmemblk" != "yes" ; then + libpmemblk="no" +fi +if test "$libpmem" = "yes"; then + cat > $TMPC << EOF +#include +int main(int argc, char **argv) +{ + PMEMblkpool *pbp; + pbp = pmemblk_open("", 0); + return 0; +} +EOF + if compile_prog "" "-lpmemblk" "libpmemblk"; then + libpmemblk="yes" + fi +fi +print_config "libpmemblk" "$libpmemblk" + +# Choose libpmem-based ioengines +if test "$libpmem" = "yes" && test "$disable_pmem" = "no"; then + devdax="yes" + if test "$libpmem1_5" = "yes"; then + pmem="yes" + fi + if test "$libpmemblk" = "yes"; then + pmemblk="yes" + fi +fi + +########################################## +# Report whether pmemblk engine is enabled +print_config "PMDK pmemblk engine" "$pmemblk" + ########################################## # Report whether dev-dax engine is enabled print_config "PMDK dev-dax engine" "$devdax" @@ -3188,6 +3226,9 @@ fi if test "$mtd" = "yes" ; then output_sym "CONFIG_MTD" fi +if test "$pmemblk" = "yes" ; then + output_sym "CONFIG_PMEMBLK" +fi if test "$devdax" = "yes" ; then output_sym "CONFIG_LINUX_DEVDAX" fi diff --git a/engines/pmemblk.c b/engines/pmemblk.c new file mode 100644 index 0000000000000000000000000000000000000000..849d8a15a0da59d07209c2475b78a1e4098c143a --- /dev/null +++ b/engines/pmemblk.c @@ -0,0 +1,449 @@ +/* + * pmemblk: IO engine that uses PMDK libpmemblk to read and write data + * + * Copyright (C) 2016 Hewlett Packard Enterprise Development LP + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License, + * version 2 as published by the Free Software Foundation.. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the Free + * Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +/* + * pmemblk engine + * + * IO engine that uses libpmemblk to read and write data + * + * To use: + * ioengine=pmemblk + * + * Other relevant settings: + * thread=1 REQUIRED + * iodepth=1 + * direct=1 + * unlink=1 + * filename=/mnt/pmem0/fiotestfile,BSIZE,FSIZEMiB + * + * thread must be set to 1 for pmemblk as multiple processes cannot + * open the same block pool file. + * + * iodepth should be set to 1 as pmemblk is always synchronous. + * Use numjobs to scale up. + * + * direct=1 is implied as pmemblk is always direct. A warning message + * is printed if this is not specified. + * + * unlink=1 removes the block pool file after testing, and is optional. + * + * The pmem device must have a DAX-capable filesystem and be mounted + * with DAX enabled. filename must point to a file on that filesystem. + * + * Example: + * mkfs.xfs /dev/pmem0 + * mkdir /mnt/pmem0 + * mount -o dax /dev/pmem0 /mnt/pmem0 + * + * When specifying the filename, if the block pool file does not already + * exist, then the pmemblk engine creates the pool file if you specify + * the block and file sizes. BSIZE is the block size in bytes. + * FSIZEMB is the pool file size in MiB. + * + * See examples/pmemblk.fio for more. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../fio.h" + +/* + * libpmemblk + */ +typedef struct fio_pmemblk_file *fio_pmemblk_file_t; + +struct fio_pmemblk_file { + fio_pmemblk_file_t pmb_next; + char *pmb_filename; + uint64_t pmb_refcnt; + PMEMblkpool *pmb_pool; + size_t pmb_bsize; + size_t pmb_nblocks; +}; + +static fio_pmemblk_file_t Cache; + +static pthread_mutex_t CacheLock = PTHREAD_MUTEX_INITIALIZER; + +#define PMB_CREATE (0x0001) /* should create file */ + +fio_pmemblk_file_t fio_pmemblk_cache_lookup(const char *filename) +{ + fio_pmemblk_file_t i; + + for (i = Cache; i != NULL; i = i->pmb_next) + if (!strcmp(filename, i->pmb_filename)) + return i; + + return NULL; +} + +static void fio_pmemblk_cache_insert(fio_pmemblk_file_t pmb) +{ + pmb->pmb_next = Cache; + Cache = pmb; +} + +static void fio_pmemblk_cache_remove(fio_pmemblk_file_t pmb) +{ + fio_pmemblk_file_t i; + + if (pmb == Cache) { + Cache = Cache->pmb_next; + pmb->pmb_next = NULL; + return; + } + + for (i = Cache; i != NULL; i = i->pmb_next) + if (pmb == i->pmb_next) { + i->pmb_next = i->pmb_next->pmb_next; + pmb->pmb_next = NULL; + return; + } +} + +/* + * to control block size and gross file size at the libpmemblk + * level, we allow the block size and file size to be appended + * to the file name: + * + * path[,bsize,fsizemib] + * + * note that we do not use the fio option "filesize" to dictate + * the file size because we can only give libpmemblk the gross + * file size, which is different from the net or usable file + * size (which is probably what fio wants). + * + * the final path without the parameters is returned in ppath. + * the block size and file size are returned in pbsize and fsize. + * + * note that the user specifies the file size in MiB, but + * we return bytes from here. + */ +static void pmb_parse_path(const char *pathspec, char **ppath, uint64_t *pbsize, + uint64_t *pfsize) +{ + char *path; + char *s; + uint64_t bsize; + uint64_t fsizemib; + + path = strdup(pathspec); + if (!path) { + *ppath = NULL; + return; + } + + /* extract sizes, if given */ + s = strrchr(path, ','); + if (s && (fsizemib = strtoull(s + 1, NULL, 10))) { + *s = 0; + s = strrchr(path, ','); + if (s && (bsize = strtoull(s + 1, NULL, 10))) { + *s = 0; + *ppath = path; + *pbsize = bsize; + *pfsize = fsizemib << 20; + return; + } + } + + /* size specs not found */ + strcpy(path, pathspec); + *ppath = path; + *pbsize = 0; + *pfsize = 0; +} + +static fio_pmemblk_file_t pmb_open(const char *pathspec, int flags) +{ + fio_pmemblk_file_t pmb; + char *path = NULL; + uint64_t bsize = 0; + uint64_t fsize = 0; + + pmb_parse_path(pathspec, &path, &bsize, &fsize); + if (!path) + return NULL; + + pthread_mutex_lock(&CacheLock); + + pmb = fio_pmemblk_cache_lookup(path); + if (!pmb) { + pmb = malloc(sizeof(*pmb)); + if (!pmb) + goto error; + + /* try opening existing first, create it if needed */ + pmb->pmb_pool = pmemblk_open(path, bsize); + if (!pmb->pmb_pool && (errno == ENOENT) && + (flags & PMB_CREATE) && (0 < fsize) && (0 < bsize)) { + pmb->pmb_pool = + pmemblk_create(path, bsize, fsize, 0644); + } + if (!pmb->pmb_pool) { + log_err("pmemblk: unable to open pmemblk pool file %s (%s)\n", + path, strerror(errno)); + goto error; + } + + pmb->pmb_filename = path; + pmb->pmb_next = NULL; + pmb->pmb_refcnt = 0; + pmb->pmb_bsize = pmemblk_bsize(pmb->pmb_pool); + pmb->pmb_nblocks = pmemblk_nblock(pmb->pmb_pool); + + fio_pmemblk_cache_insert(pmb); + } else { + free(path); + } + + pmb->pmb_refcnt += 1; + + pthread_mutex_unlock(&CacheLock); + + return pmb; + +error: + if (pmb) { + if (pmb->pmb_pool) + pmemblk_close(pmb->pmb_pool); + pmb->pmb_pool = NULL; + pmb->pmb_filename = NULL; + free(pmb); + } + if (path) + free(path); + + pthread_mutex_unlock(&CacheLock); + return NULL; +} + +static void pmb_close(fio_pmemblk_file_t pmb, const bool keep) +{ + pthread_mutex_lock(&CacheLock); + + pmb->pmb_refcnt--; + + if (!keep && !pmb->pmb_refcnt) { + pmemblk_close(pmb->pmb_pool); + pmb->pmb_pool = NULL; + free(pmb->pmb_filename); + pmb->pmb_filename = NULL; + fio_pmemblk_cache_remove(pmb); + free(pmb); + } + + pthread_mutex_unlock(&CacheLock); +} + +static int pmb_get_flags(struct thread_data *td, uint64_t *pflags) +{ + static int thread_warned = 0; + static int odirect_warned = 0; + + uint64_t flags = 0; + + if (!td->o.use_thread) { + if (!thread_warned) { + thread_warned = 1; + log_err("pmemblk: must set thread=1 for pmemblk engine\n"); + } + return 1; + } + + if (!td->o.odirect && !odirect_warned) { + odirect_warned = 1; + log_info("pmemblk: direct == 0, but pmemblk is always direct\n"); + } + + if (td->o.allow_create) + flags |= PMB_CREATE; + + (*pflags) = flags; + return 0; +} + +static int fio_pmemblk_open_file(struct thread_data *td, struct fio_file *f) +{ + uint64_t flags = 0; + fio_pmemblk_file_t pmb; + + if (pmb_get_flags(td, &flags)) + return 1; + + pmb = pmb_open(f->file_name, flags); + if (!pmb) + return 1; + + FILE_SET_ENG_DATA(f, pmb); + return 0; +} + +static int fio_pmemblk_close_file(struct thread_data fio_unused *td, + struct fio_file *f) +{ + fio_pmemblk_file_t pmb = FILE_ENG_DATA(f); + + if (pmb) + pmb_close(pmb, false); + + FILE_SET_ENG_DATA(f, NULL); + return 0; +} + +static int fio_pmemblk_get_file_size(struct thread_data *td, struct fio_file *f) +{ + uint64_t flags = 0; + fio_pmemblk_file_t pmb = FILE_ENG_DATA(f); + + if (fio_file_size_known(f)) + return 0; + + if (!pmb) { + if (pmb_get_flags(td, &flags)) + return 1; + pmb = pmb_open(f->file_name, flags); + if (!pmb) + return 1; + } + + f->real_file_size = pmb->pmb_bsize * pmb->pmb_nblocks; + + fio_file_set_size_known(f); + + if (!FILE_ENG_DATA(f)) + pmb_close(pmb, true); + + return 0; +} + +static enum fio_q_status fio_pmemblk_queue(struct thread_data *td, + struct io_u *io_u) +{ + struct fio_file *f = io_u->file; + fio_pmemblk_file_t pmb = FILE_ENG_DATA(f); + + unsigned long long off; + unsigned long len; + void *buf; + + fio_ro_check(td, io_u); + + switch (io_u->ddir) { + case DDIR_READ: + case DDIR_WRITE: + off = io_u->offset; + len = io_u->xfer_buflen; + + io_u->error = EINVAL; + if (off % pmb->pmb_bsize) + break; + if (len % pmb->pmb_bsize) + break; + if ((off + len) / pmb->pmb_bsize > pmb->pmb_nblocks) + break; + + io_u->error = 0; + buf = io_u->xfer_buf; + off /= pmb->pmb_bsize; + len /= pmb->pmb_bsize; + while (0 < len) { + if (io_u->ddir == DDIR_READ) { + if (0 != pmemblk_read(pmb->pmb_pool, buf, off)) { + io_u->error = errno; + break; + } + } else if (0 != pmemblk_write(pmb->pmb_pool, buf, off)) { + io_u->error = errno; + break; + } + buf += pmb->pmb_bsize; + off++; + len--; + } + off *= pmb->pmb_bsize; + len *= pmb->pmb_bsize; + io_u->resid = io_u->xfer_buflen - (off - io_u->offset); + break; + case DDIR_SYNC: + case DDIR_DATASYNC: + case DDIR_SYNC_FILE_RANGE: + /* we're always sync'd */ + io_u->error = 0; + break; + default: + io_u->error = EINVAL; + break; + } + + return FIO_Q_COMPLETED; +} + +static int fio_pmemblk_unlink_file(struct thread_data *td, struct fio_file *f) +{ + char *path = NULL; + uint64_t bsize = 0; + uint64_t fsize = 0; + + /* + * we need our own unlink in case the user has specified + * the block and file sizes in the path name. we parse + * the file_name to determine the file name we actually used. + */ + + pmb_parse_path(f->file_name, &path, &bsize, &fsize); + if (!path) + return ENOENT; + + unlink(path); + free(path); + return 0; +} + +FIO_STATIC struct ioengine_ops ioengine = { + .name = "pmemblk", + .version = FIO_IOOPS_VERSION, + .queue = fio_pmemblk_queue, + .open_file = fio_pmemblk_open_file, + .close_file = fio_pmemblk_close_file, + .get_file_size = fio_pmemblk_get_file_size, + .unlink_file = fio_pmemblk_unlink_file, + .flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL, +}; + +static void fio_init fio_pmemblk_register(void) +{ + register_ioengine(&ioengine); +} + +static void fio_exit fio_pmemblk_unregister(void) +{ + unregister_ioengine(&ioengine); +} diff --git a/examples/pmemblk.fio b/examples/pmemblk.fio new file mode 100644 index 0000000000000000000000000000000000000000..59bb2a8a5acbf0e03d16a988f6ae0b9eb84575d2 --- /dev/null +++ b/examples/pmemblk.fio @@ -0,0 +1,71 @@ +[global] +bs=1m +ioengine=pmemblk +norandommap +time_based +runtime=30 +group_reporting +disable_lat=1 +disable_slat=1 +disable_clat=1 +clat_percentiles=0 +cpus_allowed_policy=split + +# For the pmemblk engine: +# +# IOs always complete immediately +# IOs are always direct +# Must use threads +# +iodepth=1 +direct=1 +thread +numjobs=16 +# +# Unlink can be used to remove the files when done, but if you are +# using serial runs with stonewall, and you want the files to be created +# only once and unlinked only at the very end, then put the unlink=1 +# in the last group. This is the method demonstrated here. +# +# Note that if you have a read-only group and if the files will be +# newly created, then all of the data will read back as zero and the +# read will be optimized, yielding performance that is different from +# that of reading non-zero blocks (or unoptimized zero blocks). +# +unlink=0 +# +# The pmemblk engine does IO to files in a DAX-mounted filesystem. +# The filesystem should be created on an NVDIMM (e.g /dev/pmem0) +# and then mounted with the '-o dax' option. Note that the engine +# accesses the underlying NVDIMM directly, bypassing the kernel block +# layer, so the usual filesystem/disk performance monitoring tools such +# as iostat will not provide useful data. +# +# Here we specify a test file on each of two NVDIMMs. The first +# number after the file name is the block size in bytes (4096 bytes +# in this example). The second number is the size of the file to +# create in MiB (1 GiB in this example); note that the actual usable +# space available to fio will be less than this as libpmemblk requires +# some space for metadata. +# +# Currently, the minimum block size is 512 bytes and the minimum file +# size is about 17 MiB (these are libpmemblk requirements). +# +# While both files in this example have the same block size and file +# size, this is not required. +# +filename=/pmem0/fio-test,4096,1024 +#filename=/pmem1/fio-test,4096,1024 + +[pmemblk-write] +rw=randwrite +stonewall + +[pmemblk-read] +rw=randread +stonewall +# +# We're done, so unlink the file: +# +unlink=1 + diff --git a/fio.1 b/fio.1 index 80bf3371a3556406cbcb29323bb21f55e769b9f9..3dc7e062a063d54bcd79120a7d6dc1ffd934b80c 100644 --- a/fio.1 +++ b/fio.1 @@ -1960,6 +1960,11 @@ e.g., on NAND, writing sequentially to erase blocks and discarding before overwriting. The \fBtrimwrite\fR mode works well for this constraint. .TP +.B pmemblk +Read and write using filesystem DAX to a file on a filesystem +mounted with DAX on a persistent memory device through the PMDK +libpmemblk library. +.TP .B dev\-dax Read and write using device DAX to a persistent memory device (e.g., /dev/dax0.0) through the PMDK libpmem library. diff --git a/options.c b/options.c index 8193fb29fe2b1cdfab8e745b9522aeb507f5361e..6c58577d8dbfd0a8dfd2b77e11c9be57e4aaaf10 100644 --- a/options.c +++ b/options.c @@ -2125,6 +2125,12 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .help = "Hadoop Distributed Filesystem (HDFS) engine" }, #endif +#ifdef CONFIG_PMEMBLK + { .ival = "pmemblk", + .help = "PMDK libpmemblk based IO engine", + }, + +#endif #ifdef CONFIG_IME { .ival = "ime_psync", .help = "DDN's IME synchronous IO engine", diff --git a/os/windows/examples.wxs b/os/windows/examples.wxs index d70c77133f5a9f24908fffde9ce5ad20dbad2562..9308ba8be829c62b88cb06470a068cc2aef3f7dc 100755 --- a/os/windows/examples.wxs +++ b/os/windows/examples.wxs @@ -125,6 +125,9 @@ + + + @@ -209,6 +212,7 @@ + -- 2.41.0