72c1204567
Now mdcheck service can't be run successfully. We need to put mdcheck in to the right place (/usr/share/mdadm/mdcheck) and it needs to remove the dependency of mdadm_env.sh which is fixed in patch 76c224c6c. And there is a history problem. It needed KillMode=none before, so it removed the upstream patch 52c67fcdd. Now this problem has been fixed, so we can do the backport more easilly now. We don't need to remove the upstream patch here again. It adds a rhel only feature too for transient failure. Resolves: rhbz#2159923, rhbz#2150865, rhbz#2124071, rhbz#2203859 Signed-off-by: Xiao Ni <xni@redhat.com>
347 lines
9.3 KiB
Diff
347 lines
9.3 KiB
Diff
From 577fd10486d8d1472a6b559066f344ac30a3a391 Mon Sep 17 00:00:00 2001
|
|
From: Logan Gunthorpe <logang@deltatee.com>
|
|
Date: Wed, 1 Mar 2023 13:41:33 -0700
|
|
Subject: [PATCH 103/125] mdadm: Add --write-zeros option for Create
|
|
|
|
Add the --write-zeros option for Create which will send a write zeros
|
|
request to all the disks before assembling the array. After zeroing
|
|
the array, the disks will be in a known clean state and the initial
|
|
sync may be skipped.
|
|
|
|
Writing zeroes is best used when there is a hardware offload method
|
|
to zero the data. But even still, zeroing can take several minutes on
|
|
a large device. Because of this, all disks are zeroed in parallel using
|
|
their own forked process and a message is printed to the user. The main
|
|
process will proceed only after all the zeroing processes have completed
|
|
successfully.
|
|
|
|
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
|
|
Acked-by: Kinga Tanska <kinga.tanska@linux.intel.com>
|
|
Reviewed-by: Xiao Ni <xni@redhat.com>
|
|
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
|
|
Acked-by: Coly Li <colyli@suse.de>
|
|
Signed-off-by: Jes Sorensen <jes@trained-monkey.org>
|
|
---
|
|
Create.c | 176 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
|
|
ReadMe.c | 2 +
|
|
mdadm.c | 9 +++
|
|
mdadm.h | 5 ++
|
|
4 files changed, 190 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/Create.c b/Create.c
|
|
index 4acda30c..bbe9e13d 100644
|
|
--- a/Create.c
|
|
+++ b/Create.c
|
|
@@ -26,6 +26,10 @@
|
|
#include "md_u.h"
|
|
#include "md_p.h"
|
|
#include <ctype.h>
|
|
+#include <fcntl.h>
|
|
+#include <signal.h>
|
|
+#include <sys/signalfd.h>
|
|
+#include <sys/wait.h>
|
|
|
|
static int round_size_and_verify(unsigned long long *size, int chunk)
|
|
{
|
|
@@ -91,9 +95,149 @@ int default_layout(struct supertype *st, int level, int verbose)
|
|
return layout;
|
|
}
|
|
|
|
+static pid_t write_zeroes_fork(int fd, struct shape *s, struct supertype *st,
|
|
+ struct mddev_dev *dv)
|
|
+
|
|
+{
|
|
+ const unsigned long long req_size = 1 << 30;
|
|
+ unsigned long long offset_bytes, size_bytes, sz;
|
|
+ sigset_t sigset;
|
|
+ int ret = 0;
|
|
+ pid_t pid;
|
|
+
|
|
+ size_bytes = KIB_TO_BYTES(s->size);
|
|
+
|
|
+ /*
|
|
+ * If size_bytes is zero, this is a zoned raid array where
|
|
+ * each disk is of a different size and uses its full
|
|
+ * disk. Thus zero the entire disk.
|
|
+ */
|
|
+ if (!size_bytes && !get_dev_size(fd, dv->devname, &size_bytes))
|
|
+ return -1;
|
|
+
|
|
+ if (dv->data_offset != INVALID_SECTORS)
|
|
+ offset_bytes = SEC_TO_BYTES(dv->data_offset);
|
|
+ else
|
|
+ offset_bytes = SEC_TO_BYTES(st->data_offset);
|
|
+
|
|
+ pr_info("zeroing data from %lld to %lld on: %s\n",
|
|
+ offset_bytes, size_bytes, dv->devname);
|
|
+
|
|
+ pid = fork();
|
|
+ if (pid < 0) {
|
|
+ pr_err("Could not fork to zero disks: %s\n", strerror(errno));
|
|
+ return pid;
|
|
+ } else if (pid != 0) {
|
|
+ return pid;
|
|
+ }
|
|
+
|
|
+ sigemptyset(&sigset);
|
|
+ sigaddset(&sigset, SIGINT);
|
|
+ sigprocmask(SIG_UNBLOCK, &sigset, NULL);
|
|
+
|
|
+ while (size_bytes) {
|
|
+ /*
|
|
+ * Split requests to the kernel into 1GB chunks seeing the
|
|
+ * fallocate() call is not interruptible and blocking a
|
|
+ * ctrl-c for several minutes is not desirable.
|
|
+ *
|
|
+ * 1GB is chosen as a compromise: the user may still have
|
|
+ * to wait several seconds if they ctrl-c on devices that
|
|
+ * zero slowly, but will reduce the number of requests
|
|
+ * required and thus the overhead on devices that perform
|
|
+ * better.
|
|
+ */
|
|
+ sz = size_bytes;
|
|
+ if (sz >= req_size)
|
|
+ sz = req_size;
|
|
+
|
|
+ if (fallocate(fd, FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE,
|
|
+ offset_bytes, sz)) {
|
|
+ pr_err("zeroing %s failed: %s\n", dv->devname,
|
|
+ strerror(errno));
|
|
+ ret = 1;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ offset_bytes += sz;
|
|
+ size_bytes -= sz;
|
|
+ }
|
|
+
|
|
+ exit(ret);
|
|
+}
|
|
+
|
|
+static int wait_for_zero_forks(int *zero_pids, int count)
|
|
+{
|
|
+ int wstatus, ret = 0, i, sfd, wait_count = 0;
|
|
+ struct signalfd_siginfo fdsi;
|
|
+ bool interrupted = false;
|
|
+ sigset_t sigset;
|
|
+ ssize_t s;
|
|
+
|
|
+ for (i = 0; i < count; i++)
|
|
+ if (zero_pids[i])
|
|
+ wait_count++;
|
|
+ if (!wait_count)
|
|
+ return 0;
|
|
+
|
|
+ sigemptyset(&sigset);
|
|
+ sigaddset(&sigset, SIGINT);
|
|
+ sigaddset(&sigset, SIGCHLD);
|
|
+ sigprocmask(SIG_BLOCK, &sigset, NULL);
|
|
+
|
|
+ sfd = signalfd(-1, &sigset, 0);
|
|
+ if (sfd < 0) {
|
|
+ pr_err("Unable to create signalfd: %s\n", strerror(errno));
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
+ while (1) {
|
|
+ s = read(sfd, &fdsi, sizeof(fdsi));
|
|
+ if (s != sizeof(fdsi)) {
|
|
+ pr_err("Invalid signalfd read: %s\n", strerror(errno));
|
|
+ close(sfd);
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
+ if (fdsi.ssi_signo == SIGINT) {
|
|
+ printf("\n");
|
|
+ pr_info("Interrupting zeroing processes, please wait...\n");
|
|
+ interrupted = true;
|
|
+ } else if (fdsi.ssi_signo == SIGCHLD) {
|
|
+ if (!--wait_count)
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ close(sfd);
|
|
+
|
|
+ for (i = 0; i < count; i++) {
|
|
+ if (!zero_pids[i])
|
|
+ continue;
|
|
+
|
|
+ waitpid(zero_pids[i], &wstatus, 0);
|
|
+ zero_pids[i] = 0;
|
|
+ if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus))
|
|
+ ret = 1;
|
|
+ }
|
|
+
|
|
+ if (interrupted) {
|
|
+ pr_err("zeroing interrupted!\n");
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
+ if (ret)
|
|
+ pr_err("zeroing failed!\n");
|
|
+ else
|
|
+ pr_info("zeroing finished\n");
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
static int add_disk_to_super(int mdfd, struct shape *s, struct context *c,
|
|
struct supertype *st, struct mddev_dev *dv,
|
|
- struct mdinfo *info, int have_container, int major_num)
|
|
+ struct mdinfo *info, int have_container, int major_num,
|
|
+ int *zero_pid)
|
|
{
|
|
dev_t rdev;
|
|
int fd;
|
|
@@ -148,6 +292,14 @@ static int add_disk_to_super(int mdfd, struct shape *s, struct context *c,
|
|
}
|
|
st->ss->getinfo_super(st, info, NULL);
|
|
|
|
+ if (fd >= 0 && s->write_zeroes) {
|
|
+ *zero_pid = write_zeroes_fork(fd, s, st, dv);
|
|
+ if (*zero_pid <= 0) {
|
|
+ ioctl(mdfd, STOP_ARRAY, NULL);
|
|
+ return 1;
|
|
+ }
|
|
+ }
|
|
+
|
|
if (have_container && c->verbose > 0)
|
|
pr_err("Using %s for device %d\n",
|
|
map_dev(info->disk.major, info->disk.minor, 0),
|
|
@@ -224,10 +376,23 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
|
|
{
|
|
struct mddev_dev *moved_disk = NULL;
|
|
int pass, raid_disk_num, dnum;
|
|
+ int zero_pids[total_slots];
|
|
struct mddev_dev *dv;
|
|
struct mdinfo *infos;
|
|
+ sigset_t sigset, orig_sigset;
|
|
int ret = 0;
|
|
|
|
+ /*
|
|
+ * Block SIGINT so the main thread will always wait for the
|
|
+ * zeroing processes when being interrupted. Otherwise the
|
|
+ * zeroing processes will finish their work in the background
|
|
+ * keeping the disk busy.
|
|
+ */
|
|
+ sigemptyset(&sigset);
|
|
+ sigaddset(&sigset, SIGINT);
|
|
+ sigprocmask(SIG_BLOCK, &sigset, &orig_sigset);
|
|
+ memset(zero_pids, 0, sizeof(zero_pids));
|
|
+
|
|
infos = xmalloc(sizeof(*infos) * total_slots);
|
|
enable_fds(total_slots);
|
|
for (pass = 1; pass <= 2; pass++) {
|
|
@@ -261,7 +426,7 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
|
|
|
|
ret = add_disk_to_super(mdfd, s, c, st, dv,
|
|
&infos[dnum], have_container,
|
|
- major_num);
|
|
+ major_num, &zero_pids[dnum]);
|
|
if (ret)
|
|
goto out;
|
|
|
|
@@ -287,6 +452,10 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
|
|
}
|
|
|
|
if (pass == 1) {
|
|
+ ret = wait_for_zero_forks(zero_pids, total_slots);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
ret = update_metadata(mdfd, s, st, map, info,
|
|
chosen_name);
|
|
if (ret)
|
|
@@ -295,7 +464,10 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
|
|
}
|
|
|
|
out:
|
|
+ if (ret)
|
|
+ wait_for_zero_forks(zero_pids, total_slots);
|
|
free(infos);
|
|
+ sigprocmask(SIG_SETMASK, &orig_sigset, NULL);
|
|
return ret;
|
|
}
|
|
|
|
diff --git a/ReadMe.c b/ReadMe.c
|
|
index bd8d50d2..db251ed2 100644
|
|
--- a/ReadMe.c
|
|
+++ b/ReadMe.c
|
|
@@ -138,6 +138,7 @@ struct option long_options[] = {
|
|
{"size", 1, 0, 'z'},
|
|
{"auto", 1, 0, Auto}, /* also for --assemble */
|
|
{"assume-clean",0,0, AssumeClean },
|
|
+ {"write-zeroes",0,0, WriteZeroes },
|
|
{"metadata", 1, 0, 'e'}, /* superblock format */
|
|
{"bitmap", 1, 0, Bitmap},
|
|
{"bitmap-chunk", 1, 0, BitmapChunk},
|
|
@@ -390,6 +391,7 @@ char Help_create[] =
|
|
" --write-journal= : Specify journal device for RAID-4/5/6 array\n"
|
|
" --consistency-policy= : Specify the policy that determines how the array\n"
|
|
" -k : maintains consistency in case of unexpected shutdown.\n"
|
|
+" --write-zeroes : Write zeroes to the disks before creating. This will bypass initial sync.\n"
|
|
"\n"
|
|
;
|
|
|
|
diff --git a/mdadm.c b/mdadm.c
|
|
index 57e8e6fa..4685ad6b 100644
|
|
--- a/mdadm.c
|
|
+++ b/mdadm.c
|
|
@@ -590,6 +590,10 @@ int main(int argc, char *argv[])
|
|
s.assume_clean = 1;
|
|
continue;
|
|
|
|
+ case O(CREATE, WriteZeroes):
|
|
+ s.write_zeroes = 1;
|
|
+ continue;
|
|
+
|
|
case O(GROW,'n'):
|
|
case O(CREATE,'n'):
|
|
case O(BUILD,'n'): /* number of raid disks */
|
|
@@ -1251,6 +1255,11 @@ int main(int argc, char *argv[])
|
|
}
|
|
}
|
|
|
|
+ if (s.write_zeroes && !s.assume_clean) {
|
|
+ pr_info("Disk zeroing requested, setting --assume-clean to skip resync\n");
|
|
+ s.assume_clean = 1;
|
|
+ }
|
|
+
|
|
if (!mode && devs_found) {
|
|
mode = MISC;
|
|
devmode = 'Q';
|
|
diff --git a/mdadm.h b/mdadm.h
|
|
index 4336be4d..b9127f9a 100644
|
|
--- a/mdadm.h
|
|
+++ b/mdadm.h
|
|
@@ -275,6 +275,9 @@ static inline void __put_unaligned32(__u32 val, void *p)
|
|
|
|
#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
|
|
|
|
+#define KIB_TO_BYTES(x) ((x) << 10)
|
|
+#define SEC_TO_BYTES(x) ((x) << 9)
|
|
+
|
|
extern const char Name[];
|
|
|
|
struct md_bb_entry {
|
|
@@ -435,6 +438,7 @@ extern char Version[], Usage[], Help[], OptionHelp[],
|
|
*/
|
|
enum special_options {
|
|
AssumeClean = 300,
|
|
+ WriteZeroes,
|
|
BitmapChunk,
|
|
WriteBehind,
|
|
ReAdd,
|
|
@@ -640,6 +644,7 @@ struct shape {
|
|
int bitmap_chunk;
|
|
char *bitmap_file;
|
|
int assume_clean;
|
|
+ bool write_zeroes;
|
|
int write_behind;
|
|
unsigned long long size;
|
|
unsigned long long data_offset;
|
|
--
|
|
2.38.1
|
|
|