343 lines
11 KiB
Diff
343 lines
11 KiB
Diff
From 329dfc28debb58ffe7bd1967cea00fc583139aca Mon Sep 17 00:00:00 2001
|
|
From: NeilBrown <neilb@suse.de>
|
|
Date: Mon, 4 Nov 2019 14:27:49 +1100
|
|
Subject: [RHEL8.2 PATCH 54/61] Create: add support for RAID0 layouts.
|
|
|
|
Since Linux 5.4 a layout is needed for RAID0 arrays with
|
|
varying device sizes.
|
|
This patch makes the layout of an array visible (via --examine)
|
|
and sets the layout on newly created arrays.
|
|
--layout=dangerous
|
|
can be used to avoid setting a layout so that they array
|
|
can be used on older kernels.
|
|
|
|
Tested-by: dann frazier <dann.frazier@canonical.com>
|
|
Signed-off-by: NeilBrown <neilb@suse.de>
|
|
Signed-off-by: Jes Sorensen <jsorensen@fb.com>
|
|
---
|
|
Create.c | 11 +++++++++++
|
|
Detail.c | 5 +++++
|
|
maps.c | 12 ++++++++++++
|
|
md.4 | 14 ++++++++++++++
|
|
mdadm.8.in | 30 +++++++++++++++++++++++++++++-
|
|
mdadm.c | 8 ++++++++
|
|
mdadm.h | 8 +++++++-
|
|
super0.c | 6 ++++++
|
|
super1.c | 30 +++++++++++++++++++++++++++++-
|
|
9 files changed, 121 insertions(+), 3 deletions(-)
|
|
|
|
diff --git a/Create.c b/Create.c
|
|
index 292f92a..6f84e5b 100644
|
|
--- a/Create.c
|
|
+++ b/Create.c
|
|
@@ -51,6 +51,9 @@ static int default_layout(struct supertype *st, int level, int verbose)
|
|
default: /* no layout */
|
|
layout = 0;
|
|
break;
|
|
+ case 0:
|
|
+ layout = RAID0_ORIG_LAYOUT;
|
|
+ break;
|
|
case 10:
|
|
layout = 0x102; /* near=2, far=1 */
|
|
if (verbose > 0)
|
|
@@ -950,6 +953,11 @@ int Create(struct supertype *st, char *mddev,
|
|
if (rv) {
|
|
pr_err("ADD_NEW_DISK for %s failed: %s\n",
|
|
dv->devname, strerror(errno));
|
|
+ if (errno == EINVAL &&
|
|
+ info.array.level == 0) {
|
|
+ pr_err("Possibly your kernel doesn't support RAID0 layouts.\n");
|
|
+ pr_err("Either upgrade, or use --layout=dangerous\n");
|
|
+ }
|
|
goto abort_locked;
|
|
}
|
|
break;
|
|
@@ -1046,6 +1054,9 @@ int Create(struct supertype *st, char *mddev,
|
|
if (ioctl(mdfd, RUN_ARRAY, ¶m)) {
|
|
pr_err("RUN_ARRAY failed: %s\n",
|
|
strerror(errno));
|
|
+ if (errno == 524 /* ENOTSUP */ &&
|
|
+ info.array.level == 0)
|
|
+ cont_err("Please use --layout=original or --layout=alternate\n");
|
|
if (info.array.chunk_size & (info.array.chunk_size-1)) {
|
|
cont_err("Problem may be that chunk size is not a power of 2\n");
|
|
}
|
|
diff --git a/Detail.c b/Detail.c
|
|
index 24fa462..832485f 100644
|
|
--- a/Detail.c
|
|
+++ b/Detail.c
|
|
@@ -525,6 +525,11 @@ int Detail(char *dev, struct context *c)
|
|
printf(" Layout : %s\n",
|
|
str ? str : "-unknown-");
|
|
}
|
|
+ if (array.level == 0 && array.layout) {
|
|
+ str = map_num(r0layout, array.layout);
|
|
+ printf(" Layout : %s\n",
|
|
+ str ? str : "-unknown-");
|
|
+ }
|
|
if (array.level == 6) {
|
|
str = map_num(r6layout, array.layout);
|
|
printf(" Layout : %s\n",
|
|
diff --git a/maps.c b/maps.c
|
|
index 49b7f2c..a4fd279 100644
|
|
--- a/maps.c
|
|
+++ b/maps.c
|
|
@@ -73,6 +73,18 @@ mapping_t r6layout[] = {
|
|
{ NULL, UnSet }
|
|
};
|
|
|
|
+/* raid0 layout is only needed because of a bug in 3.14 which changed
|
|
+ * the effective layout of raid0 arrays with varying device sizes.
|
|
+ */
|
|
+mapping_t r0layout[] = {
|
|
+ { "original", RAID0_ORIG_LAYOUT},
|
|
+ { "alternate", RAID0_ALT_MULTIZONE_LAYOUT},
|
|
+ { "1", 1}, /* aka ORIG */
|
|
+ { "2", 2}, /* aka ALT */
|
|
+ { "dangerous", 0},
|
|
+ { NULL, UnSet},
|
|
+};
|
|
+
|
|
mapping_t pers[] = {
|
|
{ "linear", LEVEL_LINEAR},
|
|
{ "raid0", 0},
|
|
diff --git a/md.4 b/md.4
|
|
index e86707a..6fe2755 100644
|
|
--- a/md.4
|
|
+++ b/md.4
|
|
@@ -193,6 +193,20 @@ smallest device has been exhausted, the RAID0 driver starts
|
|
collecting chunks into smaller stripes that only span the drives which
|
|
still have remaining space.
|
|
|
|
+A bug was introduced in linux 3.14 which changed the layout of blocks in
|
|
+a RAID0 beyond the region that is striped over all devices. This bug
|
|
+does not affect an array with all devices the same size, but can affect
|
|
+other RAID0 arrays.
|
|
+
|
|
+Linux 5.4 (and some stable kernels to which the change was backported)
|
|
+will not normally assemble such an array as it cannot know which layout
|
|
+to use. There is a module parameter "raid0.default_layout" which can be
|
|
+set to "1" to force the kernel to use the pre-3.14 layout or to "2" to
|
|
+force it to use the 3.14-and-later layout. when creating a new RAID0
|
|
+array,
|
|
+.I mdadm
|
|
+will record the chosen layout in the metadata in a way that allows newer
|
|
+kernels to assemble the array without needing a module parameter.
|
|
|
|
.SS RAID1
|
|
|
|
diff --git a/mdadm.8.in b/mdadm.8.in
|
|
index 9aec9f4..fc9b6a6 100644
|
|
--- a/mdadm.8.in
|
|
+++ b/mdadm.8.in
|
|
@@ -593,6 +593,8 @@ to change the RAID level in some cases. See LEVEL CHANGES below.
|
|
This option configures the fine details of data layout for RAID5, RAID6,
|
|
and RAID10 arrays, and controls the failure modes for
|
|
.IR faulty .
|
|
+It can also be used for working around a kernel bug with RAID0, but generally
|
|
+doesn't need to be used explicitly.
|
|
|
|
The layout of the RAID5 parity block can be one of
|
|
.BR left\-asymmetric ,
|
|
@@ -652,7 +654,7 @@ option to set subsequent failure modes.
|
|
"clear" or "none" will remove any pending or periodic failure modes,
|
|
and "flush" will clear any persistent faults.
|
|
|
|
-Finally, the layout options for RAID10 are one of 'n', 'o' or 'f' followed
|
|
+The layout options for RAID10 are one of 'n', 'o' or 'f' followed
|
|
by a small number. The default is 'n2'. The supported options are:
|
|
|
|
.I 'n'
|
|
@@ -677,6 +679,32 @@ devices in the array. It does not need to divide evenly into that
|
|
number (e.g. it is perfectly legal to have an 'n2' layout for an array
|
|
with an odd number of devices).
|
|
|
|
+A bug introduced in Linux 3.14 means that RAID0 arrays
|
|
+.B "with devices of differing sizes"
|
|
+started using a different layout. This could lead to
|
|
+data corruption. Since Linux 5.4 (and various stable releases that received
|
|
+backports), the kernel will not accept such an array unless
|
|
+a layout is explictly set. It can be set to
|
|
+.RB ' original '
|
|
+or
|
|
+.RB ' alternate '.
|
|
+When creating a new array,
|
|
+.I mdadm
|
|
+will select
|
|
+.RB ' original '
|
|
+by default, so the layout does not normally need to be set.
|
|
+An array created for either
|
|
+.RB ' original '
|
|
+or
|
|
+.RB ' alternate '
|
|
+will not be recognized by an (unpatched) kernel prior to 5.4. To create
|
|
+a RAID0 array with devices of differing sizes that can be used on an
|
|
+older kernel, you can set the layout to
|
|
+.RB ' dangerous '.
|
|
+This will use whichever layout the running kernel supports, so the data
|
|
+on the array may become corrupt when changing kernel from pre-3.14 to a
|
|
+later kernel.
|
|
+
|
|
When an array is converted between RAID5 and RAID6 an intermediate
|
|
RAID6 layout is used in which the second parity block (Q) is always on
|
|
the last device. To convert a RAID5 to RAID6 and leave it in this new
|
|
diff --git a/mdadm.c b/mdadm.c
|
|
index 1fb8086..e438f9c 100644
|
|
--- a/mdadm.c
|
|
+++ b/mdadm.c
|
|
@@ -550,6 +550,14 @@ int main(int argc, char *argv[])
|
|
pr_err("raid level must be given before layout.\n");
|
|
exit(2);
|
|
|
|
+ case 0:
|
|
+ s.layout = map_name(r0layout, optarg);
|
|
+ if (s.layout == UnSet) {
|
|
+ pr_err("layout %s not understood for raid0.\n",
|
|
+ optarg);
|
|
+ exit(2);
|
|
+ }
|
|
+ break;
|
|
case 5:
|
|
s.layout = map_name(r5layout, optarg);
|
|
if (s.layout == UnSet) {
|
|
diff --git a/mdadm.h b/mdadm.h
|
|
index 91f1338..9e98778 100644
|
|
--- a/mdadm.h
|
|
+++ b/mdadm.h
|
|
@@ -763,7 +763,8 @@ extern int restore_stripes(int *dest, unsigned long long *offsets,
|
|
|
|
extern char *map_num(mapping_t *map, int num);
|
|
extern int map_name(mapping_t *map, char *name);
|
|
-extern mapping_t r5layout[], r6layout[], pers[], modes[], faultylayout[];
|
|
+extern mapping_t r0layout[], r5layout[], r6layout[],
|
|
+ pers[], modes[], faultylayout[];
|
|
extern mapping_t consistency_policies[], sysfs_array_states[];
|
|
|
|
extern char *map_dev_preferred(int major, int minor, int create,
|
|
@@ -1758,6 +1759,11 @@ char *xstrdup(const char *str);
|
|
#define makedev(M,m) (((M)<<8) | (m))
|
|
#endif
|
|
|
|
+enum r0layout {
|
|
+ RAID0_ORIG_LAYOUT = 1,
|
|
+ RAID0_ALT_MULTIZONE_LAYOUT = 2,
|
|
+};
|
|
+
|
|
/* for raid4/5/6 */
|
|
#define ALGORITHM_LEFT_ASYMMETRIC 0
|
|
#define ALGORITHM_RIGHT_ASYMMETRIC 1
|
|
diff --git a/super0.c b/super0.c
|
|
index 6b7c0e3..6af140b 100644
|
|
--- a/super0.c
|
|
+++ b/super0.c
|
|
@@ -1291,6 +1291,12 @@ static int validate_geometry0(struct supertype *st, int level,
|
|
if (*chunk == UnSet)
|
|
*chunk = DEFAULT_CHUNK;
|
|
|
|
+ if (level == 0 && layout != UnSet) {
|
|
+ if (verbose)
|
|
+ pr_err("0.90 metadata does not support layouts for RAID0\n");
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
if (!subdev)
|
|
return 1;
|
|
|
|
diff --git a/super1.c b/super1.c
|
|
index 929466d..cedbb53 100644
|
|
--- a/super1.c
|
|
+++ b/super1.c
|
|
@@ -43,7 +43,7 @@ struct mdp_superblock_1 {
|
|
|
|
__u64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/
|
|
__u32 level; /* -4 (multipath), -1 (linear), 0,1,4,5 */
|
|
- __u32 layout; /* only for raid5 currently */
|
|
+ __u32 layout; /* used for raid5, raid6, raid10, and raid0 */
|
|
__u64 size; /* used size of component devices, in 512byte sectors */
|
|
|
|
__u32 chunksize; /* in 512byte sectors */
|
|
@@ -144,6 +144,7 @@ struct misc_dev_info {
|
|
#define MD_FEATURE_JOURNAL 512 /* support write journal */
|
|
#define MD_FEATURE_PPL 1024 /* support PPL */
|
|
#define MD_FEATURE_MUTLIPLE_PPLS 2048 /* support for multiple PPLs */
|
|
+#define MD_FEATURE_RAID0_LAYOUT 4096 /* layout is meaningful in RAID0 */
|
|
#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \
|
|
|MD_FEATURE_RECOVERY_OFFSET \
|
|
|MD_FEATURE_RESHAPE_ACTIVE \
|
|
@@ -155,6 +156,7 @@ struct misc_dev_info {
|
|
|MD_FEATURE_JOURNAL \
|
|
|MD_FEATURE_PPL \
|
|
|MD_FEATURE_MULTIPLE_PPLS \
|
|
+ |MD_FEATURE_RAID0_LAYOUT \
|
|
)
|
|
|
|
static int role_from_sb(struct mdp_superblock_1 *sb)
|
|
@@ -498,6 +500,11 @@ static void examine_super1(struct supertype *st, char *homehost)
|
|
printf(" Events : %llu\n",
|
|
(unsigned long long)__le64_to_cpu(sb->events));
|
|
printf("\n");
|
|
+ if (__le32_to_cpu(sb->level) == 0 &&
|
|
+ (sb->feature_map & __cpu_to_le32(MD_FEATURE_RAID0_LAYOUT))) {
|
|
+ c = map_num(r0layout, __le32_to_cpu(sb->layout));
|
|
+ printf(" Layout : %s\n", c?c:"-unknown-");
|
|
+ }
|
|
if (__le32_to_cpu(sb->level) == 5) {
|
|
c = map_num(r5layout, __le32_to_cpu(sb->layout));
|
|
printf(" Layout : %s\n", c?c:"-unknown-");
|
|
@@ -1646,6 +1653,7 @@ struct devinfo {
|
|
int fd;
|
|
char *devname;
|
|
long long data_offset;
|
|
+ unsigned long long dev_size;
|
|
mdu_disk_info_t disk;
|
|
struct devinfo *next;
|
|
};
|
|
@@ -1687,6 +1695,7 @@ static int add_to_super1(struct supertype *st, mdu_disk_info_t *dk,
|
|
di->devname = devname;
|
|
di->disk = *dk;
|
|
di->data_offset = data_offset;
|
|
+ get_dev_size(fd, NULL, &di->dev_size);
|
|
di->next = NULL;
|
|
*dip = di;
|
|
|
|
@@ -1888,10 +1897,25 @@ static int write_init_super1(struct supertype *st)
|
|
unsigned long long sb_offset;
|
|
unsigned long long data_offset;
|
|
long bm_offset;
|
|
+ int raid0_need_layout = 0;
|
|
|
|
for (di = st->info; di; di = di->next) {
|
|
if (di->disk.state & (1 << MD_DISK_JOURNAL))
|
|
sb->feature_map |= __cpu_to_le32(MD_FEATURE_JOURNAL);
|
|
+ if (sb->level == 0 && sb->layout != 0) {
|
|
+ struct devinfo *di2 = st->info;
|
|
+ unsigned long long s1, s2;
|
|
+ s1 = di->dev_size;
|
|
+ if (di->data_offset != INVALID_SECTORS)
|
|
+ s1 -= di->data_offset;
|
|
+ s1 /= __le32_to_cpu(sb->chunksize);
|
|
+ s2 = di2->dev_size;
|
|
+ if (di2->data_offset != INVALID_SECTORS)
|
|
+ s2 -= di2->data_offset;
|
|
+ s2 /= __le32_to_cpu(sb->chunksize);
|
|
+ if (s1 != s2)
|
|
+ raid0_need_layout = 1;
|
|
+ }
|
|
}
|
|
|
|
for (di = st->info; di; di = di->next) {
|
|
@@ -2039,6 +2063,10 @@ static int write_init_super1(struct supertype *st)
|
|
sb->bblog_offset = 0;
|
|
}
|
|
|
|
+ /* RAID0 needs a layout if devices aren't all the same size */
|
|
+ if (raid0_need_layout)
|
|
+ sb->feature_map |= __cpu_to_le32(MD_FEATURE_RAID0_LAYOUT);
|
|
+
|
|
sb->sb_csum = calc_sb_1_csum(sb);
|
|
rv = store_super1(st, di->fd);
|
|
|
|
--
|
|
2.7.5
|
|
|