mdadm/mdadm-3.1.2-decremental.patch

634 lines
20 KiB
Diff
Raw Normal View History

From 8c43c776715301ff020639801a8b1b4716fdf745 Mon Sep 17 00:00:00 2001
From: Doug Ledford <dledford@redhat.com>
Date: Mon, 5 Apr 2010 12:32:08 -0400
Subject: [PATCH 6/6] Initial implementation of incremental remove support
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
Incremental.c | 36 ++++++++++++++++++
Manage.c | 105 +++++++++++++++++++++++++++++++++++++--------------
ReadMe.c | 21 +++++++---
mdadm.8 | 23 ++++++++++-
mdadm.c | 13 ++++++
mdadm.h | 8 ++++
mdstat.c | 90 +++++++++++++++++++++++++++++++++++++++++++-
sysfs.c | 13 ++++--
udev-md-raid.rules | 16 +++++---
9 files changed, 274 insertions(+), 51 deletions(-)
diff --git a/Incremental.c b/Incremental.c
index 7ad648a..d32a8e5 100644
--- a/Incremental.c
+++ b/Incremental.c
@@ -843,3 +843,39 @@ int Incremental_container(struct supertype *st, char *devname, int verbose,
map_unlock(&map);
return 0;
}
+
+/*
+ * IncrementalRemove - Attempt to see if the passed in device belongs to any
+ * raid arrays, and if so first fail (if needed) and then remove the device.
+ *
+ * @devname - The device we want to remove
+ *
+ * Special note: We would like to just use Managedevs to fail/remove the
+ * device, but unfortunately, by the time we are called via udev, the device
+ * special file is already gone, and so we can't stat the device and se we
+ * don't have the right rdev value to use in the ioctls. So, we use the
+ * sysfs method of device removal instead, but since that's not gauranteed
+ * to work depending on the version of kernel we run on, try to use the
+ * ioctl method first and only fallback if we don't have a valid device
+ * special file. That way we can support operation manually on older kernels
+ * even if we won't be able to do this automatically via udev on older
+ * kernels.
+ */
+int IncrementalRemove(char *devname, int verbose)
+{
+ char mddev[100] = "/dev/";
+ int mdfd;
+ struct mddev_dev_s devlist;
+
+ strncpy(mddev + 5, devname, sizeof(mddev) - 5);
+ if (mdstat_check_active(mddev + 5))
+ return 1;
+ if ((mdfd = open_mddev(mddev, 0)) < 0)
+ return 1;
+ memset(&devlist, 0, sizeof(devlist));
+ devlist.devname = devname;
+ devlist.disposition = 'f';
+ Manage_subdevs(mddev, mdfd, &devlist, verbose);
+ devlist.disposition = 'r';
+ return Manage_subdevs(mddev, mdfd, &devlist, verbose);
+}
diff --git a/Manage.c b/Manage.c
index f848d8b..6539eda 100644
--- a/Manage.c
+++ b/Manage.c
@@ -346,6 +346,9 @@ int Manage_subdevs(char *devname, int fd,
mdu_disk_info_t disc;
unsigned long long array_size;
mddev_dev_t dv, next = NULL;
+ struct mdinfo *mdi = NULL;
+ struct mdinfo *dev = NULL;
+ char sys_name[20] = "dev-";
struct stat stb;
int j, jnext = 0;
int tfd;
@@ -443,16 +446,43 @@ int Manage_subdevs(char *devname, int fd,
if (jnext == 0)
continue;
} else {
+ /*
+ * For fail/remove operations, allow the disk
+ * to be completely missing, use name matching
+ * to a device in our sysfs entries to
+ * suffice. For add we need a valid block device.
+ * Leave this loop one of three ways:
+ * 1) tfd < 0 and dev is set to our device
+ * 2) tfd >= 0 and dev is NULL
+ * 3) failed to find suitable device and return
+ */
j = 0;
tfd = dev_open(dv->devname, O_RDONLY);
- if (tfd < 0 && dv->disposition == 'r' &&
- lstat(dv->devname, &stb) == 0)
- /* Be happy, the lstat worked, that is
- * enough for --remove
- */
- ;
- else {
+ if (tfd < 0 && dv->disposition != 'a') {
+ strcpy(&sys_name[4],
+ strrchr(dv->devname, '/') + 1);
+ mdi = sysfs_read(fd, 0,
+ GET_DEVS | KEEP_GONE_DEVS);
+ if (!mdi) {
+ fprintf(stderr, Name ": can't open %s "
+ "and can't read sysfs info\n",
+ dv->devname);
+ return 1;
+ }
+ for (dev = mdi->devs; dev; dev = dev->next) {
+ if (strcmp(sys_name, dev->sys_name))
+ continue;
+ break;
+ }
+ if (!dev) {
+ fprintf(stderr, Name ": can't open %s "
+ "and %s not listed in sysfs\n",
+ dv->devname, sys_name);
+ sysfs_free(mdi);
+ return 1;
+ }
+ } else {
if (tfd < 0 || fstat(tfd, &stb) != 0) {
fprintf(stderr, Name ": cannot find %s: %s\n",
dv->devname, strerror(errno));
@@ -461,12 +491,12 @@ int Manage_subdevs(char *devname, int fd,
return 1;
}
close(tfd);
- }
- if ((stb.st_mode & S_IFMT) != S_IFBLK) {
- fprintf(stderr, Name ": %s is not a "
- "block device.\n",
- dv->devname);
- return 1;
+ if ((stb.st_mode & S_IFMT) != S_IFBLK) {
+ fprintf(stderr, Name ": %s is not a "
+ "block device.\n",
+ dv->devname);
+ return 1;
+ }
}
}
switch(dv->disposition){
@@ -790,26 +820,36 @@ int Manage_subdevs(char *devname, int fd,
return 1;
}
}
- /* FIXME check that it is a current member */
- err = ioctl(fd, HOT_REMOVE_DISK, (unsigned long)stb.st_rdev);
- if (err && errno == ENODEV) {
+ /* stb.st_rdev is only valid if we have a tfd that
+ * does not indicate an error on attempt to open
+ * the devname
+ */
+ if (tfd >= 0)
+ err = ioctl(fd, HOT_REMOVE_DISK,
+ (unsigned long)stb.st_rdev);
+ if (tfd < 0 || (err && errno == ENODEV)) {
/* Old kernels rejected this if no personality
* registered */
- struct mdinfo *sra = sysfs_read(fd, 0, GET_DEVS);
- struct mdinfo *dv = NULL;
- if (sra)
- dv = sra->devs;
- for ( ; dv ; dv=dv->next)
- if (dv->disk.major == major(stb.st_rdev) &&
- dv->disk.minor == minor(stb.st_rdev))
+ if (!mdi) {
+ strcpy(&sys_name[4],
+ strrchr(dv->devname, '/') + 1);
+ mdi = sysfs_read(fd, 0, GET_DEVS |
+ KEEP_GONE_DEVS);
+ if (mdi)
+ dev = mdi->devs;
+ for ( ; dev ; dev=dev->next) {
+ if (strcmp(sys_name, dev->sys_name))
+ continue;
break;
- if (dv)
- err = sysfs_set_str(sra, dv,
+ }
+ }
+ if (dev)
+ err = sysfs_set_str(mdi, dev,
"state", "remove");
else
err = -1;
- if (sra)
- sysfs_free(sra);
+ if (mdi)
+ sysfs_free(mdi);
}
if (err) {
fprintf(stderr, Name ": hot remove failed "
@@ -844,11 +884,18 @@ int Manage_subdevs(char *devname, int fd,
case 'f': /* set faulty */
/* FIXME check current member */
- if (ioctl(fd, SET_DISK_FAULTY, (unsigned long) stb.st_rdev)) {
+ if ((tfd >= 0 && ioctl(fd, SET_DISK_FAULTY,
+ (unsigned long) stb.st_rdev)) ||
+ (tfd < 0 && sysfs_set_str(mdi, dev, "state",
+ "faulty"))) {
fprintf(stderr, Name ": set device faulty failed for %s: %s\n",
dnprintable, strerror(errno));
+ if (mdi)
+ sysfs_free(mdi);
return 1;
- }
+ }
+ if (mdi)
+ sysfs_free(mdi);
if (verbose >= 0)
fprintf(stderr, Name ": set %s faulty in %s\n",
dnprintable, devname);
diff --git a/ReadMe.c b/ReadMe.c
index 9d5a211..fd216ec 100644
--- a/ReadMe.c
+++ b/ReadMe.c
@@ -86,11 +86,12 @@ char Version[] = Name " - v3.1.2 - 10th March 2010\n";
* At the time if writing, there is only minimal support.
*/
-char short_options[]="-ABCDEFGIQhVXWZ:vqbc:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:";
+char short_options[]=
+ "-ABCDEFGIQhVXWZ:vqbc:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:";
char short_bitmap_options[]=
- "-ABCDEFGIQhVXWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:";
+ "-ABCDEFGIQhVXWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:";
char short_bitmap_auto_options[]=
- "-ABCDEFGIQhVXWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:";
+ "-ABCDEFGIQhVXWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:";
struct option long_options[] = {
{"manage", 0, 0, '@'},
@@ -213,7 +214,7 @@ char Help[] =
" mdadm --grow options device\n"
" resize/reshape an active array\n"
" mdadm --incremental device\n"
-" add a device to an array as appropriate\n"
+" add/remove a device to/from an array as appropriate\n"
" mdadm --monitor options...\n"
" Monitor one or more array for significant changes.\n"
" mdadm device options...\n"
@@ -535,20 +536,26 @@ char Help_grow[] =
;
char Help_incr[] =
-"Usage: mdadm --incremental [-Rqrs] device\n"
+"Usage: mdadm --incremental [-Rqrsf] device\n"
"\n"
"This usage allows for incremental assembly of md arrays. Devices can be\n"
"added one at a time as they are discovered. Once an array has all expected\n"
"devices, it will be started.\n"
"\n"
-"Options that are valid with incremental assembly (-I --incremental) more are:\n"
-" --run -R : run arrays as soon as a minimal number of devices are\n"
+"Optionally, the process can be reversed by using the fail option.\n"
+"When fail mode is invoked, mdadm will see if the device belongs to an array\n"
+"and then both fail (if needed) and remove the device from that array.\n"
+"\n"
+"Options that are valid with incremental assembly (-I --incremental) are:\n"
+" --run -R : Run arrays as soon as a minimal number of devices are\n"
" : present rather than waiting for all expected.\n"
" --quiet -q : Don't print any information messages, just errors.\n"
" --rebuild -r : Rebuild the 'map' file that mdadm uses for tracking\n"
" : partial arrays.\n"
" --scan -s : Use with -R to start any arrays that have the minimal\n"
" : required number of devices, but are not yet started.\n"
+" --fail -f : First fail (if needed) and then remove device from\n"
+" : any array that it is a member of.\n"
;
char Help_config[] =
diff --git a/mdadm.8 b/mdadm.8
index 4edfc41..eaf9155 100644
--- a/mdadm.8
+++ b/mdadm.8
@@ -135,7 +135,11 @@ This provides a convenient interface to a
.I hot-plug
system. As each device is detected,
.I mdadm
-has a chance to include it in some array as appropriate.
+has a chance to include it in some array as appropriate. Optionally,
+with the
+.I \-\-fail
+flag is passed in then we will remove the device from any active array
+instead of adding it.
If a
.B CONTAINER
@@ -189,7 +193,7 @@ Change the size or shape of an active array.
.TP
.BR \-I ", " \-\-incremental
-Add a single device into an appropriate array, and possibly start the array.
+Add/remove a single device to/from an appropriate array, and possibly start the array.
.TP
.B \-\-auto-detect
@@ -1235,6 +1239,12 @@ in
.B mdadm.conf
as requiring an external bitmap, that bitmap will be attached first.
+.TP
+.BR \-\-fail ", " \-f
+This allows the hot-plug system to remove devices that have fully disappeared
+from the kernel. It will first fail and then remove the device from any
+array it belongs to.
+
.SH For Monitor mode:
.TP
.BR \-m ", " \-\-mail
@@ -2141,6 +2151,10 @@ Usage:
.I component-device
.HP 12
Usage:
+.B mdadm \-\-incremental \-\-fail
+.I component-device
+.HP 12
+Usage:
.B mdadm \-\-incremental \-\-rebuild
.HP 12
Usage:
@@ -2153,6 +2167,11 @@ passed to
.B "mdadm \-\-incremental"
to be conditionally added to an appropriate array.
+Conversely, it can also be used with the
+.B \-\-fail
+flag to do just the opposite and find whatever array a particular device
+is part of and remove the device from the array.
+
If the device passed is a
.B CONTAINER
device created by a previous call to
diff --git a/mdadm.c b/mdadm.c
index d5e34c0..cd6fd8f 100644
--- a/mdadm.c
+++ b/mdadm.c
@@ -124,6 +124,7 @@ int main(int argc, char *argv[])
ident.name[0] = 0;
ident.container = NULL;
ident.member = NULL;
+ ident.member_index = -1;
while ((option_index = -1) ,
(opt=getopt_long(argc, argv,
@@ -774,6 +775,9 @@ int main(int argc, char *argv[])
devmode = 'r';
continue;
case O(MANAGE,'f'): /* set faulty */
+ case O(INCREMENTAL,'f'): /* r for incremental is taken, use f
+ * even though we will both fail and
+ * remove the device */
devmode = 'f';
continue;
case O(INCREMENTAL,'R'):
@@ -1517,6 +1521,11 @@ int main(int argc, char *argv[])
": --incremental --scan meaningless without --run.\n");
break;
}
+ if (devmode == 'f') {
+ fprintf(stderr, Name
+ ": --incremental --scan --fail not supported.\n");
+ break;
+ }
rv = IncrementalScan(verbose);
}
if (!devlist) {
@@ -1533,6 +1542,10 @@ int main(int argc, char *argv[])
rv = 1;
break;
}
+ if (devmode == 'f') {
+ rv = IncrementalRemove(devlist->devname, verbose-quiet);
+ break;
+ }
rv = Incremental(devlist->devname, verbose-quiet, runstop,
ss, homehost, require_homehost, autof);
break;
diff --git a/mdadm.h b/mdadm.h
index d8ab85f..c113d0f 100644
--- a/mdadm.h
+++ b/mdadm.h
@@ -315,6 +315,7 @@ typedef struct mddev_ident_s {
* of some other entry.
*/
char *member; /* subarray within a container */
+ int member_index; /* subarray index within a container */
struct mddev_ident_s *next;
union {
@@ -355,6 +356,10 @@ struct mdstat_ent {
int raid_disks;
int chunk_size;
char * metadata_version;
+ struct dev_member {
+ char *name;
+ struct dev_member *next;
+ } *members;
struct mdstat_ent *next;
};
@@ -363,6 +368,7 @@ extern void free_mdstat(struct mdstat_ent *ms);
extern void mdstat_wait(int seconds);
extern void mdstat_wait_fd(int fd, const sigset_t *sigmask);
extern int mddev_busy(int devnum);
+extern int mdstat_check_active(char *devname);
struct map_ent {
struct map_ent *next;
@@ -404,6 +410,7 @@ enum sysfs_read_flags {
GET_STATE = (1 << 13),
GET_ERROR = (1 << 14),
SKIP_GONE_DEVS = (1 << 15),
+ KEEP_GONE_DEVS = (1 << 16),
};
/* If fd >= 0, get the array it is open on,
@@ -817,6 +824,7 @@ extern int Incremental_container(struct supertype *st, char *devname,
int trustworthy);
extern void RebuildMap(void);
extern int IncrementalScan(int verbose);
+extern int IncrementalRemove(char *devname, int verbose);
extern int CreateBitmap(char *filename, int force, char uuid[16],
unsigned long chunksize, unsigned long daemon_sleep,
diff --git a/mdstat.c b/mdstat.c
index 4a9f370..81d2212 100644
--- a/mdstat.c
+++ b/mdstat.c
@@ -83,6 +83,45 @@
#include <sys/select.h>
#include <ctype.h>
+static void free_member_devnames(struct dev_member **m)
+{
+ struct dev_member *t;
+ if (!*m)
+ return;
+ while(*m) {
+ t = *m;
+ *m = (*m)->next;
+ if (t->name)
+ free(t->name);
+ free(t);
+ }
+ *m = NULL;
+}
+
+static struct dev_member *add_member_devname(struct dev_member **m, char *name)
+{
+ struct dev_member *new;
+ char *t;
+
+ if (!m || !name)
+ return NULL;
+
+ new = malloc(sizeof(*new));
+ if (!new)
+ return NULL;
+ if ((t = strchr(name, '[')) == NULL)
+ {
+ /* not a device */
+ free(new);
+ return *m;
+ }
+ new->name = strndup(name, t - name);
+ new->next = *m;
+ *m = new;
+
+ return new;
+}
+
void free_mdstat(struct mdstat_ent *ms)
{
while (ms) {
@@ -91,6 +130,7 @@ void free_mdstat(struct mdstat_ent *ms)
if (ms->level) free(ms->level);
if (ms->pattern) free(ms->pattern);
if (ms->metadata_version) free(ms->metadata_version);
+ if (ms->members) free_member_devnames(&ms->members);
t = ms;
ms = ms->next;
free(t);
@@ -159,6 +199,7 @@ struct mdstat_ent *mdstat_read(int hold, int start)
ent->raid_disks = 0;
ent->chunk_size = 0;
ent->devcnt = 0;
+ ent->members = NULL;
ent->dev = strdup(line);
ent->devnum = devnum;
@@ -170,15 +211,23 @@ struct mdstat_ent *mdstat_read(int hold, int start)
ent->active = 1;
else if (strcmp(w, "inactive")==0)
ent->active = 0;
- else if (ent->active >=0 &&
+ else if (ent->active > 0 &&
ent->level == NULL &&
w[0] != '(' /*readonly*/) {
ent->level = strdup(w);
in_devs = 1;
} else if (in_devs && strcmp(w, "blocks")==0)
in_devs = 0;
- else if (in_devs) {
+ else if (in_devs || (ent->active == 0 && w[0] != '(' &&
+ w[l - 1] == ')')) {
+ if (isdigit(w[0]))
+ continue;
+ in_devs = 1;
ent->devcnt++;
+ if (!add_member_devname(&ent->members, w)) {
+ free_mdstat(ent);
+ break;
+ }
if (strncmp(w, "md", 2)==0) {
/* This has an md device as a component.
* If that device is already in the
@@ -310,3 +359,40 @@ int mddev_busy(int devnum)
free_mdstat(mdstat);
return me != NULL;
}
+
+/*
+ * Finds name of the active array holding this device
+ * @param[in] devname name of member device
+ * @param[out] devname name of array
+ *
+ * @return found (0), or
+ * not found, failure (1)
+ */
+
+int mdstat_check_active(char *devname)
+{
+ struct mdstat_ent *mdstat = mdstat_read(0, 0);
+ struct mdstat_ent *ent;
+ char *name;
+
+ if (!devname)
+ return 1;
+ name = strrchr(devname, '/');
+ if (name++ == NULL)
+ return 1;
+
+ for (ent = mdstat; ent; ent = ent->next) {
+ struct dev_member *m;
+ if (ent->active && (strstr(ent->metadata_version,"imsm") ||
+ strstr(ent->metadata_version,"ddf")))
+ /* only return container matches, not subarrays */
+ continue;
+ for (m = ent->members; m; m = m->next) {
+ if (!strcmp(m->name, name)) {
+ strcpy(devname, ent->dev);
+ return 0;
+ }
+ }
+ }
+ return 1;
+}
diff --git a/sysfs.c b/sysfs.c
index ebf9d8a..65dd848 100644
--- a/sysfs.c
+++ b/sysfs.c
@@ -273,13 +273,16 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
strcpy(dbase, "block/dev");
if (load_sys(fname, buf)) {
- free(dev);
- if (options & SKIP_GONE_DEVS)
+ if (options & SKIP_GONE_DEVS) {
+ free(dev);
continue;
- else
+ } else if (options & KEEP_GONE_DEVS) {
+ dev->disk.major = dev->disk.minor = -1;
+ } else
goto abort;
- }
- sscanf(buf, "%d:%d", &dev->disk.major, &dev->disk.minor);
+ } else
+ sscanf(buf, "%d:%d", &dev->disk.major,
+ &dev->disk.minor);
/* special case check for block devices that can go 'offline' */
if (options & SKIP_GONE_DEVS) {
diff --git a/udev-md-raid.rules b/udev-md-raid.rules
index c9a4f0e..aff14fa 100644
--- a/udev-md-raid.rules
+++ b/udev-md-raid.rules
@@ -1,17 +1,16 @@
# do not edit this file, it will be overwritten on update
SUBSYSTEM!="block", GOTO="md_end"
-ACTION!="add|change", GOTO="md_end"
+ACTION!="add|change|remove", GOTO="md_end"
+ACTION=="remove", GOTO="md_remove"
ACTION=="change", GOTO="md_no_incr"
-# import data from a raid member and activate it
-#ENV{ID_FS_TYPE}=="linux_raid_member", IMPORT{program}="/sbin/mdadm --examine --export $tempnode", RUN+="/sbin/mdadm --incremental $env{DEVNAME}"
-# import data from a raid set
+# we are adding a raid member, activate it
+ENV{ID_FS_TYPE}=="linux_raid_member", RUN+="/sbin/mdadm -I $env{DEVNAME}"
LABEL="md_no_incr"
KERNEL!="md*", GOTO="md_end"
-# partitions have no md/{array_state,metadata_version}, but should not
-# for that reason be ignored.
+# partitions have no md/{array_state,metadata_version}
ENV{DEVTYPE}=="partition", GOTO="md_ignore_state"
# container devices have a metadata version of e.g. 'external:ddf' and
@@ -32,7 +31,12 @@ ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[0-9]", SYMLINK+="md/$env{MD_DEVNA
IMPORT{program}="/sbin/blkid -o udev -p $tempnode"
OPTIONS+="link_priority=100"
+OPTIONS+="watch"
ENV{ID_FS_USAGE}=="filesystem|other|crypto", ENV{ID_FS_UUID_ENC}=="?*", SYMLINK+="disk/by-uuid/$env{ID_FS_UUID_ENC}"
ENV{ID_FS_USAGE}=="filesystem|other", ENV{ID_FS_LABEL_ENC}=="?*", SYMLINK+="disk/by-label/$env{ID_FS_LABEL_ENC}"
+GOTO="md_end"
+
+LABEL="md_remove"
+ENV{ID_FS_TYPE}=="linux_raid_member", RUN+="/sbin/mdadm -If $env{DEVNAME}"
LABEL="md_end"
--
1.6.6.1