From 8c43c776715301ff020639801a8b1b4716fdf745 Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Mon, 5 Apr 2010 12:32:08 -0400 Subject: [PATCH 6/6] Initial implementation of incremental remove support Signed-off-by: Doug Ledford --- Incremental.c | 36 ++++++++++++++++++ Manage.c | 105 +++++++++++++++++++++++++++++++++++++-------------- ReadMe.c | 21 +++++++--- mdadm.8 | 23 ++++++++++- mdadm.c | 13 ++++++ mdadm.h | 8 ++++ mdstat.c | 90 +++++++++++++++++++++++++++++++++++++++++++- sysfs.c | 13 ++++-- udev-md-raid.rules | 16 +++++--- 9 files changed, 274 insertions(+), 51 deletions(-) diff --git a/Incremental.c b/Incremental.c index 7ad648a..d32a8e5 100644 --- a/Incremental.c +++ b/Incremental.c @@ -843,3 +843,39 @@ int Incremental_container(struct supertype *st, char *devname, int verbose, map_unlock(&map); return 0; } + +/* + * IncrementalRemove - Attempt to see if the passed in device belongs to any + * raid arrays, and if so first fail (if needed) and then remove the device. + * + * @devname - The device we want to remove + * + * Special note: We would like to just use Managedevs to fail/remove the + * device, but unfortunately, by the time we are called via udev, the device + * special file is already gone, and so we can't stat the device and se we + * don't have the right rdev value to use in the ioctls. So, we use the + * sysfs method of device removal instead, but since that's not gauranteed + * to work depending on the version of kernel we run on, try to use the + * ioctl method first and only fallback if we don't have a valid device + * special file. That way we can support operation manually on older kernels + * even if we won't be able to do this automatically via udev on older + * kernels. + */ +int IncrementalRemove(char *devname, int verbose) +{ + char mddev[100] = "/dev/"; + int mdfd; + struct mddev_dev_s devlist; + + strncpy(mddev + 5, devname, sizeof(mddev) - 5); + if (mdstat_check_active(mddev + 5)) + return 1; + if ((mdfd = open_mddev(mddev, 0)) < 0) + return 1; + memset(&devlist, 0, sizeof(devlist)); + devlist.devname = devname; + devlist.disposition = 'f'; + Manage_subdevs(mddev, mdfd, &devlist, verbose); + devlist.disposition = 'r'; + return Manage_subdevs(mddev, mdfd, &devlist, verbose); +} diff --git a/Manage.c b/Manage.c index f848d8b..6539eda 100644 --- a/Manage.c +++ b/Manage.c @@ -346,6 +346,9 @@ int Manage_subdevs(char *devname, int fd, mdu_disk_info_t disc; unsigned long long array_size; mddev_dev_t dv, next = NULL; + struct mdinfo *mdi = NULL; + struct mdinfo *dev = NULL; + char sys_name[20] = "dev-"; struct stat stb; int j, jnext = 0; int tfd; @@ -443,16 +446,43 @@ int Manage_subdevs(char *devname, int fd, if (jnext == 0) continue; } else { + /* + * For fail/remove operations, allow the disk + * to be completely missing, use name matching + * to a device in our sysfs entries to + * suffice. For add we need a valid block device. + * Leave this loop one of three ways: + * 1) tfd < 0 and dev is set to our device + * 2) tfd >= 0 and dev is NULL + * 3) failed to find suitable device and return + */ j = 0; tfd = dev_open(dv->devname, O_RDONLY); - if (tfd < 0 && dv->disposition == 'r' && - lstat(dv->devname, &stb) == 0) - /* Be happy, the lstat worked, that is - * enough for --remove - */ - ; - else { + if (tfd < 0 && dv->disposition != 'a') { + strcpy(&sys_name[4], + strrchr(dv->devname, '/') + 1); + mdi = sysfs_read(fd, 0, + GET_DEVS | KEEP_GONE_DEVS); + if (!mdi) { + fprintf(stderr, Name ": can't open %s " + "and can't read sysfs info\n", + dv->devname); + return 1; + } + for (dev = mdi->devs; dev; dev = dev->next) { + if (strcmp(sys_name, dev->sys_name)) + continue; + break; + } + if (!dev) { + fprintf(stderr, Name ": can't open %s " + "and %s not listed in sysfs\n", + dv->devname, sys_name); + sysfs_free(mdi); + return 1; + } + } else { if (tfd < 0 || fstat(tfd, &stb) != 0) { fprintf(stderr, Name ": cannot find %s: %s\n", dv->devname, strerror(errno)); @@ -461,12 +491,12 @@ int Manage_subdevs(char *devname, int fd, return 1; } close(tfd); - } - if ((stb.st_mode & S_IFMT) != S_IFBLK) { - fprintf(stderr, Name ": %s is not a " - "block device.\n", - dv->devname); - return 1; + if ((stb.st_mode & S_IFMT) != S_IFBLK) { + fprintf(stderr, Name ": %s is not a " + "block device.\n", + dv->devname); + return 1; + } } } switch(dv->disposition){ @@ -790,26 +820,36 @@ int Manage_subdevs(char *devname, int fd, return 1; } } - /* FIXME check that it is a current member */ - err = ioctl(fd, HOT_REMOVE_DISK, (unsigned long)stb.st_rdev); - if (err && errno == ENODEV) { + /* stb.st_rdev is only valid if we have a tfd that + * does not indicate an error on attempt to open + * the devname + */ + if (tfd >= 0) + err = ioctl(fd, HOT_REMOVE_DISK, + (unsigned long)stb.st_rdev); + if (tfd < 0 || (err && errno == ENODEV)) { /* Old kernels rejected this if no personality * registered */ - struct mdinfo *sra = sysfs_read(fd, 0, GET_DEVS); - struct mdinfo *dv = NULL; - if (sra) - dv = sra->devs; - for ( ; dv ; dv=dv->next) - if (dv->disk.major == major(stb.st_rdev) && - dv->disk.minor == minor(stb.st_rdev)) + if (!mdi) { + strcpy(&sys_name[4], + strrchr(dv->devname, '/') + 1); + mdi = sysfs_read(fd, 0, GET_DEVS | + KEEP_GONE_DEVS); + if (mdi) + dev = mdi->devs; + for ( ; dev ; dev=dev->next) { + if (strcmp(sys_name, dev->sys_name)) + continue; break; - if (dv) - err = sysfs_set_str(sra, dv, + } + } + if (dev) + err = sysfs_set_str(mdi, dev, "state", "remove"); else err = -1; - if (sra) - sysfs_free(sra); + if (mdi) + sysfs_free(mdi); } if (err) { fprintf(stderr, Name ": hot remove failed " @@ -844,11 +884,18 @@ int Manage_subdevs(char *devname, int fd, case 'f': /* set faulty */ /* FIXME check current member */ - if (ioctl(fd, SET_DISK_FAULTY, (unsigned long) stb.st_rdev)) { + if ((tfd >= 0 && ioctl(fd, SET_DISK_FAULTY, + (unsigned long) stb.st_rdev)) || + (tfd < 0 && sysfs_set_str(mdi, dev, "state", + "faulty"))) { fprintf(stderr, Name ": set device faulty failed for %s: %s\n", dnprintable, strerror(errno)); + if (mdi) + sysfs_free(mdi); return 1; - } + } + if (mdi) + sysfs_free(mdi); if (verbose >= 0) fprintf(stderr, Name ": set %s faulty in %s\n", dnprintable, devname); diff --git a/ReadMe.c b/ReadMe.c index 9d5a211..fd216ec 100644 --- a/ReadMe.c +++ b/ReadMe.c @@ -86,11 +86,12 @@ char Version[] = Name " - v3.1.2 - 10th March 2010\n"; * At the time if writing, there is only minimal support. */ -char short_options[]="-ABCDEFGIQhVXWZ:vqbc:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:"; +char short_options[]= + "-ABCDEFGIQhVXWZ:vqbc:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:"; char short_bitmap_options[]= - "-ABCDEFGIQhVXWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:"; + "-ABCDEFGIQhVXWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:"; char short_bitmap_auto_options[]= - "-ABCDEFGIQhVXWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:"; + "-ABCDEFGIQhVXWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:"; struct option long_options[] = { {"manage", 0, 0, '@'}, @@ -213,7 +214,7 @@ char Help[] = " mdadm --grow options device\n" " resize/reshape an active array\n" " mdadm --incremental device\n" -" add a device to an array as appropriate\n" +" add/remove a device to/from an array as appropriate\n" " mdadm --monitor options...\n" " Monitor one or more array for significant changes.\n" " mdadm device options...\n" @@ -535,20 +536,26 @@ char Help_grow[] = ; char Help_incr[] = -"Usage: mdadm --incremental [-Rqrs] device\n" +"Usage: mdadm --incremental [-Rqrsf] device\n" "\n" "This usage allows for incremental assembly of md arrays. Devices can be\n" "added one at a time as they are discovered. Once an array has all expected\n" "devices, it will be started.\n" "\n" -"Options that are valid with incremental assembly (-I --incremental) more are:\n" -" --run -R : run arrays as soon as a minimal number of devices are\n" +"Optionally, the process can be reversed by using the fail option.\n" +"When fail mode is invoked, mdadm will see if the device belongs to an array\n" +"and then both fail (if needed) and remove the device from that array.\n" +"\n" +"Options that are valid with incremental assembly (-I --incremental) are:\n" +" --run -R : Run arrays as soon as a minimal number of devices are\n" " : present rather than waiting for all expected.\n" " --quiet -q : Don't print any information messages, just errors.\n" " --rebuild -r : Rebuild the 'map' file that mdadm uses for tracking\n" " : partial arrays.\n" " --scan -s : Use with -R to start any arrays that have the minimal\n" " : required number of devices, but are not yet started.\n" +" --fail -f : First fail (if needed) and then remove device from\n" +" : any array that it is a member of.\n" ; char Help_config[] = diff --git a/mdadm.8 b/mdadm.8 index 4edfc41..eaf9155 100644 --- a/mdadm.8 +++ b/mdadm.8 @@ -135,7 +135,11 @@ This provides a convenient interface to a .I hot-plug system. As each device is detected, .I mdadm -has a chance to include it in some array as appropriate. +has a chance to include it in some array as appropriate. Optionally, +with the +.I \-\-fail +flag is passed in then we will remove the device from any active array +instead of adding it. If a .B CONTAINER @@ -189,7 +193,7 @@ Change the size or shape of an active array. .TP .BR \-I ", " \-\-incremental -Add a single device into an appropriate array, and possibly start the array. +Add/remove a single device to/from an appropriate array, and possibly start the array. .TP .B \-\-auto-detect @@ -1235,6 +1239,12 @@ in .B mdadm.conf as requiring an external bitmap, that bitmap will be attached first. +.TP +.BR \-\-fail ", " \-f +This allows the hot-plug system to remove devices that have fully disappeared +from the kernel. It will first fail and then remove the device from any +array it belongs to. + .SH For Monitor mode: .TP .BR \-m ", " \-\-mail @@ -2141,6 +2151,10 @@ Usage: .I component-device .HP 12 Usage: +.B mdadm \-\-incremental \-\-fail +.I component-device +.HP 12 +Usage: .B mdadm \-\-incremental \-\-rebuild .HP 12 Usage: @@ -2153,6 +2167,11 @@ passed to .B "mdadm \-\-incremental" to be conditionally added to an appropriate array. +Conversely, it can also be used with the +.B \-\-fail +flag to do just the opposite and find whatever array a particular device +is part of and remove the device from the array. + If the device passed is a .B CONTAINER device created by a previous call to diff --git a/mdadm.c b/mdadm.c index d5e34c0..cd6fd8f 100644 --- a/mdadm.c +++ b/mdadm.c @@ -124,6 +124,7 @@ int main(int argc, char *argv[]) ident.name[0] = 0; ident.container = NULL; ident.member = NULL; + ident.member_index = -1; while ((option_index = -1) , (opt=getopt_long(argc, argv, @@ -774,6 +775,9 @@ int main(int argc, char *argv[]) devmode = 'r'; continue; case O(MANAGE,'f'): /* set faulty */ + case O(INCREMENTAL,'f'): /* r for incremental is taken, use f + * even though we will both fail and + * remove the device */ devmode = 'f'; continue; case O(INCREMENTAL,'R'): @@ -1517,6 +1521,11 @@ int main(int argc, char *argv[]) ": --incremental --scan meaningless without --run.\n"); break; } + if (devmode == 'f') { + fprintf(stderr, Name + ": --incremental --scan --fail not supported.\n"); + break; + } rv = IncrementalScan(verbose); } if (!devlist) { @@ -1533,6 +1542,10 @@ int main(int argc, char *argv[]) rv = 1; break; } + if (devmode == 'f') { + rv = IncrementalRemove(devlist->devname, verbose-quiet); + break; + } rv = Incremental(devlist->devname, verbose-quiet, runstop, ss, homehost, require_homehost, autof); break; diff --git a/mdadm.h b/mdadm.h index d8ab85f..c113d0f 100644 --- a/mdadm.h +++ b/mdadm.h @@ -315,6 +315,7 @@ typedef struct mddev_ident_s { * of some other entry. */ char *member; /* subarray within a container */ + int member_index; /* subarray index within a container */ struct mddev_ident_s *next; union { @@ -355,6 +356,10 @@ struct mdstat_ent { int raid_disks; int chunk_size; char * metadata_version; + struct dev_member { + char *name; + struct dev_member *next; + } *members; struct mdstat_ent *next; }; @@ -363,6 +368,7 @@ extern void free_mdstat(struct mdstat_ent *ms); extern void mdstat_wait(int seconds); extern void mdstat_wait_fd(int fd, const sigset_t *sigmask); extern int mddev_busy(int devnum); +extern int mdstat_check_active(char *devname); struct map_ent { struct map_ent *next; @@ -404,6 +410,7 @@ enum sysfs_read_flags { GET_STATE = (1 << 13), GET_ERROR = (1 << 14), SKIP_GONE_DEVS = (1 << 15), + KEEP_GONE_DEVS = (1 << 16), }; /* If fd >= 0, get the array it is open on, @@ -817,6 +824,7 @@ extern int Incremental_container(struct supertype *st, char *devname, int trustworthy); extern void RebuildMap(void); extern int IncrementalScan(int verbose); +extern int IncrementalRemove(char *devname, int verbose); extern int CreateBitmap(char *filename, int force, char uuid[16], unsigned long chunksize, unsigned long daemon_sleep, diff --git a/mdstat.c b/mdstat.c index 4a9f370..81d2212 100644 --- a/mdstat.c +++ b/mdstat.c @@ -83,6 +83,45 @@ #include #include +static void free_member_devnames(struct dev_member **m) +{ + struct dev_member *t; + if (!*m) + return; + while(*m) { + t = *m; + *m = (*m)->next; + if (t->name) + free(t->name); + free(t); + } + *m = NULL; +} + +static struct dev_member *add_member_devname(struct dev_member **m, char *name) +{ + struct dev_member *new; + char *t; + + if (!m || !name) + return NULL; + + new = malloc(sizeof(*new)); + if (!new) + return NULL; + if ((t = strchr(name, '[')) == NULL) + { + /* not a device */ + free(new); + return *m; + } + new->name = strndup(name, t - name); + new->next = *m; + *m = new; + + return new; +} + void free_mdstat(struct mdstat_ent *ms) { while (ms) { @@ -91,6 +130,7 @@ void free_mdstat(struct mdstat_ent *ms) if (ms->level) free(ms->level); if (ms->pattern) free(ms->pattern); if (ms->metadata_version) free(ms->metadata_version); + if (ms->members) free_member_devnames(&ms->members); t = ms; ms = ms->next; free(t); @@ -159,6 +199,7 @@ struct mdstat_ent *mdstat_read(int hold, int start) ent->raid_disks = 0; ent->chunk_size = 0; ent->devcnt = 0; + ent->members = NULL; ent->dev = strdup(line); ent->devnum = devnum; @@ -170,15 +211,23 @@ struct mdstat_ent *mdstat_read(int hold, int start) ent->active = 1; else if (strcmp(w, "inactive")==0) ent->active = 0; - else if (ent->active >=0 && + else if (ent->active > 0 && ent->level == NULL && w[0] != '(' /*readonly*/) { ent->level = strdup(w); in_devs = 1; } else if (in_devs && strcmp(w, "blocks")==0) in_devs = 0; - else if (in_devs) { + else if (in_devs || (ent->active == 0 && w[0] != '(' && + w[l - 1] == ')')) { + if (isdigit(w[0])) + continue; + in_devs = 1; ent->devcnt++; + if (!add_member_devname(&ent->members, w)) { + free_mdstat(ent); + break; + } if (strncmp(w, "md", 2)==0) { /* This has an md device as a component. * If that device is already in the @@ -310,3 +359,40 @@ int mddev_busy(int devnum) free_mdstat(mdstat); return me != NULL; } + +/* + * Finds name of the active array holding this device + * @param[in] devname name of member device + * @param[out] devname name of array + * + * @return found (0), or + * not found, failure (1) + */ + +int mdstat_check_active(char *devname) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *ent; + char *name; + + if (!devname) + return 1; + name = strrchr(devname, '/'); + if (name++ == NULL) + return 1; + + for (ent = mdstat; ent; ent = ent->next) { + struct dev_member *m; + if (ent->active && (strstr(ent->metadata_version,"imsm") || + strstr(ent->metadata_version,"ddf"))) + /* only return container matches, not subarrays */ + continue; + for (m = ent->members; m; m = m->next) { + if (!strcmp(m->name, name)) { + strcpy(devname, ent->dev); + return 0; + } + } + } + return 1; +} diff --git a/sysfs.c b/sysfs.c index ebf9d8a..65dd848 100644 --- a/sysfs.c +++ b/sysfs.c @@ -273,13 +273,16 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) strcpy(dbase, "block/dev"); if (load_sys(fname, buf)) { - free(dev); - if (options & SKIP_GONE_DEVS) + if (options & SKIP_GONE_DEVS) { + free(dev); continue; - else + } else if (options & KEEP_GONE_DEVS) { + dev->disk.major = dev->disk.minor = -1; + } else goto abort; - } - sscanf(buf, "%d:%d", &dev->disk.major, &dev->disk.minor); + } else + sscanf(buf, "%d:%d", &dev->disk.major, + &dev->disk.minor); /* special case check for block devices that can go 'offline' */ if (options & SKIP_GONE_DEVS) { diff --git a/udev-md-raid.rules b/udev-md-raid.rules index c9a4f0e..aff14fa 100644 --- a/udev-md-raid.rules +++ b/udev-md-raid.rules @@ -1,17 +1,16 @@ # do not edit this file, it will be overwritten on update SUBSYSTEM!="block", GOTO="md_end" -ACTION!="add|change", GOTO="md_end" +ACTION!="add|change|remove", GOTO="md_end" +ACTION=="remove", GOTO="md_remove" ACTION=="change", GOTO="md_no_incr" -# import data from a raid member and activate it -#ENV{ID_FS_TYPE}=="linux_raid_member", IMPORT{program}="/sbin/mdadm --examine --export $tempnode", RUN+="/sbin/mdadm --incremental $env{DEVNAME}" -# import data from a raid set +# we are adding a raid member, activate it +ENV{ID_FS_TYPE}=="linux_raid_member", RUN+="/sbin/mdadm -I $env{DEVNAME}" LABEL="md_no_incr" KERNEL!="md*", GOTO="md_end" -# partitions have no md/{array_state,metadata_version}, but should not -# for that reason be ignored. +# partitions have no md/{array_state,metadata_version} ENV{DEVTYPE}=="partition", GOTO="md_ignore_state" # container devices have a metadata version of e.g. 'external:ddf' and @@ -32,7 +31,12 @@ ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[0-9]", SYMLINK+="md/$env{MD_DEVNA IMPORT{program}="/sbin/blkid -o udev -p $tempnode" OPTIONS+="link_priority=100" +OPTIONS+="watch" ENV{ID_FS_USAGE}=="filesystem|other|crypto", ENV{ID_FS_UUID_ENC}=="?*", SYMLINK+="disk/by-uuid/$env{ID_FS_UUID_ENC}" ENV{ID_FS_USAGE}=="filesystem|other", ENV{ID_FS_LABEL_ENC}=="?*", SYMLINK+="disk/by-label/$env{ID_FS_LABEL_ENC}" +GOTO="md_end" + +LABEL="md_remove" +ENV{ID_FS_TYPE}=="linux_raid_member", RUN+="/sbin/mdadm -If $env{DEVNAME}" LABEL="md_end" -- 1.6.6.1