From 804a6a6b5747eed8794a6b007279dc8e09432270 Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Fri, 17 Oct 2025 17:04:25 +0800 Subject: [PATCH 70/74] mdadm/Incremental: wait a while before removing a member We encountered a regression that member disk can't be removed in incremental remove mode: mdadm -If /dev/loop0 mdadm: Cannot remove member device loop0 from md127 It doesn't allow to remove a member if sync thread is running. mdadm -If sets member disk faulty first, then it removes the disk. If sync thread is running, it will be interrupted by setting a member faulty. But the sync thread hasn't been reapped. So it needs to wait a while to let kernel to reap sync thread. Signed-off-by: Xiao Ni --- Incremental.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/Incremental.c b/Incremental.c index ba3810e6157f..f30697fa684f 100644 --- a/Incremental.c +++ b/Incremental.c @@ -1715,6 +1715,7 @@ int Incremental_remove(char *devname, char *id_path, int verbose) struct mdstat_ent *ent; struct mdinfo mdi; int mdfd = -1; + int retry = 25; if (strcmp(devnm, devname) != 0) if (!is_devnode_path(devname)) { @@ -1790,11 +1791,26 @@ int Incremental_remove(char *devname, char *id_path, int verbose) /* Native arrays are handled separatelly to provide more detailed error handling */ rv = sysfs_set_memb_state(ent->devnm, devnm, MEMB_STATE_FAULTY); - if (rv && verbose >= 0) - pr_err("Cannot fail member device %s in array %s.\n", devnm, ent->devnm); + if (rv) { + if (verbose >= 0) + pr_err("Cannot fail member device %s in array %s.\n", devnm, ent->devnm); + goto out; + } - if (rv == MDADM_STATUS_SUCCESS) + /* + * If resync/recovery is running, sync thread is interrupted by setting member faulty. + * And it needs to wait some time to let kernel to reap sync thread. If not, it will + * fail to remove it. + */ + while (retry) { rv = sysfs_set_memb_state(ent->devnm, devnm, MEMB_STATE_REMOVE); + if (rv) { + sleep_for(0, MSEC_TO_NSEC(200), true); + retry--; + continue; + } + break; + } if (rv && verbose >= 0) pr_err("Cannot remove member device %s from %s.\n", devnm, ent->devnm); -- 2.50.1