198 lines
6.5 KiB
Diff
198 lines
6.5 KiB
Diff
From 8da27191aa62b08075d8e7ec36c14083f528eb89 Mon Sep 17 00:00:00 2001
|
|
From: Nigel Croxon <ncroxon@redhat.com>
|
|
Date: Fri, 4 Apr 2025 08:44:47 -0400
|
|
Subject: [PATCH 1/1] mdadm: enable sync file for udev rules
|
|
|
|
Mounting an md device may fail during boot from mdadm's claim
|
|
on the device not being released before systemd attempts to mount.
|
|
|
|
In this case it was found that essentially there is a race condition
|
|
occurring in which the mount cannot happen without some kind of delay
|
|
being added BEFORE the mount itself triggers, or manual intervention
|
|
after a timeout.
|
|
|
|
The findings:
|
|
the inode was for a tmp block node made by mdadm for md0.
|
|
|
|
crash> detailedsearch ff1b0c398ff28380
|
|
ff1b0c398f079720: ff1b0c398ff28380 slab:filp state:alloc
|
|
obj:ff1b0c398f079700 size:256
|
|
ff1b0c398ff284f8: ff1b0c398ff28380 slab:shmem_inode_cache
|
|
state:alloc obj:ff1b0c398ff28308 size:768
|
|
|
|
crash> struct file.f_inode,f_path ff1b0c398f079700
|
|
f_inode = 0xff1b0c398ff28380,
|
|
f_path = {
|
|
mnt = 0xff1b0c594aecc7a0,
|
|
dentry = 0xff1b0c3a8c614f00
|
|
},
|
|
crash> struct dentry.d_name 0xff1b0c3a8c614f00
|
|
d_name = {
|
|
{
|
|
{ hash = 3714992780, len = 16 },
|
|
hash_len = 72434469516
|
|
},
|
|
name = 0xff1b0c3a8c614f38 ".tmp.md.1454:9:0"
|
|
},
|
|
|
|
For the race condition, mdadm and udev have some infrastructure for making
|
|
the device be ignored while under construction. e.g.
|
|
|
|
$ cat lib/udev/rules.d/01-md-raid-creating.rules
|
|
|
|
do not edit this file, it will be overwritten on update
|
|
While mdadm is creating an array, it creates a file
|
|
/run/mdadm/creating-mdXXX. If that file exists, then
|
|
the array is not "ready" and we should make sure the
|
|
content is ignored.
|
|
KERNEL=="md*", TEST=="/run/mdadm/creating-$kernel", ENV{SYSTEMD_READY}="0"
|
|
|
|
However, this feature currently is only used by the mdadm create command.
|
|
See calls to udev_block/udev_unblock in the mdadm code as to where and when
|
|
this behavior is used. Any md array being started by incremental or
|
|
normal assemble commands does not use this udev integration. So assembly
|
|
of an existing array does not look to have any explicit protection from
|
|
systemd/udev seeing an array as in a usable state before an mdadm instance
|
|
with O_EXCL closes its file handle.
|
|
This is for the sake of showing the use case for such an option and why
|
|
it would be helpful to delay the mount itself.
|
|
|
|
While mdadm is still constructing the array mdadm --incremental
|
|
that is called from within /usr/lib/udev/rules.d/64-md-raid-assembly.rules,
|
|
there is an attempt to mount the md device, but there is not a creation
|
|
of "/run/mdadm/creating-xxx" file when in incremental mode that
|
|
the rule is looking for. Therefore the device is not marked
|
|
as SYSTEMD_READY=0 in
|
|
"/usr/lib/udev/rules.d/01-md-raid-creating.rules" and missing
|
|
synchronization using the "/run/mdadm/creating-xxx" file.
|
|
|
|
As to this change affecting containers or IMSM...
|
|
(container's array state is inactive all the time)
|
|
|
|
Even if the "array_state" reports "inactive" when previous components
|
|
are added, the mdadm call for the very last array component that makes
|
|
it usable/ready, still needs to be synced properly - mdadm needs to drop
|
|
the claim first calling "close", then delete the "/run/mdadm/creating-xxx".
|
|
Then lets the udev know it is clear to act now (the "udev_unblock" in
|
|
mdadm code that generates a synthetic udev event so the rules are
|
|
reevalutated). It's this processing of the very last array component
|
|
that is the issue here (which is not IO error, but it is that trying to
|
|
open the dev returns -EBUSY because of the exclusive claim that mdadm
|
|
still holds while the mdadm device is being processed already by udev in
|
|
parallel, and that is what the
|
|
/run/mdadm/creating-xxx should prevent exactly).
|
|
|
|
The patch to Incremental.c is to enable creating the
|
|
"/run/mdadm/creating-xxx" file during incremental mode.
|
|
|
|
For the change to Create.c, the unlink is called right before dropping
|
|
the exculusive claim for the device. This should be the other way round
|
|
to avoid the race 100%. That is, if there's a "close" call and
|
|
"udev_unblock" call, the "close" should go first, then followed
|
|
"udev_unblock".
|
|
|
|
Signed-off-by: Nigel Croxon <ncroxon@redhat.com>
|
|
---
|
|
Create.c | 2 +-
|
|
Incremental.c | 20 +++++++++++++++-----
|
|
2 files changed, 16 insertions(+), 6 deletions(-)
|
|
|
|
diff --git mdadm-4.2/Create.c mdadm-4.2-fix/Create.c
|
|
--- mdadm-4.2/Create.c 2025-06-05 06:17:32.656879914 -0400
|
|
+++ mdadm-4.2-fix/Create.c 2025-06-05 07:40:05.583074551 -0400
|
|
@@ -1321,8 +1321,8 @@
|
|
} else {
|
|
pr_err("not starting array - not enough devices.\n");
|
|
}
|
|
- udev_unblock();
|
|
close(mdfd);
|
|
+ udev_unblock();
|
|
sysfs_uevent(&info, "change");
|
|
return 0;
|
|
|
|
diff --git mdadm-4.2/Incremental.c mdadm-4.2-fix/Incremental.c
|
|
--- mdadm-4.2/Incremental.c 2025-06-05 06:17:32.789874082 -0400
|
|
+++ mdadm-4.2-fix/Incremental.c 2025-06-05 07:44:27.126170772 -0400
|
|
@@ -29,6 +29,7 @@
|
|
*/
|
|
|
|
#include "mdadm.h"
|
|
+#include "udev.h"
|
|
#include <sys/wait.h>
|
|
#include <dirent.h>
|
|
#include <ctype.h>
|
|
@@ -296,7 +297,7 @@
|
|
|
|
/* Couldn't find an existing array, maybe make a new one */
|
|
mdfd = create_mddev(match ? match->devname : NULL,
|
|
- name_to_use, c->autof, trustworthy, chosen_name, 0);
|
|
+ name_to_use, c->autof, trustworthy, chosen_name, 1);
|
|
|
|
if (mdfd < 0)
|
|
goto out_unlock;
|
|
@@ -474,7 +475,6 @@
|
|
if (is_container(info.array.level)) {
|
|
char devnm[32];
|
|
/* Try to assemble within the container */
|
|
- sysfs_uevent(sra, "change");
|
|
if (!c->export && c->verbose >= 0)
|
|
pr_err("container %s now has %d device%s\n",
|
|
chosen_name, info.array.working_disks,
|
|
@@ -486,6 +486,8 @@
|
|
if (st->ss->load_container)
|
|
rv = st->ss->load_container(st, mdfd, NULL);
|
|
close(mdfd);
|
|
+ udev_unblock();
|
|
+ sysfs_uevent(sra, "change");
|
|
sysfs_free(sra);
|
|
if (!rv)
|
|
rv = Incremental_container(st, chosen_name, c, NULL);
|
|
@@ -494,6 +496,7 @@
|
|
* so that it can eg. try to rebuild degraded array */
|
|
if (st->ss->external)
|
|
ping_monitor(devnm);
|
|
+ udev_unblock();
|
|
return rv;
|
|
}
|
|
|
|
@@ -630,7 +633,11 @@
|
|
close(mdfd);
|
|
if (policy)
|
|
dev_policy_free(policy);
|
|
- sysfs_free(sra);
|
|
+ udev_unblock();
|
|
+ if (sra) {
|
|
+ sysfs_uevent(sra, "change");
|
|
+ sysfs_free(sra);
|
|
+ }
|
|
return rv;
|
|
out_unlock:
|
|
map_unlock(&map);
|
|
@@ -1577,7 +1584,7 @@
|
|
ra->name,
|
|
c->autof,
|
|
trustworthy,
|
|
- chosen_name, 0);
|
|
+ chosen_name, 1);
|
|
|
|
if (!is_fd_valid(mdfd)) {
|
|
pr_err("create_mddev failed with chosen name %s: %s.\n",
|
|
@@ -1597,6 +1604,8 @@
|
|
map_free(map);
|
|
map = NULL;
|
|
close_fd(&mdfd);
|
|
+ udev_unblock();
|
|
+ sysfs_uevent(&info, "change");
|
|
}
|
|
if (c->export && result) {
|
|
char sep = '=';
|
|
@@ -1623,6 +1632,8 @@
|
|
release:
|
|
map_free(map);
|
|
sysfs_free(list);
|
|
+ udev_unblock();
|
|
+ sysfs_uevent(&info, "change");
|
|
return rv;
|
|
}
|
|
|