Revert OL modifications
This commit is contained in:
parent
b906ab3d5b
commit
553406c146
@ -1,101 +0,0 @@
|
||||
From ac4b50f616004d236f9d8676693cffcebbe0f8af Mon Sep 17 00:00:00 2001
|
||||
From: Richard Li <tianqi.li@oracle.com>
|
||||
Date: Fri, 1 Aug 2025 22:02:00 +0000
|
||||
Subject: [PATCH] mdadm: Fix IMSM Raid assembly after disk link failure and
|
||||
reboot
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This patch addresses a scenario observed in production where disk links
|
||||
go down. After a system reboot, depending on which disk becomes
|
||||
available first, the IMSM RAID array may either fully assemble or
|
||||
come up with missing disks.
|
||||
|
||||
Below is an example of the production case simulating disk link failures
|
||||
and subsequent system reboot.
|
||||
|
||||
(note: "echo "1" | sudo tee /sys/class/scsi_device/x:x:x:x/device/delete"
|
||||
is used here to fail/unplug/disconnect disks)
|
||||
|
||||
Raid Configuration: IMSM Raid1 with two disks
|
||||
|
||||
- When sda is unplugged first, then sdb, and after reboot sdb is
|
||||
reconnected first followed by sda, the container (/dev/md127) and
|
||||
subarrays (/dev/md125, /dev/md126) correctly assemble and become active.
|
||||
- However, when sda is reconnected first, then sdb, the subarrays fail to
|
||||
fully reconstruct — sda remains missing from the assembled subarrays.
|
||||
|
||||
This patch addresses this issue in monitor.c. Specifically, when an IMSM
|
||||
RAID is detected and the faulty disk found does not yet exist
|
||||
under /sys/block/CONTAINER_NAME (we do this check so the behavior of
|
||||
"mdadm --fail" is not impacted), the disk will be marked as a spare
|
||||
instead, allowing it to be reused during array reconstruction.
|
||||
|
||||
The patch improves resilience by ensuring consistent array reconstruction
|
||||
regardless of disk detection order. This aligns system behavior with
|
||||
expected RAID redundancy and reduces risk of unnecessary manual recovery
|
||||
steps after reboots in degraded hardware environments.
|
||||
|
||||
Orabug: 38317486
|
||||
Signed-off-by: Richard Li <tianqi.li@oracle.com>
|
||||
---
|
||||
monitor.c | 35 +++++++++++++++++++++++++++++++++--
|
||||
1 file changed, 33 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/monitor.c b/monitor.c
|
||||
index a4f707cc..02e1f1bc 100644
|
||||
--- a/monitor.c
|
||||
+++ b/monitor.c
|
||||
@@ -394,6 +394,24 @@ static void signal_manager(void)
|
||||
* - request a sync_action
|
||||
*
|
||||
*/
|
||||
+static int find_disk_in_container(struct supertype *container, struct mdinfo *mdi)
|
||||
+{
|
||||
+ struct mdinfo *fdi, *di;
|
||||
+
|
||||
+ fdi = sysfs_read(-1, container->container_devnm, GET_DEVS);
|
||||
+ if (!fdi)
|
||||
+ return 0;
|
||||
+
|
||||
+ for (di = fdi->devs; di; di = di->next) {
|
||||
+ if (di->disk.major == mdi->disk.major &&
|
||||
+ di->disk.minor == mdi->disk.minor) {
|
||||
+ dprintf("%d:%d found in container in sysfs\n",
|
||||
+ mdi->disk.major, mdi->disk.minor);
|
||||
+ return 1;
|
||||
+ }
|
||||
+ }
|
||||
+ return 0;
|
||||
+}
|
||||
|
||||
#define ARRAY_DIRTY 1
|
||||
#define ARRAY_BUSY 2
|
||||
@@ -546,8 +564,21 @@ static int read_and_act(struct active_array *a, fd_set *fds)
|
||||
*/
|
||||
for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
|
||||
if (mdi->curr_state & DS_FAULTY) {
|
||||
- a->container->ss->set_disk(a, mdi->disk.raid_disk,
|
||||
- mdi->curr_state);
|
||||
+ /* Mark faulty disk as spare to allow it to be reused during IMSM array
|
||||
+ * reconstruction. This fixes the issue when disks links go down
|
||||
+ * and up againfter a reboot, IMSM RAID array may come up
|
||||
+ * with missing disks.
|
||||
+ */
|
||||
+ if (strcmp(a->container->ss->name, "imsm") == 0 &&
|
||||
+ !find_disk_in_container(a->container, mdi) &&
|
||||
+ !(mdi->curr_state & DS_SPARE)) {
|
||||
+ dprintf("Marking %d:%d as spare for reuse\n",
|
||||
+ mdi->disk.major, mdi->disk.minor);
|
||||
+ a->container->ss->set_disk(a, mdi->disk.raid_disk, DS_SPARE);
|
||||
+ } else {
|
||||
+ a->container->ss->set_disk(a, mdi->disk.raid_disk, mdi->curr_state);
|
||||
+ }
|
||||
+
|
||||
check_degraded = 1;
|
||||
if (mdi->curr_state & DS_BLOCKED)
|
||||
mdi->next_state |= DS_UNBLOCK;
|
||||
--
|
||||
2.47.1
|
||||
|
||||
0
md-auto-readd.sh
Executable file → Normal file
0
md-auto-readd.sh
Executable file → Normal file
0
mdadm-raid-check-sysconfig
Executable file → Normal file
0
mdadm-raid-check-sysconfig
Executable file → Normal file
@ -4,7 +4,7 @@ Name: mdadm
|
||||
Version: 4.4
|
||||
# extraversion is used to define rhel internal version
|
||||
%define extraversion 3
|
||||
Release: %{extraversion}.0.1%{?dist}
|
||||
Release: %{extraversion}%{?dist}
|
||||
Summary: The mdadm program controls Linux md devices (software RAID arrays)
|
||||
URL: https://git.kernel.org/pub/scm/utils/mdadm/mdadm.git
|
||||
License: GPLv2+
|
||||
@ -66,9 +66,6 @@ Patch196: mdadm-fix-building-errors.patch
|
||||
Patch197: mdadm-udev.patch
|
||||
Patch198: mdadm-2.5.2-static.patch
|
||||
|
||||
# Oracle patches
|
||||
Patch1001: 1001-mdadm-fix-IMSM-Raid-assembly-after-disk-link-failure.patch
|
||||
|
||||
BuildRequires: make
|
||||
BuildRequires: systemd-rpm-macros binutils-devel gcc systemd-devel
|
||||
%if %{with abrt}
|
||||
@ -147,9 +144,6 @@ install -m644 %{SOURCE5} %{buildroot}/etc/libreport/events.d
|
||||
/usr/share/mdadm/mdcheck
|
||||
|
||||
%changelog
|
||||
* Tue Apr 07 2026 EL Errata <el-errata_ww@oracle.com> - 4.4-3.0.1
|
||||
- mdadm: Fix IMSM Raid assembly after disk link failure and reboot. [Orabug: 38317486]
|
||||
|
||||
* Wed Nov 26 2025 Xiao Ni <xni@redhat.com> - 4.4-3
|
||||
- enable sync file for udev rules
|
||||
- Resolves: RHEL-130890
|
||||
|
||||
0
raid-check
Executable file → Normal file
0
raid-check
Executable file → Normal file
Loading…
Reference in New Issue
Block a user