f1b7707f26
For issue RHEL-40729, patch 0142 fixes it. For issue RHEL-31448, patch 0125 fixes it. Resolves: RHEL-31448,RHEL-40729,RHEL-52059 Signed-off-by: Xiao Ni <xni@redhat.com>
87 lines
2.4 KiB
Diff
87 lines
2.4 KiB
Diff
From 1a5c0e60308651a20d25ff52511230a20d830330 Mon Sep 17 00:00:00 2001
|
|
From: Logan Gunthorpe <logang@deltatee.com>
|
|
Date: Tue, 4 Jun 2024 10:38:36 -0600
|
|
Subject: [PATCH 102/201] mdadm: Fix hang race condition in
|
|
wait_for_zero_forks()
|
|
|
|
Running a create operation with --write-zeros can randomly hang
|
|
forever waiting for child processes. This happens roughly on in
|
|
ten runs with when running with small (20MB) loop devices.
|
|
|
|
The bug is caused by the fact that signals can be coallesced into
|
|
one if they are not read by signalfd quick enough. So if two children
|
|
finish at exactly the same time, only one SIGCHLD will be received
|
|
by the parent.
|
|
|
|
To fix this, wait on all processes with WNOHANG every time a SIGCHLD
|
|
is received and exit when all processes have been waited on.
|
|
|
|
Reported-by: Xiao Ni <xni@redhat.com>
|
|
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
|
|
Signed-off-by: Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>
|
|
---
|
|
Create.c | 28 +++++++++++++++-------------
|
|
1 file changed, 15 insertions(+), 13 deletions(-)
|
|
|
|
diff --git a/Create.c b/Create.c
|
|
index d033eb68..4f992a22 100644
|
|
--- a/Create.c
|
|
+++ b/Create.c
|
|
@@ -178,6 +178,7 @@ static int wait_for_zero_forks(int *zero_pids, int count)
|
|
bool interrupted = false;
|
|
sigset_t sigset;
|
|
ssize_t s;
|
|
+ pid_t pid;
|
|
|
|
for (i = 0; i < count; i++)
|
|
if (zero_pids[i])
|
|
@@ -196,7 +197,7 @@ static int wait_for_zero_forks(int *zero_pids, int count)
|
|
return 1;
|
|
}
|
|
|
|
- while (1) {
|
|
+ while (wait_count) {
|
|
s = read(sfd, &fdsi, sizeof(fdsi));
|
|
if (s != sizeof(fdsi)) {
|
|
pr_err("Invalid signalfd read: %s\n", strerror(errno));
|
|
@@ -209,23 +210,24 @@ static int wait_for_zero_forks(int *zero_pids, int count)
|
|
pr_info("Interrupting zeroing processes, please wait...\n");
|
|
interrupted = true;
|
|
} else if (fdsi.ssi_signo == SIGCHLD) {
|
|
- if (!--wait_count)
|
|
- break;
|
|
+ for (i = 0; i < count; i++) {
|
|
+ if (!zero_pids[i])
|
|
+ continue;
|
|
+
|
|
+ pid = waitpid(zero_pids[i], &wstatus, WNOHANG);
|
|
+ if (pid <= 0)
|
|
+ continue;
|
|
+
|
|
+ zero_pids[i] = 0;
|
|
+ if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus))
|
|
+ ret = 1;
|
|
+ wait_count--;
|
|
+ }
|
|
}
|
|
}
|
|
|
|
close(sfd);
|
|
|
|
- for (i = 0; i < count; i++) {
|
|
- if (!zero_pids[i])
|
|
- continue;
|
|
-
|
|
- waitpid(zero_pids[i], &wstatus, 0);
|
|
- zero_pids[i] = 0;
|
|
- if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus))
|
|
- ret = 1;
|
|
- }
|
|
-
|
|
if (interrupted) {
|
|
pr_err("zeroing interrupted!\n");
|
|
return 1;
|
|
--
|
|
2.41.0
|
|
|