import opal-prd-6.6-2.el8_3
This commit is contained in:
parent
97a9d4643c
commit
93949b31c5
@ -0,0 +1,147 @@
|
||||
commit 8cbd0de88d162e387f11569eee1bdecef8fad2e3
|
||||
Author: Oliver O'Halloran <oohall@gmail.com>
|
||||
Date: Wed Sep 23 16:12:20 2020 +1000
|
||||
|
||||
opal-prd: Have a worker process handle page offlining
|
||||
|
||||
The memory_error() hservice interface expects the memory_error() call to
|
||||
just accept the offline request and return without actually offlining the
|
||||
memory. Currently we will attempt to offline the marked pages before
|
||||
returning to HBRT which can result in an excessively long time spent in the
|
||||
memory_error() hservice call which blocks HBRT from processing other
|
||||
errors. Fix this by adding a worker process which performs the page
|
||||
offlining via the sysfs memory error interfaces.
|
||||
|
||||
Reviewed-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
|
||||
Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
|
||||
|
||||
diff --git a/external/opal-prd/opal-prd.c b/external/opal-prd/opal-prd.c
|
||||
index 40e5a984..d74d8039 100644
|
||||
--- a/external/opal-prd/opal-prd.c
|
||||
+++ b/external/opal-prd/opal-prd.c
|
||||
@@ -27,6 +27,7 @@
|
||||
#include <stdarg.h>
|
||||
#include <time.h>
|
||||
#include <poll.h>
|
||||
+#include <signal.h>
|
||||
#include <dirent.h>
|
||||
|
||||
#include <endian.h>
|
||||
@@ -696,13 +697,42 @@ out:
|
||||
return rc;
|
||||
}
|
||||
|
||||
+static int memory_error_worker(const char *sysfsfile, const char *type,
|
||||
+ uint64_t i_start_addr, uint64_t i_endAddr)
|
||||
+{
|
||||
+ int memfd, rc, n, ret = 0;
|
||||
+ char buf[ADDR_STRING_SZ];
|
||||
+ uint64_t addr;
|
||||
+
|
||||
+ memfd = open(sysfsfile, O_WRONLY);
|
||||
+ if (memfd < 0) {
|
||||
+ pr_log(LOG_CRIT, "MEM: Failed to offline memory! "
|
||||
+ "Unable to open sysfs node %s: %m", sysfsfile);
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ for (addr = i_start_addr; addr <= i_endAddr; addr += ctx->page_size) {
|
||||
+ n = snprintf(buf, ADDR_STRING_SZ, "0x%lx", addr);
|
||||
+ rc = write(memfd, buf, n);
|
||||
+ if (rc != n) {
|
||||
+ pr_log(LOG_CRIT, "MEM: Failed to offline memory! "
|
||||
+ "page addr: %016lx type: %s: %m",
|
||||
+ addr, type);
|
||||
+ ret = 1;
|
||||
+ }
|
||||
+ }
|
||||
+ pr_log(LOG_CRIT, "MEM: Offlined %016lx,%016lx, type %s: %m\n",
|
||||
+ i_start_addr, addr, type);
|
||||
+
|
||||
+ close(memfd);
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
int hservice_memory_error(uint64_t i_start_addr, uint64_t i_endAddr,
|
||||
enum MemoryError_t i_errorType)
|
||||
{
|
||||
const char *sysfsfile, *typestr;
|
||||
- char buf[ADDR_STRING_SZ];
|
||||
- int memfd, rc, n, ret = 0;
|
||||
- uint64_t addr;
|
||||
+ pid_t pid;
|
||||
|
||||
switch(i_errorType) {
|
||||
case MEMORY_ERROR_CE:
|
||||
@@ -722,26 +752,21 @@ int hservice_memory_error(uint64_t i_start_addr, uint64_t i_endAddr,
|
||||
pr_log(LOG_ERR, "MEM: Memory error: range %016lx-%016lx, type: %s",
|
||||
i_start_addr, i_endAddr, typestr);
|
||||
|
||||
+ /*
|
||||
+ * HBRT expects the memory offlining process to happen in the background
|
||||
+ * after the notification is delivered.
|
||||
+ */
|
||||
+ pid = fork();
|
||||
+ if (pid > 0)
|
||||
+ exit(memory_error_worker(sysfsfile, typestr, i_start_addr, i_endAddr));
|
||||
|
||||
- memfd = open(sysfsfile, O_WRONLY);
|
||||
- if (memfd < 0) {
|
||||
- pr_log(LOG_CRIT, "MEM: Failed to offline memory! "
|
||||
- "Unable to open sysfs node %s: %m", sysfsfile);
|
||||
+ if (pid < 0) {
|
||||
+ perror("MEM: unable to fork worker to offline memory!\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
- for (addr = i_start_addr; addr <= i_endAddr; addr += ctx->page_size) {
|
||||
- n = snprintf(buf, ADDR_STRING_SZ, "0x%lx", addr);
|
||||
- rc = write(memfd, buf, n);
|
||||
- if (rc != n) {
|
||||
- pr_log(LOG_CRIT, "MEM: Failed to offline memory! "
|
||||
- "page addr: %016lx type: %d: %m",
|
||||
- addr, i_errorType);
|
||||
- ret = rc;
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- return ret;
|
||||
+ pr_log(LOG_INFO, "MEM: forked off %d to handle mem error\n", pid);
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
uint64_t hservice_get_interface_capabilities(uint64_t set)
|
||||
@@ -2112,6 +2137,10 @@ static int init_control_socket(struct opal_prd_ctx *ctx)
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static struct sigaction sigchild_action = {
|
||||
+ .sa_flags = SA_NOCLDWAIT | SA_RESTART,
|
||||
+ .sa_handler = SIG_DFL,
|
||||
+};
|
||||
|
||||
static int run_prd_daemon(struct opal_prd_ctx *ctx)
|
||||
{
|
||||
@@ -2243,6 +2272,22 @@ static int run_prd_daemon(struct opal_prd_ctx *ctx)
|
||||
pr_debug("SCOM: f00f: %lx", be64toh(val));
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * Setup the SIGCHLD handler to automatically reap the worker threads
|
||||
+ * we use for memory offlining. We can't do this earlier since the
|
||||
+ * modprobe helper spawns workers and wants to check their exit status
|
||||
+ * with waitpid(). Auto-reaping breaks that so enable it just before
|
||||
+ * entering the attn loop.
|
||||
+ *
|
||||
+ * We also setup system call restarting on SIGCHLD since opal-prd
|
||||
+ * doesn't make any real attempt to handle blocking functions exiting
|
||||
+ * due to EINTR.
|
||||
+ */
|
||||
+ if (sigaction(SIGCHLD, &sigchild_action, NULL)) {
|
||||
+ pr_log(LOG_ERR, "CTRL: Failed to register signal handler %m\n");
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
run_attn_loop(ctx);
|
||||
rc = 0;
|
||||
|
@ -2,13 +2,16 @@
|
||||
|
||||
Name: opal-prd
|
||||
Version: 6.6
|
||||
Release: 1%{?dist}
|
||||
Release: 2%{?dist}
|
||||
Summary: OPAL Processor Recovery Diagnostics Daemon
|
||||
|
||||
Group: System Environment/Daemons
|
||||
License: ASL 2.0
|
||||
URL: http://github.com/open-power/skiboot
|
||||
|
||||
# upstream fix, opal-prd: Have a worker process handle page offlining
|
||||
Patch0: opal-prd-6.6.3-8cbd0de88d162e387f11569eee1bdecef8fad2e3.patch
|
||||
|
||||
# Presently opal-prd is supported on ppc64le architecture only.
|
||||
ExclusiveArch: ppc64le
|
||||
|
||||
@ -56,6 +59,7 @@ services to the OS (Linux) on IBM Power and OpenPower systems.
|
||||
|
||||
%prep
|
||||
%setup -q -n %{project}-%{version}
|
||||
%patch0 -p1 -b .have_a_worker_process_handle_page_offlining
|
||||
|
||||
%build
|
||||
OPAL_PRD_VERSION=%{version} make V=1 CC="gcc" CFLAGS="%{build_cflags}" LDFLAGS="%{build_ldflags}" ASFLAGS="-m64 -Wa,--generate-missing-build-notes=yes" -C external/opal-prd
|
||||
@ -123,6 +127,9 @@ install -m 644 %{SOURCE2} %{buildroot}/%{_sysconfdir}/logrotate.d/opal-prd
|
||||
%{_datadir}/qemu/
|
||||
|
||||
%changelog
|
||||
* Thu Nov 12 2020 Than Ngo <than@redhat.com> - 6.6-2
|
||||
- Resolves: #1896451, Have a worker process handle page offlining
|
||||
|
||||
* Fri Apr 24 2020 Than Ngo <than@redhat.com> - 6.6-1
|
||||
- Resolves: #1779211, rebase to 6.6
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user