import opal-prd-6.6-2.el8_3
This commit is contained in:
parent
97a9d4643c
commit
93949b31c5
@ -0,0 +1,147 @@
|
|||||||
|
commit 8cbd0de88d162e387f11569eee1bdecef8fad2e3
|
||||||
|
Author: Oliver O'Halloran <oohall@gmail.com>
|
||||||
|
Date: Wed Sep 23 16:12:20 2020 +1000
|
||||||
|
|
||||||
|
opal-prd: Have a worker process handle page offlining
|
||||||
|
|
||||||
|
The memory_error() hservice interface expects the memory_error() call to
|
||||||
|
just accept the offline request and return without actually offlining the
|
||||||
|
memory. Currently we will attempt to offline the marked pages before
|
||||||
|
returning to HBRT which can result in an excessively long time spent in the
|
||||||
|
memory_error() hservice call which blocks HBRT from processing other
|
||||||
|
errors. Fix this by adding a worker process which performs the page
|
||||||
|
offlining via the sysfs memory error interfaces.
|
||||||
|
|
||||||
|
Reviewed-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
|
||||||
|
Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
|
||||||
|
|
||||||
|
diff --git a/external/opal-prd/opal-prd.c b/external/opal-prd/opal-prd.c
|
||||||
|
index 40e5a984..d74d8039 100644
|
||||||
|
--- a/external/opal-prd/opal-prd.c
|
||||||
|
+++ b/external/opal-prd/opal-prd.c
|
||||||
|
@@ -27,6 +27,7 @@
|
||||||
|
#include <stdarg.h>
|
||||||
|
#include <time.h>
|
||||||
|
#include <poll.h>
|
||||||
|
+#include <signal.h>
|
||||||
|
#include <dirent.h>
|
||||||
|
|
||||||
|
#include <endian.h>
|
||||||
|
@@ -696,13 +697,42 @@ out:
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
+static int memory_error_worker(const char *sysfsfile, const char *type,
|
||||||
|
+ uint64_t i_start_addr, uint64_t i_endAddr)
|
||||||
|
+{
|
||||||
|
+ int memfd, rc, n, ret = 0;
|
||||||
|
+ char buf[ADDR_STRING_SZ];
|
||||||
|
+ uint64_t addr;
|
||||||
|
+
|
||||||
|
+ memfd = open(sysfsfile, O_WRONLY);
|
||||||
|
+ if (memfd < 0) {
|
||||||
|
+ pr_log(LOG_CRIT, "MEM: Failed to offline memory! "
|
||||||
|
+ "Unable to open sysfs node %s: %m", sysfsfile);
|
||||||
|
+ return -1;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ for (addr = i_start_addr; addr <= i_endAddr; addr += ctx->page_size) {
|
||||||
|
+ n = snprintf(buf, ADDR_STRING_SZ, "0x%lx", addr);
|
||||||
|
+ rc = write(memfd, buf, n);
|
||||||
|
+ if (rc != n) {
|
||||||
|
+ pr_log(LOG_CRIT, "MEM: Failed to offline memory! "
|
||||||
|
+ "page addr: %016lx type: %s: %m",
|
||||||
|
+ addr, type);
|
||||||
|
+ ret = 1;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+ pr_log(LOG_CRIT, "MEM: Offlined %016lx,%016lx, type %s: %m\n",
|
||||||
|
+ i_start_addr, addr, type);
|
||||||
|
+
|
||||||
|
+ close(memfd);
|
||||||
|
+ return ret;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
int hservice_memory_error(uint64_t i_start_addr, uint64_t i_endAddr,
|
||||||
|
enum MemoryError_t i_errorType)
|
||||||
|
{
|
||||||
|
const char *sysfsfile, *typestr;
|
||||||
|
- char buf[ADDR_STRING_SZ];
|
||||||
|
- int memfd, rc, n, ret = 0;
|
||||||
|
- uint64_t addr;
|
||||||
|
+ pid_t pid;
|
||||||
|
|
||||||
|
switch(i_errorType) {
|
||||||
|
case MEMORY_ERROR_CE:
|
||||||
|
@@ -722,26 +752,21 @@ int hservice_memory_error(uint64_t i_start_addr, uint64_t i_endAddr,
|
||||||
|
pr_log(LOG_ERR, "MEM: Memory error: range %016lx-%016lx, type: %s",
|
||||||
|
i_start_addr, i_endAddr, typestr);
|
||||||
|
|
||||||
|
+ /*
|
||||||
|
+ * HBRT expects the memory offlining process to happen in the background
|
||||||
|
+ * after the notification is delivered.
|
||||||
|
+ */
|
||||||
|
+ pid = fork();
|
||||||
|
+ if (pid > 0)
|
||||||
|
+ exit(memory_error_worker(sysfsfile, typestr, i_start_addr, i_endAddr));
|
||||||
|
|
||||||
|
- memfd = open(sysfsfile, O_WRONLY);
|
||||||
|
- if (memfd < 0) {
|
||||||
|
- pr_log(LOG_CRIT, "MEM: Failed to offline memory! "
|
||||||
|
- "Unable to open sysfs node %s: %m", sysfsfile);
|
||||||
|
+ if (pid < 0) {
|
||||||
|
+ perror("MEM: unable to fork worker to offline memory!\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
- for (addr = i_start_addr; addr <= i_endAddr; addr += ctx->page_size) {
|
||||||
|
- n = snprintf(buf, ADDR_STRING_SZ, "0x%lx", addr);
|
||||||
|
- rc = write(memfd, buf, n);
|
||||||
|
- if (rc != n) {
|
||||||
|
- pr_log(LOG_CRIT, "MEM: Failed to offline memory! "
|
||||||
|
- "page addr: %016lx type: %d: %m",
|
||||||
|
- addr, i_errorType);
|
||||||
|
- ret = rc;
|
||||||
|
- }
|
||||||
|
- }
|
||||||
|
-
|
||||||
|
- return ret;
|
||||||
|
+ pr_log(LOG_INFO, "MEM: forked off %d to handle mem error\n", pid);
|
||||||
|
+ return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t hservice_get_interface_capabilities(uint64_t set)
|
||||||
|
@@ -2112,6 +2137,10 @@ static int init_control_socket(struct opal_prd_ctx *ctx)
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
+static struct sigaction sigchild_action = {
|
||||||
|
+ .sa_flags = SA_NOCLDWAIT | SA_RESTART,
|
||||||
|
+ .sa_handler = SIG_DFL,
|
||||||
|
+};
|
||||||
|
|
||||||
|
static int run_prd_daemon(struct opal_prd_ctx *ctx)
|
||||||
|
{
|
||||||
|
@@ -2243,6 +2272,22 @@ static int run_prd_daemon(struct opal_prd_ctx *ctx)
|
||||||
|
pr_debug("SCOM: f00f: %lx", be64toh(val));
|
||||||
|
}
|
||||||
|
|
||||||
|
+ /*
|
||||||
|
+ * Setup the SIGCHLD handler to automatically reap the worker threads
|
||||||
|
+ * we use for memory offlining. We can't do this earlier since the
|
||||||
|
+ * modprobe helper spawns workers and wants to check their exit status
|
||||||
|
+ * with waitpid(). Auto-reaping breaks that so enable it just before
|
||||||
|
+ * entering the attn loop.
|
||||||
|
+ *
|
||||||
|
+ * We also setup system call restarting on SIGCHLD since opal-prd
|
||||||
|
+ * doesn't make any real attempt to handle blocking functions exiting
|
||||||
|
+ * due to EINTR.
|
||||||
|
+ */
|
||||||
|
+ if (sigaction(SIGCHLD, &sigchild_action, NULL)) {
|
||||||
|
+ pr_log(LOG_ERR, "CTRL: Failed to register signal handler %m\n");
|
||||||
|
+ return -1;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
run_attn_loop(ctx);
|
||||||
|
rc = 0;
|
||||||
|
|
@ -2,13 +2,16 @@
|
|||||||
|
|
||||||
Name: opal-prd
|
Name: opal-prd
|
||||||
Version: 6.6
|
Version: 6.6
|
||||||
Release: 1%{?dist}
|
Release: 2%{?dist}
|
||||||
Summary: OPAL Processor Recovery Diagnostics Daemon
|
Summary: OPAL Processor Recovery Diagnostics Daemon
|
||||||
|
|
||||||
Group: System Environment/Daemons
|
Group: System Environment/Daemons
|
||||||
License: ASL 2.0
|
License: ASL 2.0
|
||||||
URL: http://github.com/open-power/skiboot
|
URL: http://github.com/open-power/skiboot
|
||||||
|
|
||||||
|
# upstream fix, opal-prd: Have a worker process handle page offlining
|
||||||
|
Patch0: opal-prd-6.6.3-8cbd0de88d162e387f11569eee1bdecef8fad2e3.patch
|
||||||
|
|
||||||
# Presently opal-prd is supported on ppc64le architecture only.
|
# Presently opal-prd is supported on ppc64le architecture only.
|
||||||
ExclusiveArch: ppc64le
|
ExclusiveArch: ppc64le
|
||||||
|
|
||||||
@ -56,6 +59,7 @@ services to the OS (Linux) on IBM Power and OpenPower systems.
|
|||||||
|
|
||||||
%prep
|
%prep
|
||||||
%setup -q -n %{project}-%{version}
|
%setup -q -n %{project}-%{version}
|
||||||
|
%patch0 -p1 -b .have_a_worker_process_handle_page_offlining
|
||||||
|
|
||||||
%build
|
%build
|
||||||
OPAL_PRD_VERSION=%{version} make V=1 CC="gcc" CFLAGS="%{build_cflags}" LDFLAGS="%{build_ldflags}" ASFLAGS="-m64 -Wa,--generate-missing-build-notes=yes" -C external/opal-prd
|
OPAL_PRD_VERSION=%{version} make V=1 CC="gcc" CFLAGS="%{build_cflags}" LDFLAGS="%{build_ldflags}" ASFLAGS="-m64 -Wa,--generate-missing-build-notes=yes" -C external/opal-prd
|
||||||
@ -123,6 +127,9 @@ install -m 644 %{SOURCE2} %{buildroot}/%{_sysconfdir}/logrotate.d/opal-prd
|
|||||||
%{_datadir}/qemu/
|
%{_datadir}/qemu/
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Thu Nov 12 2020 Than Ngo <than@redhat.com> - 6.6-2
|
||||||
|
- Resolves: #1896451, Have a worker process handle page offlining
|
||||||
|
|
||||||
* Fri Apr 24 2020 Than Ngo <than@redhat.com> - 6.6-1
|
* Fri Apr 24 2020 Than Ngo <than@redhat.com> - 6.6-1
|
||||||
- Resolves: #1779211, rebase to 6.6
|
- Resolves: #1779211, rebase to 6.6
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user