Add support for vendor specific information

Resolves: RHEL-68673

Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
This commit is contained in:
Aristeu Rozanski 2025-02-11 11:13:01 -05:00
parent 09ea5ccc0c
commit 0184702bc0
3 changed files with 178 additions and 1 deletions

View File

@ -0,0 +1,95 @@
commit 83a3ced797256dcb1c93f8de4266fd7545fbfb3b
Author: Avadhut Naik <avadnaik@amd.com>
Date: Tue Nov 21 14:04:19 2023 -0600
rasdaemon: Add support for vendor-specific machine check error information
Some CPU vendors may provide additional vendor-specific machine check
error information. AMD, for example, provides FRU Text through SYND 1/2
registers if BIT 9 of SMCA_CONFIG register is set.
Add support to display the additional vendor-specific error information,
if any.
Signed-off-by: Avadhut Naik <Avadhut.Naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
mce-amd-smca.c | 12 ++++++++++++
ras-mce-handler.c | 22 ++++++++++++++++++++++
ras-mce-handler.h | 3 +++
3 files changed, 37 insertions(+)
--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-11-27 10:18:13.765255836 -0500
+++ rasdaemon-0.6.7/mce-amd-smca.c 2024-11-27 10:18:23.014169756 -0500
@@ -999,6 +999,18 @@ if (bank_type == SMCA_UMC_V2 && xec == 0
channel, csrow);
}
+
+ if (e->vdata_len) {
+ uint64_t smca_config = e->vdata[2];
+
+ /*
+ * BIT 9 of the CONFIG register of a few SMCA Bank types indicates
+ * presence of FRU Text in SYND 1 / 2 registers
+ */
+ if (smca_config & BIT(9))
+ memcpy(e->frutext, e->vdata, 16);
+ }
+
}
int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)
--- rasdaemon-0.6.7.orig/ras-mce-handler.c 2024-11-27 10:18:23.014169756 -0500
+++ rasdaemon-0.6.7/ras-mce-handler.c 2024-11-27 10:19:38.849463954 -0500
@@ -375,6 +375,25 @@ #if 0
if (e->microcode)
trace_seq_printf(s, ", microcode= %x", e->microcode);
+ if (!e->vdata_len)
+ return;
+
+ if (strlen(e->frutext)) {
+ trace_seq_printf(s, ", FRU Text= %s", e->frutext);
+ trace_seq_printf(s, ", Vendor Data= ");
+ for (int i = 2; i < e->vdata_len/8; i++) {
+ trace_seq_printf(s, "0x%lx", e->vdata[i]);
+ trace_seq_printf(s, " ");
+ }
+ } else {
+ trace_seq_printf(s, ", Vendor Data= ");
+ for (int i = 0; i < e->vdata_len/8; i ++) {
+ trace_seq_printf(s, "0x%lx", e->vdata[i]);
+ trace_seq_printf(s, " ");
+ }
+ }
+
+
/*
* FIXME: The original mcelog userspace tool uses DMI to map from
* address to DIMM. From the comments there, the code there doesn't
@@ -559,6 +578,9 @@ if (pevent_get_field_val(s, event, "ipid
if (!pevent_get_field_val(s, event, "microcode", record, &val, 1))
e.microcode = val;
+ /* Get Vendor-specfic Data, if any */
+ e.vdata = pevent_get_field_raw(s, event, "v_data", record, &e.vdata_len, 1);
+
switch (mce->cputype) {
case CPU_GENERIC:
break;
--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-11-27 10:18:23.014169756 -0500
+++ rasdaemon-0.6.7/ras-mce-handler.h 2024-11-27 10:20:05.249218250 -0500
@@ -76,8 +76,11 @@ struct mce_event {
uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */
uint64_t ppin;
uint32_t microcode;
+ int32_t vdata_len;
+ const uint64_t *vdata;
/* Parsed data */
+ char frutext[17];
char timestamp[64];
char bank_name[64];
char error_msg[4096];

View File

@ -0,0 +1,75 @@
commit 8b536321cc0679fb82d4ea7521f9375d88cec0cc
Author: Avadhut Naik <avadhut.naik@amd.com>
Date: Thu Nov 7 06:24:44 2024 +0000
rasdaemon: Modify support for vendor-specific machine check error information
Commit 83a3ced797256d ("rasdaemon: Add support for vendor-specific
machine check error information") assumes that MCA_CONFIG MSR will be
exported as part of vendor-specific error information through the MCE
tracepoint.
The same, however, is not true anymore. MCA_CONFIG MSR will not be
exported through the MCE tracepoint. Instead, the data from MCA_SYND1/2
MSRs, exported as vendor-specific error information on newer AMD SOCs,
should always be interpreted as FRUText.
Modify the error decoding support accordingly.
Fixes: 83a3ced797256d ("rasdaemon: Add support for vendor-specific
machine check error information")
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
mce-amd-smca.c | 13 ++-----------
ras-mce-handler.c | 15 +--------------
2 files changed, 3 insertions(+), 25 deletions(-)
--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-11-27 10:20:29.777989960 -0500
+++ rasdaemon-0.6.7/mce-amd-smca.c 2024-11-27 10:21:28.731441278 -0500
@@ -1000,17 +1000,8 @@ if (bank_type == SMCA_UMC_V2 && xec == 0
}
- if (e->vdata_len) {
- uint64_t smca_config = e->vdata[2];
-
- /*
- * BIT 9 of the CONFIG register of a few SMCA Bank types indicates
- * presence of FRU Text in SYND 1 / 2 registers
- */
- if (smca_config & BIT(9))
- memcpy(e->frutext, e->vdata, 16);
- }
-
+ if (e->vdata_len)
+ memcpy(e->frutext, e->vdata, 16);
}
int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)
--- rasdaemon-0.6.7.orig/ras-mce-handler.c 2024-11-27 10:20:29.777989960 -0500
+++ rasdaemon-0.6.7/ras-mce-handler.c 2024-11-27 10:21:01.517694557 -0500
@@ -378,21 +378,8 @@ #if 0
if (!e->vdata_len)
return;
- if (strlen(e->frutext)) {
+ if (strlen(e->frutext))
trace_seq_printf(s, ", FRU Text= %s", e->frutext);
- trace_seq_printf(s, ", Vendor Data= ");
- for (int i = 2; i < e->vdata_len/8; i++) {
- trace_seq_printf(s, "0x%lx", e->vdata[i]);
- trace_seq_printf(s, " ");
- }
- } else {
- trace_seq_printf(s, ", Vendor Data= ");
- for (int i = 0; i < e->vdata_len/8; i ++) {
- trace_seq_printf(s, "0x%lx", e->vdata[i]);
- trace_seq_printf(s, " ");
- }
- }
-
/*
* FIXME: The original mcelog userspace tool uses DMI to map from

View File

@ -1,6 +1,6 @@
Name: rasdaemon Name: rasdaemon
Version: 0.6.7 Version: 0.6.7
Release: 17%{?dist} Release: 18%{?dist}
Summary: Utility to receive RAS error tracings Summary: Utility to receive RAS error tracings
License: GPL-2.0-only License: GPL-2.0-only
URL: http://git.infradead.org/users/mchehab/rasdaemon.git URL: http://git.infradead.org/users/mchehab/rasdaemon.git
@ -44,6 +44,8 @@ Patch35: b1ace39286e287282a275b6edc90dc2f64e60a3c.patch
Patch36: 045ab08eaa00172d50621df9502f6910f3fe3af4.patch Patch36: 045ab08eaa00172d50621df9502f6910f3fe3af4.patch
Patch37: 79065939fc4bc1da72a3718937fab80e73a6dd75.patch Patch37: 79065939fc4bc1da72a3718937fab80e73a6dd75.patch
Patch38: 794530fbf270eae9f6f43c6d0bbd3ec6f2b210f3.patch Patch38: 794530fbf270eae9f6f43c6d0bbd3ec6f2b210f3.patch
Patch39: 83a3ced797256dcb1c93f8de4266fd7545fbfb3b.patch
Patch40: 8b536321cc0679fb82d4ea7521f9375d88cec0cc.patch
ExcludeArch: s390 s390x ExcludeArch: s390 s390x
BuildRequires: make BuildRequires: make
@ -117,6 +119,8 @@ an utility for reporting current error counts from the EDAC sysfs files.
%patch36 -p1 %patch36 -p1
%patch37 -p1 %patch37 -p1
%patch38 -p1 %patch38 -p1
%patch39 -p1
%patch40 -p1
# The tarball is locked in time the first time aclocal was ran and will keep # The tarball is locked in time the first time aclocal was ran and will keep
# requiring an older version of automake # requiring an older version of automake
@ -152,6 +156,9 @@ sed -i "s/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION=account/" %{buildroot}/%{_sysconfdir
%{_sysconfdir}/sysconfig/rasdaemon %{_sysconfdir}/sysconfig/rasdaemon
%changelog %changelog
* Wed Nov 27 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-18
- Add support for vendor specific information [RHEL-68673]
* Tue Nov 19 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-17 * Tue Nov 19 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-17
- ras-events: quit loop in read_ras_event when kbuf data is broken [RHEL-68127] - ras-events: quit loop in read_ras_event when kbuf data is broken [RHEL-68127]