import rasdaemon-0.6.1-5.1.el8_4

This commit is contained in:
CentOS Sources 2021-11-02 03:50:26 -04:00 committed by Stepan Oksanichenko
parent 37d270264d
commit 6cdd3d6f35
5 changed files with 417 additions and 1 deletions

View File

@ -0,0 +1,66 @@
commit 2a1d217660351c08eb2f8bccebf939abba2f7e69
Author: Brian WoodsGhannam, Yazen <brian.woods@amd.comYazen.Ghannam@amd.com>
Date: Fri Nov 1 15:48:13 2019 +0100
rasdaemon: rename CPU_NAPLES cputype
Change CPU_NAPLES to CPU_AMD_SMCA to reflect that it isn't just NAPLES
that is supported, but AMD's Scalable Machine Check Architecture (SMCA).
[ Yazen: change family check to feature check, and change CPU name. ]
CC: "mchehab+samsung@kernel.org" <mchehab+samsung@kernel.org>, "Namburu, Chandu-babu" <chandu@amd.com> # Thread-Topic: [PATCH 1/2] rasdaemon: rename CPU_NAPLES cputype
Signed-off-by: Brian Woods <brian.woods@amd.com>
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Cc: Chandu-babu Namburu <chandu@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
ras-mce-handler.c | 10 ++++++----
ras-mce-handler.h | 2 +-
2 files changed, 7 insertions(+), 5 deletions(-)
--- rasdaemon-0.6.1.orig/ras-mce-handler.c 2021-05-26 15:16:24.699096556 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.c 2021-05-26 15:18:06.543162745 -0400
@@ -55,7 +55,7 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series
[CPU_KNIGHTS_LANDING] = "Knights Landing",
[CPU_KNIGHTS_MILL] = "Knights Mill",
[CPU_SKYLAKE_XEON] = "Skylake server",
- [CPU_NAPLES] = "AMD Family 17h Zen1"
+ [CPU_AMD_SMCA] = "AMD Scalable MCA",
};
static enum cputype select_intel_cputype(struct ras_events *ras)
@@ -191,8 +191,10 @@ ret = 0;
if (!strcmp(mce->vendor, "AuthenticAMD")) {
if (mce->family == 15)
mce->cputype = CPU_K8;
- if (mce->family == 23)
- mce->cputype = CPU_NAPLES;
+ if (strstr(mce->processor_flags, "smca")) {
+ mce->cputype = CPU_AMD_SMCA;
+ goto ret;
+ }
if (mce->family > 23) {
log(ALL, LOG_INFO,
"Can't parse MCE for this AMD CPU yet %d\n",
@@ -435,7 +437,7 @@ if (pevent_get_field_val(s, event, "ipid
case CPU_K8:
rc = parse_amd_k8_event(ras, &e);
break;
- case CPU_NAPLES:
+ case CPU_AMD_SMCA:
rc = parse_amd_smca_event(ras, &e);
break;
default: /* All other CPU types are Intel */
--- rasdaemon-0.6.1.orig/ras-mce-handler.h 2021-05-26 15:17:15.409631590 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.h 2021-05-26 15:18:20.102038424 -0400
@@ -50,7 +50,7 @@ enum cputype {
CPU_KNIGHTS_LANDING,
CPU_KNIGHTS_MILL,
CPU_SKYLAKE_XEON,
- CPU_NAPLES,
+ CPU_AMD_SMCA,
};
struct mce_event {

View File

@ -0,0 +1,38 @@
commit 854364ba44aee9bc5646f6537fc744b0b54aff37
Author: Muralidhara M K <muralimk@amd.com>
Date: Thu Aug 20 21:00:57 2020 +0530
rasdaemon: Add 8 channel decoding for SMCA systems
Current Scalable Machine Check Architecture (SMCA) systems support up
to 8 UMC channels.
To find the UMC channel represented by a bank, look at the 6th nibble
in the MCA_IPID[InstanceId] field.
Signed-off-by: Muralidhara M K <muralimk@amd.com>
[ Adjust commit message. ]
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index d0b6cb6..7c619fd 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -438,15 +438,7 @@ static void amd_decode_errcode(struct mce_event *e)
*/
static int find_umc_channel(struct mce_event *e)
{
- uint32_t umc_instance_id[] = {0x50f00, 0x150f00};
- uint32_t instance_id = EXTRACT(e->ipid, 0, 31);
- int i, channel = -1;
-
- for (i = 0; i < ARRAY_SIZE(umc_instance_id); i++)
- if (umc_instance_id[i] == instance_id)
- channel = i;
-
- return channel;
+ return EXTRACT(e->ipid, 0, 31) >> 20;
}
/* Decode extended errors according to Scalable MCA specification */
static void decode_smca_error(struct mce_event *e)

View File

@ -0,0 +1,207 @@
commit 8704a85d8dc3483423ec2934fee8132f85f8fdb6
Author: Brian WoodsGhannam, Yazen <brian.woods@amd.comYazen.Ghannam@amd.com>
Date: Fri Nov 1 15:48:14 2019 +0100
rasdaemon: add support for new AMD SMCA bank types
Going forward, the Scalable Machine Check Architecture (SMCA) has some
updated and additional bank types which show up in Zen2. The differing
bank types include: CS_V2, PSP_V2, SMU_V2, MP5, NBIO, and PCIE. The V2
bank types replace the original bank types but have unique HWID/MCAtype
IDs from the originals so there's no conflicts between different
versions or other bank types. All of the differing bank types have new
MCE descriptions which have been added as well.
CC: "mchehab+samsung@kernel.org" <mchehab+samsung@kernel.org>, "Namburu, Chandu-babu" <chandu@amd.com> # Thread-Topic: [PATCH 2/2] rasdaemon: add support for new AMD SMCA bank types
Signed-off-by: Brian Woods <brian.woods@amd.com>
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Cc: Chandu-babu Namburu <chandu@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 6c3e8a5..114e786 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -49,11 +49,17 @@ enum smca_bank_types {
SMCA_FP, /* Floating Point */
SMCA_L3_CACHE, /* L3 Cache */
SMCA_CS, /* Coherent Slave */
+ SMCA_CS_V2, /* Coherent Slave V2 */
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
+ SMCA_PSP_V2, /* Platform Security Processor V2 */
SMCA_SMU, /* System Management Unit */
+ SMCA_SMU_V2, /* System Management Unit V2 */
+ SMCA_MP5, /* Microprocessor 5 Unit */
+ SMCA_NBIO, /* Northbridge IO Unit */
+ SMCA_PCIE, /* PCI Express Unit */
N_SMCA_BANK_TYPES
};
@@ -165,6 +171,23 @@ static const char * const smca_cs_mce_desc[] = {
"Atomic request parity",
"ECC error on probe filter access",
};
+/* Coherent Slave Unit V2 */
+static const char * const smca_cs2_mce_desc[] = {
+ "Illegal Request",
+ "Address Violation",
+ "Security Violation",
+ "Illegal Response",
+ "Unexpected Response",
+ "Request or Probe Parity Error",
+ "Read Response Parity Error",
+ "Atomic Request Parity Error",
+ "SDP read response had no match in the CS queue",
+ "Probe Filter Protocol Error",
+ "Probe Filter ECC Error",
+ "SDP read response had an unexpected RETRY error",
+ "Counter overflow error",
+ "Counter underflow error",
+};
/* Power, Interrupt, etc.. */
static const char * const smca_pie_mce_desc[] = {
"HW assert",
@@ -189,10 +212,75 @@ static const char * const smca_pb_mce_desc[] = {
static const char * const smca_psp_mce_desc[] = {
"PSP RAM ECC or parity error",
};
+/* Platform Security Processor V2 */
+static const char * const smca_psp2_mce_desc[] = {
+ "High SRAM ECC or parity error",
+ "Low SRAM ECC or parity error",
+ "Instruction Cache Bank 0 ECC or parity error",
+ "Instruction Cache Bank 1 ECC or parity error",
+ "Instruction Tag Ram 0 parity error",
+ "Instruction Tag Ram 1 parity error",
+ "Data Cache Bank 0 ECC or parity error",
+ "Data Cache Bank 1 ECC or parity error",
+ "Data Cache Bank 2 ECC or parity error",
+ "Data Cache Bank 3 ECC or parity error",
+ "Data Tag Bank 0 parity error",
+ "Data Tag Bank 1 parity error",
+ "Data Tag Bank 2 parity error",
+ "Data Tag Bank 3 parity error",
+ "Dirty Data Ram parity error",
+ "TLB Bank 0 parity error",
+ "TLB Bank 1 parity error",
+ "System Hub Read Buffer ECC or parity error",
+};
/* System Management Unit */
static const char * const smca_smu_mce_desc[] = {
"SMU RAM ECC or parity error",
};
+/* System Management Unit V2 */
+static const char * const smca_smu2_mce_desc[] = {
+ "High SRAM ECC or parity error",
+ "Low SRAM ECC or parity error",
+ "Data Cache Bank A ECC or parity error",
+ "Data Cache Bank B ECC or parity error",
+ "Data Tag Cache Bank A ECC or parity error",
+ "Data Tag Cache Bank B ECC or parity error",
+ "Instruction Cache Bank A ECC or parity error",
+ "Instruction Cache Bank B ECC or parity error",
+ "Instruction Tag Cache Bank A ECC or parity error",
+ "Instruction Tag Cache Bank B ECC or parity error",
+ "System Hub Read Buffer ECC or parity error",
+};
+/* Microprocessor 5 Unit */
+static const char * const smca_mp5_mce_desc[] = {
+ "High SRAM ECC or parity error",
+ "Low SRAM ECC or parity error",
+ "Data Cache Bank A ECC or parity error",
+ "Data Cache Bank B ECC or parity error",
+ "Data Tag Cache Bank A ECC or parity error",
+ "Data Tag Cache Bank B ECC or parity error",
+ "Instruction Cache Bank A ECC or parity error",
+ "Instruction Cache Bank B ECC or parity error",
+ "Instruction Tag Cache Bank A ECC or parity error",
+ "Instruction Tag Cache Bank B ECC or parity error",
+};
+/* Northbridge IO Unit */
+static const char * const smca_nbio_mce_desc[] = {
+ "ECC or Parity error",
+ "PCIE error",
+ "SDP ErrEvent error",
+ "SDP Egress Poison Error",
+ "IOHC Internal Poison Error",
+};
+/* PCI Express Unit */
+static const char * const smca_pcie_mce_desc[] = {
+ "CCIX PER Message logging",
+ "CCIX Read Response with Status: Non-Data Error",
+ "CCIX Write Response with Status: Non-Data Error",
+ "CCIX Read Response with Status: Data Error",
+ "CCIX Non-okay write response with data error",
+};
+
struct smca_mce_desc {
const char * const *descs;
@@ -208,11 +296,17 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) },
[SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
[SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
+ [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) },
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
[SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
+ [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)},
[SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
+ [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc)},
+ [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) },
+ [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)},
+ [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)},
};
struct smca_hwid {
@@ -235,6 +329,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Data Fabric MCA types */
{ SMCA_CS, 0x0000002E },
+ { SMCA_CS_V2, 0x0002002E },
{ SMCA_PIE, 0x0001002E },
/* Unified Memory Controller MCA type */
@@ -245,9 +340,20 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Platform Security Processor MCA type */
{ SMCA_PSP, 0x000000FF },
+ { SMCA_PSP_V2, 0x000100FF },
/* System Management Unit MCA type */
{ SMCA_SMU, 0x00000001 },
+ { SMCA_SMU_V2, 0x00010001 },
+
+ /* Microprocessor 5 Unit MCA type */
+ { SMCA_MP5, 0x00020001 },
+
+ /* Northbridge IO Unit MCA type */
+ { SMCA_NBIO, 0x00000018 },
+
+ /* PCI Express Unit MCA type */
+ { SMCA_PCIE, 0x00000046 },
};
struct smca_bank_name {
@@ -264,11 +370,17 @@ static struct smca_bank_name smca_names[] = {
[SMCA_FP] = { "Floating Point Unit" },
[SMCA_L3_CACHE] = { "L3 Cache" },
[SMCA_CS] = { "Coherent Slave" },
+ [SMCA_CS_V2] = { "Coherent Slave" },
[SMCA_PIE] = { "Power, Interrupts, etc." },
[SMCA_UMC] = { "Unified Memory Controller" },
[SMCA_PB] = { "Parameter Block" },
[SMCA_PSP] = { "Platform Security Processor" },
+ [SMCA_PSP_V2] = { "Platform Security Processor" },
[SMCA_SMU] = { "System Management Unit" },
+ [SMCA_SMU_V2] = { "System Management Unit" },
+ [SMCA_MP5] = { "Microprocessor 5 Unit" },
+ [SMCA_NBIO] = { "Northbridge IO Unit" },
+ [SMCA_PCIE] = { "PCI Express Unit" },
};
static void amd_decode_errcode(struct mce_event *e)

View File

@ -0,0 +1,94 @@
commit cc2ce5c65ed5a42eaa97aa3659854add6d808da5
Author: Muralidhara M K <muralidhara.mk@amd.com>
Date: Mon Jan 13 19:12:06 2020 +0530
rasdaemon: Add error decoding for new SMCA Load Store bank type
Future Scalable Machine Check Architecture (SMCA) systems will have a
new Load Store bank type.
Add the new type's (HWID, McaType) ID and error decoding.
Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
[ Adjust commit message. ]
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 114e786..d0b6cb6 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -38,9 +38,16 @@
* 03: EC[3], 02: EC[2], 01: EC[1], 00: EC[0]
*/
+/* MCA_STATUS REGISTER FOR FAMILY 19H
+ * The bits 24 ~ 29 contains AddressLsb
+ * 29: ADDRLS[5], 28: ADDRLS[4], 27: ADDRLS[3],
+ * 26: ADDRLS[2], 25: ADDRLS[1], 24: ADDRLS[0]
+ */
+
/* These may be used by multiple smca_hwid_mcatypes */
enum smca_bank_types {
SMCA_LS = 0, /* Load Store */
+ SMCA_LS_V2, /* Load Store */
SMCA_IF, /* Instruction Fetch */
SMCA_L2_CACHE, /* L2 Cache */
SMCA_DE, /* Decoder Unit */
@@ -88,6 +95,32 @@ static const char * const smca_ls_mce_desc[] = {
"DC tag error type 5",
"L2 fill data error",
};
+static const char * const smca_ls2_mce_desc[] = {
+ "An ECC error was detected on a data cache read by a probe or victimization",
+ "An ECC error or L2 poison was detected on a data cache read by a load",
+ "An ECC error was detected on a data cache read-modify-write by a store",
+ "An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization",
+ "An ECC error or poison bit mismatch was detected on a tag read by a load",
+ "An ECC error or poison bit mismatch was detected on a tag read by a store",
+ "An ECC error was detected on an EMEM read by a load",
+ "An ECC error was detected on an EMEM read-modify-write by a store",
+ "A parity error was detected in an L1 TLB entry by any access",
+ "A parity error was detected in an L2 TLB entry by any access",
+ "A parity error was detected in a PWC entry by any access",
+ "A parity error was detected in an STQ entry by any access",
+ "A parity error was detected in an LDQ entry by any access",
+ "A parity error was detected in a MAB entry by any access",
+ "A parity error was detected in an SCB entry state field by any access",
+ "A parity error was detected in an SCB entry address field by any access",
+ "A parity error was detected in an SCB entry data field by any access",
+ "A parity error was detected in a WCB entry by any access",
+ "A poisoned line was detected in an SCB entry by any access",
+ "A SystemReadDataError error was reported on read data returned from L2 for a load",
+ "A SystemReadDataError error was reported on read data returned from L2 for an SCB store",
+ "A SystemReadDataError error was reported on read data returned from L2 for a WCB store",
+ "A hardware assertion error was reported",
+ "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access",
+};
/* Instruction Fetch */
static const char * const smca_if_mce_desc[] = {
"microtag probe port parity error",
@@ -289,6 +322,7 @@ struct smca_mce_desc {
static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) },
+ [SMCA_LS_V2] = { smca_ls2_mce_desc, ARRAY_SIZE(smca_ls2_mce_desc) },
[SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) },
[SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) },
[SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) },
@@ -319,6 +353,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* ZN Core (HWID=0xB0) MCA types */
{ SMCA_LS, 0x000000B0 },
+ { SMCA_LS_V2, 0x001000B0 },
{ SMCA_IF, 0x000100B0 },
{ SMCA_L2_CACHE, 0x000200B0 },
{ SMCA_DE, 0x000300B0 },
@@ -362,6 +397,7 @@ struct smca_bank_name {
static struct smca_bank_name smca_names[] = {
[SMCA_LS] = { "Load Store Unit" },
+ [SMCA_LS_V2] = { "Load Store Unit" },
[SMCA_IF] = { "Instruction Fetch Unit" },
[SMCA_L2_CACHE] = { "L2 Cache" },
[SMCA_DE] = { "Decode Unit" },

View File

@ -1,6 +1,6 @@
Name: rasdaemon
Version: 0.6.1
Release: 5%{?dist}
Release: 5.1%{?dist}
Summary: Utility to receive RAS error tracings
Group: Applications/System
License: GPLv2
@ -27,6 +27,10 @@ Patch1: 60a91e4da4f2daf2b10143fc148a8043312b61e5.patch
Patch2: a16ca0711001957ee98f2c124abce0fa1f801529.patch
Patch3: add_upstream_labels.patch
Patch4: b22be68453b2497e86cbd273b9cd56fadc5859e3.patch
Patch5: 2a1d217660351c08eb2f8bccebf939abba2f7e69.patch
Patch6: 8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch
Patch7: cc2ce5c65ed5a42eaa97aa3659854add6d808da5.patch
Patch8: 854364ba44aee9bc5646f6537fc744b0b54aff37.patch
%description
%{name} is a RAS (Reliability, Availability and Serviceability) logging tool.
@ -44,6 +48,10 @@ an utility for reporting current error counts from the EDAC sysfs files.
%patch2 -p1
%patch3 -p1
%patch4 -p1
%patch5 -p1
%patch6 -p1
%patch7 -p1
%patch8 -p1
%build
%ifarch %{arm} aarch64
@ -70,6 +78,9 @@ rm INSTALL %{buildroot}/usr/include/*.h
%{_sysconfdir}/ras/dimm_labels.d
%changelog
* Wed May 26 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-5.1
- Add support for AMD SMCA [1975506]
* Wed Apr 08 2020 Aristeu Rozanski <aris@redhat.com> 0.6.1-5
- Fix high CPU usage when CPUs are offline [1683420]