From 649274b42d2a4db8134a4b6a3416ed9869758a3c Mon Sep 17 00:00:00 2001 From: CentOS Sources Date: Wed, 6 Oct 2021 09:52:01 -0400 Subject: [PATCH] import rasdaemon-0.6.1-5.1.el8_4 --- ...217660351c08eb2f8bccebf939abba2f7e69.patch | 66 ++++++ ...64ba44aee9bc5646f6537fc744b0b54aff37.patch | 38 ++++ ...a85d8dc3483423ec2934fee8132f85f8fdb6.patch | 207 ++++++++++++++++++ ...e5c65ed5a42eaa97aa3659854add6d808da5.patch | 94 ++++++++ SPECS/rasdaemon.spec | 13 +- 5 files changed, 417 insertions(+), 1 deletion(-) create mode 100644 SOURCES/2a1d217660351c08eb2f8bccebf939abba2f7e69.patch create mode 100644 SOURCES/854364ba44aee9bc5646f6537fc744b0b54aff37.patch create mode 100644 SOURCES/8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch create mode 100644 SOURCES/cc2ce5c65ed5a42eaa97aa3659854add6d808da5.patch diff --git a/SOURCES/2a1d217660351c08eb2f8bccebf939abba2f7e69.patch b/SOURCES/2a1d217660351c08eb2f8bccebf939abba2f7e69.patch new file mode 100644 index 0000000..1b5844d --- /dev/null +++ b/SOURCES/2a1d217660351c08eb2f8bccebf939abba2f7e69.patch @@ -0,0 +1,66 @@ +commit 2a1d217660351c08eb2f8bccebf939abba2f7e69 +Author: Brian WoodsGhannam, Yazen +Date: Fri Nov 1 15:48:13 2019 +0100 + + rasdaemon: rename CPU_NAPLES cputype + + Change CPU_NAPLES to CPU_AMD_SMCA to reflect that it isn't just NAPLES + that is supported, but AMD's Scalable Machine Check Architecture (SMCA). + + [ Yazen: change family check to feature check, and change CPU name. ] + + CC: "mchehab+samsung@kernel.org" , "Namburu, Chandu-babu" # Thread-Topic: [PATCH 1/2] rasdaemon: rename CPU_NAPLES cputype + Signed-off-by: Brian Woods + Signed-off-by: Yazen Ghannam + Cc: Chandu-babu Namburu + Signed-off-by: Mauro Carvalho Chehab + +--- + ras-mce-handler.c | 10 ++++++---- + ras-mce-handler.h | 2 +- + 2 files changed, 7 insertions(+), 5 deletions(-) + +--- rasdaemon-0.6.1.orig/ras-mce-handler.c 2021-05-26 15:16:24.699096556 -0400 ++++ rasdaemon-0.6.1/ras-mce-handler.c 2021-05-26 15:18:06.543162745 -0400 +@@ -55,7 +55,7 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series + [CPU_KNIGHTS_LANDING] = "Knights Landing", + [CPU_KNIGHTS_MILL] = "Knights Mill", + [CPU_SKYLAKE_XEON] = "Skylake server", +- [CPU_NAPLES] = "AMD Family 17h Zen1" ++ [CPU_AMD_SMCA] = "AMD Scalable MCA", + }; + + static enum cputype select_intel_cputype(struct ras_events *ras) +@@ -191,8 +191,10 @@ ret = 0; + if (!strcmp(mce->vendor, "AuthenticAMD")) { + if (mce->family == 15) + mce->cputype = CPU_K8; +- if (mce->family == 23) +- mce->cputype = CPU_NAPLES; ++ if (strstr(mce->processor_flags, "smca")) { ++ mce->cputype = CPU_AMD_SMCA; ++ goto ret; ++ } + if (mce->family > 23) { + log(ALL, LOG_INFO, + "Can't parse MCE for this AMD CPU yet %d\n", +@@ -435,7 +437,7 @@ if (pevent_get_field_val(s, event, "ipid + case CPU_K8: + rc = parse_amd_k8_event(ras, &e); + break; +- case CPU_NAPLES: ++ case CPU_AMD_SMCA: + rc = parse_amd_smca_event(ras, &e); + break; + default: /* All other CPU types are Intel */ +--- rasdaemon-0.6.1.orig/ras-mce-handler.h 2021-05-26 15:17:15.409631590 -0400 ++++ rasdaemon-0.6.1/ras-mce-handler.h 2021-05-26 15:18:20.102038424 -0400 +@@ -50,7 +50,7 @@ enum cputype { + CPU_KNIGHTS_LANDING, + CPU_KNIGHTS_MILL, + CPU_SKYLAKE_XEON, +- CPU_NAPLES, ++ CPU_AMD_SMCA, + }; + + struct mce_event { diff --git a/SOURCES/854364ba44aee9bc5646f6537fc744b0b54aff37.patch b/SOURCES/854364ba44aee9bc5646f6537fc744b0b54aff37.patch new file mode 100644 index 0000000..91bad1b --- /dev/null +++ b/SOURCES/854364ba44aee9bc5646f6537fc744b0b54aff37.patch @@ -0,0 +1,38 @@ +commit 854364ba44aee9bc5646f6537fc744b0b54aff37 +Author: Muralidhara M K +Date: Thu Aug 20 21:00:57 2020 +0530 + + rasdaemon: Add 8 channel decoding for SMCA systems + + Current Scalable Machine Check Architecture (SMCA) systems support up + to 8 UMC channels. + + To find the UMC channel represented by a bank, look at the 6th nibble + in the MCA_IPID[InstanceId] field. + + Signed-off-by: Muralidhara M K + [ Adjust commit message. ] + Signed-off-by: Yazen Ghannam + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/mce-amd-smca.c b/mce-amd-smca.c +index d0b6cb6..7c619fd 100644 +--- a/mce-amd-smca.c ++++ b/mce-amd-smca.c +@@ -438,15 +438,7 @@ static void amd_decode_errcode(struct mce_event *e) + */ + static int find_umc_channel(struct mce_event *e) + { +- uint32_t umc_instance_id[] = {0x50f00, 0x150f00}; +- uint32_t instance_id = EXTRACT(e->ipid, 0, 31); +- int i, channel = -1; +- +- for (i = 0; i < ARRAY_SIZE(umc_instance_id); i++) +- if (umc_instance_id[i] == instance_id) +- channel = i; +- +- return channel; ++ return EXTRACT(e->ipid, 0, 31) >> 20; + } + /* Decode extended errors according to Scalable MCA specification */ + static void decode_smca_error(struct mce_event *e) diff --git a/SOURCES/8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch b/SOURCES/8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch new file mode 100644 index 0000000..e3617fc --- /dev/null +++ b/SOURCES/8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch @@ -0,0 +1,207 @@ +commit 8704a85d8dc3483423ec2934fee8132f85f8fdb6 +Author: Brian WoodsGhannam, Yazen +Date: Fri Nov 1 15:48:14 2019 +0100 + + rasdaemon: add support for new AMD SMCA bank types + + Going forward, the Scalable Machine Check Architecture (SMCA) has some + updated and additional bank types which show up in Zen2. The differing + bank types include: CS_V2, PSP_V2, SMU_V2, MP5, NBIO, and PCIE. The V2 + bank types replace the original bank types but have unique HWID/MCAtype + IDs from the originals so there's no conflicts between different + versions or other bank types. All of the differing bank types have new + MCE descriptions which have been added as well. + + CC: "mchehab+samsung@kernel.org" , "Namburu, Chandu-babu" # Thread-Topic: [PATCH 2/2] rasdaemon: add support for new AMD SMCA bank types + Signed-off-by: Brian Woods + Signed-off-by: Yazen Ghannam + Cc: Chandu-babu Namburu + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/mce-amd-smca.c b/mce-amd-smca.c +index 6c3e8a5..114e786 100644 +--- a/mce-amd-smca.c ++++ b/mce-amd-smca.c +@@ -49,11 +49,17 @@ enum smca_bank_types { + SMCA_FP, /* Floating Point */ + SMCA_L3_CACHE, /* L3 Cache */ + SMCA_CS, /* Coherent Slave */ ++ SMCA_CS_V2, /* Coherent Slave V2 */ + SMCA_PIE, /* Power, Interrupts, etc. */ + SMCA_UMC, /* Unified Memory Controller */ + SMCA_PB, /* Parameter Block */ + SMCA_PSP, /* Platform Security Processor */ ++ SMCA_PSP_V2, /* Platform Security Processor V2 */ + SMCA_SMU, /* System Management Unit */ ++ SMCA_SMU_V2, /* System Management Unit V2 */ ++ SMCA_MP5, /* Microprocessor 5 Unit */ ++ SMCA_NBIO, /* Northbridge IO Unit */ ++ SMCA_PCIE, /* PCI Express Unit */ + N_SMCA_BANK_TYPES + }; + +@@ -165,6 +171,23 @@ static const char * const smca_cs_mce_desc[] = { + "Atomic request parity", + "ECC error on probe filter access", + }; ++/* Coherent Slave Unit V2 */ ++static const char * const smca_cs2_mce_desc[] = { ++ "Illegal Request", ++ "Address Violation", ++ "Security Violation", ++ "Illegal Response", ++ "Unexpected Response", ++ "Request or Probe Parity Error", ++ "Read Response Parity Error", ++ "Atomic Request Parity Error", ++ "SDP read response had no match in the CS queue", ++ "Probe Filter Protocol Error", ++ "Probe Filter ECC Error", ++ "SDP read response had an unexpected RETRY error", ++ "Counter overflow error", ++ "Counter underflow error", ++}; + /* Power, Interrupt, etc.. */ + static const char * const smca_pie_mce_desc[] = { + "HW assert", +@@ -189,10 +212,75 @@ static const char * const smca_pb_mce_desc[] = { + static const char * const smca_psp_mce_desc[] = { + "PSP RAM ECC or parity error", + }; ++/* Platform Security Processor V2 */ ++static const char * const smca_psp2_mce_desc[] = { ++ "High SRAM ECC or parity error", ++ "Low SRAM ECC or parity error", ++ "Instruction Cache Bank 0 ECC or parity error", ++ "Instruction Cache Bank 1 ECC or parity error", ++ "Instruction Tag Ram 0 parity error", ++ "Instruction Tag Ram 1 parity error", ++ "Data Cache Bank 0 ECC or parity error", ++ "Data Cache Bank 1 ECC or parity error", ++ "Data Cache Bank 2 ECC or parity error", ++ "Data Cache Bank 3 ECC or parity error", ++ "Data Tag Bank 0 parity error", ++ "Data Tag Bank 1 parity error", ++ "Data Tag Bank 2 parity error", ++ "Data Tag Bank 3 parity error", ++ "Dirty Data Ram parity error", ++ "TLB Bank 0 parity error", ++ "TLB Bank 1 parity error", ++ "System Hub Read Buffer ECC or parity error", ++}; + /* System Management Unit */ + static const char * const smca_smu_mce_desc[] = { + "SMU RAM ECC or parity error", + }; ++/* System Management Unit V2 */ ++static const char * const smca_smu2_mce_desc[] = { ++ "High SRAM ECC or parity error", ++ "Low SRAM ECC or parity error", ++ "Data Cache Bank A ECC or parity error", ++ "Data Cache Bank B ECC or parity error", ++ "Data Tag Cache Bank A ECC or parity error", ++ "Data Tag Cache Bank B ECC or parity error", ++ "Instruction Cache Bank A ECC or parity error", ++ "Instruction Cache Bank B ECC or parity error", ++ "Instruction Tag Cache Bank A ECC or parity error", ++ "Instruction Tag Cache Bank B ECC or parity error", ++ "System Hub Read Buffer ECC or parity error", ++}; ++/* Microprocessor 5 Unit */ ++static const char * const smca_mp5_mce_desc[] = { ++ "High SRAM ECC or parity error", ++ "Low SRAM ECC or parity error", ++ "Data Cache Bank A ECC or parity error", ++ "Data Cache Bank B ECC or parity error", ++ "Data Tag Cache Bank A ECC or parity error", ++ "Data Tag Cache Bank B ECC or parity error", ++ "Instruction Cache Bank A ECC or parity error", ++ "Instruction Cache Bank B ECC or parity error", ++ "Instruction Tag Cache Bank A ECC or parity error", ++ "Instruction Tag Cache Bank B ECC or parity error", ++}; ++/* Northbridge IO Unit */ ++static const char * const smca_nbio_mce_desc[] = { ++ "ECC or Parity error", ++ "PCIE error", ++ "SDP ErrEvent error", ++ "SDP Egress Poison Error", ++ "IOHC Internal Poison Error", ++}; ++/* PCI Express Unit */ ++static const char * const smca_pcie_mce_desc[] = { ++ "CCIX PER Message logging", ++ "CCIX Read Response with Status: Non-Data Error", ++ "CCIX Write Response with Status: Non-Data Error", ++ "CCIX Read Response with Status: Data Error", ++ "CCIX Non-okay write response with data error", ++}; ++ + + struct smca_mce_desc { + const char * const *descs; +@@ -208,11 +296,17 @@ static struct smca_mce_desc smca_mce_descs[] = { + [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) }, + [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, + [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, ++ [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) }, + [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, + [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, + [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, + [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, ++ [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)}, + [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, ++ [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc)}, ++ [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) }, ++ [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)}, ++ [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)}, + }; + + struct smca_hwid { +@@ -235,6 +329,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = { + + /* Data Fabric MCA types */ + { SMCA_CS, 0x0000002E }, ++ { SMCA_CS_V2, 0x0002002E }, + { SMCA_PIE, 0x0001002E }, + + /* Unified Memory Controller MCA type */ +@@ -245,9 +340,20 @@ static struct smca_hwid smca_hwid_mcatypes[] = { + + /* Platform Security Processor MCA type */ + { SMCA_PSP, 0x000000FF }, ++ { SMCA_PSP_V2, 0x000100FF }, + + /* System Management Unit MCA type */ + { SMCA_SMU, 0x00000001 }, ++ { SMCA_SMU_V2, 0x00010001 }, ++ ++ /* Microprocessor 5 Unit MCA type */ ++ { SMCA_MP5, 0x00020001 }, ++ ++ /* Northbridge IO Unit MCA type */ ++ { SMCA_NBIO, 0x00000018 }, ++ ++ /* PCI Express Unit MCA type */ ++ { SMCA_PCIE, 0x00000046 }, + }; + + struct smca_bank_name { +@@ -264,11 +370,17 @@ static struct smca_bank_name smca_names[] = { + [SMCA_FP] = { "Floating Point Unit" }, + [SMCA_L3_CACHE] = { "L3 Cache" }, + [SMCA_CS] = { "Coherent Slave" }, ++ [SMCA_CS_V2] = { "Coherent Slave" }, + [SMCA_PIE] = { "Power, Interrupts, etc." }, + [SMCA_UMC] = { "Unified Memory Controller" }, + [SMCA_PB] = { "Parameter Block" }, + [SMCA_PSP] = { "Platform Security Processor" }, ++ [SMCA_PSP_V2] = { "Platform Security Processor" }, + [SMCA_SMU] = { "System Management Unit" }, ++ [SMCA_SMU_V2] = { "System Management Unit" }, ++ [SMCA_MP5] = { "Microprocessor 5 Unit" }, ++ [SMCA_NBIO] = { "Northbridge IO Unit" }, ++ [SMCA_PCIE] = { "PCI Express Unit" }, + }; + + static void amd_decode_errcode(struct mce_event *e) diff --git a/SOURCES/cc2ce5c65ed5a42eaa97aa3659854add6d808da5.patch b/SOURCES/cc2ce5c65ed5a42eaa97aa3659854add6d808da5.patch new file mode 100644 index 0000000..36c019d --- /dev/null +++ b/SOURCES/cc2ce5c65ed5a42eaa97aa3659854add6d808da5.patch @@ -0,0 +1,94 @@ +commit cc2ce5c65ed5a42eaa97aa3659854add6d808da5 +Author: Muralidhara M K +Date: Mon Jan 13 19:12:06 2020 +0530 + + rasdaemon: Add error decoding for new SMCA Load Store bank type + + Future Scalable Machine Check Architecture (SMCA) systems will have a + new Load Store bank type. + + Add the new type's (HWID, McaType) ID and error decoding. + + Signed-off-by: Muralidhara M K + [ Adjust commit message. ] + Signed-off-by: Yazen Ghannam + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/mce-amd-smca.c b/mce-amd-smca.c +index 114e786..d0b6cb6 100644 +--- a/mce-amd-smca.c ++++ b/mce-amd-smca.c +@@ -38,9 +38,16 @@ + * 03: EC[3], 02: EC[2], 01: EC[1], 00: EC[0] + */ + ++/* MCA_STATUS REGISTER FOR FAMILY 19H ++ * The bits 24 ~ 29 contains AddressLsb ++ * 29: ADDRLS[5], 28: ADDRLS[4], 27: ADDRLS[3], ++ * 26: ADDRLS[2], 25: ADDRLS[1], 24: ADDRLS[0] ++ */ ++ + /* These may be used by multiple smca_hwid_mcatypes */ + enum smca_bank_types { + SMCA_LS = 0, /* Load Store */ ++ SMCA_LS_V2, /* Load Store */ + SMCA_IF, /* Instruction Fetch */ + SMCA_L2_CACHE, /* L2 Cache */ + SMCA_DE, /* Decoder Unit */ +@@ -88,6 +95,32 @@ static const char * const smca_ls_mce_desc[] = { + "DC tag error type 5", + "L2 fill data error", + }; ++static const char * const smca_ls2_mce_desc[] = { ++ "An ECC error was detected on a data cache read by a probe or victimization", ++ "An ECC error or L2 poison was detected on a data cache read by a load", ++ "An ECC error was detected on a data cache read-modify-write by a store", ++ "An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization", ++ "An ECC error or poison bit mismatch was detected on a tag read by a load", ++ "An ECC error or poison bit mismatch was detected on a tag read by a store", ++ "An ECC error was detected on an EMEM read by a load", ++ "An ECC error was detected on an EMEM read-modify-write by a store", ++ "A parity error was detected in an L1 TLB entry by any access", ++ "A parity error was detected in an L2 TLB entry by any access", ++ "A parity error was detected in a PWC entry by any access", ++ "A parity error was detected in an STQ entry by any access", ++ "A parity error was detected in an LDQ entry by any access", ++ "A parity error was detected in a MAB entry by any access", ++ "A parity error was detected in an SCB entry state field by any access", ++ "A parity error was detected in an SCB entry address field by any access", ++ "A parity error was detected in an SCB entry data field by any access", ++ "A parity error was detected in a WCB entry by any access", ++ "A poisoned line was detected in an SCB entry by any access", ++ "A SystemReadDataError error was reported on read data returned from L2 for a load", ++ "A SystemReadDataError error was reported on read data returned from L2 for an SCB store", ++ "A SystemReadDataError error was reported on read data returned from L2 for a WCB store", ++ "A hardware assertion error was reported", ++ "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access", ++}; + /* Instruction Fetch */ + static const char * const smca_if_mce_desc[] = { + "microtag probe port parity error", +@@ -289,6 +322,7 @@ struct smca_mce_desc { + + static struct smca_mce_desc smca_mce_descs[] = { + [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) }, ++ [SMCA_LS_V2] = { smca_ls2_mce_desc, ARRAY_SIZE(smca_ls2_mce_desc) }, + [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) }, + [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) }, + [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) }, +@@ -319,6 +353,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = { + + /* ZN Core (HWID=0xB0) MCA types */ + { SMCA_LS, 0x000000B0 }, ++ { SMCA_LS_V2, 0x001000B0 }, + { SMCA_IF, 0x000100B0 }, + { SMCA_L2_CACHE, 0x000200B0 }, + { SMCA_DE, 0x000300B0 }, +@@ -362,6 +397,7 @@ struct smca_bank_name { + + static struct smca_bank_name smca_names[] = { + [SMCA_LS] = { "Load Store Unit" }, ++ [SMCA_LS_V2] = { "Load Store Unit" }, + [SMCA_IF] = { "Instruction Fetch Unit" }, + [SMCA_L2_CACHE] = { "L2 Cache" }, + [SMCA_DE] = { "Decode Unit" }, diff --git a/SPECS/rasdaemon.spec b/SPECS/rasdaemon.spec index 2746928..3c21a31 100644 --- a/SPECS/rasdaemon.spec +++ b/SPECS/rasdaemon.spec @@ -1,6 +1,6 @@ Name: rasdaemon Version: 0.6.1 -Release: 5%{?dist} +Release: 5.1%{?dist} Summary: Utility to receive RAS error tracings Group: Applications/System License: GPLv2 @@ -27,6 +27,10 @@ Patch1: 60a91e4da4f2daf2b10143fc148a8043312b61e5.patch Patch2: a16ca0711001957ee98f2c124abce0fa1f801529.patch Patch3: add_upstream_labels.patch Patch4: b22be68453b2497e86cbd273b9cd56fadc5859e3.patch +Patch5: 2a1d217660351c08eb2f8bccebf939abba2f7e69.patch +Patch6: 8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch +Patch7: cc2ce5c65ed5a42eaa97aa3659854add6d808da5.patch +Patch8: 854364ba44aee9bc5646f6537fc744b0b54aff37.patch %description %{name} is a RAS (Reliability, Availability and Serviceability) logging tool. @@ -44,6 +48,10 @@ an utility for reporting current error counts from the EDAC sysfs files. %patch2 -p1 %patch3 -p1 %patch4 -p1 +%patch5 -p1 +%patch6 -p1 +%patch7 -p1 +%patch8 -p1 %build %ifarch %{arm} aarch64 @@ -70,6 +78,9 @@ rm INSTALL %{buildroot}/usr/include/*.h %{_sysconfdir}/ras/dimm_labels.d %changelog +* Wed May 26 2021 Aristeu Rozanski 0.6.1-5.1 +- Add support for AMD SMCA [1975506] + * Wed Apr 08 2020 Aristeu Rozanski 0.6.1-5 - Fix high CPU usage when CPUs are offline [1683420]