diff --git a/SOURCES/28ea956acc2dab7c18b4701f9657afb9ab3ddc79.patch b/SOURCES/28ea956acc2dab7c18b4701f9657afb9ab3ddc79.patch new file mode 100644 index 0000000..fdc509b --- /dev/null +++ b/SOURCES/28ea956acc2dab7c18b4701f9657afb9ab3ddc79.patch @@ -0,0 +1,28 @@ +commit 28ea956acc2dab7c18b4701f9657afb9ab3ddc79 +Author: Muralidhara M K +Date: Mon Jul 12 05:18:43 2021 -0500 + + rasdaemon: set SMCA maximum number of banks to 64 + + Newer AMD systems with SMCA banks support up to 64 MCA banks per CPU. + + This patch is based on the commit below upstremed into the kernel: + a0bc32b3cacf ("x86/mce: Increase maximum number of banks to 64") + + Signed-off-by: Muralidhara M K + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/mce-amd-smca.c b/mce-amd-smca.c +index e0cf512..3c346f4 100644 +--- a/mce-amd-smca.c ++++ b/mce-amd-smca.c +@@ -75,6 +75,9 @@ enum smca_bank_types { + N_SMCA_BANK_TYPES + }; + ++/* Maximum number of MCA banks per CPU. */ ++#define MAX_NR_BANKS 64 ++ + /* SMCA Extended error strings */ + /* Load Store */ + static const char * const smca_ls_mce_desc[] = { diff --git a/SOURCES/7937f0d6c2aaaed096f3a3d306416743c0dcb7a4.patch b/SOURCES/7937f0d6c2aaaed096f3a3d306416743c0dcb7a4.patch new file mode 100644 index 0000000..76afc8e --- /dev/null +++ b/SOURCES/7937f0d6c2aaaed096f3a3d306416743c0dcb7a4.patch @@ -0,0 +1,24 @@ +commit 7937f0d6c2aaaed096f3a3d306416743c0dcb7a4 +Author: Muralidhara M K +Date: Wed Jul 28 01:52:12 2021 -0500 + + rasdaemon: Support MCE for AMD CPU family 19h + + Add support for family 19h x86 CPUs from AMD. + + Signed-off-by: Muralidhara M K + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-mce-handler.c b/ras-mce-handler.c +index 805004a..f2b53d4 100644 +--- a/ras-mce-handler.c ++++ b/ras-mce-handler.c +@@ -208,7 +208,7 @@ static int detect_cpu(struct ras_events *ras) + mce->cputype = CPU_AMD_SMCA; + goto ret; + } +- if (mce->family > 23) { ++ if (mce->family > 25) { + log(ALL, LOG_INFO, + "Can't parse MCE for this AMD CPU yet %d\n", + mce->family); diff --git a/SOURCES/9acef39f13833f7d53ef96abc5a72e79384260f4.patch b/SOURCES/9acef39f13833f7d53ef96abc5a72e79384260f4.patch new file mode 100644 index 0000000..c4c8af1 --- /dev/null +++ b/SOURCES/9acef39f13833f7d53ef96abc5a72e79384260f4.patch @@ -0,0 +1,230 @@ +commit 9acef39f13833f7d53ef96abc5a72e79384260f4 +Author: Naveen Krishna Chatradhi +Date: Tue Jun 1 11:01:17 2021 +0530 + + rasdaemon: Add new SMCA bank types with error decoding + + Upcoming systems with Scalable Machine Check Architecture (SMCA) have + new MCA banks added. + + This patch adds the (HWID, MCATYPE) tuple, name and error decoding for + those new SMCA banks. + While at it, optimize the string names in smca_bank_name[]. + + Signed-off-by: Muralidhara M K + Signed-off-by: Naveen Krishna Chatradhi + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/mce-amd-smca.c b/mce-amd-smca.c +index 7c619fd..e0cf512 100644 +--- a/mce-amd-smca.c ++++ b/mce-amd-smca.c +@@ -47,7 +47,7 @@ + /* These may be used by multiple smca_hwid_mcatypes */ + enum smca_bank_types { + SMCA_LS = 0, /* Load Store */ +- SMCA_LS_V2, /* Load Store */ ++ SMCA_LS_V2, + SMCA_IF, /* Instruction Fetch */ + SMCA_L2_CACHE, /* L2 Cache */ + SMCA_DE, /* Decoder Unit */ +@@ -56,17 +56,22 @@ enum smca_bank_types { + SMCA_FP, /* Floating Point */ + SMCA_L3_CACHE, /* L3 Cache */ + SMCA_CS, /* Coherent Slave */ +- SMCA_CS_V2, /* Coherent Slave V2 */ ++ SMCA_CS_V2, + SMCA_PIE, /* Power, Interrupts, etc. */ + SMCA_UMC, /* Unified Memory Controller */ ++ SMCA_UMC_V2, + SMCA_PB, /* Parameter Block */ + SMCA_PSP, /* Platform Security Processor */ +- SMCA_PSP_V2, /* Platform Security Processor V2 */ ++ SMCA_PSP_V2, + SMCA_SMU, /* System Management Unit */ +- SMCA_SMU_V2, /* System Management Unit V2 */ ++ SMCA_SMU_V2, + SMCA_MP5, /* Microprocessor 5 Unit */ + SMCA_NBIO, /* Northbridge IO Unit */ + SMCA_PCIE, /* PCI Express Unit */ ++ SMCA_PCIE_V2, ++ SMCA_XGMI_PCS, /* xGMI PCS Unit */ ++ SMCA_XGMI_PHY, /* xGMI PHY Unit */ ++ SMCA_WAFL_PHY, /* WAFL PHY Unit */ + N_SMCA_BANK_TYPES + }; + +@@ -237,6 +242,22 @@ static const char * const smca_umc_mce_desc[] = { + "Command/address parity error", + "Write data CRC error", + }; ++ ++static const char * const smca_umc2_mce_desc[] = { ++ "DRAM ECC error", ++ "Data poison error", ++ "SDP parity error", ++ "Reserved", ++ "Address/Command parity error", ++ "Write data parity error", ++ "DCQ SRAM ECC error", ++ "Reserved", ++ "Read data parity error", ++ "Rdb SRAM ECC error", ++ "RdRsp SRAM ECC error", ++ "LM32 MP errors", ++}; ++ + /* Parameter Block */ + static const char * const smca_pb_mce_desc[] = { + "Parameter Block RAM ECC error", +@@ -314,6 +335,55 @@ static const char * const smca_pcie_mce_desc[] = { + "CCIX Non-okay write response with data error", + }; + ++static const char * const smca_pcie2_mce_desc[] = { ++ "SDP Parity Error logging", ++}; ++ ++static const char * const smca_xgmipcs_mce_desc[] = { ++ "Data Loss Error", ++ "Training Error", ++ "Flow Control Acknowledge Error", ++ "Rx Fifo Underflow Error", ++ "Rx Fifo Overflow Error", ++ "CRC Error", ++ "BER Exceeded Error", ++ "Tx Vcid Data Error", ++ "Replay Buffer Parity Error", ++ "Data Parity Error", ++ "Replay Fifo Overflow Error", ++ "Replay Fifo Underflow Error", ++ "Elastic Fifo Overflow Error", ++ "Deskew Error", ++ "Flow Control CRC Error", ++ "Data Startup Limit Error", ++ "FC Init Timeout Error", ++ "Recovery Timeout Error", ++ "Ready Serial Timeout Error", ++ "Ready Serial Attempt Error", ++ "Recovery Attempt Error", ++ "Recovery Relock Attempt Error", ++ "Replay Attempt Error", ++ "Sync Header Error", ++ "Tx Replay Timeout Error", ++ "Rx Replay Timeout Error", ++ "LinkSub Tx Timeout Error", ++ "LinkSub Rx Timeout Error", ++ "Rx CMD Pocket Error", ++}; ++ ++static const char * const smca_xgmiphy_mce_desc[] = { ++ "RAM ECC Error", ++ "ARC instruction buffer parity error", ++ "ARC data buffer parity error", ++ "PHY APB error", ++}; ++ ++static const char * const smca_waflphy_mce_desc[] = { ++ "RAM ECC Error", ++ "ARC instruction buffer parity error", ++ "ARC data buffer parity error", ++ "PHY APB error", ++}; + + struct smca_mce_desc { + const char * const *descs; +@@ -333,6 +403,7 @@ static struct smca_mce_desc smca_mce_descs[] = { + [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) }, + [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, + [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, ++ [SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) }, + [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, + [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, + [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)}, +@@ -341,6 +412,10 @@ static struct smca_mce_desc smca_mce_descs[] = { + [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) }, + [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)}, + [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)}, ++ [SMCA_PCIE_V2] = { smca_pcie2_mce_desc, ARRAY_SIZE(smca_pcie2_mce_desc) }, ++ [SMCA_XGMI_PCS] = { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc) }, ++ [SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) }, ++ [SMCA_WAFL_PHY] = { smca_waflphy_mce_desc, ARRAY_SIZE(smca_waflphy_mce_desc) }, + }; + + struct smca_hwid { +@@ -369,6 +444,8 @@ static struct smca_hwid smca_hwid_mcatypes[] = { + + /* Unified Memory Controller MCA type */ + { SMCA_UMC, 0x00000096 }, ++ /* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */ ++ { SMCA_UMC_V2, 0x00010096 }, + + /* Parameter Block MCA type */ + { SMCA_PB, 0x00000005 }, +@@ -389,6 +466,16 @@ static struct smca_hwid smca_hwid_mcatypes[] = { + + /* PCI Express Unit MCA type */ + { SMCA_PCIE, 0x00000046 }, ++ { SMCA_PCIE_V2, 0x00010046 }, ++ ++ /* Ext Global Memory Interconnect PCS MCA type */ ++ { SMCA_XGMI_PCS, 0x00000050 }, ++ ++ /* Ext Global Memory Interconnect PHY MCA type */ ++ { SMCA_XGMI_PHY, 0x00000259 }, ++ ++ /* WAFL PHY MCA type */ ++ { SMCA_WAFL_PHY, 0x00000267 }, + }; + + struct smca_bank_name { +@@ -396,27 +483,28 @@ struct smca_bank_name { + }; + + static struct smca_bank_name smca_names[] = { +- [SMCA_LS] = { "Load Store Unit" }, +- [SMCA_LS_V2] = { "Load Store Unit" }, +- [SMCA_IF] = { "Instruction Fetch Unit" }, +- [SMCA_L2_CACHE] = { "L2 Cache" }, +- [SMCA_DE] = { "Decode Unit" }, +- [SMCA_RESERVED] = { "Reserved" }, +- [SMCA_EX] = { "Execution Unit" }, +- [SMCA_FP] = { "Floating Point Unit" }, +- [SMCA_L3_CACHE] = { "L3 Cache" }, +- [SMCA_CS] = { "Coherent Slave" }, +- [SMCA_CS_V2] = { "Coherent Slave" }, +- [SMCA_PIE] = { "Power, Interrupts, etc." }, +- [SMCA_UMC] = { "Unified Memory Controller" }, +- [SMCA_PB] = { "Parameter Block" }, +- [SMCA_PSP] = { "Platform Security Processor" }, +- [SMCA_PSP_V2] = { "Platform Security Processor" }, +- [SMCA_SMU] = { "System Management Unit" }, +- [SMCA_SMU_V2] = { "System Management Unit" }, +- [SMCA_MP5] = { "Microprocessor 5 Unit" }, +- [SMCA_NBIO] = { "Northbridge IO Unit" }, +- [SMCA_PCIE] = { "PCI Express Unit" }, ++ [SMCA_LS ... SMCA_LS_V2] = { "Load Store Unit" }, ++ [SMCA_IF] = { "Instruction Fetch Unit" }, ++ [SMCA_L2_CACHE] = { "L2 Cache" }, ++ [SMCA_DE] = { "Decode Unit" }, ++ [SMCA_RESERVED] = { "Reserved" }, ++ [SMCA_EX] = { "Execution Unit" }, ++ [SMCA_FP] = { "Floating Point Unit" }, ++ [SMCA_L3_CACHE] = { "L3 Cache" }, ++ [SMCA_CS ... SMCA_CS_V2] = { "Coherent Slave" }, ++ [SMCA_PIE] = { "Power, Interrupts, etc." }, ++ [SMCA_UMC] = { "Unified Memory Controller" }, ++ [SMCA_UMC_V2] = { "Unified Memory Controller V2" }, ++ [SMCA_PB] = { "Parameter Block" }, ++ [SMCA_PSP ... SMCA_PSP_V2] = { "Platform Security Processor" }, ++ [SMCA_SMU ... SMCA_SMU_V2] = { "System Management Unit" }, ++ [SMCA_MP5] = { "Microprocessor 5 Unit" }, ++ [SMCA_NBIO] = { "Northbridge IO Unit" }, ++ [SMCA_PCIE ... SMCA_PCIE_V2] = { "PCI Express Unit" }, ++ [SMCA_XGMI_PCS] = { "Ext Global Memory Interconnect PCS Unit" }, ++ [SMCA_XGMI_PHY] = { "Ext Global Memory Interconnect PHY Unit" }, ++ [SMCA_WAFL_PHY] = { "WAFL PHY Unit" }, ++ + }; + + static void amd_decode_errcode(struct mce_event *e) diff --git a/SOURCES/aecf33aa70331670c06db6b652712b476e24051c.patch b/SOURCES/aecf33aa70331670c06db6b652712b476e24051c.patch new file mode 100644 index 0000000..fd557ec --- /dev/null +++ b/SOURCES/aecf33aa70331670c06db6b652712b476e24051c.patch @@ -0,0 +1,107 @@ +commit aecf33aa70331670c06db6b652712b476e24051c +Author: Muralidhara M K +Date: Mon Jul 12 05:40:46 2021 -0500 + + rasdaemon: Enumerate memory on noncpu nodes + + On newer heterogeneous systems from AMD with GPU nodes (with HBM2 memory + banks) connected via xGMI links to the CPUs. + + The node id information is available in the InstanceHI[47:44] of + the IPID register. + + The UMC Phys on Aldeberan nodes are enumerated as csrow + The UMC channels connected to HBMs are enumerated as ranks. + + Signed-off-by: Muralidhara M K + Signed-off-by: Naveen Krishna Chatradhi + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/mce-amd-smca.c b/mce-amd-smca.c +index 3c346f4..f3379fc 100644 +--- a/mce-amd-smca.c ++++ b/mce-amd-smca.c +@@ -78,6 +78,12 @@ enum smca_bank_types { + /* Maximum number of MCA banks per CPU. */ + #define MAX_NR_BANKS 64 + ++/* ++ * On Newer heterogeneous systems from AMD with CPU and GPU nodes connected ++ * via xGMI links, the NON CPU Nodes are enumerated from index 8 ++ */ ++#define NONCPU_NODE_INDEX 8 ++ + /* SMCA Extended error strings */ + /* Load Store */ + static const char * const smca_ls_mce_desc[] = { +@@ -531,6 +537,26 @@ static int find_umc_channel(struct mce_event *e) + { + return EXTRACT(e->ipid, 0, 31) >> 20; + } ++ ++/* ++ * The HBM memory managed by the UMCCH of the noncpu node ++ * can be calculated based on the [15:12]bits of IPID ++ */ ++static int find_hbm_channel(struct mce_event *e) ++{ ++ int umc, tmp; ++ ++ umc = EXTRACT(e->ipid, 0, 31) >> 20; ++ ++ /* ++ * The HBM channel managed by the UMC of the noncpu node ++ * can be calculated based on the [15:12]bits of IPID as follows ++ */ ++ tmp = ((e->ipid >> 12) & 0xf); ++ ++ return (umc % 2) ? tmp + 4 : tmp; ++} ++ + /* Decode extended errors according to Scalable MCA specification */ + static void decode_smca_error(struct mce_event *e) + { +@@ -539,6 +565,7 @@ static void decode_smca_error(struct mce_event *e) + unsigned short xec = (e->status >> 16) & 0x3f; + const struct smca_hwid *s_hwid; + uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63); ++ uint8_t mcatype_instancehi = EXTRACT(e->ipid, 44, 47); + unsigned int csrow = -1, channel = -1; + unsigned int i; + +@@ -548,14 +575,16 @@ static void decode_smca_error(struct mce_event *e) + bank_type = s_hwid->bank_type; + break; + } ++ if (mcatype_instancehi >= NONCPU_NODE_INDEX) ++ bank_type = SMCA_UMC_V2; + } + +- if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) { ++ if (i >= MAX_NR_BANKS) { + strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID"); + return; + } + +- if (bank_type >= N_SMCA_BANK_TYPES) { ++ if (bank_type >= MAX_NR_BANKS) { + strcpy(e->mcastatus_msg, "Don't know how to decode this bank"); + return; + } +@@ -580,6 +609,16 @@ static void decode_smca_error(struct mce_event *e) + mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d", + channel, csrow); + } ++ ++ if (bank_type == SMCA_UMC_V2 && xec == 0) { ++ /* The UMCPHY is reported as csrow in case of noncpu nodes */ ++ csrow = find_umc_channel(e) / 2; ++ /* UMCCH is managing the HBM memory */ ++ channel = find_hbm_channel(e); ++ mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d", ++ channel, csrow); ++ } ++ + } + + int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e) diff --git a/SOURCES/rasdaemon-ras-mc-ctl-Fix-script-to-parse-dimm-sizes.patch b/SOURCES/rasdaemon-ras-mc-ctl-Fix-script-to-parse-dimm-sizes.patch new file mode 100644 index 0000000..c4517a4 --- /dev/null +++ b/SOURCES/rasdaemon-ras-mc-ctl-Fix-script-to-parse-dimm-sizes.patch @@ -0,0 +1,47 @@ +From: Muralidhara M K + +This patch removes trailing spaces at the end of a line from +file location and fixes --layout option to parse dimm nodes +to get the size from ras-mc-ctl. + +Issue is reported https://github.com/mchehab/rasdaemon/issues/43 +Where '> ras-mc-ctl --layout' reports all 0s + +With this change the layout prints the correct dimm sizes +> sudo ras-mc-ctl --layout + +-----------------------------------------------+ + | mc0 | + | csrow0 | csrow1 | csrow2 | csrow3 | +----------+-----------------------------------------------+ +... +channel7: | 16384 MB | 0 MB | 0 MB | 0 MB | +channel6: | 16384 MB | 0 MB | 0 MB | 0 MB | +... +----------+-----------------------------------------------+ + +Signed-off-by: Muralidhara M K +Signed-off-by: Naveen Krishna Chatradhi +--- + util/ras-mc-ctl.in | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 1e3aeb7..b22dd60 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -246,6 +246,7 @@ sub parse_dimm_nodes + if (($file =~ /max_location$/)) { + open IN, $file; + my $location = ; ++ $location =~ s/\s+$//; + close IN; + my @temp = split(/ /, $location); + +@@ -288,6 +289,7 @@ sub parse_dimm_nodes + + open IN, $file; + my $location = ; ++ $location =~ s/\s+$//; + close IN; + + my @pos; diff --git a/SPECS/rasdaemon.spec b/SPECS/rasdaemon.spec index 3c21a31..e5768ad 100644 --- a/SPECS/rasdaemon.spec +++ b/SPECS/rasdaemon.spec @@ -1,6 +1,6 @@ Name: rasdaemon Version: 0.6.1 -Release: 5.1%{?dist} +Release: 7%{?dist} Summary: Utility to receive RAS error tracings Group: Applications/System License: GPLv2 @@ -31,6 +31,11 @@ Patch5: 2a1d217660351c08eb2f8bccebf939abba2f7e69.patch Patch6: 8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch Patch7: cc2ce5c65ed5a42eaa97aa3659854add6d808da5.patch Patch8: 854364ba44aee9bc5646f6537fc744b0b54aff37.patch +Patch9: 9acef39f13833f7d53ef96abc5a72e79384260f4.patch +Patch10: 28ea956acc2dab7c18b4701f9657afb9ab3ddc79.patch +Patch11: aecf33aa70331670c06db6b652712b476e24051c.patch +Patch12: 7937f0d6c2aaaed096f3a3d306416743c0dcb7a4.patch +Patch13: rasdaemon-ras-mc-ctl-Fix-script-to-parse-dimm-sizes.patch %description %{name} is a RAS (Reliability, Availability and Serviceability) logging tool. @@ -52,6 +57,11 @@ an utility for reporting current error counts from the EDAC sysfs files. %patch6 -p1 %patch7 -p1 %patch8 -p1 +%patch9 -p1 +%patch10 -p1 +%patch11 -p1 +%patch12 -p1 +%patch13 -p1 %build %ifarch %{arm} aarch64 @@ -78,8 +88,11 @@ rm INSTALL %{buildroot}/usr/include/*.h %{_sysconfdir}/ras/dimm_labels.d %changelog -* Wed May 26 2021 Aristeu Rozanski 0.6.1-5.1 -- Add support for AMD SMCA [1975506] +* Thu Aug 26 2021 Aristeu Rozanski 0.6.1-7 +- Add support for AMD SMCA banks for family 19 [1991955] + +* Wed May 26 2021 Aristeu Rozanski 0.6.1-6 +- Add support for AMD SMCA [1965011] * Wed Apr 08 2020 Aristeu Rozanski 0.6.1-5 - Fix high CPU usage when CPUs are offline [1683420]