diff --git a/ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch b/ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch new file mode 100644 index 0000000..88356fb --- /dev/null +++ b/ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch @@ -0,0 +1,94 @@ +commit ced615cf8146f51b5d6fe7a29107a2adc77407ca +Author: Sathya Priya Kumar +Date: Thu Jan 11 01:20:07 2024 -0600 + + rasdaemon: Add error decoding for MCA_CTL_SMU extended bits + + Enable error decoding support for the newly added extended + error bit descriptions from MCA_CTL_SMU. + b'0:11 can be decoded from existing array smca_smu2_mce_desc. + Define a function to append the newly defined b'58:62 to the + smca_smu2_mce_desc. This reduces the maintaining Reserved bits + from b'12:57 in the code. + + Signed-off-by: Sathya Priya Kumar + Signed-off-by: Mauro Carvalho Chehab + +--- + mce-amd-smca.c | 33 ++++++++++++++++++++++++++++++++- + ras-mce-handler.h | 1 + + 2 files changed, 33 insertions(+), 1 deletion(-) + +--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-06-28 10:34:16.453522865 -0400 ++++ rasdaemon-0.6.7/mce-amd-smca.c 2024-06-28 10:34:46.049124270 -0400 +@@ -397,7 +397,7 @@ static const char * const smca_smu_mce_d + "An ECC or parity error in an SMU RAM instance", + }; + +-static const char * const smca_smu2_mce_desc[] = { ++static const char * smca_smu2_mce_desc[64] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Data Cache Bank A ECC or parity error", +@@ -409,6 +409,15 @@ static const char * const smca_smu2_mce_ + "Instruction Tag Cache Bank A ECC or parity error", + "Instruction Tag Cache Bank B ECC or parity error", + "System Hub Read Buffer ECC or parity error", ++ "PHY RAS ECC Error", ++}; ++ ++static const char * smca_smu2_ext_mce_desc[] = { ++ "A correctable error from a GFX Sub-IP", ++ "A fatal error from a GFX Sub-IP", ++ "Reserved", ++ "Reserved", ++ "A poison error from a GFX Sub-IP", + }; + + static const char * const smca_mp5_mce_desc[] = { +@@ -815,6 +824,27 @@ static struct smca_bank_name smca_names[ + [SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" }, + }; + ++void smca_smu2_ext_err_desc(void) ++{ ++ int i, j; ++ int smu2_bits = 62; ++ ++ /* ++ * MCA_CTL_SMU error stings are defined for b'58:59 and b'62 ++ * in MI300A AMD systems. See AMD PPR MCA::SMU::MCA_CTL_SMU ++ * ++ * b'0:11 can be decoded from existing array smca_smu2_mce_desc. ++ * b'12:57 are Reserved and b'58:62 are appended to the ++ * smca_smu2_mce_desc. ++ */ ++ for (i = 12, j = 0; i < smu2_bits || j < 5; i++, j++) { ++ for ( ; i < 58; i++) ++ smca_smu2_mce_desc[i] = "Reserved"; ++ ++ smca_smu2_mce_desc[i] = smca_smu2_ext_mce_desc[j]; ++ } ++} ++ + void amd_decode_errcode(struct mce_event *e) + { + +@@ -906,6 +936,7 @@ unsigned short xec = (e->status >> 16) & + mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID, + (ipid_high & MCI_IPID_MCATYPE) >> 16); + ++ smca_smu2_ext_err_desc(); + fixup_hwid(m, &mcatype_hwid); + + for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { +--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-06-28 10:34:16.453522865 -0400 ++++ rasdaemon-0.6.7/ras-mce-handler.h 2024-06-28 10:34:17.795508302 -0400 +@@ -121,6 +121,7 @@ int set_intel_imc_log(enum cputype cputy + /* Undertake AMD SMCA Error Decoding */ + void decode_smca_error(struct mce_event *e, struct mce_priv *m); + void amd_decode_errcode(struct mce_event *e); ++void smca_smu2_ext_err_desc(void); + + /* Per-CPU-type decoders for Intel CPUs */ + void p4_decode_model(struct mce_event *e); diff --git a/rasdaemon.spec b/rasdaemon.spec index fb2aa7a..d8eaa5d 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,6 +1,6 @@ Name: rasdaemon Version: 0.6.7 -Release: 12%{?dist} +Release: 13%{?dist} Summary: Utility to receive RAS error tracings License: GPL-2.0-only URL: http://git.infradead.org/users/mchehab/rasdaemon.git @@ -37,6 +37,7 @@ Patch28: 9c86f6255f67a8bae28cd46c54500fc16bfc7a30.patch Patch29: 9bd84aef87978b806178a73ed33c39d6c442fc1f.patch Patch30: 885e546add918457c453bd3f753ac7df90b39e36.patch Patch31: 7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch +Patch32: ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch ExcludeArch: s390 s390x BuildRequires: make @@ -103,6 +104,7 @@ an utility for reporting current error counts from the EDAC sysfs files. %patch29 -p1 %patch30 -p1 %patch31 -p1 +%patch32 -p1 # The tarball is locked in time the first time aclocal was ran and will keep # requiring an older version of automake @@ -138,6 +140,9 @@ sed -i "s/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION=account/" %{buildroot}/%{_sysconfdir %{_sysconfdir}/sysconfig/rasdaemon %changelog +* Fri Jun 28 2024 Aristeu Rozanski 0.6.7-13 +- rasdaemon: Add error decoding for MCA_CTL_SMU extended bits [RHEL-35718] + * Thu Jun 20 2024 Aristeu Rozanski 0.6.7-12 - mce-amd-smca: update smca_hwid to use smca_bank_types [RHEL-24170]