diff --git a/SOURCES/73d8177ce0d2fcb7693cacee4778d0845ebd3788.patch b/SOURCES/73d8177ce0d2fcb7693cacee4778d0845ebd3788.patch new file mode 100644 index 0000000..2655de8 --- /dev/null +++ b/SOURCES/73d8177ce0d2fcb7693cacee4778d0845ebd3788.patch @@ -0,0 +1,93 @@ +commit 73d8177ce0d2fcb7693cacee4778d0845ebd3788 +Author: sathya priya kumar +Date: Thu Jun 13 05:29:09 2024 +0000 + + rasdaemon: mce-amd-smca: Optimizing decoding of MCA_CTL_SMU bits + + Optimize smca_smu2_mce_desc in better way from the commit ced615c. + + Update existing array with extended error descriptions instead + of creating new array, simplifying the code. + + Signed-off-by: Sathya Priya Kumar + Signed-off-by: Mauro Carvalho Chehab + +--- + mce-amd-smca.c | 29 +++-------------------------- + ras-mce-handler.h | 1 - + 2 files changed, 3 insertions(+), 27 deletions(-) + +--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-07-18 11:14:26.008582740 -0400 ++++ rasdaemon-0.6.7/mce-amd-smca.c 2024-07-18 11:15:05.510270132 -0400 +@@ -397,7 +397,7 @@ static const char * const smca_smu_mce_d + "An ECC or parity error in an SMU RAM instance", + }; + +-static const char * smca_smu2_mce_desc[64] = { ++static const char * const smca_smu2_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Data Cache Bank A ECC or parity error", +@@ -410,14 +410,13 @@ static const char * smca_smu2_mce_desc[6 + "Instruction Tag Cache Bank B ECC or parity error", + "System Hub Read Buffer ECC or parity error", + "PHY RAS ECC Error", +-}; +- +-static const char * smca_smu2_ext_mce_desc[] = { ++ [12 ... 57] = "Reserved", + "A correctable error from a GFX Sub-IP", + "A fatal error from a GFX Sub-IP", + "Reserved", + "Reserved", + "A poison error from a GFX Sub-IP", ++ "Reserved", + }; + + static const char * const smca_mp5_mce_desc[] = { +@@ -824,27 +823,6 @@ static struct smca_bank_name smca_names[ + [SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" }, + }; + +-void smca_smu2_ext_err_desc(void) +-{ +- int i, j; +- int smu2_bits = 62; +- +- /* +- * MCA_CTL_SMU error stings are defined for b'58:59 and b'62 +- * in MI300A AMD systems. See AMD PPR MCA::SMU::MCA_CTL_SMU +- * +- * b'0:11 can be decoded from existing array smca_smu2_mce_desc. +- * b'12:57 are Reserved and b'58:62 are appended to the +- * smca_smu2_mce_desc. +- */ +- for (i = 12, j = 0; i < smu2_bits || j < 5; i++, j++) { +- for ( ; i < 58; i++) +- smca_smu2_mce_desc[i] = "Reserved"; +- +- smca_smu2_mce_desc[i] = smca_smu2_ext_mce_desc[j]; +- } +-} +- + void amd_decode_errcode(struct mce_event *e) + { + +@@ -936,7 +914,6 @@ unsigned short xec = (e->status >> 16) & + mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID, + (ipid_high & MCI_IPID_MCATYPE) >> 16); + +- smca_smu2_ext_err_desc(); + fixup_hwid(m, &mcatype_hwid); + + for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { +--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-07-18 11:14:26.008582740 -0400 ++++ rasdaemon-0.6.7/ras-mce-handler.h 2024-07-18 11:14:28.987559165 -0400 +@@ -121,7 +121,6 @@ int set_intel_imc_log(enum cputype cputy + /* Undertake AMD SMCA Error Decoding */ + void decode_smca_error(struct mce_event *e, struct mce_priv *m); + void amd_decode_errcode(struct mce_event *e); +-void smca_smu2_ext_err_desc(void); + + /* Per-CPU-type decoders for Intel CPUs */ + void p4_decode_model(struct mce_event *e); diff --git a/SOURCES/7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch b/SOURCES/7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch new file mode 100644 index 0000000..b9615bd --- /dev/null +++ b/SOURCES/7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch @@ -0,0 +1,34 @@ +commit 7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e +Author: Aristeu Rozanski +Date: Tue Apr 9 10:06:30 2024 -0400 + + mce-amd-smca: update smca_hwid to use smca_bank_types + + bank_type is used as smca_bank_types everywhere, there's no point in + declaring it as unsigned int. It also upsets covscan: + + 3. rasdaemon-0.6.7/mce-amd-smca.c:914: assignment: Assigning: "bank_type" = "s_hwid->bank_type". + 7. rasdaemon-0.6.7/mce-amd-smca.c:926: cond_at_most: Checking "bank_type >= 64U" implies that "bank_type" and "s_hwid->bank_type" may be up to 63 on the false branch. + 14. rasdaemon-0.6.7/mce-amd-smca.c:942: overrun-local: Overrunning array "smca_mce_descs" of 38 16-byte elements at element index 63 (byte offset 1023) using index "bank_type" (which evaluates to 63). + # 940| /* Only print the descriptor of valid extended error code */ + # 941| if (xec < smca_mce_descs[bank_type].num_descs) + # 942|-> mce_snprintf(e->mcastatus_msg, + # 943| "%s. Ext Err Code: %d", + # 944| smca_mce_descs[bank_type].descs[xec], + + Signed-off-by: Aristeu Rozanski + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/mce-amd-smca.c b/mce-amd-smca.c +index 7521ff7..6632663 100644 +--- a/mce-amd-smca.c ++++ b/mce-amd-smca.c +@@ -706,7 +706,7 @@ static struct smca_mce_desc smca_mce_descs[] = { + }; + + struct smca_hwid { +- unsigned int bank_type; /* Use with smca_bank_types for easy indexing.*/ ++ enum smca_bank_types bank_type; + uint32_t mcatype_hwid; /* mcatype,hwid bit 63-32 in MCx_IPID Register*/ + }; + diff --git a/SOURCES/885e546add918457c453bd3f753ac7df90b39e36.patch b/SOURCES/885e546add918457c453bd3f753ac7df90b39e36.patch new file mode 100644 index 0000000..e5a2e94 --- /dev/null +++ b/SOURCES/885e546add918457c453bd3f753ac7df90b39e36.patch @@ -0,0 +1,22 @@ +commit 885e546add918457c453bd3f753ac7df90b39e36 +Author: weidongkl +Date: Tue Sep 19 16:29:21 2023 +0800 + + Add a space between "diskerror_event" and "store" + + Signed-off-by: weidongkl + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-record.c b/ras-record.c +index a5f99ae..6b050bb 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -484,7 +484,7 @@ int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev + + if (!priv || !priv->stmt_diskerror_event) + return 0; +- log(TERM, LOG_INFO, "diskerror_eventstore: %p\n", priv->stmt_diskerror_event); ++ log(TERM, LOG_INFO, "diskerror_event store: %p\n", priv->stmt_diskerror_event); + + sqlite3_bind_text(priv->stmt_diskerror_event, 1, ev->timestamp, -1, NULL); + sqlite3_bind_text(priv->stmt_diskerror_event, 2, ev->dev, -1, NULL); diff --git a/SOURCES/9bd84aef87978b806178a73ed33c39d6c442fc1f.patch b/SOURCES/9bd84aef87978b806178a73ed33c39d6c442fc1f.patch new file mode 100644 index 0000000..adecd79 --- /dev/null +++ b/SOURCES/9bd84aef87978b806178a73ed33c39d6c442fc1f.patch @@ -0,0 +1,24 @@ +commit 9bd84aef87978b806178a73ed33c39d6c442fc1f +Author: weidong +Date: Tue Aug 8 08:59:12 2023 +0000 + + add ':' before error output + + All prints except disk are preceded by a colon + + Signed-off-by: weidong + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index dc326d3..13078c2 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1469,7 +1469,7 @@ sub errors + $out .= "\n"; + } + if ($out ne "") { +- print "Disk errors\n$out\n"; ++ print "Disk errors:\n$out\n"; + } else { + print "No disk errors.\n\n"; + } diff --git a/SOURCES/9c86f6255f67a8bae28cd46c54500fc16bfc7a30.patch b/SOURCES/9c86f6255f67a8bae28cd46c54500fc16bfc7a30.patch new file mode 100644 index 0000000..fe85c48 --- /dev/null +++ b/SOURCES/9c86f6255f67a8bae28cd46c54500fc16bfc7a30.patch @@ -0,0 +1,117 @@ +commit 9c86f6255f67a8bae28cd46c54500fc16bfc7a30 +Author: Yang Shi +Date: Mon Apr 4 16:34:05 2022 -0700 + + rasdaemon: use the new block_rq_error tracepoint + + Since Linux 5.18-rc1 a new block tracepoint called block_rq_error is + available for tracing disk error events dedicatedly. Currently + rasdaemon is using block_rq_complete which also traces successful cases. + It incurs excessive tracing logs and somehow overhead since the event is + triggered quite often. + + Use the new tracepoint for disk error reporting, and the new trace point + has the same format as block_rq_complete. + + Signed-off-by: Yang Shi + Signed-off-by: Mauro Carvalho Chehab + +--- + ras-events.c | 53 ++++++++++------------------------------------------- + ras-record.c | 2 +- + 2 files changed, 11 insertions(+), 44 deletions(-) + +--- rasdaemon-0.6.7.orig/ras-events.c 2024-05-14 11:05:40.020599541 -0400 ++++ rasdaemon-0.6.7/ras-events.c 2024-05-14 11:06:38.831067957 -0400 +@@ -27,6 +27,7 @@ * Foundation, Inc., 51 Franklin Street, + #include + #include + #include ++#include + #include "libtrace/kbuffer.h" + #include "libtrace/event-parse.h" + #include "ras-mc-handler.h" +@@ -229,7 +230,7 @@ if (rc < 0) { + #endif + + #ifdef HAVE_DISKERROR +- rc |= __toggle_ras_mc_event(ras, "block", "block_rq_complete", enable); ++ rc |= __toggle_ras_mc_event(ras, "block", "block_rq_error", enable); + #endif + + #ifdef HAVE_MEMORY_FAILURE +@@ -241,37 +242,6 @@ free_ras: + return rc; + } + +-/* +- * Set kernel filter. libtrace doesn't provide an API for setting filters +- * in kernel, we have to implement it here. +- */ +-static int filter_ras_mc_event(struct ras_events *ras, char *group, char *event, +- const char *filter_str) +-{ +- int fd, rc; +- char fname[MAX_PATH + 1]; +- +- snprintf(fname, sizeof(fname), "events/%s/%s/filter", group, event); +- fd = open_trace(ras, fname, O_RDWR | O_APPEND); +- if (fd < 0) { +- log(ALL, LOG_WARNING, "Can't open filter file\n"); +- return errno; +- } +- +- rc = write(fd, filter_str ,strlen(filter_str)); +- if (rc < 0) { +- log(ALL, LOG_WARNING, "Can't write to filter file\n"); +- close(fd); +- return rc; +- } +- close(fd); +- if (!rc) { +- log(ALL, LOG_WARNING, "Nothing was written on filter file\n"); +- return EIO; +- } +- +- return 0; +-} + + /* + * Tracing read code +@@ -901,17 +871,14 @@ (void)open("/sys/kernel/debug/ras/daemon + #endif + + #ifdef HAVE_DISKERROR +- rc = filter_ras_mc_event(ras, "block", "block_rq_complete", "error != 0"); +- if (!rc) { +- rc = add_event_handler(ras, pevent, page_size, "block", +- "block_rq_complete", ras_diskerror_event_handler, +- NULL, DISKERROR_EVENT); +- if (!rc) +- num_events++; +- else +- log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", +- "block", "block_rq_complete"); +- } ++ rc = add_event_handler(ras, pevent, page_size, "block", ++ "block_rq_error", ras_diskerror_event_handler, ++ NULL, DISKERROR_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "block", "block_rq_error"); + #endif + + #ifdef HAVE_MEMORY_FAILURE +--- rasdaemon-0.6.7.orig/ras-record.c 2024-05-14 11:07:24.573654494 -0400 ++++ rasdaemon-0.6.7/ras-record.c 2024-05-14 11:07:07.626807674 -0400 +@@ -456,7 +456,7 @@ return 0; + #endif + + /* +- * Table and functions to handle block:block_rq_complete ++ * Table and functions to handle block:block_rq_error + */ + + #ifdef HAVE_DISKERROR diff --git a/SOURCES/ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch b/SOURCES/ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch new file mode 100644 index 0000000..88356fb --- /dev/null +++ b/SOURCES/ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch @@ -0,0 +1,94 @@ +commit ced615cf8146f51b5d6fe7a29107a2adc77407ca +Author: Sathya Priya Kumar +Date: Thu Jan 11 01:20:07 2024 -0600 + + rasdaemon: Add error decoding for MCA_CTL_SMU extended bits + + Enable error decoding support for the newly added extended + error bit descriptions from MCA_CTL_SMU. + b'0:11 can be decoded from existing array smca_smu2_mce_desc. + Define a function to append the newly defined b'58:62 to the + smca_smu2_mce_desc. This reduces the maintaining Reserved bits + from b'12:57 in the code. + + Signed-off-by: Sathya Priya Kumar + Signed-off-by: Mauro Carvalho Chehab + +--- + mce-amd-smca.c | 33 ++++++++++++++++++++++++++++++++- + ras-mce-handler.h | 1 + + 2 files changed, 33 insertions(+), 1 deletion(-) + +--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-06-28 10:34:16.453522865 -0400 ++++ rasdaemon-0.6.7/mce-amd-smca.c 2024-06-28 10:34:46.049124270 -0400 +@@ -397,7 +397,7 @@ static const char * const smca_smu_mce_d + "An ECC or parity error in an SMU RAM instance", + }; + +-static const char * const smca_smu2_mce_desc[] = { ++static const char * smca_smu2_mce_desc[64] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Data Cache Bank A ECC or parity error", +@@ -409,6 +409,15 @@ static const char * const smca_smu2_mce_ + "Instruction Tag Cache Bank A ECC or parity error", + "Instruction Tag Cache Bank B ECC or parity error", + "System Hub Read Buffer ECC or parity error", ++ "PHY RAS ECC Error", ++}; ++ ++static const char * smca_smu2_ext_mce_desc[] = { ++ "A correctable error from a GFX Sub-IP", ++ "A fatal error from a GFX Sub-IP", ++ "Reserved", ++ "Reserved", ++ "A poison error from a GFX Sub-IP", + }; + + static const char * const smca_mp5_mce_desc[] = { +@@ -815,6 +824,27 @@ static struct smca_bank_name smca_names[ + [SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" }, + }; + ++void smca_smu2_ext_err_desc(void) ++{ ++ int i, j; ++ int smu2_bits = 62; ++ ++ /* ++ * MCA_CTL_SMU error stings are defined for b'58:59 and b'62 ++ * in MI300A AMD systems. See AMD PPR MCA::SMU::MCA_CTL_SMU ++ * ++ * b'0:11 can be decoded from existing array smca_smu2_mce_desc. ++ * b'12:57 are Reserved and b'58:62 are appended to the ++ * smca_smu2_mce_desc. ++ */ ++ for (i = 12, j = 0; i < smu2_bits || j < 5; i++, j++) { ++ for ( ; i < 58; i++) ++ smca_smu2_mce_desc[i] = "Reserved"; ++ ++ smca_smu2_mce_desc[i] = smca_smu2_ext_mce_desc[j]; ++ } ++} ++ + void amd_decode_errcode(struct mce_event *e) + { + +@@ -906,6 +936,7 @@ unsigned short xec = (e->status >> 16) & + mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID, + (ipid_high & MCI_IPID_MCATYPE) >> 16); + ++ smca_smu2_ext_err_desc(); + fixup_hwid(m, &mcatype_hwid); + + for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { +--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-06-28 10:34:16.453522865 -0400 ++++ rasdaemon-0.6.7/ras-mce-handler.h 2024-06-28 10:34:17.795508302 -0400 +@@ -121,6 +121,7 @@ int set_intel_imc_log(enum cputype cputy + /* Undertake AMD SMCA Error Decoding */ + void decode_smca_error(struct mce_event *e, struct mce_priv *m); + void amd_decode_errcode(struct mce_event *e); ++void smca_smu2_ext_err_desc(void); + + /* Per-CPU-type decoders for Intel CPUs */ + void p4_decode_model(struct mce_event *e); diff --git a/SPECS/rasdaemon.spec b/SPECS/rasdaemon.spec index 04d39dc..07eecb5 100644 --- a/SPECS/rasdaemon.spec +++ b/SPECS/rasdaemon.spec @@ -1,8 +1,8 @@ Name: rasdaemon Version: 0.6.7 -Release: 9%{?dist} +Release: 15%{?dist} Summary: Utility to receive RAS error tracings -License: GPLv2 +License: GPL-2.0-only URL: http://git.infradead.org/users/mchehab/rasdaemon.git Source0: http://www.infradead.org/~mchehab/rasdaemon/%{name}-%{version}.tar.bz2 Patch0: labels.patch @@ -33,6 +33,12 @@ Patch24: 1f74a59ee33b7448b00d7ba13d5ecd4918b9853c.patch Patch25: 2d15882a0cbfce0b905039bebc811ac8311cd739.patch Patch26: c785d309dcbdeb7ecd219975244f3944a8d047e9.patch Patch27: b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87.patch +Patch28: 9c86f6255f67a8bae28cd46c54500fc16bfc7a30.patch +Patch29: 9bd84aef87978b806178a73ed33c39d6c442fc1f.patch +Patch30: 885e546add918457c453bd3f753ac7df90b39e36.patch +Patch31: 7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch +Patch32: ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch +Patch33: 73d8177ce0d2fcb7693cacee4778d0845ebd3788.patch ExcludeArch: s390 s390x BuildRequires: make @@ -95,6 +101,12 @@ an utility for reporting current error counts from the EDAC sysfs files. %patch25 -p1 %patch26 -p1 %patch27 -p1 +%patch28 -p1 +%patch29 -p1 +%patch30 -p1 +%patch31 -p1 +%patch32 -p1 +%patch33 -p1 # The tarball is locked in time the first time aclocal was ran and will keep # requiring an older version of automake @@ -130,6 +142,21 @@ sed -i "s/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION=account/" %{buildroot}/%{_sysconfdir %{_sysconfdir}/sysconfig/rasdaemon %changelog +* Thu Jul 18 2024 Aristeu Rozanski 0.6.7-14 +- rasdaemon: mce-amd-smca: Optimizing decoding of MCA_CTL_SMU bits [RHEL-48819] + +* Fri Jun 28 2024 Aristeu Rozanski 0.6.7-13 +- rasdaemon: Add error decoding for MCA_CTL_SMU extended bits [RHEL-35718] + +* Thu Jun 20 2024 Aristeu Rozanski 0.6.7-12 +- mce-amd-smca: update smca_hwid to use smca_bank_types [RHEL-24170] + +* Wed May 08 2024 Aristeu Rozanski 0.6.7-11 +- Fix excessive block messages [RHEL-8708] + +* Wed Jan 10 2024 Aristeu Rozanski 0.6.7-10 +- Update License string to use SPDX [RHELMISC-1262] + * Thu Oct 26 2023 Aristeu Rozanski 0.6.7-9 - Update SMCA support for AMD processors [RHEL-11092]