diff --git a/SOURCES/045ab08eaa00172d50621df9502f6910f3fe3af4.patch b/SOURCES/045ab08eaa00172d50621df9502f6910f3fe3af4.patch new file mode 100644 index 0000000..99887d4 --- /dev/null +++ b/SOURCES/045ab08eaa00172d50621df9502f6910f3fe3af4.patch @@ -0,0 +1,154 @@ +commit 045ab08eaa00172d50621df9502f6910f3fe3af4 +Author: Avadhut Naik +Date: Mon Apr 1 23:33:07 2024 -0500 + + rasdaemon: Add support to parse the PPIN field of mce tracepoint + + Support for exporting the PPIN (Protected Processor Inventory Number) + is being added to the mce_record tracepoint. + + Add the required, corresponding support in the rasdaemon for the field + to be parsed and logged or added to the database and viewed later through + ras-mc-ctl utility. + + Signed-off-by: Avadhut Naik + Signed-off-by: Mauro Carvalho Chehab + +--- + ras-mce-handler.c | 7 +++++++ + ras-mce-handler.h | 1 + + ras-record.c | 42 ++++++++++++++++++++++-------------------- + util/ras-mc-ctl.in | 7 ++++--- + 4 files changed, 34 insertions(+), 23 deletions(-) + +--- rasdaemon-0.6.7.orig/ras-mce-handler.c 2024-08-22 14:30:41.581276901 -0400 ++++ rasdaemon-0.6.7/ras-mce-handler.c 2024-08-22 14:33:19.940957855 -0400 +@@ -369,6 +369,9 @@ #if 0 + + trace_seq_printf(s, ", apicid= %x", e->apicid); + ++ if (e->ppin) ++ trace_seq_printf(s, ", ppin= %llx", (long long)e->ppin); ++ + /* + * FIXME: The original mcelog userspace tool uses DMI to map from + * address to DIMM. From the comments there, the code there doesn't +@@ -545,6 +548,10 @@ if (pevent_get_field_val(s, event, "ipid + return -1; + e.ipid = val; + ++ /* Get PPIN */ ++ if (!pevent_get_field_val(s, event, "ppin", record, &val, 1)) ++ e.ppin = val; ++ + switch (mce->cputype) { + case CPU_GENERIC: + break; +--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-08-22 14:30:41.581276901 -0400 ++++ rasdaemon-0.6.7/ras-mce-handler.h 2024-08-22 14:33:47.312729865 -0400 +@@ -74,6 +74,7 @@ struct mce_event { + uint8_t cpuvendor; + uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */ + uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */ ++ uint64_t ppin; + + /* Parsed data */ + char timestamp[64]; +--- rasdaemon-0.6.7.orig/ras-record.c 2024-08-22 14:30:41.581276901 -0400 ++++ rasdaemon-0.6.7/ras-record.c 2024-08-22 14:37:20.884941279 -0400 +@@ -330,19 +330,20 @@ static const struct db_fields mce_record + { .name="ip", .type="INTEGER" }, + { .name="tsc", .type="INTEGER" }, + { .name="walltime", .type="INTEGER" }, +- { .name="cpu", .type="INTEGER" }, // 10 ++ { .name = "ppin", .type = "INTEGER" }, // 10 ++ { .name="cpu", .type="INTEGER" }, + { .name="cpuid", .type="INTEGER" }, + { .name="apicid", .type="INTEGER" }, + { .name="socketid", .type="INTEGER" }, +- { .name="cs", .type="INTEGER" }, +- { .name="bank", .type="INTEGER" }, //15 ++ { .name="cs", .type="INTEGER" }, //15 ++ { .name="bank", .type="INTEGER" }, + { .name="cpuvendor", .type="INTEGER" }, + + /* Parsed data - will likely change */ + { .name="bank_name", .type="TEXT" }, + { .name="error_msg", .type="TEXT" }, +- { .name="mcgstatus_msg", .type="TEXT" }, +- { .name="mcistatus_msg", .type="TEXT" }, // 20 ++ { .name="mcgstatus_msg", .type="TEXT" }, // 20 ++ { .name="mcistatus_msg", .type="TEXT" }, + { .name="mcastatus_msg", .type="TEXT" }, + { .name="user_action", .type="TEXT" }, + { .name="mc_location", .type="TEXT" }, +@@ -372,21 +373,22 @@ return 0; + sqlite3_bind_int64 (priv->stmt_mce_record, 7, ev->ip); + sqlite3_bind_int64 (priv->stmt_mce_record, 8, ev->tsc); + sqlite3_bind_int64 (priv->stmt_mce_record, 9, ev->walltime); +- sqlite3_bind_int (priv->stmt_mce_record, 10, ev->cpu); +- sqlite3_bind_int (priv->stmt_mce_record, 11, ev->cpuid); +- sqlite3_bind_int (priv->stmt_mce_record, 12, ev->apicid); +- sqlite3_bind_int (priv->stmt_mce_record, 13, ev->socketid); +- sqlite3_bind_int (priv->stmt_mce_record, 14, ev->cs); +- sqlite3_bind_int (priv->stmt_mce_record, 15, ev->bank); +- sqlite3_bind_int (priv->stmt_mce_record, 16, ev->cpuvendor); +- +- sqlite3_bind_text(priv->stmt_mce_record, 17, ev->bank_name, -1, NULL); +- sqlite3_bind_text(priv->stmt_mce_record, 18, ev->error_msg, -1, NULL); +- sqlite3_bind_text(priv->stmt_mce_record, 19, ev->mcgstatus_msg, -1, NULL); +- sqlite3_bind_text(priv->stmt_mce_record, 20, ev->mcistatus_msg, -1, NULL); +- sqlite3_bind_text(priv->stmt_mce_record, 21, ev->mcastatus_msg, -1, NULL); +- sqlite3_bind_text(priv->stmt_mce_record, 22, ev->user_action, -1, NULL); +- sqlite3_bind_text(priv->stmt_mce_record, 23, ev->mc_location, -1, NULL); ++ sqlite3_bind_int64(priv->stmt_mce_record, 10, ev->ppin); ++ sqlite3_bind_int (priv->stmt_mce_record, 11, ev->cpu); ++ sqlite3_bind_int (priv->stmt_mce_record, 12, ev->cpuid); ++ sqlite3_bind_int (priv->stmt_mce_record, 13, ev->apicid); ++ sqlite3_bind_int (priv->stmt_mce_record, 14, ev->socketid); ++ sqlite3_bind_int (priv->stmt_mce_record, 15, ev->cs); ++ sqlite3_bind_int (priv->stmt_mce_record, 16, ev->bank); ++ sqlite3_bind_int (priv->stmt_mce_record, 17, ev->cpuvendor); ++ ++ sqlite3_bind_text(priv->stmt_mce_record, 18, ev->bank_name, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mce_record, 19, ev->error_msg, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mce_record, 20, ev->mcgstatus_msg, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mce_record, 21, ev->mcistatus_msg, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mce_record, 22, ev->mcastatus_msg, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mce_record, 23, ev->user_action, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mce_record, 24, ev->mc_location, -1, NULL); + + rc = sqlite3_step(priv->stmt_mce_record); + if (rc != SQLITE_OK && rc != SQLITE_DONE) +--- rasdaemon-0.6.7.orig/util/ras-mc-ctl.in 2024-08-22 14:29:51.058697724 -0400 ++++ rasdaemon-0.6.7/util/ras-mc-ctl.in 2024-08-22 14:34:53.803175632 -0400 +@@ -1317,7 +1317,7 @@ sub errors + { + require DBI; + my ($query, $query_handle, $id, $time, $devname, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out); +- my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location); ++ my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location); + my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data); + my ($bus_name, $dev_name, $driver_name, $reporter_name); + my ($dev, $sector, $nr_sector, $error, $rwbs, $cmd); +@@ -1485,10 +1485,10 @@ $out .= sprintf "address=0x%08x, ", $add + + # MCE mce_record errors + if ($has_mce == 1) { +- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record order by id"; ++ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, ppin, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record order by id"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); +- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location)); ++ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location)); + $out = ""; + while($query_handle->fetch()) { + $out .= "$id $time error: $msg"; +@@ -1507,6 +1507,7 @@ $out .= sprintf ", misc=0x%08x", $misc i + $out .= sprintf ", ip=0x%08x", $ip if ($ip); + $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc); + $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime); ++ $out .= sprintf ", ppin=0x%08x", $ppin if ($ppin); + $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu); + $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid); + $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid); diff --git a/SOURCES/73d8177ce0d2fcb7693cacee4778d0845ebd3788.patch b/SOURCES/73d8177ce0d2fcb7693cacee4778d0845ebd3788.patch new file mode 100644 index 0000000..2655de8 --- /dev/null +++ b/SOURCES/73d8177ce0d2fcb7693cacee4778d0845ebd3788.patch @@ -0,0 +1,93 @@ +commit 73d8177ce0d2fcb7693cacee4778d0845ebd3788 +Author: sathya priya kumar +Date: Thu Jun 13 05:29:09 2024 +0000 + + rasdaemon: mce-amd-smca: Optimizing decoding of MCA_CTL_SMU bits + + Optimize smca_smu2_mce_desc in better way from the commit ced615c. + + Update existing array with extended error descriptions instead + of creating new array, simplifying the code. + + Signed-off-by: Sathya Priya Kumar + Signed-off-by: Mauro Carvalho Chehab + +--- + mce-amd-smca.c | 29 +++-------------------------- + ras-mce-handler.h | 1 - + 2 files changed, 3 insertions(+), 27 deletions(-) + +--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-07-18 11:14:26.008582740 -0400 ++++ rasdaemon-0.6.7/mce-amd-smca.c 2024-07-18 11:15:05.510270132 -0400 +@@ -397,7 +397,7 @@ static const char * const smca_smu_mce_d + "An ECC or parity error in an SMU RAM instance", + }; + +-static const char * smca_smu2_mce_desc[64] = { ++static const char * const smca_smu2_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Data Cache Bank A ECC or parity error", +@@ -410,14 +410,13 @@ static const char * smca_smu2_mce_desc[6 + "Instruction Tag Cache Bank B ECC or parity error", + "System Hub Read Buffer ECC or parity error", + "PHY RAS ECC Error", +-}; +- +-static const char * smca_smu2_ext_mce_desc[] = { ++ [12 ... 57] = "Reserved", + "A correctable error from a GFX Sub-IP", + "A fatal error from a GFX Sub-IP", + "Reserved", + "Reserved", + "A poison error from a GFX Sub-IP", ++ "Reserved", + }; + + static const char * const smca_mp5_mce_desc[] = { +@@ -824,27 +823,6 @@ static struct smca_bank_name smca_names[ + [SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" }, + }; + +-void smca_smu2_ext_err_desc(void) +-{ +- int i, j; +- int smu2_bits = 62; +- +- /* +- * MCA_CTL_SMU error stings are defined for b'58:59 and b'62 +- * in MI300A AMD systems. See AMD PPR MCA::SMU::MCA_CTL_SMU +- * +- * b'0:11 can be decoded from existing array smca_smu2_mce_desc. +- * b'12:57 are Reserved and b'58:62 are appended to the +- * smca_smu2_mce_desc. +- */ +- for (i = 12, j = 0; i < smu2_bits || j < 5; i++, j++) { +- for ( ; i < 58; i++) +- smca_smu2_mce_desc[i] = "Reserved"; +- +- smca_smu2_mce_desc[i] = smca_smu2_ext_mce_desc[j]; +- } +-} +- + void amd_decode_errcode(struct mce_event *e) + { + +@@ -936,7 +914,6 @@ unsigned short xec = (e->status >> 16) & + mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID, + (ipid_high & MCI_IPID_MCATYPE) >> 16); + +- smca_smu2_ext_err_desc(); + fixup_hwid(m, &mcatype_hwid); + + for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { +--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-07-18 11:14:26.008582740 -0400 ++++ rasdaemon-0.6.7/ras-mce-handler.h 2024-07-18 11:14:28.987559165 -0400 +@@ -121,7 +121,6 @@ int set_intel_imc_log(enum cputype cputy + /* Undertake AMD SMCA Error Decoding */ + void decode_smca_error(struct mce_event *e, struct mce_priv *m); + void amd_decode_errcode(struct mce_event *e); +-void smca_smu2_ext_err_desc(void); + + /* Per-CPU-type decoders for Intel CPUs */ + void p4_decode_model(struct mce_event *e); diff --git a/SOURCES/79065939fc4bc1da72a3718937fab80e73a6dd75.patch b/SOURCES/79065939fc4bc1da72a3718937fab80e73a6dd75.patch new file mode 100644 index 0000000..8f4e2a3 --- /dev/null +++ b/SOURCES/79065939fc4bc1da72a3718937fab80e73a6dd75.patch @@ -0,0 +1,128 @@ +commit 79065939fc4bc1da72a3718937fab80e73a6dd75 +Author: Avadhut Naik +Date: Tue Apr 2 00:07:38 2024 -0500 + + rasdaemon: Add support to parse microcode field of mce tracepoint + + Support for exporting the Microcode Revision is being added to the + mce_record tracepoint. + + Add the required, corresponding support in the rasdaemon for the field + to be parsed and logged or added to the database and viewed later through + ras-mc-ctl utility. + + Signed-off-by: Avadhut Naik + Signed-off-by: Mauro Carvalho Chehab + +--- + ras-mce-handler.c | 7 +++++++ + ras-mce-handler.h | 1 + + ras-record.c | 20 +++++++++++--------- + util/ras-mc-ctl.in | 7 ++++--- + 4 files changed, 23 insertions(+), 12 deletions(-) + +--- rasdaemon-0.6.7.orig/ras-mce-handler.c 2024-08-22 14:44:51.352160832 -0400 ++++ rasdaemon-0.6.7/ras-mce-handler.c 2024-08-22 14:44:51.361160757 -0400 +@@ -372,6 +372,9 @@ #if 0 + if (e->ppin) + trace_seq_printf(s, ", ppin= %llx", (long long)e->ppin); + ++ if (e->microcode) ++ trace_seq_printf(s, ", microcode= %x", e->microcode); ++ + /* + * FIXME: The original mcelog userspace tool uses DMI to map from + * address to DIMM. From the comments there, the code there doesn't +@@ -552,6 +555,10 @@ if (pevent_get_field_val(s, event, "ipid + if (!pevent_get_field_val(s, event, "ppin", record, &val, 1)) + e.ppin = val; + ++ /* Get Microcode Revision */ ++ if (!pevent_get_field_val(s, event, "microcode", record, &val, 1)) ++ e.microcode = val; ++ + switch (mce->cputype) { + case CPU_GENERIC: + break; +--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-08-22 14:44:51.352160832 -0400 ++++ rasdaemon-0.6.7/ras-mce-handler.h 2024-08-22 14:44:51.361160757 -0400 +@@ -75,6 +75,7 @@ struct mce_event { + uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */ + uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */ + uint64_t ppin; ++ uint32_t microcode; + + /* Parsed data */ + char timestamp[64]; +--- rasdaemon-0.6.7.orig/ras-record.c 2024-08-22 14:44:51.353160824 -0400 ++++ rasdaemon-0.6.7/ras-record.c 2024-08-22 14:44:51.362160748 -0400 +@@ -338,11 +338,12 @@ { .name = "ppin", .type = "INTEGER" }, + { .name="cs", .type="INTEGER" }, //15 + { .name="bank", .type="INTEGER" }, + { .name="cpuvendor", .type="INTEGER" }, ++ { .name = "microcode", .type = "INTEGER" }, + + /* Parsed data - will likely change */ + { .name="bank_name", .type="TEXT" }, +- { .name="error_msg", .type="TEXT" }, +- { .name="mcgstatus_msg", .type="TEXT" }, // 20 ++ { .name="error_msg", .type="TEXT" }, // 20 ++ { .name="mcgstatus_msg", .type="TEXT" }, + { .name="mcistatus_msg", .type="TEXT" }, + { .name="mcastatus_msg", .type="TEXT" }, + { .name="user_action", .type="TEXT" }, +@@ -381,14 +382,15 @@ sqlite3_bind_int64(priv->stmt_mce_record + sqlite3_bind_int (priv->stmt_mce_record, 15, ev->cs); + sqlite3_bind_int (priv->stmt_mce_record, 16, ev->bank); + sqlite3_bind_int (priv->stmt_mce_record, 17, ev->cpuvendor); ++ sqlite3_bind_int (priv->stmt_mce_record, 18, ev->microcode); + +- sqlite3_bind_text(priv->stmt_mce_record, 18, ev->bank_name, -1, NULL); +- sqlite3_bind_text(priv->stmt_mce_record, 19, ev->error_msg, -1, NULL); +- sqlite3_bind_text(priv->stmt_mce_record, 20, ev->mcgstatus_msg, -1, NULL); +- sqlite3_bind_text(priv->stmt_mce_record, 21, ev->mcistatus_msg, -1, NULL); +- sqlite3_bind_text(priv->stmt_mce_record, 22, ev->mcastatus_msg, -1, NULL); +- sqlite3_bind_text(priv->stmt_mce_record, 23, ev->user_action, -1, NULL); +- sqlite3_bind_text(priv->stmt_mce_record, 24, ev->mc_location, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mce_record, 19, ev->bank_name, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mce_record, 20, ev->error_msg, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mce_record, 21, ev->mcgstatus_msg, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mce_record, 22, ev->mcistatus_msg, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mce_record, 23, ev->mcastatus_msg, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mce_record, 24, ev->user_action, -1, NULL); ++ sqlite3_bind_text(priv->stmt_mce_record, 25, ev->mc_location, -1, NULL); + + rc = sqlite3_step(priv->stmt_mce_record); + if (rc != SQLITE_OK && rc != SQLITE_DONE) +--- rasdaemon-0.6.7.orig/util/ras-mc-ctl.in 2024-08-22 14:44:51.353160824 -0400 ++++ rasdaemon-0.6.7/util/ras-mc-ctl.in 2024-08-22 14:44:51.362160748 -0400 +@@ -1317,7 +1317,7 @@ sub errors + { + require DBI; + my ($query, $query_handle, $id, $time, $devname, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out); +- my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location); ++ my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $microcode, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location); + my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data); + my ($bus_name, $dev_name, $driver_name, $reporter_name); + my ($dev, $sector, $nr_sector, $error, $rwbs, $cmd); +@@ -1485,10 +1485,10 @@ $out .= sprintf "address=0x%08x, ", $add + + # MCE mce_record errors + if ($has_mce == 1) { +- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, ppin, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record order by id"; ++ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, ppin, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, microcode, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record order by id"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); +- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location)); ++ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $microcode, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location)); + $out = ""; + while($query_handle->fetch()) { + $out .= "$id $time error: $msg"; +@@ -1514,6 +1514,7 @@ $out .= sprintf ", apicid=0x%08x", $apic + $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid); + $out .= sprintf ", cs=0x%08x", $cs if ($cs); + $out .= sprintf ", bank=0x%08x", $bank if ($bank); ++ $out .= sprintf ", microcode=0x%08x", $microcode if ($microcode); + + $out .= "\n"; + } diff --git a/SOURCES/794530fbf270eae9f6f43c6d0bbd3ec6f2b210f3.patch b/SOURCES/794530fbf270eae9f6f43c6d0bbd3ec6f2b210f3.patch new file mode 100644 index 0000000..7d3e6a1 --- /dev/null +++ b/SOURCES/794530fbf270eae9f6f43c6d0bbd3ec6f2b210f3.patch @@ -0,0 +1,35 @@ +commit 794530fbf270eae9f6f43c6d0bbd3ec6f2b210f3 +Author: hubin +Date: Thu May 18 16:14:41 2023 +0800 + + ras-events: quit loop in read_ras_event when kbuf data is broken + + when kbuf data is broken, kbuffer_next_event() may move kbuf->index back to + the current kbuf->index position, causing dead loop. + + In this situation, rasdaemon will repeatedly parse an invalid event, and + print warning like "ug! negative record size -8!", pushing cpu utilization + rate to 100%. + + when kbuf data is broken, discard current page and continue reading next page + kbuf. + + Signed-off-by: hubin + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-events.c b/ras-events.c +index 2662467..fced7ab 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -512,6 +512,11 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata, + kbuffer_load_subbuffer(kbuf, page); + + while ((data = kbuffer_read_event(kbuf, &time_stamp))) { ++ if (kbuffer_curr_size(kbuf) < 0) { ++ log(TERM, LOG_ERR, "invalid kbuf data, discard\n"); ++ break; ++ } ++ + parse_ras_data(&pdata[i], + kbuf, data, time_stamp); + diff --git a/SOURCES/7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch b/SOURCES/7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch new file mode 100644 index 0000000..b9615bd --- /dev/null +++ b/SOURCES/7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch @@ -0,0 +1,34 @@ +commit 7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e +Author: Aristeu Rozanski +Date: Tue Apr 9 10:06:30 2024 -0400 + + mce-amd-smca: update smca_hwid to use smca_bank_types + + bank_type is used as smca_bank_types everywhere, there's no point in + declaring it as unsigned int. It also upsets covscan: + + 3. rasdaemon-0.6.7/mce-amd-smca.c:914: assignment: Assigning: "bank_type" = "s_hwid->bank_type". + 7. rasdaemon-0.6.7/mce-amd-smca.c:926: cond_at_most: Checking "bank_type >= 64U" implies that "bank_type" and "s_hwid->bank_type" may be up to 63 on the false branch. + 14. rasdaemon-0.6.7/mce-amd-smca.c:942: overrun-local: Overrunning array "smca_mce_descs" of 38 16-byte elements at element index 63 (byte offset 1023) using index "bank_type" (which evaluates to 63). + # 940| /* Only print the descriptor of valid extended error code */ + # 941| if (xec < smca_mce_descs[bank_type].num_descs) + # 942|-> mce_snprintf(e->mcastatus_msg, + # 943| "%s. Ext Err Code: %d", + # 944| smca_mce_descs[bank_type].descs[xec], + + Signed-off-by: Aristeu Rozanski + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/mce-amd-smca.c b/mce-amd-smca.c +index 7521ff7..6632663 100644 +--- a/mce-amd-smca.c ++++ b/mce-amd-smca.c +@@ -706,7 +706,7 @@ static struct smca_mce_desc smca_mce_descs[] = { + }; + + struct smca_hwid { +- unsigned int bank_type; /* Use with smca_bank_types for easy indexing.*/ ++ enum smca_bank_types bank_type; + uint32_t mcatype_hwid; /* mcatype,hwid bit 63-32 in MCx_IPID Register*/ + }; + diff --git a/SOURCES/83a3ced797256dcb1c93f8de4266fd7545fbfb3b.patch b/SOURCES/83a3ced797256dcb1c93f8de4266fd7545fbfb3b.patch new file mode 100644 index 0000000..4ffb2aa --- /dev/null +++ b/SOURCES/83a3ced797256dcb1c93f8de4266fd7545fbfb3b.patch @@ -0,0 +1,95 @@ +commit 83a3ced797256dcb1c93f8de4266fd7545fbfb3b +Author: Avadhut Naik +Date: Tue Nov 21 14:04:19 2023 -0600 + + rasdaemon: Add support for vendor-specific machine check error information + + Some CPU vendors may provide additional vendor-specific machine check + error information. AMD, for example, provides FRU Text through SYND 1/2 + registers if BIT 9 of SMCA_CONFIG register is set. + + Add support to display the additional vendor-specific error information, + if any. + + Signed-off-by: Avadhut Naik + Signed-off-by: Mauro Carvalho Chehab + +--- + mce-amd-smca.c | 12 ++++++++++++ + ras-mce-handler.c | 22 ++++++++++++++++++++++ + ras-mce-handler.h | 3 +++ + 3 files changed, 37 insertions(+) + +--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-11-27 10:18:13.765255836 -0500 ++++ rasdaemon-0.6.7/mce-amd-smca.c 2024-11-27 10:18:23.014169756 -0500 +@@ -999,6 +999,18 @@ if (bank_type == SMCA_UMC_V2 && xec == 0 + channel, csrow); + } + ++ ++ if (e->vdata_len) { ++ uint64_t smca_config = e->vdata[2]; ++ ++ /* ++ * BIT 9 of the CONFIG register of a few SMCA Bank types indicates ++ * presence of FRU Text in SYND 1 / 2 registers ++ */ ++ if (smca_config & BIT(9)) ++ memcpy(e->frutext, e->vdata, 16); ++ } ++ + } + + int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e) +--- rasdaemon-0.6.7.orig/ras-mce-handler.c 2024-11-27 10:18:23.014169756 -0500 ++++ rasdaemon-0.6.7/ras-mce-handler.c 2024-11-27 10:19:38.849463954 -0500 +@@ -375,6 +375,25 @@ #if 0 + if (e->microcode) + trace_seq_printf(s, ", microcode= %x", e->microcode); + ++ if (!e->vdata_len) ++ return; ++ ++ if (strlen(e->frutext)) { ++ trace_seq_printf(s, ", FRU Text= %s", e->frutext); ++ trace_seq_printf(s, ", Vendor Data= "); ++ for (int i = 2; i < e->vdata_len/8; i++) { ++ trace_seq_printf(s, "0x%lx", e->vdata[i]); ++ trace_seq_printf(s, " "); ++ } ++ } else { ++ trace_seq_printf(s, ", Vendor Data= "); ++ for (int i = 0; i < e->vdata_len/8; i ++) { ++ trace_seq_printf(s, "0x%lx", e->vdata[i]); ++ trace_seq_printf(s, " "); ++ } ++ } ++ ++ + /* + * FIXME: The original mcelog userspace tool uses DMI to map from + * address to DIMM. From the comments there, the code there doesn't +@@ -559,6 +578,9 @@ if (pevent_get_field_val(s, event, "ipid + if (!pevent_get_field_val(s, event, "microcode", record, &val, 1)) + e.microcode = val; + ++ /* Get Vendor-specfic Data, if any */ ++ e.vdata = pevent_get_field_raw(s, event, "v_data", record, &e.vdata_len, 1); ++ + switch (mce->cputype) { + case CPU_GENERIC: + break; +--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-11-27 10:18:23.014169756 -0500 ++++ rasdaemon-0.6.7/ras-mce-handler.h 2024-11-27 10:20:05.249218250 -0500 +@@ -76,8 +76,11 @@ struct mce_event { + uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */ + uint64_t ppin; + uint32_t microcode; ++ int32_t vdata_len; ++ const uint64_t *vdata; + + /* Parsed data */ ++ char frutext[17]; + char timestamp[64]; + char bank_name[64]; + char error_msg[4096]; diff --git a/SOURCES/885e546add918457c453bd3f753ac7df90b39e36.patch b/SOURCES/885e546add918457c453bd3f753ac7df90b39e36.patch new file mode 100644 index 0000000..e5a2e94 --- /dev/null +++ b/SOURCES/885e546add918457c453bd3f753ac7df90b39e36.patch @@ -0,0 +1,22 @@ +commit 885e546add918457c453bd3f753ac7df90b39e36 +Author: weidongkl +Date: Tue Sep 19 16:29:21 2023 +0800 + + Add a space between "diskerror_event" and "store" + + Signed-off-by: weidongkl + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-record.c b/ras-record.c +index a5f99ae..6b050bb 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -484,7 +484,7 @@ int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev + + if (!priv || !priv->stmt_diskerror_event) + return 0; +- log(TERM, LOG_INFO, "diskerror_eventstore: %p\n", priv->stmt_diskerror_event); ++ log(TERM, LOG_INFO, "diskerror_event store: %p\n", priv->stmt_diskerror_event); + + sqlite3_bind_text(priv->stmt_diskerror_event, 1, ev->timestamp, -1, NULL); + sqlite3_bind_text(priv->stmt_diskerror_event, 2, ev->dev, -1, NULL); diff --git a/SOURCES/8b536321cc0679fb82d4ea7521f9375d88cec0cc.patch b/SOURCES/8b536321cc0679fb82d4ea7521f9375d88cec0cc.patch new file mode 100644 index 0000000..d9c0b34 --- /dev/null +++ b/SOURCES/8b536321cc0679fb82d4ea7521f9375d88cec0cc.patch @@ -0,0 +1,75 @@ +commit 8b536321cc0679fb82d4ea7521f9375d88cec0cc +Author: Avadhut Naik +Date: Thu Nov 7 06:24:44 2024 +0000 + + rasdaemon: Modify support for vendor-specific machine check error information + + Commit 83a3ced797256d ("rasdaemon: Add support for vendor-specific + machine check error information") assumes that MCA_CONFIG MSR will be + exported as part of vendor-specific error information through the MCE + tracepoint. + + The same, however, is not true anymore. MCA_CONFIG MSR will not be + exported through the MCE tracepoint. Instead, the data from MCA_SYND1/2 + MSRs, exported as vendor-specific error information on newer AMD SOCs, + should always be interpreted as FRUText. + + Modify the error decoding support accordingly. + + Fixes: 83a3ced797256d ("rasdaemon: Add support for vendor-specific + machine check error information") + Signed-off-by: Avadhut Naik + Signed-off-by: Mauro Carvalho Chehab + +--- + mce-amd-smca.c | 13 ++----------- + ras-mce-handler.c | 15 +-------------- + 2 files changed, 3 insertions(+), 25 deletions(-) + +--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-11-27 10:20:29.777989960 -0500 ++++ rasdaemon-0.6.7/mce-amd-smca.c 2024-11-27 10:21:28.731441278 -0500 +@@ -1000,17 +1000,8 @@ if (bank_type == SMCA_UMC_V2 && xec == 0 + } + + +- if (e->vdata_len) { +- uint64_t smca_config = e->vdata[2]; +- +- /* +- * BIT 9 of the CONFIG register of a few SMCA Bank types indicates +- * presence of FRU Text in SYND 1 / 2 registers +- */ +- if (smca_config & BIT(9)) +- memcpy(e->frutext, e->vdata, 16); +- } +- ++ if (e->vdata_len) ++ memcpy(e->frutext, e->vdata, 16); + } + + int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e) +--- rasdaemon-0.6.7.orig/ras-mce-handler.c 2024-11-27 10:20:29.777989960 -0500 ++++ rasdaemon-0.6.7/ras-mce-handler.c 2024-11-27 10:21:01.517694557 -0500 +@@ -378,21 +378,8 @@ #if 0 + if (!e->vdata_len) + return; + +- if (strlen(e->frutext)) { ++ if (strlen(e->frutext)) + trace_seq_printf(s, ", FRU Text= %s", e->frutext); +- trace_seq_printf(s, ", Vendor Data= "); +- for (int i = 2; i < e->vdata_len/8; i++) { +- trace_seq_printf(s, "0x%lx", e->vdata[i]); +- trace_seq_printf(s, " "); +- } +- } else { +- trace_seq_printf(s, ", Vendor Data= "); +- for (int i = 0; i < e->vdata_len/8; i ++) { +- trace_seq_printf(s, "0x%lx", e->vdata[i]); +- trace_seq_printf(s, " "); +- } +- } +- + + /* + * FIXME: The original mcelog userspace tool uses DMI to map from diff --git a/SOURCES/9bd84aef87978b806178a73ed33c39d6c442fc1f.patch b/SOURCES/9bd84aef87978b806178a73ed33c39d6c442fc1f.patch new file mode 100644 index 0000000..adecd79 --- /dev/null +++ b/SOURCES/9bd84aef87978b806178a73ed33c39d6c442fc1f.patch @@ -0,0 +1,24 @@ +commit 9bd84aef87978b806178a73ed33c39d6c442fc1f +Author: weidong +Date: Tue Aug 8 08:59:12 2023 +0000 + + add ':' before error output + + All prints except disk are preceded by a colon + + Signed-off-by: weidong + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index dc326d3..13078c2 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1469,7 +1469,7 @@ sub errors + $out .= "\n"; + } + if ($out ne "") { +- print "Disk errors\n$out\n"; ++ print "Disk errors:\n$out\n"; + } else { + print "No disk errors.\n\n"; + } diff --git a/SOURCES/9c86f6255f67a8bae28cd46c54500fc16bfc7a30.patch b/SOURCES/9c86f6255f67a8bae28cd46c54500fc16bfc7a30.patch new file mode 100644 index 0000000..fe85c48 --- /dev/null +++ b/SOURCES/9c86f6255f67a8bae28cd46c54500fc16bfc7a30.patch @@ -0,0 +1,117 @@ +commit 9c86f6255f67a8bae28cd46c54500fc16bfc7a30 +Author: Yang Shi +Date: Mon Apr 4 16:34:05 2022 -0700 + + rasdaemon: use the new block_rq_error tracepoint + + Since Linux 5.18-rc1 a new block tracepoint called block_rq_error is + available for tracing disk error events dedicatedly. Currently + rasdaemon is using block_rq_complete which also traces successful cases. + It incurs excessive tracing logs and somehow overhead since the event is + triggered quite often. + + Use the new tracepoint for disk error reporting, and the new trace point + has the same format as block_rq_complete. + + Signed-off-by: Yang Shi + Signed-off-by: Mauro Carvalho Chehab + +--- + ras-events.c | 53 ++++++++++------------------------------------------- + ras-record.c | 2 +- + 2 files changed, 11 insertions(+), 44 deletions(-) + +--- rasdaemon-0.6.7.orig/ras-events.c 2024-05-14 11:05:40.020599541 -0400 ++++ rasdaemon-0.6.7/ras-events.c 2024-05-14 11:06:38.831067957 -0400 +@@ -27,6 +27,7 @@ * Foundation, Inc., 51 Franklin Street, + #include + #include + #include ++#include + #include "libtrace/kbuffer.h" + #include "libtrace/event-parse.h" + #include "ras-mc-handler.h" +@@ -229,7 +230,7 @@ if (rc < 0) { + #endif + + #ifdef HAVE_DISKERROR +- rc |= __toggle_ras_mc_event(ras, "block", "block_rq_complete", enable); ++ rc |= __toggle_ras_mc_event(ras, "block", "block_rq_error", enable); + #endif + + #ifdef HAVE_MEMORY_FAILURE +@@ -241,37 +242,6 @@ free_ras: + return rc; + } + +-/* +- * Set kernel filter. libtrace doesn't provide an API for setting filters +- * in kernel, we have to implement it here. +- */ +-static int filter_ras_mc_event(struct ras_events *ras, char *group, char *event, +- const char *filter_str) +-{ +- int fd, rc; +- char fname[MAX_PATH + 1]; +- +- snprintf(fname, sizeof(fname), "events/%s/%s/filter", group, event); +- fd = open_trace(ras, fname, O_RDWR | O_APPEND); +- if (fd < 0) { +- log(ALL, LOG_WARNING, "Can't open filter file\n"); +- return errno; +- } +- +- rc = write(fd, filter_str ,strlen(filter_str)); +- if (rc < 0) { +- log(ALL, LOG_WARNING, "Can't write to filter file\n"); +- close(fd); +- return rc; +- } +- close(fd); +- if (!rc) { +- log(ALL, LOG_WARNING, "Nothing was written on filter file\n"); +- return EIO; +- } +- +- return 0; +-} + + /* + * Tracing read code +@@ -901,17 +871,14 @@ (void)open("/sys/kernel/debug/ras/daemon + #endif + + #ifdef HAVE_DISKERROR +- rc = filter_ras_mc_event(ras, "block", "block_rq_complete", "error != 0"); +- if (!rc) { +- rc = add_event_handler(ras, pevent, page_size, "block", +- "block_rq_complete", ras_diskerror_event_handler, +- NULL, DISKERROR_EVENT); +- if (!rc) +- num_events++; +- else +- log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", +- "block", "block_rq_complete"); +- } ++ rc = add_event_handler(ras, pevent, page_size, "block", ++ "block_rq_error", ras_diskerror_event_handler, ++ NULL, DISKERROR_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "block", "block_rq_error"); + #endif + + #ifdef HAVE_MEMORY_FAILURE +--- rasdaemon-0.6.7.orig/ras-record.c 2024-05-14 11:07:24.573654494 -0400 ++++ rasdaemon-0.6.7/ras-record.c 2024-05-14 11:07:07.626807674 -0400 +@@ -456,7 +456,7 @@ return 0; + #endif + + /* +- * Table and functions to handle block:block_rq_complete ++ * Table and functions to handle block:block_rq_error + */ + + #ifdef HAVE_DISKERROR diff --git a/SOURCES/ad0444190e02bca309a61a4bad51bc0e16c0aef5.patch b/SOURCES/ad0444190e02bca309a61a4bad51bc0e16c0aef5.patch new file mode 100644 index 0000000..d289ea4 --- /dev/null +++ b/SOURCES/ad0444190e02bca309a61a4bad51bc0e16c0aef5.patch @@ -0,0 +1,134 @@ +commit ad0444190e02bca309a61a4bad51bc0e16c0aef5 +Author: Avadhut Naik +Date: Fri May 10 13:20:19 2024 -0500 + + rasdaemon: Update SMCA bank error descriptions + + Update error descriptions of SMCA bank types to support AMD's new Family + 1Ah-based processors. + Also, modify some existing error descriptions to better reflect the error + received. + + Signed-off-by: Avadhut Naik + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/mce-amd-smca.c b/mce-amd-smca.c +index 6632663..a55e013 100644 +--- a/mce-amd-smca.c ++++ b/mce-amd-smca.c +@@ -108,7 +108,7 @@ static const char * const smca_ls_mce_desc[] = { + "Store queue parity", + "Miss address buffer payload parity", + "L1 TLB parity", +- "Reserved", ++ "DC Tag error type 5", + "DC tag error type 6", + "DC tag error type 1", + "Internal error type 1", +@@ -125,6 +125,12 @@ static const char * const smca_ls_mce_desc[] = { + "DC tag error type 3", + "DC tag error type 5", + "L2 fill data error", ++ "Error on SCB cacheline state or address field", ++ "Error on SCB data, commit pipe 0", ++ "Error on SCB data, commit pipe 1", ++ "Error on SCB data for non-cacheable DRAM or IO", ++ "System Read Data Error detected by write combine buffer", ++ "Hardware Asserts", + }; + + static const char * const smca_ls2_mce_desc[] = { +@@ -168,7 +174,7 @@ static const char * const smca_if_mce_desc[] = { + "BP L1-BTB Multi-Hit Error", + "BP L2-BTB Multi-Hit Error", + "L2 Cache Response Poison error", +- "L2 Cache Error Response", ++ "System Read Data error", + "Hardware Assertion Error", + "L1-TLB Multi-Hit", + "L2-TLB Multi-Hit", +@@ -182,6 +188,7 @@ static const char * const smca_l2_mce_desc[] = { + "L2M Data Array ECC Error", + "Hardware Assert Error", + "SDP Read Response Parity Error", ++ "Error initiated by programmable state machine", + }; + + static const char * const smca_de_mce_desc[] = { +@@ -193,7 +200,7 @@ static const char * const smca_de_mce_desc[] = { + "Fetch address FIFO parity error", + "Patch RAM data parity error", + "Patch RAM sequencer parity error", +- "Micro-op buffer parity error", ++ "Micro-op fetch queue parity error", + "Hardware Assertion MCA Error", + }; + +@@ -235,6 +242,7 @@ static const char * const smca_l3_mce_desc[] = { + "L3 victim queue Data Fabric error", + "L3 Hardware Assertion", + "XI WCB Parity Poison Creation event", ++ "Machine check error initiated by DSM action", + }; + + static const char * const smca_cs_mce_desc[] = { +@@ -268,6 +276,9 @@ static const char * const smca_cs2_mce_desc[] = { + "Address Violation on the no data channel", + "Security Violation on the no data channel", + "Hardware Assert Error", ++ "Shadow Tag Array Protocol Error", ++ "Shadow Tag ECC Error", ++ "Shadow Tag Transaction Error", + }; + + /* +@@ -303,6 +314,8 @@ static const char * const smca_pie_mce_desc[] = { + "A deferred error was detected in the DF", + "Watch Dog Timer", + "An SRAM ECC error was detected in the CNLI block", ++ "Register access during DF Cstate", ++ "DSM Error", + }; + + static const char * const smca_umc_mce_desc[] = { +@@ -318,6 +331,11 @@ static const char * const smca_umc_mce_desc[] = { + "ECS Error", + "UMC Throttling Error", + "Read CRC Error", ++ "Reserved", ++ "Reserved", ++ "Reserved", ++ "Reserved", ++ "RFM SRAM ECC error", + }; + + static const char * const smca_umc_quirk_mce_desc[] = { +@@ -391,6 +409,12 @@ static const char * const smca_psp2_mce_desc[] = { + "TLB Bank 0 parity error", + "TLB Bank 1 parity error", + "System Hub Read Buffer ECC or parity error", ++ "FUSE IP SRAM ECC or parity error", ++ "PCRU FUSE SRAM ECC or parity error", ++ "SIB SRAM parity error", ++ "mpASP SECEMC Error", ++ "mpASP A5 Hang", ++ "SIB WDT error", + }; + + static const char * const smca_smu_mce_desc[] = { +@@ -431,6 +455,7 @@ static const char * const smca_mp5_mce_desc[] = { + "Instruction Cache Bank B ECC or parity error", + "Instruction Tag Cache Bank A ECC or parity error", + "Instruction Tag Cache Bank B ECC or parity error", ++ "Fuse SRAM ECC or parity error", + }; + + static const char * const smca_mpdma_mce_desc[] = { +@@ -483,6 +508,7 @@ static const char * const smca_mpdma_mce_desc[] = { + "MPDMA PTE Internal Data FIFO ECC or parity error", + "MPDMA PTE Command Memory DMA ECC or parity error", + "MPDMA PTE Command Memory Internal ECC or parity error", ++ "MPDMA TVF SDP Master Memory 7 ECC or parity error", + }; + + static const char * const smca_nbio_mce_desc[] = { diff --git a/SOURCES/b1ace39286e287282a275b6edc90dc2f64e60a3c.patch b/SOURCES/b1ace39286e287282a275b6edc90dc2f64e60a3c.patch new file mode 100644 index 0000000..0f5280f --- /dev/null +++ b/SOURCES/b1ace39286e287282a275b6edc90dc2f64e60a3c.patch @@ -0,0 +1,56 @@ +commit b1ace39286e287282a275b6edc90dc2f64e60a3c +Author: Avadhut Naik +Date: Mon Mar 25 23:06:08 2024 -0500 + + rasdaemon: ras-mc-ctl: Add support to display mcastatus_msg string + + Currently, the mcastatus_msg string of struct mce_event is added to the + SQLite database by the rasdaemon when it is recording errors. The same + however, is not outputted by the ras-mc-ctl utility. + + The string provides important error information relating to the received + MCE. For example, on AMD SMCA systems, the string outputs extended error + code and description. As such, the string should be present in the + output of ras-mc-ctl utility. + + Add support to output the string through the ras-mc-ctl utility. + + Signed-off-by: Avadhut Naik + Signed-off-by: Mauro Carvalho Chehab + +--- + util/ras-mc-ctl.in | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +--- rasdaemon-0.6.7.orig/util/ras-mc-ctl.in 2024-08-19 15:08:29.246429487 -0400 ++++ rasdaemon-0.6.7/util/ras-mc-ctl.in 2024-08-19 15:10:55.478162148 -0400 +@@ -1317,7 +1317,7 @@ sub errors + { + require DBI; + my ($query, $query_handle, $id, $time, $devname, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out); +- my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location); ++ my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location); + my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data); + my ($bus_name, $dev_name, $driver_name, $reporter_name); + my ($dev, $sector, $nr_sector, $error, $rwbs, $cmd); +@@ -1485,10 +1485,10 @@ $out .= sprintf "address=0x%08x, ", $add + + # MCE mce_record errors + if ($has_mce == 1) { +- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id"; ++ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record order by id"; + $query_handle = $dbh->prepare($query); + $query_handle->execute(); +- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location)); ++ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location)); + $out = ""; + while($query_handle->fetch()) { + $out .= "$id $time error: $msg"; +@@ -1496,6 +1496,7 @@ $out .= sprintf "address=0x%08x, ", $add + $out .= ", bank $bank_name" if ($bank_name); + $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg); + $out .= ", mci $mcistatus_msg" if ($mcistatus_msg); ++ $out .= ", mca $mcastatus_msg" if ($mcastatus_msg); + $out .= ", $mc_location" if ($mc_location); + $out .= ", $user_action" if ($user_action); + $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap); diff --git a/SOURCES/ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch b/SOURCES/ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch new file mode 100644 index 0000000..88356fb --- /dev/null +++ b/SOURCES/ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch @@ -0,0 +1,94 @@ +commit ced615cf8146f51b5d6fe7a29107a2adc77407ca +Author: Sathya Priya Kumar +Date: Thu Jan 11 01:20:07 2024 -0600 + + rasdaemon: Add error decoding for MCA_CTL_SMU extended bits + + Enable error decoding support for the newly added extended + error bit descriptions from MCA_CTL_SMU. + b'0:11 can be decoded from existing array smca_smu2_mce_desc. + Define a function to append the newly defined b'58:62 to the + smca_smu2_mce_desc. This reduces the maintaining Reserved bits + from b'12:57 in the code. + + Signed-off-by: Sathya Priya Kumar + Signed-off-by: Mauro Carvalho Chehab + +--- + mce-amd-smca.c | 33 ++++++++++++++++++++++++++++++++- + ras-mce-handler.h | 1 + + 2 files changed, 33 insertions(+), 1 deletion(-) + +--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-06-28 10:34:16.453522865 -0400 ++++ rasdaemon-0.6.7/mce-amd-smca.c 2024-06-28 10:34:46.049124270 -0400 +@@ -397,7 +397,7 @@ static const char * const smca_smu_mce_d + "An ECC or parity error in an SMU RAM instance", + }; + +-static const char * const smca_smu2_mce_desc[] = { ++static const char * smca_smu2_mce_desc[64] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Data Cache Bank A ECC or parity error", +@@ -409,6 +409,15 @@ static const char * const smca_smu2_mce_ + "Instruction Tag Cache Bank A ECC or parity error", + "Instruction Tag Cache Bank B ECC or parity error", + "System Hub Read Buffer ECC or parity error", ++ "PHY RAS ECC Error", ++}; ++ ++static const char * smca_smu2_ext_mce_desc[] = { ++ "A correctable error from a GFX Sub-IP", ++ "A fatal error from a GFX Sub-IP", ++ "Reserved", ++ "Reserved", ++ "A poison error from a GFX Sub-IP", + }; + + static const char * const smca_mp5_mce_desc[] = { +@@ -815,6 +824,27 @@ static struct smca_bank_name smca_names[ + [SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" }, + }; + ++void smca_smu2_ext_err_desc(void) ++{ ++ int i, j; ++ int smu2_bits = 62; ++ ++ /* ++ * MCA_CTL_SMU error stings are defined for b'58:59 and b'62 ++ * in MI300A AMD systems. See AMD PPR MCA::SMU::MCA_CTL_SMU ++ * ++ * b'0:11 can be decoded from existing array smca_smu2_mce_desc. ++ * b'12:57 are Reserved and b'58:62 are appended to the ++ * smca_smu2_mce_desc. ++ */ ++ for (i = 12, j = 0; i < smu2_bits || j < 5; i++, j++) { ++ for ( ; i < 58; i++) ++ smca_smu2_mce_desc[i] = "Reserved"; ++ ++ smca_smu2_mce_desc[i] = smca_smu2_ext_mce_desc[j]; ++ } ++} ++ + void amd_decode_errcode(struct mce_event *e) + { + +@@ -906,6 +936,7 @@ unsigned short xec = (e->status >> 16) & + mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID, + (ipid_high & MCI_IPID_MCATYPE) >> 16); + ++ smca_smu2_ext_err_desc(); + fixup_hwid(m, &mcatype_hwid); + + for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { +--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-06-28 10:34:16.453522865 -0400 ++++ rasdaemon-0.6.7/ras-mce-handler.h 2024-06-28 10:34:17.795508302 -0400 +@@ -121,6 +121,7 @@ int set_intel_imc_log(enum cputype cputy + /* Undertake AMD SMCA Error Decoding */ + void decode_smca_error(struct mce_event *e, struct mce_priv *m); + void amd_decode_errcode(struct mce_event *e); ++void smca_smu2_ext_err_desc(void); + + /* Per-CPU-type decoders for Intel CPUs */ + void p4_decode_model(struct mce_event *e); diff --git a/SPECS/rasdaemon.spec b/SPECS/rasdaemon.spec index 04d39dc..12d37a6 100644 --- a/SPECS/rasdaemon.spec +++ b/SPECS/rasdaemon.spec @@ -1,8 +1,8 @@ Name: rasdaemon Version: 0.6.7 -Release: 9%{?dist} +Release: 18%{?dist} Summary: Utility to receive RAS error tracings -License: GPLv2 +License: GPL-2.0-only URL: http://git.infradead.org/users/mchehab/rasdaemon.git Source0: http://www.infradead.org/~mchehab/rasdaemon/%{name}-%{version}.tar.bz2 Patch0: labels.patch @@ -33,6 +33,19 @@ Patch24: 1f74a59ee33b7448b00d7ba13d5ecd4918b9853c.patch Patch25: 2d15882a0cbfce0b905039bebc811ac8311cd739.patch Patch26: c785d309dcbdeb7ecd219975244f3944a8d047e9.patch Patch27: b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87.patch +Patch28: 9c86f6255f67a8bae28cd46c54500fc16bfc7a30.patch +Patch29: 9bd84aef87978b806178a73ed33c39d6c442fc1f.patch +Patch30: 885e546add918457c453bd3f753ac7df90b39e36.patch +Patch31: 7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch +Patch32: ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch +Patch33: 73d8177ce0d2fcb7693cacee4778d0845ebd3788.patch +Patch34: ad0444190e02bca309a61a4bad51bc0e16c0aef5.patch +Patch35: b1ace39286e287282a275b6edc90dc2f64e60a3c.patch +Patch36: 045ab08eaa00172d50621df9502f6910f3fe3af4.patch +Patch37: 79065939fc4bc1da72a3718937fab80e73a6dd75.patch +Patch38: 794530fbf270eae9f6f43c6d0bbd3ec6f2b210f3.patch +Patch39: 83a3ced797256dcb1c93f8de4266fd7545fbfb3b.patch +Patch40: 8b536321cc0679fb82d4ea7521f9375d88cec0cc.patch ExcludeArch: s390 s390x BuildRequires: make @@ -95,6 +108,19 @@ an utility for reporting current error counts from the EDAC sysfs files. %patch25 -p1 %patch26 -p1 %patch27 -p1 +%patch28 -p1 +%patch29 -p1 +%patch30 -p1 +%patch31 -p1 +%patch32 -p1 +%patch33 -p1 +%patch34 -p1 +%patch35 -p1 +%patch36 -p1 +%patch37 -p1 +%patch38 -p1 +%patch39 -p1 +%patch40 -p1 # The tarball is locked in time the first time aclocal was ran and will keep # requiring an older version of automake @@ -130,6 +156,33 @@ sed -i "s/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION=account/" %{buildroot}/%{_sysconfdir %{_sysconfdir}/sysconfig/rasdaemon %changelog +* Wed Nov 27 2024 Aristeu Rozanski 0.6.7-18 +- Add support for vendor specific information [RHEL-68673] + +* Tue Nov 19 2024 Aristeu Rozanski 0.6.7-17 +- ras-events: quit loop in read_ras_event when kbuf data is broken [RHEL-68127] + +* Thu Sep 05 2024 Aristeu Rozanski 0.6.7-16 +- rasdaemon: Add support to parse the PPIN field of mce tracepoint [RHEL-52911] +- rasdaemon: Add support to parse microcode field of mce tracepoint [RHEL-52911] +- rasdaemon: Update SMCA bank error descriptions [RHEL-52911] +- rasdaemon: ras-mc-ctl: Add support to display mcastatus_msg string [RHEL-52911] + +* Thu Jul 18 2024 Aristeu Rozanski 0.6.7-15 +- rasdaemon: mce-amd-smca: Optimizing decoding of MCA_CTL_SMU bits [RHEL-48819] + +* Fri Jun 28 2024 Aristeu Rozanski 0.6.7-13 +- rasdaemon: Add error decoding for MCA_CTL_SMU extended bits [RHEL-35718] + +* Thu Jun 20 2024 Aristeu Rozanski 0.6.7-12 +- mce-amd-smca: update smca_hwid to use smca_bank_types [RHEL-24170] + +* Wed May 08 2024 Aristeu Rozanski 0.6.7-11 +- Fix excessive block messages [RHEL-8708] + +* Wed Jan 10 2024 Aristeu Rozanski 0.6.7-10 +- Update License string to use SPDX [RHELMISC-1262] + * Thu Oct 26 2023 Aristeu Rozanski 0.6.7-9 - Update SMCA support for AMD processors [RHEL-11092]