rasdaemon: include requested fixes

- rasdaemon: Add support to parse the PPIN field of mce tracepoint
- rasdaemon: Add support to parse microcode field of mce tracepoint
- rasdaemon: Update SMCA bank error descriptions
- rasdaemon: ras-mc-ctl: Add support to display mcastatus_msg string

Resolves: RHEL-52911

Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
This commit is contained in:
Aristeu Rozanski 2024-10-01 10:47:32 -04:00
parent ec6406197f
commit 0d211e2538
5 changed files with 488 additions and 2 deletions

View File

@ -0,0 +1,154 @@
commit 045ab08eaa00172d50621df9502f6910f3fe3af4
Author: Avadhut Naik <avadhut.naik@amd.com>
Date: Mon Apr 1 23:33:07 2024 -0500
rasdaemon: Add support to parse the PPIN field of mce tracepoint
Support for exporting the PPIN (Protected Processor Inventory Number)
is being added to the mce_record tracepoint.
Add the required, corresponding support in the rasdaemon for the field
to be parsed and logged or added to the database and viewed later through
ras-mc-ctl utility.
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
ras-mce-handler.c | 7 +++++++
ras-mce-handler.h | 1 +
ras-record.c | 42 ++++++++++++++++++++++--------------------
util/ras-mc-ctl.in | 7 ++++---
4 files changed, 34 insertions(+), 23 deletions(-)
--- rasdaemon-0.6.7.orig/ras-mce-handler.c 2024-08-22 14:30:41.581276901 -0400
+++ rasdaemon-0.6.7/ras-mce-handler.c 2024-08-22 14:33:19.940957855 -0400
@@ -369,6 +369,9 @@ #if 0
trace_seq_printf(s, ", apicid= %x", e->apicid);
+ if (e->ppin)
+ trace_seq_printf(s, ", ppin= %llx", (long long)e->ppin);
+
/*
* FIXME: The original mcelog userspace tool uses DMI to map from
* address to DIMM. From the comments there, the code there doesn't
@@ -545,6 +548,10 @@ if (pevent_get_field_val(s, event, "ipid
return -1;
e.ipid = val;
+ /* Get PPIN */
+ if (!pevent_get_field_val(s, event, "ppin", record, &val, 1))
+ e.ppin = val;
+
switch (mce->cputype) {
case CPU_GENERIC:
break;
--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-08-22 14:30:41.581276901 -0400
+++ rasdaemon-0.6.7/ras-mce-handler.h 2024-08-22 14:33:47.312729865 -0400
@@ -74,6 +74,7 @@ struct mce_event {
uint8_t cpuvendor;
uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */
uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */
+ uint64_t ppin;
/* Parsed data */
char timestamp[64];
--- rasdaemon-0.6.7.orig/ras-record.c 2024-08-22 14:30:41.581276901 -0400
+++ rasdaemon-0.6.7/ras-record.c 2024-08-22 14:37:20.884941279 -0400
@@ -330,19 +330,20 @@ static const struct db_fields mce_record
{ .name="ip", .type="INTEGER" },
{ .name="tsc", .type="INTEGER" },
{ .name="walltime", .type="INTEGER" },
- { .name="cpu", .type="INTEGER" }, // 10
+ { .name = "ppin", .type = "INTEGER" }, // 10
+ { .name="cpu", .type="INTEGER" },
{ .name="cpuid", .type="INTEGER" },
{ .name="apicid", .type="INTEGER" },
{ .name="socketid", .type="INTEGER" },
- { .name="cs", .type="INTEGER" },
- { .name="bank", .type="INTEGER" }, //15
+ { .name="cs", .type="INTEGER" }, //15
+ { .name="bank", .type="INTEGER" },
{ .name="cpuvendor", .type="INTEGER" },
/* Parsed data - will likely change */
{ .name="bank_name", .type="TEXT" },
{ .name="error_msg", .type="TEXT" },
- { .name="mcgstatus_msg", .type="TEXT" },
- { .name="mcistatus_msg", .type="TEXT" }, // 20
+ { .name="mcgstatus_msg", .type="TEXT" }, // 20
+ { .name="mcistatus_msg", .type="TEXT" },
{ .name="mcastatus_msg", .type="TEXT" },
{ .name="user_action", .type="TEXT" },
{ .name="mc_location", .type="TEXT" },
@@ -372,21 +373,22 @@ return 0;
sqlite3_bind_int64 (priv->stmt_mce_record, 7, ev->ip);
sqlite3_bind_int64 (priv->stmt_mce_record, 8, ev->tsc);
sqlite3_bind_int64 (priv->stmt_mce_record, 9, ev->walltime);
- sqlite3_bind_int (priv->stmt_mce_record, 10, ev->cpu);
- sqlite3_bind_int (priv->stmt_mce_record, 11, ev->cpuid);
- sqlite3_bind_int (priv->stmt_mce_record, 12, ev->apicid);
- sqlite3_bind_int (priv->stmt_mce_record, 13, ev->socketid);
- sqlite3_bind_int (priv->stmt_mce_record, 14, ev->cs);
- sqlite3_bind_int (priv->stmt_mce_record, 15, ev->bank);
- sqlite3_bind_int (priv->stmt_mce_record, 16, ev->cpuvendor);
-
- sqlite3_bind_text(priv->stmt_mce_record, 17, ev->bank_name, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 18, ev->error_msg, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 19, ev->mcgstatus_msg, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 20, ev->mcistatus_msg, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 21, ev->mcastatus_msg, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 22, ev->user_action, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 23, ev->mc_location, -1, NULL);
+ sqlite3_bind_int64(priv->stmt_mce_record, 10, ev->ppin);
+ sqlite3_bind_int (priv->stmt_mce_record, 11, ev->cpu);
+ sqlite3_bind_int (priv->stmt_mce_record, 12, ev->cpuid);
+ sqlite3_bind_int (priv->stmt_mce_record, 13, ev->apicid);
+ sqlite3_bind_int (priv->stmt_mce_record, 14, ev->socketid);
+ sqlite3_bind_int (priv->stmt_mce_record, 15, ev->cs);
+ sqlite3_bind_int (priv->stmt_mce_record, 16, ev->bank);
+ sqlite3_bind_int (priv->stmt_mce_record, 17, ev->cpuvendor);
+
+ sqlite3_bind_text(priv->stmt_mce_record, 18, ev->bank_name, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 19, ev->error_msg, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 20, ev->mcgstatus_msg, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 21, ev->mcistatus_msg, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 22, ev->mcastatus_msg, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 23, ev->user_action, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 24, ev->mc_location, -1, NULL);
rc = sqlite3_step(priv->stmt_mce_record);
if (rc != SQLITE_OK && rc != SQLITE_DONE)
--- rasdaemon-0.6.7.orig/util/ras-mc-ctl.in 2024-08-22 14:29:51.058697724 -0400
+++ rasdaemon-0.6.7/util/ras-mc-ctl.in 2024-08-22 14:34:53.803175632 -0400
@@ -1317,7 +1317,7 @@ sub errors
{
require DBI;
my ($query, $query_handle, $id, $time, $devname, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out);
- my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location);
+ my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location);
my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data);
my ($bus_name, $dev_name, $driver_name, $reporter_name);
my ($dev, $sector, $nr_sector, $error, $rwbs, $cmd);
@@ -1485,10 +1485,10 @@ $out .= sprintf "address=0x%08x, ", $add
# MCE mce_record errors
if ($has_mce == 1) {
- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record order by id";
+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, ppin, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record order by id";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location));
+ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location));
$out = "";
while($query_handle->fetch()) {
$out .= "$id $time error: $msg";
@@ -1507,6 +1507,7 @@ $out .= sprintf ", misc=0x%08x", $misc i
$out .= sprintf ", ip=0x%08x", $ip if ($ip);
$out .= sprintf ", tsc=0x%08x", $tsc if ($tsc);
$out .= sprintf ", walltime=0x%08x", $walltime if ($walltime);
+ $out .= sprintf ", ppin=0x%08x", $ppin if ($ppin);
$out .= sprintf ", cpu=0x%08x", $cpu if ($cpu);
$out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid);
$out .= sprintf ", apicid=0x%08x", $apicid if ($apicid);

View File

@ -0,0 +1,128 @@
commit 79065939fc4bc1da72a3718937fab80e73a6dd75
Author: Avadhut Naik <avadhut.naik@amd.com>
Date: Tue Apr 2 00:07:38 2024 -0500
rasdaemon: Add support to parse microcode field of mce tracepoint
Support for exporting the Microcode Revision is being added to the
mce_record tracepoint.
Add the required, corresponding support in the rasdaemon for the field
to be parsed and logged or added to the database and viewed later through
ras-mc-ctl utility.
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
ras-mce-handler.c | 7 +++++++
ras-mce-handler.h | 1 +
ras-record.c | 20 +++++++++++---------
util/ras-mc-ctl.in | 7 ++++---
4 files changed, 23 insertions(+), 12 deletions(-)
--- rasdaemon-0.6.7.orig/ras-mce-handler.c 2024-08-22 14:44:51.352160832 -0400
+++ rasdaemon-0.6.7/ras-mce-handler.c 2024-08-22 14:44:51.361160757 -0400
@@ -372,6 +372,9 @@ #if 0
if (e->ppin)
trace_seq_printf(s, ", ppin= %llx", (long long)e->ppin);
+ if (e->microcode)
+ trace_seq_printf(s, ", microcode= %x", e->microcode);
+
/*
* FIXME: The original mcelog userspace tool uses DMI to map from
* address to DIMM. From the comments there, the code there doesn't
@@ -552,6 +555,10 @@ if (pevent_get_field_val(s, event, "ipid
if (!pevent_get_field_val(s, event, "ppin", record, &val, 1))
e.ppin = val;
+ /* Get Microcode Revision */
+ if (!pevent_get_field_val(s, event, "microcode", record, &val, 1))
+ e.microcode = val;
+
switch (mce->cputype) {
case CPU_GENERIC:
break;
--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-08-22 14:44:51.352160832 -0400
+++ rasdaemon-0.6.7/ras-mce-handler.h 2024-08-22 14:44:51.361160757 -0400
@@ -75,6 +75,7 @@ struct mce_event {
uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */
uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */
uint64_t ppin;
+ uint32_t microcode;
/* Parsed data */
char timestamp[64];
--- rasdaemon-0.6.7.orig/ras-record.c 2024-08-22 14:44:51.353160824 -0400
+++ rasdaemon-0.6.7/ras-record.c 2024-08-22 14:44:51.362160748 -0400
@@ -338,11 +338,12 @@ { .name = "ppin", .type = "INTEGER" },
{ .name="cs", .type="INTEGER" }, //15
{ .name="bank", .type="INTEGER" },
{ .name="cpuvendor", .type="INTEGER" },
+ { .name = "microcode", .type = "INTEGER" },
/* Parsed data - will likely change */
{ .name="bank_name", .type="TEXT" },
- { .name="error_msg", .type="TEXT" },
- { .name="mcgstatus_msg", .type="TEXT" }, // 20
+ { .name="error_msg", .type="TEXT" }, // 20
+ { .name="mcgstatus_msg", .type="TEXT" },
{ .name="mcistatus_msg", .type="TEXT" },
{ .name="mcastatus_msg", .type="TEXT" },
{ .name="user_action", .type="TEXT" },
@@ -381,14 +382,15 @@ sqlite3_bind_int64(priv->stmt_mce_record
sqlite3_bind_int (priv->stmt_mce_record, 15, ev->cs);
sqlite3_bind_int (priv->stmt_mce_record, 16, ev->bank);
sqlite3_bind_int (priv->stmt_mce_record, 17, ev->cpuvendor);
+ sqlite3_bind_int (priv->stmt_mce_record, 18, ev->microcode);
- sqlite3_bind_text(priv->stmt_mce_record, 18, ev->bank_name, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 19, ev->error_msg, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 20, ev->mcgstatus_msg, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 21, ev->mcistatus_msg, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 22, ev->mcastatus_msg, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 23, ev->user_action, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 24, ev->mc_location, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 19, ev->bank_name, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 20, ev->error_msg, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 21, ev->mcgstatus_msg, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 22, ev->mcistatus_msg, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 23, ev->mcastatus_msg, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 24, ev->user_action, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 25, ev->mc_location, -1, NULL);
rc = sqlite3_step(priv->stmt_mce_record);
if (rc != SQLITE_OK && rc != SQLITE_DONE)
--- rasdaemon-0.6.7.orig/util/ras-mc-ctl.in 2024-08-22 14:44:51.353160824 -0400
+++ rasdaemon-0.6.7/util/ras-mc-ctl.in 2024-08-22 14:44:51.362160748 -0400
@@ -1317,7 +1317,7 @@ sub errors
{
require DBI;
my ($query, $query_handle, $id, $time, $devname, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out);
- my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location);
+ my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $microcode, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location);
my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data);
my ($bus_name, $dev_name, $driver_name, $reporter_name);
my ($dev, $sector, $nr_sector, $error, $rwbs, $cmd);
@@ -1485,10 +1485,10 @@ $out .= sprintf "address=0x%08x, ", $add
# MCE mce_record errors
if ($has_mce == 1) {
- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, ppin, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record order by id";
+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, ppin, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, microcode, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record order by id";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location));
+ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $microcode, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location));
$out = "";
while($query_handle->fetch()) {
$out .= "$id $time error: $msg";
@@ -1514,6 +1514,7 @@ $out .= sprintf ", apicid=0x%08x", $apic
$out .= sprintf ", socketid=0x%08x", $socketid if ($socketid);
$out .= sprintf ", cs=0x%08x", $cs if ($cs);
$out .= sprintf ", bank=0x%08x", $bank if ($bank);
+ $out .= sprintf ", microcode=0x%08x", $microcode if ($microcode);
$out .= "\n";
}

View File

@ -0,0 +1,134 @@
commit ad0444190e02bca309a61a4bad51bc0e16c0aef5
Author: Avadhut Naik <avadhut.naik@amd.com>
Date: Fri May 10 13:20:19 2024 -0500
rasdaemon: Update SMCA bank error descriptions
Update error descriptions of SMCA bank types to support AMD's new Family
1Ah-based processors.
Also, modify some existing error descriptions to better reflect the error
received.
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 6632663..a55e013 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -108,7 +108,7 @@ static const char * const smca_ls_mce_desc[] = {
"Store queue parity",
"Miss address buffer payload parity",
"L1 TLB parity",
- "Reserved",
+ "DC Tag error type 5",
"DC tag error type 6",
"DC tag error type 1",
"Internal error type 1",
@@ -125,6 +125,12 @@ static const char * const smca_ls_mce_desc[] = {
"DC tag error type 3",
"DC tag error type 5",
"L2 fill data error",
+ "Error on SCB cacheline state or address field",
+ "Error on SCB data, commit pipe 0",
+ "Error on SCB data, commit pipe 1",
+ "Error on SCB data for non-cacheable DRAM or IO",
+ "System Read Data Error detected by write combine buffer",
+ "Hardware Asserts",
};
static const char * const smca_ls2_mce_desc[] = {
@@ -168,7 +174,7 @@ static const char * const smca_if_mce_desc[] = {
"BP L1-BTB Multi-Hit Error",
"BP L2-BTB Multi-Hit Error",
"L2 Cache Response Poison error",
- "L2 Cache Error Response",
+ "System Read Data error",
"Hardware Assertion Error",
"L1-TLB Multi-Hit",
"L2-TLB Multi-Hit",
@@ -182,6 +188,7 @@ static const char * const smca_l2_mce_desc[] = {
"L2M Data Array ECC Error",
"Hardware Assert Error",
"SDP Read Response Parity Error",
+ "Error initiated by programmable state machine",
};
static const char * const smca_de_mce_desc[] = {
@@ -193,7 +200,7 @@ static const char * const smca_de_mce_desc[] = {
"Fetch address FIFO parity error",
"Patch RAM data parity error",
"Patch RAM sequencer parity error",
- "Micro-op buffer parity error",
+ "Micro-op fetch queue parity error",
"Hardware Assertion MCA Error",
};
@@ -235,6 +242,7 @@ static const char * const smca_l3_mce_desc[] = {
"L3 victim queue Data Fabric error",
"L3 Hardware Assertion",
"XI WCB Parity Poison Creation event",
+ "Machine check error initiated by DSM action",
};
static const char * const smca_cs_mce_desc[] = {
@@ -268,6 +276,9 @@ static const char * const smca_cs2_mce_desc[] = {
"Address Violation on the no data channel",
"Security Violation on the no data channel",
"Hardware Assert Error",
+ "Shadow Tag Array Protocol Error",
+ "Shadow Tag ECC Error",
+ "Shadow Tag Transaction Error",
};
/*
@@ -303,6 +314,8 @@ static const char * const smca_pie_mce_desc[] = {
"A deferred error was detected in the DF",
"Watch Dog Timer",
"An SRAM ECC error was detected in the CNLI block",
+ "Register access during DF Cstate",
+ "DSM Error",
};
static const char * const smca_umc_mce_desc[] = {
@@ -318,6 +331,11 @@ static const char * const smca_umc_mce_desc[] = {
"ECS Error",
"UMC Throttling Error",
"Read CRC Error",
+ "Reserved",
+ "Reserved",
+ "Reserved",
+ "Reserved",
+ "RFM SRAM ECC error",
};
static const char * const smca_umc_quirk_mce_desc[] = {
@@ -391,6 +409,12 @@ static const char * const smca_psp2_mce_desc[] = {
"TLB Bank 0 parity error",
"TLB Bank 1 parity error",
"System Hub Read Buffer ECC or parity error",
+ "FUSE IP SRAM ECC or parity error",
+ "PCRU FUSE SRAM ECC or parity error",
+ "SIB SRAM parity error",
+ "mpASP SECEMC Error",
+ "mpASP A5 Hang",
+ "SIB WDT error",
};
static const char * const smca_smu_mce_desc[] = {
@@ -431,6 +455,7 @@ static const char * const smca_mp5_mce_desc[] = {
"Instruction Cache Bank B ECC or parity error",
"Instruction Tag Cache Bank A ECC or parity error",
"Instruction Tag Cache Bank B ECC or parity error",
+ "Fuse SRAM ECC or parity error",
};
static const char * const smca_mpdma_mce_desc[] = {
@@ -483,6 +508,7 @@ static const char * const smca_mpdma_mce_desc[] = {
"MPDMA PTE Internal Data FIFO ECC or parity error",
"MPDMA PTE Command Memory DMA ECC or parity error",
"MPDMA PTE Command Memory Internal ECC or parity error",
+ "MPDMA TVF SDP Master Memory 7 ECC or parity error",
};
static const char * const smca_nbio_mce_desc[] = {

View File

@ -0,0 +1,56 @@
commit b1ace39286e287282a275b6edc90dc2f64e60a3c
Author: Avadhut Naik <avadhut.naik@amd.com>
Date: Mon Mar 25 23:06:08 2024 -0500
rasdaemon: ras-mc-ctl: Add support to display mcastatus_msg string
Currently, the mcastatus_msg string of struct mce_event is added to the
SQLite database by the rasdaemon when it is recording errors. The same
however, is not outputted by the ras-mc-ctl utility.
The string provides important error information relating to the received
MCE. For example, on AMD SMCA systems, the string outputs extended error
code and description. As such, the string should be present in the
output of ras-mc-ctl utility.
Add support to output the string through the ras-mc-ctl utility.
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
util/ras-mc-ctl.in | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
--- rasdaemon-0.6.7.orig/util/ras-mc-ctl.in 2024-08-19 15:08:29.246429487 -0400
+++ rasdaemon-0.6.7/util/ras-mc-ctl.in 2024-08-19 15:10:55.478162148 -0400
@@ -1317,7 +1317,7 @@ sub errors
{
require DBI;
my ($query, $query_handle, $id, $time, $devname, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out);
- my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location);
+ my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location);
my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data);
my ($bus_name, $dev_name, $driver_name, $reporter_name);
my ($dev, $sector, $nr_sector, $error, $rwbs, $cmd);
@@ -1485,10 +1485,10 @@ $out .= sprintf "address=0x%08x, ", $add
# MCE mce_record errors
if ($has_mce == 1) {
- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id";
+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record order by id";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location));
+ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location));
$out = "";
while($query_handle->fetch()) {
$out .= "$id $time error: $msg";
@@ -1496,6 +1496,7 @@ $out .= sprintf "address=0x%08x, ", $add
$out .= ", bank $bank_name" if ($bank_name);
$out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg);
$out .= ", mci $mcistatus_msg" if ($mcistatus_msg);
+ $out .= ", mca $mcastatus_msg" if ($mcastatus_msg);
$out .= ", $mc_location" if ($mc_location);
$out .= ", $user_action" if ($user_action);
$out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap);

View File

@ -1,6 +1,6 @@
Name: rasdaemon
Version: 0.6.7
Release: 15%{?dist}
Release: 16%{?dist}
Summary: Utility to receive RAS error tracings
License: GPL-2.0-only
URL: http://git.infradead.org/users/mchehab/rasdaemon.git
@ -39,6 +39,10 @@ Patch30: 885e546add918457c453bd3f753ac7df90b39e36.patch
Patch31: 7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch
Patch32: ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch
Patch33: 73d8177ce0d2fcb7693cacee4778d0845ebd3788.patch
Patch34: ad0444190e02bca309a61a4bad51bc0e16c0aef5.patch
Patch35: b1ace39286e287282a275b6edc90dc2f64e60a3c.patch
Patch36: 045ab08eaa00172d50621df9502f6910f3fe3af4.patch
Patch37: 79065939fc4bc1da72a3718937fab80e73a6dd75.patch
ExcludeArch: s390 s390x
BuildRequires: make
@ -107,6 +111,10 @@ an utility for reporting current error counts from the EDAC sysfs files.
%patch31 -p1
%patch32 -p1
%patch33 -p1
%patch34 -p1
%patch35 -p1
%patch36 -p1
%patch37 -p1
# The tarball is locked in time the first time aclocal was ran and will keep
# requiring an older version of automake
@ -142,7 +150,13 @@ sed -i "s/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION=account/" %{buildroot}/%{_sysconfdir
%{_sysconfdir}/sysconfig/rasdaemon
%changelog
* Thu Jul 18 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-14
* Thu Sep 05 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-16
- rasdaemon: Add support to parse the PPIN field of mce tracepoint [RHEL-52911]
- rasdaemon: Add support to parse microcode field of mce tracepoint [RHEL-52911]
- rasdaemon: Update SMCA bank error descriptions [RHEL-52911]
- rasdaemon: ras-mc-ctl: Add support to display mcastatus_msg string [RHEL-52911]
* Thu Jul 18 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-15
- rasdaemon: mce-amd-smca: Optimizing decoding of MCA_CTL_SMU bits [RHEL-48819]
* Fri Jun 28 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-13