rasdaemon: include requested fixes
- rasdaemon: Add support to parse the PPIN field of mce tracepoint - rasdaemon: Add support to parse microcode field of mce tracepoint - rasdaemon: Update SMCA bank error descriptions - rasdaemon: ras-mc-ctl: Add support to display mcastatus_msg string Resolves: RHEL-52911 Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
This commit is contained in:
parent
ec6406197f
commit
0d211e2538
154
045ab08eaa00172d50621df9502f6910f3fe3af4.patch
Normal file
154
045ab08eaa00172d50621df9502f6910f3fe3af4.patch
Normal file
@ -0,0 +1,154 @@
|
||||
commit 045ab08eaa00172d50621df9502f6910f3fe3af4
|
||||
Author: Avadhut Naik <avadhut.naik@amd.com>
|
||||
Date: Mon Apr 1 23:33:07 2024 -0500
|
||||
|
||||
rasdaemon: Add support to parse the PPIN field of mce tracepoint
|
||||
|
||||
Support for exporting the PPIN (Protected Processor Inventory Number)
|
||||
is being added to the mce_record tracepoint.
|
||||
|
||||
Add the required, corresponding support in the rasdaemon for the field
|
||||
to be parsed and logged or added to the database and viewed later through
|
||||
ras-mc-ctl utility.
|
||||
|
||||
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
|
||||
|
||||
---
|
||||
ras-mce-handler.c | 7 +++++++
|
||||
ras-mce-handler.h | 1 +
|
||||
ras-record.c | 42 ++++++++++++++++++++++--------------------
|
||||
util/ras-mc-ctl.in | 7 ++++---
|
||||
4 files changed, 34 insertions(+), 23 deletions(-)
|
||||
|
||||
--- rasdaemon-0.6.7.orig/ras-mce-handler.c 2024-08-22 14:30:41.581276901 -0400
|
||||
+++ rasdaemon-0.6.7/ras-mce-handler.c 2024-08-22 14:33:19.940957855 -0400
|
||||
@@ -369,6 +369,9 @@ #if 0
|
||||
|
||||
trace_seq_printf(s, ", apicid= %x", e->apicid);
|
||||
|
||||
+ if (e->ppin)
|
||||
+ trace_seq_printf(s, ", ppin= %llx", (long long)e->ppin);
|
||||
+
|
||||
/*
|
||||
* FIXME: The original mcelog userspace tool uses DMI to map from
|
||||
* address to DIMM. From the comments there, the code there doesn't
|
||||
@@ -545,6 +548,10 @@ if (pevent_get_field_val(s, event, "ipid
|
||||
return -1;
|
||||
e.ipid = val;
|
||||
|
||||
+ /* Get PPIN */
|
||||
+ if (!pevent_get_field_val(s, event, "ppin", record, &val, 1))
|
||||
+ e.ppin = val;
|
||||
+
|
||||
switch (mce->cputype) {
|
||||
case CPU_GENERIC:
|
||||
break;
|
||||
--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-08-22 14:30:41.581276901 -0400
|
||||
+++ rasdaemon-0.6.7/ras-mce-handler.h 2024-08-22 14:33:47.312729865 -0400
|
||||
@@ -74,6 +74,7 @@ struct mce_event {
|
||||
uint8_t cpuvendor;
|
||||
uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */
|
||||
uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */
|
||||
+ uint64_t ppin;
|
||||
|
||||
/* Parsed data */
|
||||
char timestamp[64];
|
||||
--- rasdaemon-0.6.7.orig/ras-record.c 2024-08-22 14:30:41.581276901 -0400
|
||||
+++ rasdaemon-0.6.7/ras-record.c 2024-08-22 14:37:20.884941279 -0400
|
||||
@@ -330,19 +330,20 @@ static const struct db_fields mce_record
|
||||
{ .name="ip", .type="INTEGER" },
|
||||
{ .name="tsc", .type="INTEGER" },
|
||||
{ .name="walltime", .type="INTEGER" },
|
||||
- { .name="cpu", .type="INTEGER" }, // 10
|
||||
+ { .name = "ppin", .type = "INTEGER" }, // 10
|
||||
+ { .name="cpu", .type="INTEGER" },
|
||||
{ .name="cpuid", .type="INTEGER" },
|
||||
{ .name="apicid", .type="INTEGER" },
|
||||
{ .name="socketid", .type="INTEGER" },
|
||||
- { .name="cs", .type="INTEGER" },
|
||||
- { .name="bank", .type="INTEGER" }, //15
|
||||
+ { .name="cs", .type="INTEGER" }, //15
|
||||
+ { .name="bank", .type="INTEGER" },
|
||||
{ .name="cpuvendor", .type="INTEGER" },
|
||||
|
||||
/* Parsed data - will likely change */
|
||||
{ .name="bank_name", .type="TEXT" },
|
||||
{ .name="error_msg", .type="TEXT" },
|
||||
- { .name="mcgstatus_msg", .type="TEXT" },
|
||||
- { .name="mcistatus_msg", .type="TEXT" }, // 20
|
||||
+ { .name="mcgstatus_msg", .type="TEXT" }, // 20
|
||||
+ { .name="mcistatus_msg", .type="TEXT" },
|
||||
{ .name="mcastatus_msg", .type="TEXT" },
|
||||
{ .name="user_action", .type="TEXT" },
|
||||
{ .name="mc_location", .type="TEXT" },
|
||||
@@ -372,21 +373,22 @@ return 0;
|
||||
sqlite3_bind_int64 (priv->stmt_mce_record, 7, ev->ip);
|
||||
sqlite3_bind_int64 (priv->stmt_mce_record, 8, ev->tsc);
|
||||
sqlite3_bind_int64 (priv->stmt_mce_record, 9, ev->walltime);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 10, ev->cpu);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 11, ev->cpuid);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 12, ev->apicid);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 13, ev->socketid);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 14, ev->cs);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 15, ev->bank);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 16, ev->cpuvendor);
|
||||
-
|
||||
- sqlite3_bind_text(priv->stmt_mce_record, 17, ev->bank_name, -1, NULL);
|
||||
- sqlite3_bind_text(priv->stmt_mce_record, 18, ev->error_msg, -1, NULL);
|
||||
- sqlite3_bind_text(priv->stmt_mce_record, 19, ev->mcgstatus_msg, -1, NULL);
|
||||
- sqlite3_bind_text(priv->stmt_mce_record, 20, ev->mcistatus_msg, -1, NULL);
|
||||
- sqlite3_bind_text(priv->stmt_mce_record, 21, ev->mcastatus_msg, -1, NULL);
|
||||
- sqlite3_bind_text(priv->stmt_mce_record, 22, ev->user_action, -1, NULL);
|
||||
- sqlite3_bind_text(priv->stmt_mce_record, 23, ev->mc_location, -1, NULL);
|
||||
+ sqlite3_bind_int64(priv->stmt_mce_record, 10, ev->ppin);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 11, ev->cpu);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 12, ev->cpuid);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 13, ev->apicid);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 14, ev->socketid);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 15, ev->cs);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 16, ev->bank);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 17, ev->cpuvendor);
|
||||
+
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 18, ev->bank_name, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 19, ev->error_msg, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 20, ev->mcgstatus_msg, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 21, ev->mcistatus_msg, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 22, ev->mcastatus_msg, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 23, ev->user_action, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 24, ev->mc_location, -1, NULL);
|
||||
|
||||
rc = sqlite3_step(priv->stmt_mce_record);
|
||||
if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
--- rasdaemon-0.6.7.orig/util/ras-mc-ctl.in 2024-08-22 14:29:51.058697724 -0400
|
||||
+++ rasdaemon-0.6.7/util/ras-mc-ctl.in 2024-08-22 14:34:53.803175632 -0400
|
||||
@@ -1317,7 +1317,7 @@ sub errors
|
||||
{
|
||||
require DBI;
|
||||
my ($query, $query_handle, $id, $time, $devname, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out);
|
||||
- my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location);
|
||||
+ my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location);
|
||||
my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data);
|
||||
my ($bus_name, $dev_name, $driver_name, $reporter_name);
|
||||
my ($dev, $sector, $nr_sector, $error, $rwbs, $cmd);
|
||||
@@ -1485,10 +1485,10 @@ $out .= sprintf "address=0x%08x, ", $add
|
||||
|
||||
# MCE mce_record errors
|
||||
if ($has_mce == 1) {
|
||||
- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record order by id";
|
||||
+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, ppin, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record order by id";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location));
|
||||
+ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location));
|
||||
$out = "";
|
||||
while($query_handle->fetch()) {
|
||||
$out .= "$id $time error: $msg";
|
||||
@@ -1507,6 +1507,7 @@ $out .= sprintf ", misc=0x%08x", $misc i
|
||||
$out .= sprintf ", ip=0x%08x", $ip if ($ip);
|
||||
$out .= sprintf ", tsc=0x%08x", $tsc if ($tsc);
|
||||
$out .= sprintf ", walltime=0x%08x", $walltime if ($walltime);
|
||||
+ $out .= sprintf ", ppin=0x%08x", $ppin if ($ppin);
|
||||
$out .= sprintf ", cpu=0x%08x", $cpu if ($cpu);
|
||||
$out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid);
|
||||
$out .= sprintf ", apicid=0x%08x", $apicid if ($apicid);
|
128
79065939fc4bc1da72a3718937fab80e73a6dd75.patch
Normal file
128
79065939fc4bc1da72a3718937fab80e73a6dd75.patch
Normal file
@ -0,0 +1,128 @@
|
||||
commit 79065939fc4bc1da72a3718937fab80e73a6dd75
|
||||
Author: Avadhut Naik <avadhut.naik@amd.com>
|
||||
Date: Tue Apr 2 00:07:38 2024 -0500
|
||||
|
||||
rasdaemon: Add support to parse microcode field of mce tracepoint
|
||||
|
||||
Support for exporting the Microcode Revision is being added to the
|
||||
mce_record tracepoint.
|
||||
|
||||
Add the required, corresponding support in the rasdaemon for the field
|
||||
to be parsed and logged or added to the database and viewed later through
|
||||
ras-mc-ctl utility.
|
||||
|
||||
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
|
||||
|
||||
---
|
||||
ras-mce-handler.c | 7 +++++++
|
||||
ras-mce-handler.h | 1 +
|
||||
ras-record.c | 20 +++++++++++---------
|
||||
util/ras-mc-ctl.in | 7 ++++---
|
||||
4 files changed, 23 insertions(+), 12 deletions(-)
|
||||
|
||||
--- rasdaemon-0.6.7.orig/ras-mce-handler.c 2024-08-22 14:44:51.352160832 -0400
|
||||
+++ rasdaemon-0.6.7/ras-mce-handler.c 2024-08-22 14:44:51.361160757 -0400
|
||||
@@ -372,6 +372,9 @@ #if 0
|
||||
if (e->ppin)
|
||||
trace_seq_printf(s, ", ppin= %llx", (long long)e->ppin);
|
||||
|
||||
+ if (e->microcode)
|
||||
+ trace_seq_printf(s, ", microcode= %x", e->microcode);
|
||||
+
|
||||
/*
|
||||
* FIXME: The original mcelog userspace tool uses DMI to map from
|
||||
* address to DIMM. From the comments there, the code there doesn't
|
||||
@@ -552,6 +555,10 @@ if (pevent_get_field_val(s, event, "ipid
|
||||
if (!pevent_get_field_val(s, event, "ppin", record, &val, 1))
|
||||
e.ppin = val;
|
||||
|
||||
+ /* Get Microcode Revision */
|
||||
+ if (!pevent_get_field_val(s, event, "microcode", record, &val, 1))
|
||||
+ e.microcode = val;
|
||||
+
|
||||
switch (mce->cputype) {
|
||||
case CPU_GENERIC:
|
||||
break;
|
||||
--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-08-22 14:44:51.352160832 -0400
|
||||
+++ rasdaemon-0.6.7/ras-mce-handler.h 2024-08-22 14:44:51.361160757 -0400
|
||||
@@ -75,6 +75,7 @@ struct mce_event {
|
||||
uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */
|
||||
uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */
|
||||
uint64_t ppin;
|
||||
+ uint32_t microcode;
|
||||
|
||||
/* Parsed data */
|
||||
char timestamp[64];
|
||||
--- rasdaemon-0.6.7.orig/ras-record.c 2024-08-22 14:44:51.353160824 -0400
|
||||
+++ rasdaemon-0.6.7/ras-record.c 2024-08-22 14:44:51.362160748 -0400
|
||||
@@ -338,11 +338,12 @@ { .name = "ppin", .type = "INTEGER" },
|
||||
{ .name="cs", .type="INTEGER" }, //15
|
||||
{ .name="bank", .type="INTEGER" },
|
||||
{ .name="cpuvendor", .type="INTEGER" },
|
||||
+ { .name = "microcode", .type = "INTEGER" },
|
||||
|
||||
/* Parsed data - will likely change */
|
||||
{ .name="bank_name", .type="TEXT" },
|
||||
- { .name="error_msg", .type="TEXT" },
|
||||
- { .name="mcgstatus_msg", .type="TEXT" }, // 20
|
||||
+ { .name="error_msg", .type="TEXT" }, // 20
|
||||
+ { .name="mcgstatus_msg", .type="TEXT" },
|
||||
{ .name="mcistatus_msg", .type="TEXT" },
|
||||
{ .name="mcastatus_msg", .type="TEXT" },
|
||||
{ .name="user_action", .type="TEXT" },
|
||||
@@ -381,14 +382,15 @@ sqlite3_bind_int64(priv->stmt_mce_record
|
||||
sqlite3_bind_int (priv->stmt_mce_record, 15, ev->cs);
|
||||
sqlite3_bind_int (priv->stmt_mce_record, 16, ev->bank);
|
||||
sqlite3_bind_int (priv->stmt_mce_record, 17, ev->cpuvendor);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 18, ev->microcode);
|
||||
|
||||
- sqlite3_bind_text(priv->stmt_mce_record, 18, ev->bank_name, -1, NULL);
|
||||
- sqlite3_bind_text(priv->stmt_mce_record, 19, ev->error_msg, -1, NULL);
|
||||
- sqlite3_bind_text(priv->stmt_mce_record, 20, ev->mcgstatus_msg, -1, NULL);
|
||||
- sqlite3_bind_text(priv->stmt_mce_record, 21, ev->mcistatus_msg, -1, NULL);
|
||||
- sqlite3_bind_text(priv->stmt_mce_record, 22, ev->mcastatus_msg, -1, NULL);
|
||||
- sqlite3_bind_text(priv->stmt_mce_record, 23, ev->user_action, -1, NULL);
|
||||
- sqlite3_bind_text(priv->stmt_mce_record, 24, ev->mc_location, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 19, ev->bank_name, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 20, ev->error_msg, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 21, ev->mcgstatus_msg, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 22, ev->mcistatus_msg, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 23, ev->mcastatus_msg, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 24, ev->user_action, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 25, ev->mc_location, -1, NULL);
|
||||
|
||||
rc = sqlite3_step(priv->stmt_mce_record);
|
||||
if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
--- rasdaemon-0.6.7.orig/util/ras-mc-ctl.in 2024-08-22 14:44:51.353160824 -0400
|
||||
+++ rasdaemon-0.6.7/util/ras-mc-ctl.in 2024-08-22 14:44:51.362160748 -0400
|
||||
@@ -1317,7 +1317,7 @@ sub errors
|
||||
{
|
||||
require DBI;
|
||||
my ($query, $query_handle, $id, $time, $devname, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out);
|
||||
- my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location);
|
||||
+ my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $microcode, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location);
|
||||
my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data);
|
||||
my ($bus_name, $dev_name, $driver_name, $reporter_name);
|
||||
my ($dev, $sector, $nr_sector, $error, $rwbs, $cmd);
|
||||
@@ -1485,10 +1485,10 @@ $out .= sprintf "address=0x%08x, ", $add
|
||||
|
||||
# MCE mce_record errors
|
||||
if ($has_mce == 1) {
|
||||
- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, ppin, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record order by id";
|
||||
+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, ppin, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, microcode, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record order by id";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location));
|
||||
+ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $microcode, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location));
|
||||
$out = "";
|
||||
while($query_handle->fetch()) {
|
||||
$out .= "$id $time error: $msg";
|
||||
@@ -1514,6 +1514,7 @@ $out .= sprintf ", apicid=0x%08x", $apic
|
||||
$out .= sprintf ", socketid=0x%08x", $socketid if ($socketid);
|
||||
$out .= sprintf ", cs=0x%08x", $cs if ($cs);
|
||||
$out .= sprintf ", bank=0x%08x", $bank if ($bank);
|
||||
+ $out .= sprintf ", microcode=0x%08x", $microcode if ($microcode);
|
||||
|
||||
$out .= "\n";
|
||||
}
|
134
ad0444190e02bca309a61a4bad51bc0e16c0aef5.patch
Normal file
134
ad0444190e02bca309a61a4bad51bc0e16c0aef5.patch
Normal file
@ -0,0 +1,134 @@
|
||||
commit ad0444190e02bca309a61a4bad51bc0e16c0aef5
|
||||
Author: Avadhut Naik <avadhut.naik@amd.com>
|
||||
Date: Fri May 10 13:20:19 2024 -0500
|
||||
|
||||
rasdaemon: Update SMCA bank error descriptions
|
||||
|
||||
Update error descriptions of SMCA bank types to support AMD's new Family
|
||||
1Ah-based processors.
|
||||
Also, modify some existing error descriptions to better reflect the error
|
||||
received.
|
||||
|
||||
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
|
||||
|
||||
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
|
||||
index 6632663..a55e013 100644
|
||||
--- a/mce-amd-smca.c
|
||||
+++ b/mce-amd-smca.c
|
||||
@@ -108,7 +108,7 @@ static const char * const smca_ls_mce_desc[] = {
|
||||
"Store queue parity",
|
||||
"Miss address buffer payload parity",
|
||||
"L1 TLB parity",
|
||||
- "Reserved",
|
||||
+ "DC Tag error type 5",
|
||||
"DC tag error type 6",
|
||||
"DC tag error type 1",
|
||||
"Internal error type 1",
|
||||
@@ -125,6 +125,12 @@ static const char * const smca_ls_mce_desc[] = {
|
||||
"DC tag error type 3",
|
||||
"DC tag error type 5",
|
||||
"L2 fill data error",
|
||||
+ "Error on SCB cacheline state or address field",
|
||||
+ "Error on SCB data, commit pipe 0",
|
||||
+ "Error on SCB data, commit pipe 1",
|
||||
+ "Error on SCB data for non-cacheable DRAM or IO",
|
||||
+ "System Read Data Error detected by write combine buffer",
|
||||
+ "Hardware Asserts",
|
||||
};
|
||||
|
||||
static const char * const smca_ls2_mce_desc[] = {
|
||||
@@ -168,7 +174,7 @@ static const char * const smca_if_mce_desc[] = {
|
||||
"BP L1-BTB Multi-Hit Error",
|
||||
"BP L2-BTB Multi-Hit Error",
|
||||
"L2 Cache Response Poison error",
|
||||
- "L2 Cache Error Response",
|
||||
+ "System Read Data error",
|
||||
"Hardware Assertion Error",
|
||||
"L1-TLB Multi-Hit",
|
||||
"L2-TLB Multi-Hit",
|
||||
@@ -182,6 +188,7 @@ static const char * const smca_l2_mce_desc[] = {
|
||||
"L2M Data Array ECC Error",
|
||||
"Hardware Assert Error",
|
||||
"SDP Read Response Parity Error",
|
||||
+ "Error initiated by programmable state machine",
|
||||
};
|
||||
|
||||
static const char * const smca_de_mce_desc[] = {
|
||||
@@ -193,7 +200,7 @@ static const char * const smca_de_mce_desc[] = {
|
||||
"Fetch address FIFO parity error",
|
||||
"Patch RAM data parity error",
|
||||
"Patch RAM sequencer parity error",
|
||||
- "Micro-op buffer parity error",
|
||||
+ "Micro-op fetch queue parity error",
|
||||
"Hardware Assertion MCA Error",
|
||||
};
|
||||
|
||||
@@ -235,6 +242,7 @@ static const char * const smca_l3_mce_desc[] = {
|
||||
"L3 victim queue Data Fabric error",
|
||||
"L3 Hardware Assertion",
|
||||
"XI WCB Parity Poison Creation event",
|
||||
+ "Machine check error initiated by DSM action",
|
||||
};
|
||||
|
||||
static const char * const smca_cs_mce_desc[] = {
|
||||
@@ -268,6 +276,9 @@ static const char * const smca_cs2_mce_desc[] = {
|
||||
"Address Violation on the no data channel",
|
||||
"Security Violation on the no data channel",
|
||||
"Hardware Assert Error",
|
||||
+ "Shadow Tag Array Protocol Error",
|
||||
+ "Shadow Tag ECC Error",
|
||||
+ "Shadow Tag Transaction Error",
|
||||
};
|
||||
|
||||
/*
|
||||
@@ -303,6 +314,8 @@ static const char * const smca_pie_mce_desc[] = {
|
||||
"A deferred error was detected in the DF",
|
||||
"Watch Dog Timer",
|
||||
"An SRAM ECC error was detected in the CNLI block",
|
||||
+ "Register access during DF Cstate",
|
||||
+ "DSM Error",
|
||||
};
|
||||
|
||||
static const char * const smca_umc_mce_desc[] = {
|
||||
@@ -318,6 +331,11 @@ static const char * const smca_umc_mce_desc[] = {
|
||||
"ECS Error",
|
||||
"UMC Throttling Error",
|
||||
"Read CRC Error",
|
||||
+ "Reserved",
|
||||
+ "Reserved",
|
||||
+ "Reserved",
|
||||
+ "Reserved",
|
||||
+ "RFM SRAM ECC error",
|
||||
};
|
||||
|
||||
static const char * const smca_umc_quirk_mce_desc[] = {
|
||||
@@ -391,6 +409,12 @@ static const char * const smca_psp2_mce_desc[] = {
|
||||
"TLB Bank 0 parity error",
|
||||
"TLB Bank 1 parity error",
|
||||
"System Hub Read Buffer ECC or parity error",
|
||||
+ "FUSE IP SRAM ECC or parity error",
|
||||
+ "PCRU FUSE SRAM ECC or parity error",
|
||||
+ "SIB SRAM parity error",
|
||||
+ "mpASP SECEMC Error",
|
||||
+ "mpASP A5 Hang",
|
||||
+ "SIB WDT error",
|
||||
};
|
||||
|
||||
static const char * const smca_smu_mce_desc[] = {
|
||||
@@ -431,6 +455,7 @@ static const char * const smca_mp5_mce_desc[] = {
|
||||
"Instruction Cache Bank B ECC or parity error",
|
||||
"Instruction Tag Cache Bank A ECC or parity error",
|
||||
"Instruction Tag Cache Bank B ECC or parity error",
|
||||
+ "Fuse SRAM ECC or parity error",
|
||||
};
|
||||
|
||||
static const char * const smca_mpdma_mce_desc[] = {
|
||||
@@ -483,6 +508,7 @@ static const char * const smca_mpdma_mce_desc[] = {
|
||||
"MPDMA PTE Internal Data FIFO ECC or parity error",
|
||||
"MPDMA PTE Command Memory DMA ECC or parity error",
|
||||
"MPDMA PTE Command Memory Internal ECC or parity error",
|
||||
+ "MPDMA TVF SDP Master Memory 7 ECC or parity error",
|
||||
};
|
||||
|
||||
static const char * const smca_nbio_mce_desc[] = {
|
56
b1ace39286e287282a275b6edc90dc2f64e60a3c.patch
Normal file
56
b1ace39286e287282a275b6edc90dc2f64e60a3c.patch
Normal file
@ -0,0 +1,56 @@
|
||||
commit b1ace39286e287282a275b6edc90dc2f64e60a3c
|
||||
Author: Avadhut Naik <avadhut.naik@amd.com>
|
||||
Date: Mon Mar 25 23:06:08 2024 -0500
|
||||
|
||||
rasdaemon: ras-mc-ctl: Add support to display mcastatus_msg string
|
||||
|
||||
Currently, the mcastatus_msg string of struct mce_event is added to the
|
||||
SQLite database by the rasdaemon when it is recording errors. The same
|
||||
however, is not outputted by the ras-mc-ctl utility.
|
||||
|
||||
The string provides important error information relating to the received
|
||||
MCE. For example, on AMD SMCA systems, the string outputs extended error
|
||||
code and description. As such, the string should be present in the
|
||||
output of ras-mc-ctl utility.
|
||||
|
||||
Add support to output the string through the ras-mc-ctl utility.
|
||||
|
||||
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
|
||||
|
||||
---
|
||||
util/ras-mc-ctl.in | 7 ++++---
|
||||
1 file changed, 4 insertions(+), 3 deletions(-)
|
||||
|
||||
--- rasdaemon-0.6.7.orig/util/ras-mc-ctl.in 2024-08-19 15:08:29.246429487 -0400
|
||||
+++ rasdaemon-0.6.7/util/ras-mc-ctl.in 2024-08-19 15:10:55.478162148 -0400
|
||||
@@ -1317,7 +1317,7 @@ sub errors
|
||||
{
|
||||
require DBI;
|
||||
my ($query, $query_handle, $id, $time, $devname, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out);
|
||||
- my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location);
|
||||
+ my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location);
|
||||
my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data);
|
||||
my ($bus_name, $dev_name, $driver_name, $reporter_name);
|
||||
my ($dev, $sector, $nr_sector, $error, $rwbs, $cmd);
|
||||
@@ -1485,10 +1485,10 @@ $out .= sprintf "address=0x%08x, ", $add
|
||||
|
||||
# MCE mce_record errors
|
||||
if ($has_mce == 1) {
|
||||
- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id";
|
||||
+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record order by id";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location));
|
||||
+ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location));
|
||||
$out = "";
|
||||
while($query_handle->fetch()) {
|
||||
$out .= "$id $time error: $msg";
|
||||
@@ -1496,6 +1496,7 @@ $out .= sprintf "address=0x%08x, ", $add
|
||||
$out .= ", bank $bank_name" if ($bank_name);
|
||||
$out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg);
|
||||
$out .= ", mci $mcistatus_msg" if ($mcistatus_msg);
|
||||
+ $out .= ", mca $mcastatus_msg" if ($mcastatus_msg);
|
||||
$out .= ", $mc_location" if ($mc_location);
|
||||
$out .= ", $user_action" if ($user_action);
|
||||
$out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap);
|
@ -1,6 +1,6 @@
|
||||
Name: rasdaemon
|
||||
Version: 0.6.7
|
||||
Release: 15%{?dist}
|
||||
Release: 16%{?dist}
|
||||
Summary: Utility to receive RAS error tracings
|
||||
License: GPL-2.0-only
|
||||
URL: http://git.infradead.org/users/mchehab/rasdaemon.git
|
||||
@ -39,6 +39,10 @@ Patch30: 885e546add918457c453bd3f753ac7df90b39e36.patch
|
||||
Patch31: 7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch
|
||||
Patch32: ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch
|
||||
Patch33: 73d8177ce0d2fcb7693cacee4778d0845ebd3788.patch
|
||||
Patch34: ad0444190e02bca309a61a4bad51bc0e16c0aef5.patch
|
||||
Patch35: b1ace39286e287282a275b6edc90dc2f64e60a3c.patch
|
||||
Patch36: 045ab08eaa00172d50621df9502f6910f3fe3af4.patch
|
||||
Patch37: 79065939fc4bc1da72a3718937fab80e73a6dd75.patch
|
||||
|
||||
ExcludeArch: s390 s390x
|
||||
BuildRequires: make
|
||||
@ -107,6 +111,10 @@ an utility for reporting current error counts from the EDAC sysfs files.
|
||||
%patch31 -p1
|
||||
%patch32 -p1
|
||||
%patch33 -p1
|
||||
%patch34 -p1
|
||||
%patch35 -p1
|
||||
%patch36 -p1
|
||||
%patch37 -p1
|
||||
|
||||
# The tarball is locked in time the first time aclocal was ran and will keep
|
||||
# requiring an older version of automake
|
||||
@ -142,7 +150,13 @@ sed -i "s/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION=account/" %{buildroot}/%{_sysconfdir
|
||||
%{_sysconfdir}/sysconfig/rasdaemon
|
||||
|
||||
%changelog
|
||||
* Thu Jul 18 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-14
|
||||
* Thu Sep 05 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-16
|
||||
- rasdaemon: Add support to parse the PPIN field of mce tracepoint [RHEL-52911]
|
||||
- rasdaemon: Add support to parse microcode field of mce tracepoint [RHEL-52911]
|
||||
- rasdaemon: Update SMCA bank error descriptions [RHEL-52911]
|
||||
- rasdaemon: ras-mc-ctl: Add support to display mcastatus_msg string [RHEL-52911]
|
||||
|
||||
* Thu Jul 18 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-15
|
||||
- rasdaemon: mce-amd-smca: Optimizing decoding of MCA_CTL_SMU bits [RHEL-48819]
|
||||
|
||||
* Fri Jun 28 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-13
|
||||
|
Loading…
Reference in New Issue
Block a user