Import from AlmaLinux stable repository

This commit is contained in:
eabdullin 2025-12-04 13:30:27 +00:00
parent 2761f36654
commit b1b2b36e10
14 changed files with 1116 additions and 2 deletions

View File

@ -0,0 +1,154 @@
commit 045ab08eaa00172d50621df9502f6910f3fe3af4
Author: Avadhut Naik <avadhut.naik@amd.com>
Date: Mon Apr 1 23:33:07 2024 -0500
rasdaemon: Add support to parse the PPIN field of mce tracepoint
Support for exporting the PPIN (Protected Processor Inventory Number)
is being added to the mce_record tracepoint.
Add the required, corresponding support in the rasdaemon for the field
to be parsed and logged or added to the database and viewed later through
ras-mc-ctl utility.
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
ras-mce-handler.c | 7 +++++++
ras-mce-handler.h | 1 +
ras-record.c | 42 ++++++++++++++++++++++--------------------
util/ras-mc-ctl.in | 7 ++++---
4 files changed, 34 insertions(+), 23 deletions(-)
--- rasdaemon-0.6.7.orig/ras-mce-handler.c 2024-08-22 14:30:41.581276901 -0400
+++ rasdaemon-0.6.7/ras-mce-handler.c 2024-08-22 14:33:19.940957855 -0400
@@ -369,6 +369,9 @@ #if 0
trace_seq_printf(s, ", apicid= %x", e->apicid);
+ if (e->ppin)
+ trace_seq_printf(s, ", ppin= %llx", (long long)e->ppin);
+
/*
* FIXME: The original mcelog userspace tool uses DMI to map from
* address to DIMM. From the comments there, the code there doesn't
@@ -545,6 +548,10 @@ if (pevent_get_field_val(s, event, "ipid
return -1;
e.ipid = val;
+ /* Get PPIN */
+ if (!pevent_get_field_val(s, event, "ppin", record, &val, 1))
+ e.ppin = val;
+
switch (mce->cputype) {
case CPU_GENERIC:
break;
--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-08-22 14:30:41.581276901 -0400
+++ rasdaemon-0.6.7/ras-mce-handler.h 2024-08-22 14:33:47.312729865 -0400
@@ -74,6 +74,7 @@ struct mce_event {
uint8_t cpuvendor;
uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */
uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */
+ uint64_t ppin;
/* Parsed data */
char timestamp[64];
--- rasdaemon-0.6.7.orig/ras-record.c 2024-08-22 14:30:41.581276901 -0400
+++ rasdaemon-0.6.7/ras-record.c 2024-08-22 14:37:20.884941279 -0400
@@ -330,19 +330,20 @@ static const struct db_fields mce_record
{ .name="ip", .type="INTEGER" },
{ .name="tsc", .type="INTEGER" },
{ .name="walltime", .type="INTEGER" },
- { .name="cpu", .type="INTEGER" }, // 10
+ { .name = "ppin", .type = "INTEGER" }, // 10
+ { .name="cpu", .type="INTEGER" },
{ .name="cpuid", .type="INTEGER" },
{ .name="apicid", .type="INTEGER" },
{ .name="socketid", .type="INTEGER" },
- { .name="cs", .type="INTEGER" },
- { .name="bank", .type="INTEGER" }, //15
+ { .name="cs", .type="INTEGER" }, //15
+ { .name="bank", .type="INTEGER" },
{ .name="cpuvendor", .type="INTEGER" },
/* Parsed data - will likely change */
{ .name="bank_name", .type="TEXT" },
{ .name="error_msg", .type="TEXT" },
- { .name="mcgstatus_msg", .type="TEXT" },
- { .name="mcistatus_msg", .type="TEXT" }, // 20
+ { .name="mcgstatus_msg", .type="TEXT" }, // 20
+ { .name="mcistatus_msg", .type="TEXT" },
{ .name="mcastatus_msg", .type="TEXT" },
{ .name="user_action", .type="TEXT" },
{ .name="mc_location", .type="TEXT" },
@@ -372,21 +373,22 @@ return 0;
sqlite3_bind_int64 (priv->stmt_mce_record, 7, ev->ip);
sqlite3_bind_int64 (priv->stmt_mce_record, 8, ev->tsc);
sqlite3_bind_int64 (priv->stmt_mce_record, 9, ev->walltime);
- sqlite3_bind_int (priv->stmt_mce_record, 10, ev->cpu);
- sqlite3_bind_int (priv->stmt_mce_record, 11, ev->cpuid);
- sqlite3_bind_int (priv->stmt_mce_record, 12, ev->apicid);
- sqlite3_bind_int (priv->stmt_mce_record, 13, ev->socketid);
- sqlite3_bind_int (priv->stmt_mce_record, 14, ev->cs);
- sqlite3_bind_int (priv->stmt_mce_record, 15, ev->bank);
- sqlite3_bind_int (priv->stmt_mce_record, 16, ev->cpuvendor);
-
- sqlite3_bind_text(priv->stmt_mce_record, 17, ev->bank_name, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 18, ev->error_msg, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 19, ev->mcgstatus_msg, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 20, ev->mcistatus_msg, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 21, ev->mcastatus_msg, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 22, ev->user_action, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 23, ev->mc_location, -1, NULL);
+ sqlite3_bind_int64(priv->stmt_mce_record, 10, ev->ppin);
+ sqlite3_bind_int (priv->stmt_mce_record, 11, ev->cpu);
+ sqlite3_bind_int (priv->stmt_mce_record, 12, ev->cpuid);
+ sqlite3_bind_int (priv->stmt_mce_record, 13, ev->apicid);
+ sqlite3_bind_int (priv->stmt_mce_record, 14, ev->socketid);
+ sqlite3_bind_int (priv->stmt_mce_record, 15, ev->cs);
+ sqlite3_bind_int (priv->stmt_mce_record, 16, ev->bank);
+ sqlite3_bind_int (priv->stmt_mce_record, 17, ev->cpuvendor);
+
+ sqlite3_bind_text(priv->stmt_mce_record, 18, ev->bank_name, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 19, ev->error_msg, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 20, ev->mcgstatus_msg, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 21, ev->mcistatus_msg, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 22, ev->mcastatus_msg, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 23, ev->user_action, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 24, ev->mc_location, -1, NULL);
rc = sqlite3_step(priv->stmt_mce_record);
if (rc != SQLITE_OK && rc != SQLITE_DONE)
--- rasdaemon-0.6.7.orig/util/ras-mc-ctl.in 2024-08-22 14:29:51.058697724 -0400
+++ rasdaemon-0.6.7/util/ras-mc-ctl.in 2024-08-22 14:34:53.803175632 -0400
@@ -1317,7 +1317,7 @@ sub errors
{
require DBI;
my ($query, $query_handle, $id, $time, $devname, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out);
- my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location);
+ my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location);
my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data);
my ($bus_name, $dev_name, $driver_name, $reporter_name);
my ($dev, $sector, $nr_sector, $error, $rwbs, $cmd);
@@ -1485,10 +1485,10 @@ $out .= sprintf "address=0x%08x, ", $add
# MCE mce_record errors
if ($has_mce == 1) {
- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record order by id";
+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, ppin, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record order by id";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location));
+ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location));
$out = "";
while($query_handle->fetch()) {
$out .= "$id $time error: $msg";
@@ -1507,6 +1507,7 @@ $out .= sprintf ", misc=0x%08x", $misc i
$out .= sprintf ", ip=0x%08x", $ip if ($ip);
$out .= sprintf ", tsc=0x%08x", $tsc if ($tsc);
$out .= sprintf ", walltime=0x%08x", $walltime if ($walltime);
+ $out .= sprintf ", ppin=0x%08x", $ppin if ($ppin);
$out .= sprintf ", cpu=0x%08x", $cpu if ($cpu);
$out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid);
$out .= sprintf ", apicid=0x%08x", $apicid if ($apicid);

View File

@ -0,0 +1,93 @@
commit 73d8177ce0d2fcb7693cacee4778d0845ebd3788
Author: sathya priya kumar <SathyaPriya.K@amd.com>
Date: Thu Jun 13 05:29:09 2024 +0000
rasdaemon: mce-amd-smca: Optimizing decoding of MCA_CTL_SMU bits
Optimize smca_smu2_mce_desc in better way from the commit ced615c.
Update existing array with extended error descriptions instead
of creating new array, simplifying the code.
Signed-off-by: Sathya Priya Kumar <sathyapriya.k@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
mce-amd-smca.c | 29 +++--------------------------
ras-mce-handler.h | 1 -
2 files changed, 3 insertions(+), 27 deletions(-)
--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-07-18 11:14:26.008582740 -0400
+++ rasdaemon-0.6.7/mce-amd-smca.c 2024-07-18 11:15:05.510270132 -0400
@@ -397,7 +397,7 @@ static const char * const smca_smu_mce_d
"An ECC or parity error in an SMU RAM instance",
};
-static const char * smca_smu2_mce_desc[64] = {
+static const char * const smca_smu2_mce_desc[] = {
"High SRAM ECC or parity error",
"Low SRAM ECC or parity error",
"Data Cache Bank A ECC or parity error",
@@ -410,14 +410,13 @@ static const char * smca_smu2_mce_desc[6
"Instruction Tag Cache Bank B ECC or parity error",
"System Hub Read Buffer ECC or parity error",
"PHY RAS ECC Error",
-};
-
-static const char * smca_smu2_ext_mce_desc[] = {
+ [12 ... 57] = "Reserved",
"A correctable error from a GFX Sub-IP",
"A fatal error from a GFX Sub-IP",
"Reserved",
"Reserved",
"A poison error from a GFX Sub-IP",
+ "Reserved",
};
static const char * const smca_mp5_mce_desc[] = {
@@ -824,27 +823,6 @@ static struct smca_bank_name smca_names[
[SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" },
};
-void smca_smu2_ext_err_desc(void)
-{
- int i, j;
- int smu2_bits = 62;
-
- /*
- * MCA_CTL_SMU error stings are defined for b'58:59 and b'62
- * in MI300A AMD systems. See AMD PPR MCA::SMU::MCA_CTL_SMU
- *
- * b'0:11 can be decoded from existing array smca_smu2_mce_desc.
- * b'12:57 are Reserved and b'58:62 are appended to the
- * smca_smu2_mce_desc.
- */
- for (i = 12, j = 0; i < smu2_bits || j < 5; i++, j++) {
- for ( ; i < 58; i++)
- smca_smu2_mce_desc[i] = "Reserved";
-
- smca_smu2_mce_desc[i] = smca_smu2_ext_mce_desc[j];
- }
-}
-
void amd_decode_errcode(struct mce_event *e)
{
@@ -936,7 +914,6 @@ unsigned short xec = (e->status >> 16) &
mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID,
(ipid_high & MCI_IPID_MCATYPE) >> 16);
- smca_smu2_ext_err_desc();
fixup_hwid(m, &mcatype_hwid);
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-07-18 11:14:26.008582740 -0400
+++ rasdaemon-0.6.7/ras-mce-handler.h 2024-07-18 11:14:28.987559165 -0400
@@ -121,7 +121,6 @@ int set_intel_imc_log(enum cputype cputy
/* Undertake AMD SMCA Error Decoding */
void decode_smca_error(struct mce_event *e, struct mce_priv *m);
void amd_decode_errcode(struct mce_event *e);
-void smca_smu2_ext_err_desc(void);
/* Per-CPU-type decoders for Intel CPUs */
void p4_decode_model(struct mce_event *e);

View File

@ -0,0 +1,128 @@
commit 79065939fc4bc1da72a3718937fab80e73a6dd75
Author: Avadhut Naik <avadhut.naik@amd.com>
Date: Tue Apr 2 00:07:38 2024 -0500
rasdaemon: Add support to parse microcode field of mce tracepoint
Support for exporting the Microcode Revision is being added to the
mce_record tracepoint.
Add the required, corresponding support in the rasdaemon for the field
to be parsed and logged or added to the database and viewed later through
ras-mc-ctl utility.
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
ras-mce-handler.c | 7 +++++++
ras-mce-handler.h | 1 +
ras-record.c | 20 +++++++++++---------
util/ras-mc-ctl.in | 7 ++++---
4 files changed, 23 insertions(+), 12 deletions(-)
--- rasdaemon-0.6.7.orig/ras-mce-handler.c 2024-08-22 14:44:51.352160832 -0400
+++ rasdaemon-0.6.7/ras-mce-handler.c 2024-08-22 14:44:51.361160757 -0400
@@ -372,6 +372,9 @@ #if 0
if (e->ppin)
trace_seq_printf(s, ", ppin= %llx", (long long)e->ppin);
+ if (e->microcode)
+ trace_seq_printf(s, ", microcode= %x", e->microcode);
+
/*
* FIXME: The original mcelog userspace tool uses DMI to map from
* address to DIMM. From the comments there, the code there doesn't
@@ -552,6 +555,10 @@ if (pevent_get_field_val(s, event, "ipid
if (!pevent_get_field_val(s, event, "ppin", record, &val, 1))
e.ppin = val;
+ /* Get Microcode Revision */
+ if (!pevent_get_field_val(s, event, "microcode", record, &val, 1))
+ e.microcode = val;
+
switch (mce->cputype) {
case CPU_GENERIC:
break;
--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-08-22 14:44:51.352160832 -0400
+++ rasdaemon-0.6.7/ras-mce-handler.h 2024-08-22 14:44:51.361160757 -0400
@@ -75,6 +75,7 @@ struct mce_event {
uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */
uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */
uint64_t ppin;
+ uint32_t microcode;
/* Parsed data */
char timestamp[64];
--- rasdaemon-0.6.7.orig/ras-record.c 2024-08-22 14:44:51.353160824 -0400
+++ rasdaemon-0.6.7/ras-record.c 2024-08-22 14:44:51.362160748 -0400
@@ -338,11 +338,12 @@ { .name = "ppin", .type = "INTEGER" },
{ .name="cs", .type="INTEGER" }, //15
{ .name="bank", .type="INTEGER" },
{ .name="cpuvendor", .type="INTEGER" },
+ { .name = "microcode", .type = "INTEGER" },
/* Parsed data - will likely change */
{ .name="bank_name", .type="TEXT" },
- { .name="error_msg", .type="TEXT" },
- { .name="mcgstatus_msg", .type="TEXT" }, // 20
+ { .name="error_msg", .type="TEXT" }, // 20
+ { .name="mcgstatus_msg", .type="TEXT" },
{ .name="mcistatus_msg", .type="TEXT" },
{ .name="mcastatus_msg", .type="TEXT" },
{ .name="user_action", .type="TEXT" },
@@ -381,14 +382,15 @@ sqlite3_bind_int64(priv->stmt_mce_record
sqlite3_bind_int (priv->stmt_mce_record, 15, ev->cs);
sqlite3_bind_int (priv->stmt_mce_record, 16, ev->bank);
sqlite3_bind_int (priv->stmt_mce_record, 17, ev->cpuvendor);
+ sqlite3_bind_int (priv->stmt_mce_record, 18, ev->microcode);
- sqlite3_bind_text(priv->stmt_mce_record, 18, ev->bank_name, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 19, ev->error_msg, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 20, ev->mcgstatus_msg, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 21, ev->mcistatus_msg, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 22, ev->mcastatus_msg, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 23, ev->user_action, -1, NULL);
- sqlite3_bind_text(priv->stmt_mce_record, 24, ev->mc_location, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 19, ev->bank_name, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 20, ev->error_msg, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 21, ev->mcgstatus_msg, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 22, ev->mcistatus_msg, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 23, ev->mcastatus_msg, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 24, ev->user_action, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mce_record, 25, ev->mc_location, -1, NULL);
rc = sqlite3_step(priv->stmt_mce_record);
if (rc != SQLITE_OK && rc != SQLITE_DONE)
--- rasdaemon-0.6.7.orig/util/ras-mc-ctl.in 2024-08-22 14:44:51.353160824 -0400
+++ rasdaemon-0.6.7/util/ras-mc-ctl.in 2024-08-22 14:44:51.362160748 -0400
@@ -1317,7 +1317,7 @@ sub errors
{
require DBI;
my ($query, $query_handle, $id, $time, $devname, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out);
- my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location);
+ my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $microcode, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location);
my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data);
my ($bus_name, $dev_name, $driver_name, $reporter_name);
my ($dev, $sector, $nr_sector, $error, $rwbs, $cmd);
@@ -1485,10 +1485,10 @@ $out .= sprintf "address=0x%08x, ", $add
# MCE mce_record errors
if ($has_mce == 1) {
- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, ppin, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record order by id";
+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, ppin, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, microcode, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record order by id";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location));
+ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $ppin, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $microcode, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location));
$out = "";
while($query_handle->fetch()) {
$out .= "$id $time error: $msg";
@@ -1514,6 +1514,7 @@ $out .= sprintf ", apicid=0x%08x", $apic
$out .= sprintf ", socketid=0x%08x", $socketid if ($socketid);
$out .= sprintf ", cs=0x%08x", $cs if ($cs);
$out .= sprintf ", bank=0x%08x", $bank if ($bank);
+ $out .= sprintf ", microcode=0x%08x", $microcode if ($microcode);
$out .= "\n";
}

View File

@ -0,0 +1,35 @@
commit 794530fbf270eae9f6f43c6d0bbd3ec6f2b210f3
Author: hubin <hubin73@huawei.com>
Date: Thu May 18 16:14:41 2023 +0800
ras-events: quit loop in read_ras_event when kbuf data is broken
when kbuf data is broken, kbuffer_next_event() may move kbuf->index back to
the current kbuf->index position, causing dead loop.
In this situation, rasdaemon will repeatedly parse an invalid event, and
print warning like "ug! negative record size -8!", pushing cpu utilization
rate to 100%.
when kbuf data is broken, discard current page and continue reading next page
kbuf.
Signed-off-by: hubin <hubin73@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/ras-events.c b/ras-events.c
index 2662467..fced7ab 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -512,6 +512,11 @@ static int read_ras_event_all_cpus(struct pthread_data *pdata,
kbuffer_load_subbuffer(kbuf, page);
while ((data = kbuffer_read_event(kbuf, &time_stamp))) {
+ if (kbuffer_curr_size(kbuf) < 0) {
+ log(TERM, LOG_ERR, "invalid kbuf data, discard\n");
+ break;
+ }
+
parse_ras_data(&pdata[i],
kbuf, data, time_stamp);

View File

@ -0,0 +1,34 @@
commit 7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e
Author: Aristeu Rozanski <arozansk@redhat.com>
Date: Tue Apr 9 10:06:30 2024 -0400
mce-amd-smca: update smca_hwid to use smca_bank_types
bank_type is used as smca_bank_types everywhere, there's no point in
declaring it as unsigned int. It also upsets covscan:
3. rasdaemon-0.6.7/mce-amd-smca.c:914: assignment: Assigning: "bank_type" = "s_hwid->bank_type".
7. rasdaemon-0.6.7/mce-amd-smca.c:926: cond_at_most: Checking "bank_type >= 64U" implies that "bank_type" and "s_hwid->bank_type" may be up to 63 on the false branch.
14. rasdaemon-0.6.7/mce-amd-smca.c:942: overrun-local: Overrunning array "smca_mce_descs" of 38 16-byte elements at element index 63 (byte offset 1023) using index "bank_type" (which evaluates to 63).
# 940| /* Only print the descriptor of valid extended error code */
# 941| if (xec < smca_mce_descs[bank_type].num_descs)
# 942|-> mce_snprintf(e->mcastatus_msg,
# 943| "%s. Ext Err Code: %d",
# 944| smca_mce_descs[bank_type].descs[xec],
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 7521ff7..6632663 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -706,7 +706,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
};
struct smca_hwid {
- unsigned int bank_type; /* Use with smca_bank_types for easy indexing.*/
+ enum smca_bank_types bank_type;
uint32_t mcatype_hwid; /* mcatype,hwid bit 63-32 in MCx_IPID Register*/
};

View File

@ -0,0 +1,95 @@
commit 83a3ced797256dcb1c93f8de4266fd7545fbfb3b
Author: Avadhut Naik <avadnaik@amd.com>
Date: Tue Nov 21 14:04:19 2023 -0600
rasdaemon: Add support for vendor-specific machine check error information
Some CPU vendors may provide additional vendor-specific machine check
error information. AMD, for example, provides FRU Text through SYND 1/2
registers if BIT 9 of SMCA_CONFIG register is set.
Add support to display the additional vendor-specific error information,
if any.
Signed-off-by: Avadhut Naik <Avadhut.Naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
mce-amd-smca.c | 12 ++++++++++++
ras-mce-handler.c | 22 ++++++++++++++++++++++
ras-mce-handler.h | 3 +++
3 files changed, 37 insertions(+)
--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-11-27 10:18:13.765255836 -0500
+++ rasdaemon-0.6.7/mce-amd-smca.c 2024-11-27 10:18:23.014169756 -0500
@@ -999,6 +999,18 @@ if (bank_type == SMCA_UMC_V2 && xec == 0
channel, csrow);
}
+
+ if (e->vdata_len) {
+ uint64_t smca_config = e->vdata[2];
+
+ /*
+ * BIT 9 of the CONFIG register of a few SMCA Bank types indicates
+ * presence of FRU Text in SYND 1 / 2 registers
+ */
+ if (smca_config & BIT(9))
+ memcpy(e->frutext, e->vdata, 16);
+ }
+
}
int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)
--- rasdaemon-0.6.7.orig/ras-mce-handler.c 2024-11-27 10:18:23.014169756 -0500
+++ rasdaemon-0.6.7/ras-mce-handler.c 2024-11-27 10:19:38.849463954 -0500
@@ -375,6 +375,25 @@ #if 0
if (e->microcode)
trace_seq_printf(s, ", microcode= %x", e->microcode);
+ if (!e->vdata_len)
+ return;
+
+ if (strlen(e->frutext)) {
+ trace_seq_printf(s, ", FRU Text= %s", e->frutext);
+ trace_seq_printf(s, ", Vendor Data= ");
+ for (int i = 2; i < e->vdata_len/8; i++) {
+ trace_seq_printf(s, "0x%lx", e->vdata[i]);
+ trace_seq_printf(s, " ");
+ }
+ } else {
+ trace_seq_printf(s, ", Vendor Data= ");
+ for (int i = 0; i < e->vdata_len/8; i ++) {
+ trace_seq_printf(s, "0x%lx", e->vdata[i]);
+ trace_seq_printf(s, " ");
+ }
+ }
+
+
/*
* FIXME: The original mcelog userspace tool uses DMI to map from
* address to DIMM. From the comments there, the code there doesn't
@@ -559,6 +578,9 @@ if (pevent_get_field_val(s, event, "ipid
if (!pevent_get_field_val(s, event, "microcode", record, &val, 1))
e.microcode = val;
+ /* Get Vendor-specfic Data, if any */
+ e.vdata = pevent_get_field_raw(s, event, "v_data", record, &e.vdata_len, 1);
+
switch (mce->cputype) {
case CPU_GENERIC:
break;
--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-11-27 10:18:23.014169756 -0500
+++ rasdaemon-0.6.7/ras-mce-handler.h 2024-11-27 10:20:05.249218250 -0500
@@ -76,8 +76,11 @@ struct mce_event {
uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */
uint64_t ppin;
uint32_t microcode;
+ int32_t vdata_len;
+ const uint64_t *vdata;
/* Parsed data */
+ char frutext[17];
char timestamp[64];
char bank_name[64];
char error_msg[4096];

View File

@ -0,0 +1,22 @@
commit 885e546add918457c453bd3f753ac7df90b39e36
Author: weidongkl <weidongkl@sina.com>
Date: Tue Sep 19 16:29:21 2023 +0800
Add a space between "diskerror_event" and "store"
Signed-off-by: weidongkl <weidongkl@sina.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/ras-record.c b/ras-record.c
index a5f99ae..6b050bb 100644
--- a/ras-record.c
+++ b/ras-record.c
@@ -484,7 +484,7 @@ int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev
if (!priv || !priv->stmt_diskerror_event)
return 0;
- log(TERM, LOG_INFO, "diskerror_eventstore: %p\n", priv->stmt_diskerror_event);
+ log(TERM, LOG_INFO, "diskerror_event store: %p\n", priv->stmt_diskerror_event);
sqlite3_bind_text(priv->stmt_diskerror_event, 1, ev->timestamp, -1, NULL);
sqlite3_bind_text(priv->stmt_diskerror_event, 2, ev->dev, -1, NULL);

View File

@ -0,0 +1,75 @@
commit 8b536321cc0679fb82d4ea7521f9375d88cec0cc
Author: Avadhut Naik <avadhut.naik@amd.com>
Date: Thu Nov 7 06:24:44 2024 +0000
rasdaemon: Modify support for vendor-specific machine check error information
Commit 83a3ced797256d ("rasdaemon: Add support for vendor-specific
machine check error information") assumes that MCA_CONFIG MSR will be
exported as part of vendor-specific error information through the MCE
tracepoint.
The same, however, is not true anymore. MCA_CONFIG MSR will not be
exported through the MCE tracepoint. Instead, the data from MCA_SYND1/2
MSRs, exported as vendor-specific error information on newer AMD SOCs,
should always be interpreted as FRUText.
Modify the error decoding support accordingly.
Fixes: 83a3ced797256d ("rasdaemon: Add support for vendor-specific
machine check error information")
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
mce-amd-smca.c | 13 ++-----------
ras-mce-handler.c | 15 +--------------
2 files changed, 3 insertions(+), 25 deletions(-)
--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-11-27 10:20:29.777989960 -0500
+++ rasdaemon-0.6.7/mce-amd-smca.c 2024-11-27 10:21:28.731441278 -0500
@@ -1000,17 +1000,8 @@ if (bank_type == SMCA_UMC_V2 && xec == 0
}
- if (e->vdata_len) {
- uint64_t smca_config = e->vdata[2];
-
- /*
- * BIT 9 of the CONFIG register of a few SMCA Bank types indicates
- * presence of FRU Text in SYND 1 / 2 registers
- */
- if (smca_config & BIT(9))
- memcpy(e->frutext, e->vdata, 16);
- }
-
+ if (e->vdata_len)
+ memcpy(e->frutext, e->vdata, 16);
}
int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)
--- rasdaemon-0.6.7.orig/ras-mce-handler.c 2024-11-27 10:20:29.777989960 -0500
+++ rasdaemon-0.6.7/ras-mce-handler.c 2024-11-27 10:21:01.517694557 -0500
@@ -378,21 +378,8 @@ #if 0
if (!e->vdata_len)
return;
- if (strlen(e->frutext)) {
+ if (strlen(e->frutext))
trace_seq_printf(s, ", FRU Text= %s", e->frutext);
- trace_seq_printf(s, ", Vendor Data= ");
- for (int i = 2; i < e->vdata_len/8; i++) {
- trace_seq_printf(s, "0x%lx", e->vdata[i]);
- trace_seq_printf(s, " ");
- }
- } else {
- trace_seq_printf(s, ", Vendor Data= ");
- for (int i = 0; i < e->vdata_len/8; i ++) {
- trace_seq_printf(s, "0x%lx", e->vdata[i]);
- trace_seq_printf(s, " ");
- }
- }
-
/*
* FIXME: The original mcelog userspace tool uses DMI to map from

View File

@ -0,0 +1,24 @@
commit 9bd84aef87978b806178a73ed33c39d6c442fc1f
Author: weidong <weidongkl@sina.com>
Date: Tue Aug 8 08:59:12 2023 +0000
add ':' before error output
All prints except disk are preceded by a colon
Signed-off-by: weidong <weidongkl@sina.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index dc326d3..13078c2 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -1469,7 +1469,7 @@ sub errors
$out .= "\n";
}
if ($out ne "") {
- print "Disk errors\n$out\n";
+ print "Disk errors:\n$out\n";
} else {
print "No disk errors.\n\n";
}

View File

@ -0,0 +1,117 @@
commit 9c86f6255f67a8bae28cd46c54500fc16bfc7a30
Author: Yang Shi <shy828301@gmail.com>
Date: Mon Apr 4 16:34:05 2022 -0700
rasdaemon: use the new block_rq_error tracepoint
Since Linux 5.18-rc1 a new block tracepoint called block_rq_error is
available for tracing disk error events dedicatedly. Currently
rasdaemon is using block_rq_complete which also traces successful cases.
It incurs excessive tracing logs and somehow overhead since the event is
triggered quite often.
Use the new tracepoint for disk error reporting, and the new trace point
has the same format as block_rq_complete.
Signed-off-by: Yang Shi <shy828301@gmail.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
ras-events.c | 53 ++++++++++-------------------------------------------
ras-record.c | 2 +-
2 files changed, 11 insertions(+), 44 deletions(-)
--- rasdaemon-0.6.7.orig/ras-events.c 2024-05-14 11:05:40.020599541 -0400
+++ rasdaemon-0.6.7/ras-events.c 2024-05-14 11:06:38.831067957 -0400
@@ -27,6 +27,7 @@ * Foundation, Inc., 51 Franklin Street,
#include <sys/poll.h>
#include <signal.h>
#include <sys/signalfd.h>
+#include <linux/version.h>
#include "libtrace/kbuffer.h"
#include "libtrace/event-parse.h"
#include "ras-mc-handler.h"
@@ -229,7 +230,7 @@ if (rc < 0) {
#endif
#ifdef HAVE_DISKERROR
- rc |= __toggle_ras_mc_event(ras, "block", "block_rq_complete", enable);
+ rc |= __toggle_ras_mc_event(ras, "block", "block_rq_error", enable);
#endif
#ifdef HAVE_MEMORY_FAILURE
@@ -241,37 +242,6 @@ free_ras:
return rc;
}
-/*
- * Set kernel filter. libtrace doesn't provide an API for setting filters
- * in kernel, we have to implement it here.
- */
-static int filter_ras_mc_event(struct ras_events *ras, char *group, char *event,
- const char *filter_str)
-{
- int fd, rc;
- char fname[MAX_PATH + 1];
-
- snprintf(fname, sizeof(fname), "events/%s/%s/filter", group, event);
- fd = open_trace(ras, fname, O_RDWR | O_APPEND);
- if (fd < 0) {
- log(ALL, LOG_WARNING, "Can't open filter file\n");
- return errno;
- }
-
- rc = write(fd, filter_str ,strlen(filter_str));
- if (rc < 0) {
- log(ALL, LOG_WARNING, "Can't write to filter file\n");
- close(fd);
- return rc;
- }
- close(fd);
- if (!rc) {
- log(ALL, LOG_WARNING, "Nothing was written on filter file\n");
- return EIO;
- }
-
- return 0;
-}
/*
* Tracing read code
@@ -901,17 +871,14 @@ (void)open("/sys/kernel/debug/ras/daemon
#endif
#ifdef HAVE_DISKERROR
- rc = filter_ras_mc_event(ras, "block", "block_rq_complete", "error != 0");
- if (!rc) {
- rc = add_event_handler(ras, pevent, page_size, "block",
- "block_rq_complete", ras_diskerror_event_handler,
- NULL, DISKERROR_EVENT);
- if (!rc)
- num_events++;
- else
- log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
- "block", "block_rq_complete");
- }
+ rc = add_event_handler(ras, pevent, page_size, "block",
+ "block_rq_error", ras_diskerror_event_handler,
+ NULL, DISKERROR_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "block", "block_rq_error");
#endif
#ifdef HAVE_MEMORY_FAILURE
--- rasdaemon-0.6.7.orig/ras-record.c 2024-05-14 11:07:24.573654494 -0400
+++ rasdaemon-0.6.7/ras-record.c 2024-05-14 11:07:07.626807674 -0400
@@ -456,7 +456,7 @@ return 0;
#endif
/*
- * Table and functions to handle block:block_rq_complete
+ * Table and functions to handle block:block_rq_error
*/
#ifdef HAVE_DISKERROR

View File

@ -0,0 +1,134 @@
commit ad0444190e02bca309a61a4bad51bc0e16c0aef5
Author: Avadhut Naik <avadhut.naik@amd.com>
Date: Fri May 10 13:20:19 2024 -0500
rasdaemon: Update SMCA bank error descriptions
Update error descriptions of SMCA bank types to support AMD's new Family
1Ah-based processors.
Also, modify some existing error descriptions to better reflect the error
received.
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 6632663..a55e013 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -108,7 +108,7 @@ static const char * const smca_ls_mce_desc[] = {
"Store queue parity",
"Miss address buffer payload parity",
"L1 TLB parity",
- "Reserved",
+ "DC Tag error type 5",
"DC tag error type 6",
"DC tag error type 1",
"Internal error type 1",
@@ -125,6 +125,12 @@ static const char * const smca_ls_mce_desc[] = {
"DC tag error type 3",
"DC tag error type 5",
"L2 fill data error",
+ "Error on SCB cacheline state or address field",
+ "Error on SCB data, commit pipe 0",
+ "Error on SCB data, commit pipe 1",
+ "Error on SCB data for non-cacheable DRAM or IO",
+ "System Read Data Error detected by write combine buffer",
+ "Hardware Asserts",
};
static const char * const smca_ls2_mce_desc[] = {
@@ -168,7 +174,7 @@ static const char * const smca_if_mce_desc[] = {
"BP L1-BTB Multi-Hit Error",
"BP L2-BTB Multi-Hit Error",
"L2 Cache Response Poison error",
- "L2 Cache Error Response",
+ "System Read Data error",
"Hardware Assertion Error",
"L1-TLB Multi-Hit",
"L2-TLB Multi-Hit",
@@ -182,6 +188,7 @@ static const char * const smca_l2_mce_desc[] = {
"L2M Data Array ECC Error",
"Hardware Assert Error",
"SDP Read Response Parity Error",
+ "Error initiated by programmable state machine",
};
static const char * const smca_de_mce_desc[] = {
@@ -193,7 +200,7 @@ static const char * const smca_de_mce_desc[] = {
"Fetch address FIFO parity error",
"Patch RAM data parity error",
"Patch RAM sequencer parity error",
- "Micro-op buffer parity error",
+ "Micro-op fetch queue parity error",
"Hardware Assertion MCA Error",
};
@@ -235,6 +242,7 @@ static const char * const smca_l3_mce_desc[] = {
"L3 victim queue Data Fabric error",
"L3 Hardware Assertion",
"XI WCB Parity Poison Creation event",
+ "Machine check error initiated by DSM action",
};
static const char * const smca_cs_mce_desc[] = {
@@ -268,6 +276,9 @@ static const char * const smca_cs2_mce_desc[] = {
"Address Violation on the no data channel",
"Security Violation on the no data channel",
"Hardware Assert Error",
+ "Shadow Tag Array Protocol Error",
+ "Shadow Tag ECC Error",
+ "Shadow Tag Transaction Error",
};
/*
@@ -303,6 +314,8 @@ static const char * const smca_pie_mce_desc[] = {
"A deferred error was detected in the DF",
"Watch Dog Timer",
"An SRAM ECC error was detected in the CNLI block",
+ "Register access during DF Cstate",
+ "DSM Error",
};
static const char * const smca_umc_mce_desc[] = {
@@ -318,6 +331,11 @@ static const char * const smca_umc_mce_desc[] = {
"ECS Error",
"UMC Throttling Error",
"Read CRC Error",
+ "Reserved",
+ "Reserved",
+ "Reserved",
+ "Reserved",
+ "RFM SRAM ECC error",
};
static const char * const smca_umc_quirk_mce_desc[] = {
@@ -391,6 +409,12 @@ static const char * const smca_psp2_mce_desc[] = {
"TLB Bank 0 parity error",
"TLB Bank 1 parity error",
"System Hub Read Buffer ECC or parity error",
+ "FUSE IP SRAM ECC or parity error",
+ "PCRU FUSE SRAM ECC or parity error",
+ "SIB SRAM parity error",
+ "mpASP SECEMC Error",
+ "mpASP A5 Hang",
+ "SIB WDT error",
};
static const char * const smca_smu_mce_desc[] = {
@@ -431,6 +455,7 @@ static const char * const smca_mp5_mce_desc[] = {
"Instruction Cache Bank B ECC or parity error",
"Instruction Tag Cache Bank A ECC or parity error",
"Instruction Tag Cache Bank B ECC or parity error",
+ "Fuse SRAM ECC or parity error",
};
static const char * const smca_mpdma_mce_desc[] = {
@@ -483,6 +508,7 @@ static const char * const smca_mpdma_mce_desc[] = {
"MPDMA PTE Internal Data FIFO ECC or parity error",
"MPDMA PTE Command Memory DMA ECC or parity error",
"MPDMA PTE Command Memory Internal ECC or parity error",
+ "MPDMA TVF SDP Master Memory 7 ECC or parity error",
};
static const char * const smca_nbio_mce_desc[] = {

View File

@ -0,0 +1,56 @@
commit b1ace39286e287282a275b6edc90dc2f64e60a3c
Author: Avadhut Naik <avadhut.naik@amd.com>
Date: Mon Mar 25 23:06:08 2024 -0500
rasdaemon: ras-mc-ctl: Add support to display mcastatus_msg string
Currently, the mcastatus_msg string of struct mce_event is added to the
SQLite database by the rasdaemon when it is recording errors. The same
however, is not outputted by the ras-mc-ctl utility.
The string provides important error information relating to the received
MCE. For example, on AMD SMCA systems, the string outputs extended error
code and description. As such, the string should be present in the
output of ras-mc-ctl utility.
Add support to output the string through the ras-mc-ctl utility.
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
util/ras-mc-ctl.in | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
--- rasdaemon-0.6.7.orig/util/ras-mc-ctl.in 2024-08-19 15:08:29.246429487 -0400
+++ rasdaemon-0.6.7/util/ras-mc-ctl.in 2024-08-19 15:10:55.478162148 -0400
@@ -1317,7 +1317,7 @@ sub errors
{
require DBI;
my ($query, $query_handle, $id, $time, $devname, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out);
- my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location);
+ my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location);
my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data);
my ($bus_name, $dev_name, $driver_name, $reporter_name);
my ($dev, $sector, $nr_sector, $error, $rwbs, $cmd);
@@ -1485,10 +1485,10 @@ $out .= sprintf "address=0x%08x, ", $add
# MCE mce_record errors
if ($has_mce == 1) {
- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id";
+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, mcastatus_msg, user_action, mc_location from mce_record order by id";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location));
+ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $mcastatus_msg, $user_action, $mc_location));
$out = "";
while($query_handle->fetch()) {
$out .= "$id $time error: $msg";
@@ -1496,6 +1496,7 @@ $out .= sprintf "address=0x%08x, ", $add
$out .= ", bank $bank_name" if ($bank_name);
$out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg);
$out .= ", mci $mcistatus_msg" if ($mcistatus_msg);
+ $out .= ", mca $mcastatus_msg" if ($mcastatus_msg);
$out .= ", $mc_location" if ($mc_location);
$out .= ", $user_action" if ($user_action);
$out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap);

View File

@ -0,0 +1,94 @@
commit ced615cf8146f51b5d6fe7a29107a2adc77407ca
Author: Sathya Priya Kumar <sathyapriya.k@amd.com>
Date: Thu Jan 11 01:20:07 2024 -0600
rasdaemon: Add error decoding for MCA_CTL_SMU extended bits
Enable error decoding support for the newly added extended
error bit descriptions from MCA_CTL_SMU.
b'0:11 can be decoded from existing array smca_smu2_mce_desc.
Define a function to append the newly defined b'58:62 to the
smca_smu2_mce_desc. This reduces the maintaining Reserved bits
from b'12:57 in the code.
Signed-off-by: Sathya Priya Kumar <sathyapriya.k@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
mce-amd-smca.c | 33 ++++++++++++++++++++++++++++++++-
ras-mce-handler.h | 1 +
2 files changed, 33 insertions(+), 1 deletion(-)
--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-06-28 10:34:16.453522865 -0400
+++ rasdaemon-0.6.7/mce-amd-smca.c 2024-06-28 10:34:46.049124270 -0400
@@ -397,7 +397,7 @@ static const char * const smca_smu_mce_d
"An ECC or parity error in an SMU RAM instance",
};
-static const char * const smca_smu2_mce_desc[] = {
+static const char * smca_smu2_mce_desc[64] = {
"High SRAM ECC or parity error",
"Low SRAM ECC or parity error",
"Data Cache Bank A ECC or parity error",
@@ -409,6 +409,15 @@ static const char * const smca_smu2_mce_
"Instruction Tag Cache Bank A ECC or parity error",
"Instruction Tag Cache Bank B ECC or parity error",
"System Hub Read Buffer ECC or parity error",
+ "PHY RAS ECC Error",
+};
+
+static const char * smca_smu2_ext_mce_desc[] = {
+ "A correctable error from a GFX Sub-IP",
+ "A fatal error from a GFX Sub-IP",
+ "Reserved",
+ "Reserved",
+ "A poison error from a GFX Sub-IP",
};
static const char * const smca_mp5_mce_desc[] = {
@@ -815,6 +824,27 @@ static struct smca_bank_name smca_names[
[SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" },
};
+void smca_smu2_ext_err_desc(void)
+{
+ int i, j;
+ int smu2_bits = 62;
+
+ /*
+ * MCA_CTL_SMU error stings are defined for b'58:59 and b'62
+ * in MI300A AMD systems. See AMD PPR MCA::SMU::MCA_CTL_SMU
+ *
+ * b'0:11 can be decoded from existing array smca_smu2_mce_desc.
+ * b'12:57 are Reserved and b'58:62 are appended to the
+ * smca_smu2_mce_desc.
+ */
+ for (i = 12, j = 0; i < smu2_bits || j < 5; i++, j++) {
+ for ( ; i < 58; i++)
+ smca_smu2_mce_desc[i] = "Reserved";
+
+ smca_smu2_mce_desc[i] = smca_smu2_ext_mce_desc[j];
+ }
+}
+
void amd_decode_errcode(struct mce_event *e)
{
@@ -906,6 +936,7 @@ unsigned short xec = (e->status >> 16) &
mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID,
(ipid_high & MCI_IPID_MCATYPE) >> 16);
+ smca_smu2_ext_err_desc();
fixup_hwid(m, &mcatype_hwid);
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-06-28 10:34:16.453522865 -0400
+++ rasdaemon-0.6.7/ras-mce-handler.h 2024-06-28 10:34:17.795508302 -0400
@@ -121,6 +121,7 @@ int set_intel_imc_log(enum cputype cputy
/* Undertake AMD SMCA Error Decoding */
void decode_smca_error(struct mce_event *e, struct mce_priv *m);
void amd_decode_errcode(struct mce_event *e);
+void smca_smu2_ext_err_desc(void);
/* Per-CPU-type decoders for Intel CPUs */
void p4_decode_model(struct mce_event *e);

View File

@ -1,8 +1,8 @@
Name: rasdaemon
Version: 0.6.7
Release: 9%{?dist}
Release: 18%{?dist}
Summary: Utility to receive RAS error tracings
License: GPLv2
License: GPL-2.0-only
URL: http://git.infradead.org/users/mchehab/rasdaemon.git
Source0: http://www.infradead.org/~mchehab/rasdaemon/%{name}-%{version}.tar.bz2
Patch0: labels.patch
@ -33,6 +33,19 @@ Patch24: 1f74a59ee33b7448b00d7ba13d5ecd4918b9853c.patch
Patch25: 2d15882a0cbfce0b905039bebc811ac8311cd739.patch
Patch26: c785d309dcbdeb7ecd219975244f3944a8d047e9.patch
Patch27: b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87.patch
Patch28: 9c86f6255f67a8bae28cd46c54500fc16bfc7a30.patch
Patch29: 9bd84aef87978b806178a73ed33c39d6c442fc1f.patch
Patch30: 885e546add918457c453bd3f753ac7df90b39e36.patch
Patch31: 7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch
Patch32: ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch
Patch33: 73d8177ce0d2fcb7693cacee4778d0845ebd3788.patch
Patch34: ad0444190e02bca309a61a4bad51bc0e16c0aef5.patch
Patch35: b1ace39286e287282a275b6edc90dc2f64e60a3c.patch
Patch36: 045ab08eaa00172d50621df9502f6910f3fe3af4.patch
Patch37: 79065939fc4bc1da72a3718937fab80e73a6dd75.patch
Patch38: 794530fbf270eae9f6f43c6d0bbd3ec6f2b210f3.patch
Patch39: 83a3ced797256dcb1c93f8de4266fd7545fbfb3b.patch
Patch40: 8b536321cc0679fb82d4ea7521f9375d88cec0cc.patch
ExcludeArch: s390 s390x
BuildRequires: make
@ -95,6 +108,19 @@ an utility for reporting current error counts from the EDAC sysfs files.
%patch25 -p1
%patch26 -p1
%patch27 -p1
%patch28 -p1
%patch29 -p1
%patch30 -p1
%patch31 -p1
%patch32 -p1
%patch33 -p1
%patch34 -p1
%patch35 -p1
%patch36 -p1
%patch37 -p1
%patch38 -p1
%patch39 -p1
%patch40 -p1
# The tarball is locked in time the first time aclocal was ran and will keep
# requiring an older version of automake
@ -130,6 +156,33 @@ sed -i "s/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION=account/" %{buildroot}/%{_sysconfdir
%{_sysconfdir}/sysconfig/rasdaemon
%changelog
* Wed Nov 27 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-18
- Add support for vendor specific information [RHEL-68673]
* Tue Nov 19 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-17
- ras-events: quit loop in read_ras_event when kbuf data is broken [RHEL-68127]
* Thu Sep 05 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-16
- rasdaemon: Add support to parse the PPIN field of mce tracepoint [RHEL-52911]
- rasdaemon: Add support to parse microcode field of mce tracepoint [RHEL-52911]
- rasdaemon: Update SMCA bank error descriptions [RHEL-52911]
- rasdaemon: ras-mc-ctl: Add support to display mcastatus_msg string [RHEL-52911]
* Thu Jul 18 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-15
- rasdaemon: mce-amd-smca: Optimizing decoding of MCA_CTL_SMU bits [RHEL-48819]
* Fri Jun 28 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-13
- rasdaemon: Add error decoding for MCA_CTL_SMU extended bits [RHEL-35718]
* Thu Jun 20 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-12
- mce-amd-smca: update smca_hwid to use smca_bank_types [RHEL-24170]
* Wed May 08 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-11
- Fix excessive block messages [RHEL-8708]
* Wed Jan 10 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-10
- Update License string to use SPDX [RHELMISC-1262]
* Thu Oct 26 2023 Aristeu Rozanski <aris@redhat.com> 0.6.7-9
- Update SMCA support for AMD processors [RHEL-11092]