200 lines
7.2 KiB
Diff
200 lines
7.2 KiB
Diff
commit 70acd500302d2db318bb0e35b551f74fd4baebc4
|
|
Author: Shiju Jose <shiju.jose@huawei.com>
|
|
Date: Mon Feb 12 10:27:58 2024 +0000
|
|
|
|
rasdaemon: ras-mc-ctl: Add support for CXL AER uncorrectable trace events
|
|
|
|
Add support for CXL AER uncorrectable events to the ras-mc-ctl tool.
|
|
|
|
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
|
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
|
|
(cherry picked from commit f8b6da812eddc063ea739970f941fdd24fb984ae)
|
|
|
|
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
|
index 1cc19b3..c0a2ec6 100755
|
|
--- a/util/ras-mc-ctl.in
|
|
+++ b/util/ras-mc-ctl.in
|
|
@@ -43,6 +43,7 @@ my $modprobe = find_prog ("modprobe") or exit (1);
|
|
|
|
my $has_aer = 0;
|
|
my $has_arm = 0;
|
|
+my $has_cxl = 0;
|
|
my $has_devlink = 0;
|
|
my $has_disk_errors = 0;
|
|
my $has_extlog = 0;
|
|
@@ -51,6 +52,7 @@ my $has_mce = 0;
|
|
|
|
@WITH_AER_TRUE@$has_aer = 1;
|
|
@WITH_ARM_TRUE@$has_arm = 1;
|
|
+@WITH_CXL_TRUE@$has_cxl = 1;
|
|
@WITH_DEVLINK_TRUE@$has_devlink = 1;
|
|
@WITH_DISKERROR_TRUE@$has_disk_errors = 1;
|
|
@WITH_EXTLOG_TRUE@$has_extlog = 1;
|
|
@@ -1156,6 +1158,78 @@ sub get_uuid_le
|
|
return $out;
|
|
}
|
|
|
|
+use constant {
|
|
+ CXL_AER_UE_CACHE_DATA_PARITY => 0x0001,
|
|
+ CXL_AER_UE_CACHE_ADDR_PARITY => 0x0002,
|
|
+ CXL_AER_UE_CACHE_BE_PARITY => 0x0004,
|
|
+ CXL_AER_UE_CACHE_DATA_ECC => 0x0008,
|
|
+ CXL_AER_UE_MEM_DATA_PARITY => 0x0010,
|
|
+ CXL_AER_UE_MEM_ADDR_PARITY => 0x0020,
|
|
+ CXL_AER_UE_MEM_BE_PARITY => 0x0040,
|
|
+ CXL_AER_UE_MEM_DATA_ECC => 0x0080,
|
|
+ CXL_AER_UE_REINIT_THRESH => 0x0100,
|
|
+ CXL_AER_UE_RSVD_ENCODE => 0x0200,
|
|
+ CXL_AER_UE_POISON => 0x0400,
|
|
+ CXL_AER_UE_RECV_OVERFLOW => 0x0800,
|
|
+ CXL_AER_UE_INTERNAL_ERR => 0x4000,
|
|
+ CXL_AER_UE_IDE_TX_ERR => 0x8000,
|
|
+ CXL_AER_UE_IDE_RX_ERR => 0x10000,
|
|
+};
|
|
+
|
|
+sub get_cxl_ue_error_status_text
|
|
+{
|
|
+ my $error_status = $_[0];
|
|
+ my @out;
|
|
+
|
|
+ if ($error_status & CXL_AER_UE_CACHE_DATA_PARITY) {
|
|
+ push @out, (sprintf "\'Cache Data Parity Error\' ");
|
|
+ }
|
|
+ if ($error_status & CXL_AER_UE_CACHE_ADDR_PARITY) {
|
|
+ push @out, (sprintf "\'Cache Address Parity Error\' ");
|
|
+ }
|
|
+ if ($error_status & CXL_AER_UE_CACHE_BE_PARITY) {
|
|
+ push @out, (sprintf "\'Cache Byte Enable Parity Error\' ");
|
|
+ }
|
|
+ if ($error_status & CXL_AER_UE_CACHE_DATA_ECC) {
|
|
+ push @out, (sprintf "\'Cache Data ECC Error\' ");
|
|
+ }
|
|
+ if ($error_status & CXL_AER_UE_MEM_DATA_PARITY) {
|
|
+ push @out, (sprintf "\'Memory Data Parity Error\' ");
|
|
+ }
|
|
+ if ($error_status & CXL_AER_UE_MEM_ADDR_PARITY) {
|
|
+ push @out, (sprintf "\'Memory Address Parity Error\' ");
|
|
+ }
|
|
+ if ($error_status & CXL_AER_UE_MEM_BE_PARITY) {
|
|
+ push @out, (sprintf "\'Memory Byte Enable Parity Error\' ");
|
|
+ }
|
|
+ if ($error_status & CXL_AER_UE_MEM_DATA_ECC) {
|
|
+ push @out, (sprintf "\'Memory Data ECC Error\' ");
|
|
+ }
|
|
+ if ($error_status & CXL_AER_UE_REINIT_THRESH) {
|
|
+ push @out, (sprintf "\'REINIT Threshold Hit\' ");
|
|
+ }
|
|
+ if ($error_status & CXL_AER_UE_RSVD_ENCODE) {
|
|
+ push @out, (sprintf "\'Received Unrecognized Encoding\' ");
|
|
+ }
|
|
+ if ($error_status & CXL_AER_UE_POISON) {
|
|
+ push @out, (sprintf "\'Received Poison From Peer\' ");
|
|
+ }
|
|
+ if ($error_status & CXL_AER_UE_RECV_OVERFLOW) {
|
|
+ push @out, (sprintf "\'Receiver Overflow\' ");
|
|
+ }
|
|
+ if ($error_status & CXL_AER_UE_INTERNAL_ERR) {
|
|
+ push @out, (sprintf "\'Component Specific Error\' ");
|
|
+ }
|
|
+ if ($error_status & CXL_AER_UE_IDE_TX_ERR) {
|
|
+ push @out, (sprintf "\'IDE Tx Error\' ");
|
|
+ }
|
|
+ if ($error_status & CXL_AER_UE_IDE_RX_ERR) {
|
|
+ push @out, (sprintf "\'IDE Rx Error\' ");
|
|
+ }
|
|
+
|
|
+ return join (", ", @out);
|
|
+}
|
|
+
|
|
sub summary
|
|
{
|
|
require DBI;
|
|
@@ -1163,7 +1237,7 @@ sub summary
|
|
my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg, $action_result);
|
|
my ($etype, $severity, $etype_string, $severity_string);
|
|
my ($dev_name, $dev);
|
|
- my ($mpidr);
|
|
+ my ($mpidr, $memdev);
|
|
|
|
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
|
|
|
|
@@ -1219,6 +1293,25 @@ sub summary
|
|
$query_handle->finish;
|
|
}
|
|
|
|
+ # CXL errors
|
|
+ if ($has_cxl == 1) {
|
|
+ # CXL AER uncorrectable errors
|
|
+ $query = "select memdev, count(*) from cxl_aer_ue_event$conf{opt}{since} group by memdev";
|
|
+ $query_handle = $dbh->prepare($query);
|
|
+ $query_handle->execute();
|
|
+ $query_handle->bind_columns(\($memdev, $count));
|
|
+ $out = "";
|
|
+ while($query_handle->fetch()) {
|
|
+ $out .= "\t$memdev errors: $count\n";
|
|
+ }
|
|
+ if ($out ne "") {
|
|
+ print "CXL AER uncorrectable events summary:\n$out\n";
|
|
+ } else {
|
|
+ print "No CXL AER uncorrectable errors.\n\n";
|
|
+ }
|
|
+ $query_handle->finish;
|
|
+ }
|
|
+
|
|
# extlog errors
|
|
if ($has_extlog == 1) {
|
|
$query = "select etype, severity, count(*) from extlog_event group by etype, severity";
|
|
@@ -1324,6 +1417,7 @@ sub errors
|
|
my ($dev, $sector, $nr_sector, $error, $rwbs, $cmd);
|
|
my ($error_count, $affinity, $mpidr, $r_state, $psci_state);
|
|
my ($pfn, $page_type, $action_result);
|
|
+ my ($memdev, $host, $serial, $error_status, $first_error, $header_log);
|
|
|
|
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
|
|
|
|
@@ -1389,6 +1483,44 @@ sub errors
|
|
$query_handle->finish;
|
|
}
|
|
|
|
+ # CXL errors
|
|
+ if ($has_cxl == 1) {
|
|
+ # CXL AER uncorrectable errors
|
|
+ use constant SZ_512 => 0x200;
|
|
+ use constant CXL_HEADERLOG_SIZE_U32 => SZ_512/32;
|
|
+ $query = "select id, timestamp, memdev, host, serial, error_status, first_error, header_log from cxl_aer_ue_event$conf{opt}{since} order by id";
|
|
+ $query_handle = $dbh->prepare($query);
|
|
+ $query_handle->execute();
|
|
+ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $error_status, $first_error, $header_log));
|
|
+ $out = "";
|
|
+ while($query_handle->fetch()) {
|
|
+ $out .= "$id $timestamp error: ";
|
|
+ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev);
|
|
+ $out .= "host=$host, " if (defined $host && length $host);
|
|
+ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial);
|
|
+ if (defined $error_status && length $error_status) {
|
|
+ $out .= sprintf "error_status: %s, ", get_cxl_ue_error_status_text($error_status);
|
|
+ }
|
|
+ if (defined $first_error && length $first_error) {
|
|
+ $out .= sprintf "first_error: %s, ", get_cxl_ue_error_status_text($first_error);
|
|
+ }
|
|
+ if (defined $header_log && length $header_log) {
|
|
+ $out .= sprintf "header_log:\n";
|
|
+ my @bytes = unpack "C*", $header_log;
|
|
+ for (my $i = 0; $i < CXL_HEADERLOG_SIZE_U32; $i++) {
|
|
+ $out .= sprintf "%08x ", $bytes[$i];
|
|
+ }
|
|
+ }
|
|
+ $out .= "\n";
|
|
+ }
|
|
+ if ($out ne "") {
|
|
+ print "CXL AER uncorrectable events:\n$out\n";
|
|
+ } else {
|
|
+ print "No CXL AER uncorrectable errors.\n\n";
|
|
+ }
|
|
+ $query_handle->finish;
|
|
+ }
|
|
+
|
|
# Extlog errors
|
|
if ($has_extlog == 1) {
|
|
$query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id";
|