import rasdaemon-0.6.1-3.el8

This commit is contained in:
CentOS Sources 2020-01-21 17:42:50 -05:00 committed by Andrew Lukoshko
commit 84623fe140
5 changed files with 980 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
SOURCES/rasdaemon-0.6.1.tar.bz2

1
.rasdaemon.metadata Normal file
View File

@ -0,0 +1 @@
742eda555cccb8ca8f9b6a18bab1f4a732c11318 SOURCES/rasdaemon-0.6.1.tar.bz2

View File

@ -0,0 +1,149 @@
commit 60a91e4da4f2daf2b10143fc148a8043312b61e5
Author: Aristeu Rozanski <aris@redhat.com>
Date: Wed Aug 1 16:29:58 2018 -0400
rasdaemon: ras-mc-ctl: add option to show error counts
In some scenarios it might not be desirable to have a daemon running
to parse and store the errors provided by EDAC and only having the
number of CEs and UEs is enough. This patch implements this feature
as an ras-mc-ctl option.
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 38b7824..aee431a 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -50,6 +50,8 @@ my %dimm_location = ();
my %csrow_size = ();
my %rank_size = ();
my %csrow_ranks = ();
+my %dimm_ce_count = ();
+my %dimm_ue_count = ();
my @layers;
my @max_pos;
@@ -76,6 +78,7 @@ Usage: $prog [OPTIONS...]
--layout Display the memory layout.
--summary Presents a summary of the logged errors.
--errors Shows the errors stored at the error database.
+ --error-count Shows the corrected and uncorrected error counts using sysfs.
--help This help message.
EOF
@@ -83,7 +86,7 @@ parse_cmdline();
if ( $conf{opt}{mainboard} || $conf{opt}{print_labels}
|| $conf{opt}{register_labels} || $conf{opt}{display_memory_layout}
- || $conf{opt}{guess_dimm_label}) {
+ || $conf{opt}{guess_dimm_label} || $conf{opt}{error_count}) {
get_mainboard_info();
@@ -105,6 +108,9 @@ if ( $conf{opt}{mainboard} || $conf{opt}{print_labels}
if ($conf{opt}{guess_dimm_label}) {
guess_dimm_label ();
}
+ if ($conf{opt}{error_count}) {
+ display_error_count ();
+ }
}
if ($conf{opt}{status}) {
@@ -134,6 +140,7 @@ sub parse_cmdline
$conf{opt}{guess_dimm_label} = 0;
$conf{opt}{summary} = 0;
$conf{opt}{errors} = 0;
+ $conf{opt}{error_count} = 0;
my $rref = \$conf{opt}{report};
my $mref = \$conf{opt}{mainboard};
@@ -150,7 +157,8 @@ sub parse_cmdline
"status" => \$conf{opt}{status},
"layout" => \$conf{opt}{display_memory_layout},
"summary" => \$conf{opt}{summary},
- "errors" => \$conf{opt}{errors}
+ "errors" => \$conf{opt}{errors},
+ "error-count" => \$conf{opt}{error_count}
);
usage(1) if !$rc;
@@ -284,6 +292,30 @@ sub parse_dimm_nodes
$dimm_label_file{$str_loc} = $file;
$dimm_location{$str_loc} = $location;
+ my $count;
+
+ $file =~s/dimm_label/dimm_ce_count/;
+ if (-e $file) {
+ open IN, $file;
+ chomp($count = <IN>);
+ close IN;
+ } else {
+ log_error ("dimm_ce_count not found in sysfs. Old kernel?\n");
+ exit -1;
+ }
+ $dimm_ce_count{$str_loc} = $count;
+
+ $file =~s/dimm_ce_count/dimm_ue_count/;
+ if (-e $file) {
+ open IN, $file;
+ chomp($count = <IN>);
+ close IN;
+ } else {
+ log_error ("dimm_ue_count not found in sysfs. Old kernel?\n");
+ exit -1;
+ }
+ $dimm_ue_count{$str_loc} = $count;
+
return;
}
}
@@ -906,6 +938,45 @@ sub display_memory_layout
dimm_display_mem();
}
+sub display_error_count
+{
+ my $sysfs_dir = "/sys/devices/system/edac/mc";
+ my $key;
+ my $max_width = 0;
+ my %dimm_labels = ();
+
+ find ({wanted => \&parse_dimm_nodes, no_chdir => 1}, $sysfs_dir);
+
+ if (!scalar(keys %dimm_node)) {
+ log_error ("No DIMMs found in /sys or new sysfs EDAC interface not found.\n");
+ exit -1;
+ }
+
+ foreach $key (keys %dimm_node) {
+ my $label_width;
+
+ open IN, $dimm_label_file{$key};
+ chomp(my $label = <IN>);
+ close IN;
+ $label_width = length $label;
+
+ if ($label_width > $max_width) {
+ $max_width = $label_width;
+ }
+ $dimm_labels{$key} = $label;
+ }
+ my $string = "Label";
+ $string .= " " x ($max_width - length $string);
+ print($string . "\tCE\tUE\n");
+
+ foreach $key (keys %dimm_node) {
+ my $ce_count = $dimm_ce_count{$key};
+ my $ue_count = $dimm_ue_count{$key};
+
+ print("$dimm_labels{$key}\t$ce_count\t$ue_count\n");
+ }
+}
+
sub find_prog
{
my ($file) = @_;

View File

@ -0,0 +1,670 @@
commit a16ca0711001957ee98f2c124abce0fa1f801529
Author: Chandu-babu Namburu <chandu@amd.com>
Date: Wed Jan 30 20:36:45 2019 +0530
rasdaemon: add support for AMD Scalable MCA
Add logic here to decode errors from all known IP blocks for
AMD Scalable MCA supported processors
Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Chandu-babu Namburu <chandu@amd.com>
---
mce-amd-smca.c | 371 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
mce-amd.c | 122 +++++++++++++++++
ras-mce-handler.c | 24 +++
ras-mce-handler.h | 15 ++
4 files changed, 530 insertions(+), 2 deletions(-)
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ rasdaemon-0.6.1/mce-amd-smca.c 2019-07-12 11:35:04.836470461 -0400
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2018, AMD, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "ras-mce-handler.h"
+#include "bitfield.h"
+
+/* MCA_STATUS REGISTER FOR FAMILY 17H
+ *********************** Higher 32-bits *****************************
+ * 63: VALIDERROR, 62: OVERFLOW, 61: UC, 60: Err ENABLE,
+ * 59: Misc Valid, 58: Addr Valid, 57: PCC, 56: ErrCoreID Valid,
+ * 55: TCC, 54: RES, 53: Syndrom Valid, 52: Transparanet,
+ * 51: RES, 50: RES, 49: RES, 48: RES,
+ * 47: RES, 46: CECC, 45: UECC, 44: Deferred,
+ * 43: Poison, 42: RES, 41: RES, 40: RES,
+ * 39: RES, 38: RES, 37: ErrCoreID[5], 36: ErrCoreID[4],
+ * 35: ErrCoreID[3], 34: ErrCoreID[2] 33: ErrCoreID[1] 32: ErrCoreID[0]
+ *********************** Lower 32-bits ******************************
+ * 31: RES, 30: RES, 29: RES, 28: RES,
+ * 27: RES, 26: RES, 25: RES, 24: RES
+ * 23: RES, 22: RES, 21: XEC[5], 20: XEC[4],
+ * 19: XEC[3], 18: XEC[2], 17: XEC[1], 16: XEC[0]
+ * 15: EC[15], 14: EC[14], 13: EC[13], 12: EC[12],
+ * 11: EC[11], 10: EC[10], 09: EC[9], 08: EC[8],
+ * 07: EC[7], 06: EC[6], 05: EC[5], 04: EC[4],
+ * 03: EC[3], 02: EC[2], 01: EC[1], 00: EC[0]
+ */
+
+/* These may be used by multiple smca_hwid_mcatypes */
+enum smca_bank_types {
+ SMCA_LS = 0, /* Load Store */
+ SMCA_IF, /* Instruction Fetch */
+ SMCA_L2_CACHE, /* L2 Cache */
+ SMCA_DE, /* Decoder Unit */
+ SMCA_RESERVED, /* Reserved */
+ SMCA_EX, /* Execution Unit */
+ SMCA_FP, /* Floating Point */
+ SMCA_L3_CACHE, /* L3 Cache */
+ SMCA_CS, /* Coherent Slave */
+ SMCA_PIE, /* Power, Interrupts, etc. */
+ SMCA_UMC, /* Unified Memory Controller */
+ SMCA_PB, /* Parameter Block */
+ SMCA_PSP, /* Platform Security Processor */
+ SMCA_SMU, /* System Management Unit */
+ N_SMCA_BANK_TYPES
+};
+
+/* SMCA Extended error strings */
+/* Load Store */
+static const char * const smca_ls_mce_desc[] = {
+ "Load queue parity",
+ "Store queue parity",
+ "Miss address buffer payload parity",
+ "L1 TLB parity",
+ "Reserved",
+ "DC tag error type 6",
+ "DC tag error type 1",
+ "Internal error type 1",
+ "Internal error type 2",
+ "Sys Read data error thread 0",
+ "Sys read data error thread 1",
+ "DC tag error type 2",
+ "DC data error type 1 (poison consumption)",
+ "DC data error type 2",
+ "DC data error type 3",
+ "DC tag error type 4",
+ "L2 TLB parity",
+ "PDC parity error",
+ "DC tag error type 3",
+ "DC tag error type 5",
+ "L2 fill data error",
+};
+/* Instruction Fetch */
+static const char * const smca_if_mce_desc[] = {
+ "microtag probe port parity error",
+ "IC microtag or full tag multi-hit error",
+ "IC full tag parity",
+ "IC data array parity",
+ "Decoupling queue phys addr parity error",
+ "L0 ITLB parity error",
+ "L1 ITLB parity error",
+ "L2 ITLB parity error",
+ "BPQ snoop parity on Thread 0",
+ "BPQ snoop parity on Thread 1",
+ "L1 BTB multi-match error",
+ "L2 BTB multi-match error",
+ "L2 Cache Response Poison error",
+ "System Read Data error",
+};
+/* L2 Cache */
+static const char * const smca_l2_mce_desc[] = {
+ "L2M tag multi-way-hit error",
+ "L2M tag ECC error",
+ "L2M data ECC error",
+ "HW assert",
+};
+/* Decoder Unit */
+static const char * const smca_de_mce_desc[] = {
+ "uop cache tag parity error",
+ "uop cache data parity error",
+ "Insn buffer parity error",
+ "uop queue parity error",
+ "Insn dispatch queue parity error",
+ "Fetch address FIFO parity",
+ "Patch RAM data parity",
+ "Patch RAM sequencer parity",
+ "uop buffer parity"
+};
+/* Execution Unit */
+static const char * const smca_ex_mce_desc[] = {
+ "Watchdog timeout error",
+ "Phy register file parity",
+ "Flag register file parity",
+ "Immediate displacement register file parity",
+ "Address generator payload parity",
+ "EX payload parity",
+ "Checkpoint queue parity",
+ "Retire dispatch queue parity",
+ "Retire status queue parity error",
+ "Scheduling queue parity error",
+ "Branch buffer queue parity error",
+};
+/* Floating Point Unit */
+static const char * const smca_fp_mce_desc[] = {
+ "Physical register file parity",
+ "Freelist parity error",
+ "Schedule queue parity",
+ "NSQ parity error",
+ "Retire queue parity",
+ "Status register file parity",
+ "Hardware assertion",
+};
+/* L3 Cache */
+static const char * const smca_l3_mce_desc[] = {
+ "Shadow tag macro ECC error",
+ "Shadow tag macro multi-way-hit error",
+ "L3M tag ECC error",
+ "L3M tag multi-way-hit error",
+ "L3M data ECC error",
+ "XI parity, L3 fill done channel error",
+ "L3 victim queue parity",
+ "L3 HW assert",
+};
+/* Coherent Slave Unit */
+static const char * const smca_cs_mce_desc[] = {
+ "Illegal request from transport layer",
+ "Address violation",
+ "Security violation",
+ "Illegal response from transport layer",
+ "Unexpected response",
+ "Parity error on incoming request or probe response data",
+ "Parity error on incoming read response data",
+ "Atomic request parity",
+ "ECC error on probe filter access",
+};
+/* Power, Interrupt, etc.. */
+static const char * const smca_pie_mce_desc[] = {
+ "HW assert",
+ "Internal PIE register security violation",
+ "Error on GMI link",
+ "Poison data written to internal PIE register",
+};
+/* Unified Memory Controller */
+static const char * const smca_umc_mce_desc[] = {
+ "DRAM ECC error",
+ "Data poison error on DRAM",
+ "SDP parity error",
+ "Advanced peripheral bus error",
+ "Command/address parity error",
+ "Write data CRC error",
+};
+/* Parameter Block */
+static const char * const smca_pb_mce_desc[] = {
+ "Parameter Block RAM ECC error",
+};
+/* Platform Security Processor */
+static const char * const smca_psp_mce_desc[] = {
+ "PSP RAM ECC or parity error",
+};
+/* System Management Unit */
+static const char * const smca_smu_mce_desc[] = {
+ "SMU RAM ECC or parity error",
+};
+
+struct smca_mce_desc {
+ const char * const *descs;
+ unsigned int num_descs;
+};
+
+static struct smca_mce_desc smca_mce_descs[] = {
+ [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) },
+ [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) },
+ [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) },
+ [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) },
+ [SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) },
+ [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) },
+ [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
+ [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
+ [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
+ [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
+ [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
+ [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
+ [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
+};
+
+struct smca_hwid {
+ unsigned int bank_type; /* Use with smca_bank_types for easy indexing.*/
+ uint32_t mcatype_hwid; /* mcatype,hwid bit 63-32 in MCx_IPID Register*/
+};
+
+static struct smca_hwid smca_hwid_mcatypes[] = {
+ /* { bank_type, mcatype_hwid } */
+
+ /* ZN Core (HWID=0xB0) MCA types */
+ { SMCA_LS, 0x000000B0 },
+ { SMCA_IF, 0x000100B0 },
+ { SMCA_L2_CACHE, 0x000200B0 },
+ { SMCA_DE, 0x000300B0 },
+ /* HWID 0xB0 MCATYPE 0x4 is Reserved */
+ { SMCA_EX, 0x000500B0 },
+ { SMCA_FP, 0x000600B0 },
+ { SMCA_L3_CACHE, 0x000700B0 },
+
+ /* Data Fabric MCA types */
+ { SMCA_CS, 0x0000002E },
+ { SMCA_PIE, 0x0001002E },
+
+ /* Unified Memory Controller MCA type */
+ { SMCA_UMC, 0x00000096 },
+
+ /* Parameter Block MCA type */
+ { SMCA_PB, 0x00000005 },
+
+ /* Platform Security Processor MCA type */
+ { SMCA_PSP, 0x000000FF },
+
+ /* System Management Unit MCA type */
+ { SMCA_SMU, 0x00000001 },
+};
+
+struct smca_bank_name {
+ const char *name;
+};
+
+static struct smca_bank_name smca_names[] = {
+ [SMCA_LS] = { "Load Store Unit" },
+ [SMCA_IF] = { "Instruction Fetch Unit" },
+ [SMCA_L2_CACHE] = { "L2 Cache" },
+ [SMCA_DE] = { "Decode Unit" },
+ [SMCA_RESERVED] = { "Reserved" },
+ [SMCA_EX] = { "Execution Unit" },
+ [SMCA_FP] = { "Floating Point Unit" },
+ [SMCA_L3_CACHE] = { "L3 Cache" },
+ [SMCA_CS] = { "Coherent Slave" },
+ [SMCA_PIE] = { "Power, Interrupts, etc." },
+ [SMCA_UMC] = { "Unified Memory Controller" },
+ [SMCA_PB] = { "Parameter Block" },
+ [SMCA_PSP] = { "Platform Security Processor" },
+ [SMCA_SMU] = { "System Management Unit" },
+};
+
+static void amd_decode_errcode(struct mce_event *e)
+{
+
+ decode_amd_errcode(e);
+
+ if (e->status & MCI_STATUS_POISON)
+ mce_snprintf(e->mcistatus_msg, "Poison consumed");
+
+ if (e->status & MCI_STATUS_TCC)
+ mce_snprintf(e->mcistatus_msg, "Task_context_corrupt");
+
+}
+/*
+ * To find the UMC channel represented by this bank we need to match on its
+ * instance_id. The instance_id of a bank is held in the lower 32 bits of its
+ * IPID.
+ */
+static int find_umc_channel(struct mce_event *e)
+{
+ uint32_t umc_instance_id[] = {0x50f00, 0x150f00};
+ uint32_t instance_id = EXTRACT(e->ipid, 0, 31);
+ int i, channel = -1;
+
+ for (i = 0; i < ARRAY_SIZE(umc_instance_id); i++)
+ if (umc_instance_id[i] == instance_id)
+ channel = i;
+
+ return channel;
+}
+/* Decode extended errors according to Scalable MCA specification */
+static void decode_smca_error(struct mce_event *e)
+{
+ enum smca_bank_types bank_type;
+ const char *ip_name;
+ unsigned short xec = (e->status >> 16) & 0x3f;
+ const struct smca_hwid *s_hwid;
+ uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
+ unsigned int csrow = -1, channel = -1;
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
+ s_hwid = &smca_hwid_mcatypes[i];
+ if (mcatype_hwid == s_hwid->mcatype_hwid) {
+ bank_type = s_hwid->bank_type;
+ break;
+ }
+ }
+
+ if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) {
+ strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID");
+ return;
+ }
+
+ if (bank_type >= N_SMCA_BANK_TYPES) {
+ strcpy(e->mcastatus_msg, "Don't know how to decode this bank");
+ return;
+ }
+
+ if (bank_type == SMCA_RESERVED) {
+ strcpy(e->mcastatus_msg, "Bank 4 is reserved.\n");
+ return;
+ }
+
+ ip_name = smca_names[bank_type].name;
+
+ mce_snprintf(e->bank_name, "%s (bank=%d)", ip_name, e->bank);
+
+ /* Only print the descriptor of valid extended error code */
+ if (xec < smca_mce_descs[bank_type].num_descs)
+ mce_snprintf(e->mcastatus_msg,
+ " %s.\n", smca_mce_descs[bank_type].descs[xec]);
+
+ if (bank_type == SMCA_UMC && xec == 0) {
+ channel = find_umc_channel(e);
+ csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */
+ mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
+ channel, csrow);
+ }
+}
+
+int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)
+{
+ uint64_t mcgstatus = e->mcgstatus;
+
+ mce_snprintf(e->mcgstatus_msg, "mcgstatus=%lld",
+ (long long)e->mcgstatus);
+
+ if (mcgstatus & MCG_STATUS_RIPV)
+ mce_snprintf(e->mcgstatus_msg, "RIPV");
+ if (mcgstatus & MCG_STATUS_EIPV)
+ mce_snprintf(e->mcgstatus_msg, "EIPV");
+ if (mcgstatus & MCG_STATUS_MCIP)
+ mce_snprintf(e->mcgstatus_msg, "MCIP");
+
+ decode_smca_error(e);
+ amd_decode_errcode(e);
+ return 0;
+}
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ rasdaemon-0.6.1/mce-amd.c 2019-07-12 11:35:04.836470461 -0400
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2018, The AMD, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "ras-mce-handler.h"
+
+/* Error Code Types */
+#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010)
+#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100)
+#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800)
+#define INT_ERROR(x) (((x) & 0xF4FF) == 0x0400)
+
+/* Error code: transaction type (TT) */
+static char *transaction[] = {
+ "instruction", "data", "generic", "reserved"
+};
+/* Error codes: cache level (LL) */
+static char *cachelevel[] = {
+ "reserved", "L1", "L2", "L3/generic"
+};
+/* Error codes: memory transaction type (RRRR) */
+static char *memtrans[] = {
+ "generic", "generic read", "generic write", "data read",
+ "data write", "instruction fetch", "prefetch", "evict", "snoop",
+ "?", "?", "?", "?", "?", "?", "?"
+};
+/* Participation Processor */
+static char *partproc[] = {
+ "local node origin", "local node response",
+ "local node observed", "generic participation"
+};
+/* Timeout */
+static char *timeout[] = {
+ "request didn't time out",
+ "request timed out"
+};
+/* internal unclassified error code */
+static char *internal[] = { "reserved",
+ "reserved",
+ "hardware assert",
+ "reserved" };
+
+#define TT(x) (((x) >> 2) & 0x3) /*bit 2, bit 3*/
+#define TT_MSG(x) transaction[TT(x)]
+#define LL(x) ((x) & 0x3) /*bit 0, bit 1*/
+#define LL_MSG(x) cachelevel[LL(x)]
+
+#define R4(x) (((x) >> 4) & 0xF) /*bit 4, bit 5, bit 6, bit 7 */
+#define R4_MSG(x) ((R4(x) < 9) ? memtrans[R4(x)] : "Wrong R4!")
+
+#define TO(x) (((x) >> 8) & 0x1) /*bit 8*/
+#define TO_MSG(x) timeout[TO(x)]
+#define PP(x) (((x) >> 9) & 0x3) /*bit 9, bit 10*/
+#define PP_MSG(x) partproc[PP(x)]
+
+#define UU(x) (((x) >> 8) & 0x3) /*bit 8, bit 9*/
+#define UU_MSG(x) internal[UU(x)]
+
+void decode_amd_errcode(struct mce_event *e)
+{
+ uint16_t ec = e->status & 0xffff;
+ uint16_t ecc = (e->status >> 45) & 0x3;
+
+ if (e->status & MCI_STATUS_UC) {
+ if (e->status & MCI_STATUS_PCC)
+ strcpy(e->error_msg, "System Fatal error.");
+ if (e->mcgstatus & MCG_STATUS_RIPV)
+ strcpy(e->error_msg,
+ "Uncorrected, software restartable error.");
+ strcpy(e->error_msg,
+ "Uncorrected, software containable error.");
+ } else if (e->status & MCI_STATUS_DEFERRED)
+ strcpy(e->error_msg, "Deferred error, no action required.");
+ else
+ strcpy(e->error_msg, "Corrected error, no action required.");
+
+ if (!(e->status & MCI_STATUS_VAL))
+ mce_snprintf(e->mcistatus_msg, "MCE_INVALID");
+
+ if (e->status & MCI_STATUS_OVER)
+ mce_snprintf(e->mcistatus_msg, "Error_overflow");
+
+ if (e->status & MCI_STATUS_PCC)
+ mce_snprintf(e->mcistatus_msg, "Processor_context_corrupt");
+
+ if (ecc)
+ mce_snprintf(e->mcistatus_msg,
+ "%sECC", ((ecc == 2) ? "C" : "U"));
+
+ if (INT_ERROR(ec)) {
+ mce_snprintf(e->mcastatus_msg, "Internal '%s'", UU_MSG(ec));
+ return;
+ }
+
+ if (TLB_ERROR(ec))
+ mce_snprintf(e->mcastatus_msg,
+ "TLB Error 'tx: %s, level: %s'",
+ TT_MSG(ec), LL_MSG(ec));
+ else if (MEM_ERROR(ec))
+ mce_snprintf(e->mcastatus_msg,
+ "Memory Error 'mem-tx: %s, tx: %s, level: %s'",
+ R4_MSG(ec), TT_MSG(ec), LL_MSG(ec));
+ else if (BUS_ERROR(ec))
+ mce_snprintf(e->mcastatus_msg,
+ "Bus Error '%s, %s, mem-tx: %s, level: %s'",
+ PP_MSG(ec), TO_MSG(ec),
+ R4_MSG(ec), LL_MSG(ec));
+ return;
+
+}
--- rasdaemon-0.6.1.orig/ras-mce-handler.c 2019-07-12 11:35:01.585502811 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.c 2019-07-12 11:35:04.836470461 -0400
@@ -55,6 +55,7 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series
[CPU_KNIGHTS_LANDING] = "Knights Landing",
[CPU_KNIGHTS_MILL] = "Knights Mill",
[CPU_SKYLAKE_XEON] = "Skylake server",
+ [CPU_NAPLES] = "AMD Family 17h Zen1"
};
static enum cputype select_intel_cputype(struct ras_events *ras)
@@ -190,9 +191,12 @@ ret = 0;
if (!strcmp(mce->vendor, "AuthenticAMD")) {
if (mce->family == 15)
mce->cputype = CPU_K8;
- if (mce->family > 15) {
+ if (mce->family == 23)
+ mce->cputype = CPU_NAPLES;
+ if (mce->family > 23) {
log(ALL, LOG_INFO,
- "Can't parse MCE for this AMD CPU yet\n");
+ "Can't parse MCE for this AMD CPU yet %d\n",
+ mce->family);
ret = EINVAL;
}
goto ret;
@@ -331,6 +335,12 @@ #if 0
if (e->status & MCI_STATUS_ADDRV)
trace_seq_printf(s, ", addr= %llx", (long long)e->addr);
+ if (e->status & MCI_STATUS_SYNDV)
+ trace_seq_printf(s, ", synd= %llx", (long long)e->synd);
+
+ if (e->ipid)
+ trace_seq_printf(s, ", ipid= %llx", (long long)e->ipid);
+
if (e->mcgstatus_msg)
trace_seq_printf(s, ", %s", e->mcgstatus_msg);
else
@@ -411,6 +421,13 @@ if (pevent_get_field_val(s, event, "bank
if (pevent_get_field_val(s, event, "cpuvendor", record, &val, 1) < 0)
return -1;
e.cpuvendor = val;
+ /* Get New entries */
+ if (pevent_get_field_val(s, event, "synd", record, &val, 1) < 0)
+ return -1;
+ e.synd = val;
+ if (pevent_get_field_val(s, event, "ipid", record, &val, 1) < 0)
+ return -1;
+ e.ipid = val;
switch (mce->cputype) {
case CPU_GENERIC:
@@ -418,6 +435,9 @@ if (pevent_get_field_val(s, event, "cpuv
case CPU_K8:
rc = parse_amd_k8_event(ras, &e);
break;
+ case CPU_NAPLES:
+ rc = parse_amd_smca_event(ras, &e);
+ break;
default: /* All other CPU types are Intel */
rc = parse_intel_event(ras, &e);
}
--- rasdaemon-0.6.1.orig/ras-mce-handler.h 2019-07-12 11:35:01.585502811 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.h 2019-07-12 11:35:04.836470461 -0400
@@ -50,6 +50,7 @@ enum cputype {
CPU_KNIGHTS_LANDING,
CPU_KNIGHTS_MILL,
CPU_SKYLAKE_XEON,
+ CPU_NAPLES,
};
struct mce_event {
@@ -69,6 +70,8 @@ struct mce_event {
uint8_t cs;
uint8_t bank;
uint8_t cpuvendor;
+ uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */
+ uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */
/* Parsed data */
char timestamp[64];
@@ -129,6 +132,9 @@ void broadwell_de_decode_model(struct ra
void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e);
void skylake_s_decode_model(struct ras_events *ras, struct mce_event *e);
+/* AMD error code decode function */
+void decode_amd_errcode(struct mce_event *e);
+
/* Software defined banks */
#define MCE_EXTENDED_BANK 128
@@ -144,6 +150,13 @@ #define MCI_STATUS_EN (1ULL<<60) /*
#define MCI_STATUS_S (1ULL<<56) /* signalled */
#define MCI_STATUS_AR (1ULL<<55) /* action-required */
+/* AMD-specific bits */
+#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */
+#define MCI_STATUS_SYNDV (1ULL<<53) /* synd reg. valid */
+/* uncorrected error,deferred exception */
+#define MCI_STATUS_DEFERRED (1ULL<<44)
+#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
+
#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
#define MCG_STATUS_EIPV (1ULL<<1) /* eip points to correct instruction */
#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
@@ -154,4 +167,6 @@ int parse_intel_event(struct ras_events
int parse_amd_k8_event(struct ras_events *ras, struct mce_event *e);
+int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e);
+
#endif
--- rasdaemon-0.6.1.orig/Makefile.in 2018-04-25 06:29:05.000000000 -0400
+++ rasdaemon-0.6.1/Makefile.in 2019-07-15 14:41:22.308278851 -0400
@@ -100,7 +100,7 @@ sbin_PROGRAMS = rasdaemon$(EXEEXT)
@WITH_MCE_TRUE@ mce-intel-dunnington.c mce-intel-tulsa.c \
@WITH_MCE_TRUE@ mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \
@WITH_MCE_TRUE@ mce-intel-knl.c mce-intel-broadwell-de.c \
-@WITH_MCE_TRUE@ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c
+@WITH_MCE_TRUE@ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c mce-amd.c mce-amd-smca.c
@WITH_EXTLOG_TRUE@am__append_6 = ras-extlog-handler.c
@WITH_ABRT_REPORT_TRUE@am__append_7 = ras-report.c
@@ -132,7 +132,7 @@ am__rasdaemon_SOURCES_DIST = rasdaemon.c
mce-intel-ivb.c mce-intel-haswell.c mce-intel-knl.c \
mce-intel-broadwell-de.c mce-intel-broadwell-epex.c \
mce-intel-skylake-xeon.c ras-extlog-handler.c ras-report.c \
- non-standard-hisi_hip07.c
+ non-standard-hisi_hip07.c mce-amd-smca.c mce-amd.c
@WITH_SQLITE3_TRUE@am__objects_1 = ras-record.$(OBJEXT)
@WITH_AER_TRUE@am__objects_2 = ras-aer-handler.$(OBJEXT)
@WITH_NON_STANDARD_TRUE@am__objects_3 = \
@@ -149,7 +149,9 @@ non-standard-hisi_hip07.c
@WITH_MCE_TRUE@ mce-intel-knl.$(OBJEXT) \
@WITH_MCE_TRUE@ mce-intel-broadwell-de.$(OBJEXT) \
@WITH_MCE_TRUE@ mce-intel-broadwell-epex.$(OBJEXT) \
-@WITH_MCE_TRUE@ mce-intel-skylake-xeon.$(OBJEXT)
+@WITH_MCE_TRUE@ mce-intel-skylake-xeon.$(OBJEXT) \
+@WITH_MCE_TRUE@ mce-amd-smca.$(OBJEXT) \
+@WITH_MCE_TRUE@ mce-amd.$(OBJEXT)
@WITH_EXTLOG_TRUE@am__objects_6 = ras-extlog-handler.$(OBJEXT)
@WITH_ABRT_REPORT_TRUE@am__objects_7 = ras-report.$(OBJEXT)
@WITH_HISI_NS_DECODE_TRUE@am__objects_8 = \
@@ -595,6 +597,8 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bitfield.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd-k8.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd-scma.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-broadwell-de.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-broadwell-epex.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-dunnington.Po@am__quote@

159
SPECS/rasdaemon.spec Normal file
View File

@ -0,0 +1,159 @@
Name: rasdaemon
Version: 0.6.1
Release: 3%{?dist}
Summary: Utility to receive RAS error tracings
Group: Applications/System
License: GPLv2
URL: http://git.infradead.org/users/mchehab/rasdaemon.git
Source0: http://www.infradead.org/~mchehab/rasdaemon/%{name}-%{version}.tar.bz2
ExcludeArch: s390 s390x
BuildRequires: gettext-devel
BuildRequires: perl-generators
BuildRequires: sqlite-devel
BuildRequires: systemd
Provides: bundled(kernel-event-lib)
Requires: hwdata
Requires: perl-DBD-SQLite
%ifarch %{ix86} x86_64
Requires: dmidecode
%endif
Requires(post): systemd
Requires(preun): systemd
Requires(postun): systemd
Patch1: 60a91e4da4f2daf2b10143fc148a8043312b61e5.patch
Patch2: a16ca0711001957ee98f2c124abce0fa1f801529.patch
%description
%{name} is a RAS (Reliability, Availability and Serviceability) logging tool.
It currently records memory errors, using the EDAC tracing events.
EDAC is drivers in the Linux kernel that handle detection of ECC errors
from memory controllers for most chipsets on i386 and x86_64 architectures.
EDAC drivers for other architectures like arm also exists.
This userspace component consists of an init script which makes sure
EDAC drivers and DIMM labels are loaded at system startup, as well as
an utility for reporting current error counts from the EDAC sysfs files.
%prep
%setup -q
%patch1 -p1
%patch2 -p1
%build
%ifarch %{arm} aarch64
%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-non-standard --enable-hisi-ns-decode --enable-arm
%else
%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report
%endif
make %{?_smp_mflags}
%install
make install DESTDIR=%{buildroot}
install -D -p -m 0644 misc/rasdaemon.service %{buildroot}/%{_unitdir}/rasdaemon.service
install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service
rm INSTALL %{buildroot}/usr/include/*.h
%files
%doc AUTHORS ChangeLog COPYING README TODO
%{_sbindir}/rasdaemon
%{_sbindir}/ras-mc-ctl
%{_mandir}/*/*
%{_unitdir}/*.service
%{_sharedstatedir}/rasdaemon
%{_sysconfdir}/ras/dimm_labels.d
%changelog
* Thu Jul 11 2019 Aristeu Rozanski <aris@redhat.com> 0.6.1-3
- Add support for AMD scalable MCA [1725488]
* Mon Aug 20 2018 Aristeu Rozanski <aris@redhat.com> 0.6.1-2
- Add support for error count display [1573685]
* Wed Apr 25 2018 Mauro Carvalho Chehab <mchehab+samsung@kernel.org> 0.6.1-1
- Bump to version 0.6.1 adding support for Skylake Xeon MSCOD, a bug fix and some new DELL labels
* Fri Feb 09 2018 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.0-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_28_Mass_Rebuild
* Sat Oct 14 2017 Mauro Carvalho Chehab <mchehab@osg.samsung.com> 0.6.0-1
- Bump to version 0.6.0 adding support for Arm and Hisilicon events and update Dell Skylate labels
* Thu Aug 03 2017 Fedora Release Engineering <releng@fedoraproject.org> - 0.5.8-6
- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Binutils_Mass_Rebuild
* Thu Jul 27 2017 Fedora Release Engineering <releng@fedoraproject.org> - 0.5.8-5
- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Mass_Rebuild
* Sat Feb 11 2017 Fedora Release Engineering <releng@fedoraproject.org> - 0.5.8-4
- Rebuilt for https://fedoraproject.org/wiki/Fedora_26_Mass_Rebuild
* Fri Apr 15 2016 Mauro Carvalho Chehab <mchehab@osg.samsung.com> 0.5.8-3
- Add a virtual provide, per BZ#104132
* Fri Apr 15 2016 Mauro Carvalho Chehab <mchehab@osg.samsung.com> 0.5.8-2
- Bump to version 0.5.8 with support for Broadwell EP/EX MSCOD/DE MSCOD
* Thu Feb 04 2016 Fedora Release Engineering <releng@fedoraproject.org> - 0.5.6-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_24_Mass_Rebuild
* Fri Jul 03 2015 Mauro Carvalho Chehab <mchehab@osg.samsung.com> 0.5.6-1
- Bump to version 0.5.6 with support for LMCE and some fixes
* Thu Jun 18 2015 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 0.5.5-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_23_Mass_Rebuild
* Wed Jun 03 2015 Mauro Carvalho Chehab <mchehab@osg.samsung.com> 0.5.5-1
- Bump to version 0.5.5 with support for newer Intel platforms & some fixes
* Tue Sep 16 2014 Peter Robinson <pbrobinson@fedoraproject.org> 0.5.4-3
- aarch64/ppc64 have edac capabilities
- spec cleanups
- No need to run autoreconf
* Sun Aug 17 2014 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 0.5.4-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_21_22_Mass_Rebuild
* Fri Aug 15 2014 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.5.4-1
- Bump to version 0.5.4 with some fixes, mainly for amd64
* Sun Aug 10 2014 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.5.3-1
- Bump to version 0.5.3 and enable ABRT and ExtLog
* Sun Jun 08 2014 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 0.5.2-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_21_Mass_Rebuild
* Thu Apr 03 2014 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.5.2-1
- fix and enable ABRT report support
* Fri Mar 28 2014 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.5.1-1
- Do some fixes at the service files and add some documentation for --record
* Sun Feb 16 2014 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.5.0-1
- Add experimental ABRT support
* Tue Sep 10 2013 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.4.2-1
- Fix ras-mc-ctl layout filling
* Sun Aug 04 2013 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 0.4.1-5
- Rebuilt for https://fedoraproject.org/wiki/Fedora_20_Mass_Rebuild
* Wed Jul 17 2013 Petr Pisar <ppisar@redhat.com> - 0.4.1-4
- Perl 5.18 rebuild
* Sun Jun 2 2013 Peter Robinson <pbrobinson@fedoraproject.org> 0.4.1-3
- ARM has EDMA drivers (currently supported in Calxeda highbank)
* Wed May 29 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.4.1-2
- Fix the name of perl-DBD-SQLite package
* Wed May 29 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.4.1-1
- Updated to version 0.4.1 with contains some bug fixes
* Tue May 28 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.4.0-1
- Updated to version 0.4.0 and added support for mce, aer and sqlite3 storage
* Mon May 20 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.3.0-1
- Package created