commit 84623fe14076c4f087e2c984f635c3aeb6a9bef6 Author: CentOS Sources Date: Tue Jan 21 17:42:50 2020 -0500 import rasdaemon-0.6.1-3.el8 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e69cfd0 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +SOURCES/rasdaemon-0.6.1.tar.bz2 diff --git a/.rasdaemon.metadata b/.rasdaemon.metadata new file mode 100644 index 0000000..e6215b6 --- /dev/null +++ b/.rasdaemon.metadata @@ -0,0 +1 @@ +742eda555cccb8ca8f9b6a18bab1f4a732c11318 SOURCES/rasdaemon-0.6.1.tar.bz2 diff --git a/SOURCES/60a91e4da4f2daf2b10143fc148a8043312b61e5.patch b/SOURCES/60a91e4da4f2daf2b10143fc148a8043312b61e5.patch new file mode 100644 index 0000000..57a4e46 --- /dev/null +++ b/SOURCES/60a91e4da4f2daf2b10143fc148a8043312b61e5.patch @@ -0,0 +1,149 @@ +commit 60a91e4da4f2daf2b10143fc148a8043312b61e5 +Author: Aristeu Rozanski +Date: Wed Aug 1 16:29:58 2018 -0400 + + rasdaemon: ras-mc-ctl: add option to show error counts + + In some scenarios it might not be desirable to have a daemon running + to parse and store the errors provided by EDAC and only having the + number of CEs and UEs is enough. This patch implements this feature + as an ras-mc-ctl option. + + Signed-off-by: Aristeu Rozanski + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 38b7824..aee431a 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -50,6 +50,8 @@ my %dimm_location = (); + my %csrow_size = (); + my %rank_size = (); + my %csrow_ranks = (); ++my %dimm_ce_count = (); ++my %dimm_ue_count = (); + + my @layers; + my @max_pos; +@@ -76,6 +78,7 @@ Usage: $prog [OPTIONS...] + --layout Display the memory layout. + --summary Presents a summary of the logged errors. + --errors Shows the errors stored at the error database. ++ --error-count Shows the corrected and uncorrected error counts using sysfs. + --help This help message. + EOF + +@@ -83,7 +86,7 @@ parse_cmdline(); + + if ( $conf{opt}{mainboard} || $conf{opt}{print_labels} + || $conf{opt}{register_labels} || $conf{opt}{display_memory_layout} +- || $conf{opt}{guess_dimm_label}) { ++ || $conf{opt}{guess_dimm_label} || $conf{opt}{error_count}) { + + get_mainboard_info(); + +@@ -105,6 +108,9 @@ if ( $conf{opt}{mainboard} || $conf{opt}{print_labels} + if ($conf{opt}{guess_dimm_label}) { + guess_dimm_label (); + } ++ if ($conf{opt}{error_count}) { ++ display_error_count (); ++ } + } + + if ($conf{opt}{status}) { +@@ -134,6 +140,7 @@ sub parse_cmdline + $conf{opt}{guess_dimm_label} = 0; + $conf{opt}{summary} = 0; + $conf{opt}{errors} = 0; ++ $conf{opt}{error_count} = 0; + + my $rref = \$conf{opt}{report}; + my $mref = \$conf{opt}{mainboard}; +@@ -150,7 +157,8 @@ sub parse_cmdline + "status" => \$conf{opt}{status}, + "layout" => \$conf{opt}{display_memory_layout}, + "summary" => \$conf{opt}{summary}, +- "errors" => \$conf{opt}{errors} ++ "errors" => \$conf{opt}{errors}, ++ "error-count" => \$conf{opt}{error_count} + ); + + usage(1) if !$rc; +@@ -284,6 +292,30 @@ sub parse_dimm_nodes + $dimm_label_file{$str_loc} = $file; + $dimm_location{$str_loc} = $location; + ++ my $count; ++ ++ $file =~s/dimm_label/dimm_ce_count/; ++ if (-e $file) { ++ open IN, $file; ++ chomp($count = ); ++ close IN; ++ } else { ++ log_error ("dimm_ce_count not found in sysfs. Old kernel?\n"); ++ exit -1; ++ } ++ $dimm_ce_count{$str_loc} = $count; ++ ++ $file =~s/dimm_ce_count/dimm_ue_count/; ++ if (-e $file) { ++ open IN, $file; ++ chomp($count = ); ++ close IN; ++ } else { ++ log_error ("dimm_ue_count not found in sysfs. Old kernel?\n"); ++ exit -1; ++ } ++ $dimm_ue_count{$str_loc} = $count; ++ + return; + } + } +@@ -906,6 +938,45 @@ sub display_memory_layout + dimm_display_mem(); + } + ++sub display_error_count ++{ ++ my $sysfs_dir = "/sys/devices/system/edac/mc"; ++ my $key; ++ my $max_width = 0; ++ my %dimm_labels = (); ++ ++ find ({wanted => \&parse_dimm_nodes, no_chdir => 1}, $sysfs_dir); ++ ++ if (!scalar(keys %dimm_node)) { ++ log_error ("No DIMMs found in /sys or new sysfs EDAC interface not found.\n"); ++ exit -1; ++ } ++ ++ foreach $key (keys %dimm_node) { ++ my $label_width; ++ ++ open IN, $dimm_label_file{$key}; ++ chomp(my $label = ); ++ close IN; ++ $label_width = length $label; ++ ++ if ($label_width > $max_width) { ++ $max_width = $label_width; ++ } ++ $dimm_labels{$key} = $label; ++ } ++ my $string = "Label"; ++ $string .= " " x ($max_width - length $string); ++ print($string . "\tCE\tUE\n"); ++ ++ foreach $key (keys %dimm_node) { ++ my $ce_count = $dimm_ce_count{$key}; ++ my $ue_count = $dimm_ue_count{$key}; ++ ++ print("$dimm_labels{$key}\t$ce_count\t$ue_count\n"); ++ } ++} ++ + sub find_prog + { + my ($file) = @_; diff --git a/SOURCES/a16ca0711001957ee98f2c124abce0fa1f801529.patch b/SOURCES/a16ca0711001957ee98f2c124abce0fa1f801529.patch new file mode 100644 index 0000000..3a96263 --- /dev/null +++ b/SOURCES/a16ca0711001957ee98f2c124abce0fa1f801529.patch @@ -0,0 +1,670 @@ +commit a16ca0711001957ee98f2c124abce0fa1f801529 +Author: Chandu-babu Namburu +Date: Wed Jan 30 20:36:45 2019 +0530 + + rasdaemon: add support for AMD Scalable MCA + + Add logic here to decode errors from all known IP blocks for + AMD Scalable MCA supported processors + + Reviewed-by: Yazen Ghannam + Signed-off-by: Chandu-babu Namburu + +--- + mce-amd-smca.c | 371 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + mce-amd.c | 122 +++++++++++++++++ + ras-mce-handler.c | 24 +++ + ras-mce-handler.h | 15 ++ + 4 files changed, 530 insertions(+), 2 deletions(-) + +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ rasdaemon-0.6.1/mce-amd-smca.c 2019-07-12 11:35:04.836470461 -0400 +@@ -0,0 +1,371 @@ ++/* ++ * Copyright (c) 2018, AMD, Inc. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 and ++ * only version 2 as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ */ ++ ++#include ++#include ++ ++#include "ras-mce-handler.h" ++#include "bitfield.h" ++ ++/* MCA_STATUS REGISTER FOR FAMILY 17H ++ *********************** Higher 32-bits ***************************** ++ * 63: VALIDERROR, 62: OVERFLOW, 61: UC, 60: Err ENABLE, ++ * 59: Misc Valid, 58: Addr Valid, 57: PCC, 56: ErrCoreID Valid, ++ * 55: TCC, 54: RES, 53: Syndrom Valid, 52: Transparanet, ++ * 51: RES, 50: RES, 49: RES, 48: RES, ++ * 47: RES, 46: CECC, 45: UECC, 44: Deferred, ++ * 43: Poison, 42: RES, 41: RES, 40: RES, ++ * 39: RES, 38: RES, 37: ErrCoreID[5], 36: ErrCoreID[4], ++ * 35: ErrCoreID[3], 34: ErrCoreID[2] 33: ErrCoreID[1] 32: ErrCoreID[0] ++ *********************** Lower 32-bits ****************************** ++ * 31: RES, 30: RES, 29: RES, 28: RES, ++ * 27: RES, 26: RES, 25: RES, 24: RES ++ * 23: RES, 22: RES, 21: XEC[5], 20: XEC[4], ++ * 19: XEC[3], 18: XEC[2], 17: XEC[1], 16: XEC[0] ++ * 15: EC[15], 14: EC[14], 13: EC[13], 12: EC[12], ++ * 11: EC[11], 10: EC[10], 09: EC[9], 08: EC[8], ++ * 07: EC[7], 06: EC[6], 05: EC[5], 04: EC[4], ++ * 03: EC[3], 02: EC[2], 01: EC[1], 00: EC[0] ++ */ ++ ++/* These may be used by multiple smca_hwid_mcatypes */ ++enum smca_bank_types { ++ SMCA_LS = 0, /* Load Store */ ++ SMCA_IF, /* Instruction Fetch */ ++ SMCA_L2_CACHE, /* L2 Cache */ ++ SMCA_DE, /* Decoder Unit */ ++ SMCA_RESERVED, /* Reserved */ ++ SMCA_EX, /* Execution Unit */ ++ SMCA_FP, /* Floating Point */ ++ SMCA_L3_CACHE, /* L3 Cache */ ++ SMCA_CS, /* Coherent Slave */ ++ SMCA_PIE, /* Power, Interrupts, etc. */ ++ SMCA_UMC, /* Unified Memory Controller */ ++ SMCA_PB, /* Parameter Block */ ++ SMCA_PSP, /* Platform Security Processor */ ++ SMCA_SMU, /* System Management Unit */ ++ N_SMCA_BANK_TYPES ++}; ++ ++/* SMCA Extended error strings */ ++/* Load Store */ ++static const char * const smca_ls_mce_desc[] = { ++ "Load queue parity", ++ "Store queue parity", ++ "Miss address buffer payload parity", ++ "L1 TLB parity", ++ "Reserved", ++ "DC tag error type 6", ++ "DC tag error type 1", ++ "Internal error type 1", ++ "Internal error type 2", ++ "Sys Read data error thread 0", ++ "Sys read data error thread 1", ++ "DC tag error type 2", ++ "DC data error type 1 (poison consumption)", ++ "DC data error type 2", ++ "DC data error type 3", ++ "DC tag error type 4", ++ "L2 TLB parity", ++ "PDC parity error", ++ "DC tag error type 3", ++ "DC tag error type 5", ++ "L2 fill data error", ++}; ++/* Instruction Fetch */ ++static const char * const smca_if_mce_desc[] = { ++ "microtag probe port parity error", ++ "IC microtag or full tag multi-hit error", ++ "IC full tag parity", ++ "IC data array parity", ++ "Decoupling queue phys addr parity error", ++ "L0 ITLB parity error", ++ "L1 ITLB parity error", ++ "L2 ITLB parity error", ++ "BPQ snoop parity on Thread 0", ++ "BPQ snoop parity on Thread 1", ++ "L1 BTB multi-match error", ++ "L2 BTB multi-match error", ++ "L2 Cache Response Poison error", ++ "System Read Data error", ++}; ++/* L2 Cache */ ++static const char * const smca_l2_mce_desc[] = { ++ "L2M tag multi-way-hit error", ++ "L2M tag ECC error", ++ "L2M data ECC error", ++ "HW assert", ++}; ++/* Decoder Unit */ ++static const char * const smca_de_mce_desc[] = { ++ "uop cache tag parity error", ++ "uop cache data parity error", ++ "Insn buffer parity error", ++ "uop queue parity error", ++ "Insn dispatch queue parity error", ++ "Fetch address FIFO parity", ++ "Patch RAM data parity", ++ "Patch RAM sequencer parity", ++ "uop buffer parity" ++}; ++/* Execution Unit */ ++static const char * const smca_ex_mce_desc[] = { ++ "Watchdog timeout error", ++ "Phy register file parity", ++ "Flag register file parity", ++ "Immediate displacement register file parity", ++ "Address generator payload parity", ++ "EX payload parity", ++ "Checkpoint queue parity", ++ "Retire dispatch queue parity", ++ "Retire status queue parity error", ++ "Scheduling queue parity error", ++ "Branch buffer queue parity error", ++}; ++/* Floating Point Unit */ ++static const char * const smca_fp_mce_desc[] = { ++ "Physical register file parity", ++ "Freelist parity error", ++ "Schedule queue parity", ++ "NSQ parity error", ++ "Retire queue parity", ++ "Status register file parity", ++ "Hardware assertion", ++}; ++/* L3 Cache */ ++static const char * const smca_l3_mce_desc[] = { ++ "Shadow tag macro ECC error", ++ "Shadow tag macro multi-way-hit error", ++ "L3M tag ECC error", ++ "L3M tag multi-way-hit error", ++ "L3M data ECC error", ++ "XI parity, L3 fill done channel error", ++ "L3 victim queue parity", ++ "L3 HW assert", ++}; ++/* Coherent Slave Unit */ ++static const char * const smca_cs_mce_desc[] = { ++ "Illegal request from transport layer", ++ "Address violation", ++ "Security violation", ++ "Illegal response from transport layer", ++ "Unexpected response", ++ "Parity error on incoming request or probe response data", ++ "Parity error on incoming read response data", ++ "Atomic request parity", ++ "ECC error on probe filter access", ++}; ++/* Power, Interrupt, etc.. */ ++static const char * const smca_pie_mce_desc[] = { ++ "HW assert", ++ "Internal PIE register security violation", ++ "Error on GMI link", ++ "Poison data written to internal PIE register", ++}; ++/* Unified Memory Controller */ ++static const char * const smca_umc_mce_desc[] = { ++ "DRAM ECC error", ++ "Data poison error on DRAM", ++ "SDP parity error", ++ "Advanced peripheral bus error", ++ "Command/address parity error", ++ "Write data CRC error", ++}; ++/* Parameter Block */ ++static const char * const smca_pb_mce_desc[] = { ++ "Parameter Block RAM ECC error", ++}; ++/* Platform Security Processor */ ++static const char * const smca_psp_mce_desc[] = { ++ "PSP RAM ECC or parity error", ++}; ++/* System Management Unit */ ++static const char * const smca_smu_mce_desc[] = { ++ "SMU RAM ECC or parity error", ++}; ++ ++struct smca_mce_desc { ++ const char * const *descs; ++ unsigned int num_descs; ++}; ++ ++static struct smca_mce_desc smca_mce_descs[] = { ++ [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) }, ++ [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) }, ++ [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) }, ++ [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) }, ++ [SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) }, ++ [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) }, ++ [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, ++ [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, ++ [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, ++ [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, ++ [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, ++ [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, ++ [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, ++}; ++ ++struct smca_hwid { ++ unsigned int bank_type; /* Use with smca_bank_types for easy indexing.*/ ++ uint32_t mcatype_hwid; /* mcatype,hwid bit 63-32 in MCx_IPID Register*/ ++}; ++ ++static struct smca_hwid smca_hwid_mcatypes[] = { ++ /* { bank_type, mcatype_hwid } */ ++ ++ /* ZN Core (HWID=0xB0) MCA types */ ++ { SMCA_LS, 0x000000B0 }, ++ { SMCA_IF, 0x000100B0 }, ++ { SMCA_L2_CACHE, 0x000200B0 }, ++ { SMCA_DE, 0x000300B0 }, ++ /* HWID 0xB0 MCATYPE 0x4 is Reserved */ ++ { SMCA_EX, 0x000500B0 }, ++ { SMCA_FP, 0x000600B0 }, ++ { SMCA_L3_CACHE, 0x000700B0 }, ++ ++ /* Data Fabric MCA types */ ++ { SMCA_CS, 0x0000002E }, ++ { SMCA_PIE, 0x0001002E }, ++ ++ /* Unified Memory Controller MCA type */ ++ { SMCA_UMC, 0x00000096 }, ++ ++ /* Parameter Block MCA type */ ++ { SMCA_PB, 0x00000005 }, ++ ++ /* Platform Security Processor MCA type */ ++ { SMCA_PSP, 0x000000FF }, ++ ++ /* System Management Unit MCA type */ ++ { SMCA_SMU, 0x00000001 }, ++}; ++ ++struct smca_bank_name { ++ const char *name; ++}; ++ ++static struct smca_bank_name smca_names[] = { ++ [SMCA_LS] = { "Load Store Unit" }, ++ [SMCA_IF] = { "Instruction Fetch Unit" }, ++ [SMCA_L2_CACHE] = { "L2 Cache" }, ++ [SMCA_DE] = { "Decode Unit" }, ++ [SMCA_RESERVED] = { "Reserved" }, ++ [SMCA_EX] = { "Execution Unit" }, ++ [SMCA_FP] = { "Floating Point Unit" }, ++ [SMCA_L3_CACHE] = { "L3 Cache" }, ++ [SMCA_CS] = { "Coherent Slave" }, ++ [SMCA_PIE] = { "Power, Interrupts, etc." }, ++ [SMCA_UMC] = { "Unified Memory Controller" }, ++ [SMCA_PB] = { "Parameter Block" }, ++ [SMCA_PSP] = { "Platform Security Processor" }, ++ [SMCA_SMU] = { "System Management Unit" }, ++}; ++ ++static void amd_decode_errcode(struct mce_event *e) ++{ ++ ++ decode_amd_errcode(e); ++ ++ if (e->status & MCI_STATUS_POISON) ++ mce_snprintf(e->mcistatus_msg, "Poison consumed"); ++ ++ if (e->status & MCI_STATUS_TCC) ++ mce_snprintf(e->mcistatus_msg, "Task_context_corrupt"); ++ ++} ++/* ++ * To find the UMC channel represented by this bank we need to match on its ++ * instance_id. The instance_id of a bank is held in the lower 32 bits of its ++ * IPID. ++ */ ++static int find_umc_channel(struct mce_event *e) ++{ ++ uint32_t umc_instance_id[] = {0x50f00, 0x150f00}; ++ uint32_t instance_id = EXTRACT(e->ipid, 0, 31); ++ int i, channel = -1; ++ ++ for (i = 0; i < ARRAY_SIZE(umc_instance_id); i++) ++ if (umc_instance_id[i] == instance_id) ++ channel = i; ++ ++ return channel; ++} ++/* Decode extended errors according to Scalable MCA specification */ ++static void decode_smca_error(struct mce_event *e) ++{ ++ enum smca_bank_types bank_type; ++ const char *ip_name; ++ unsigned short xec = (e->status >> 16) & 0x3f; ++ const struct smca_hwid *s_hwid; ++ uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63); ++ unsigned int csrow = -1, channel = -1; ++ unsigned int i; ++ ++ for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { ++ s_hwid = &smca_hwid_mcatypes[i]; ++ if (mcatype_hwid == s_hwid->mcatype_hwid) { ++ bank_type = s_hwid->bank_type; ++ break; ++ } ++ } ++ ++ if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) { ++ strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID"); ++ return; ++ } ++ ++ if (bank_type >= N_SMCA_BANK_TYPES) { ++ strcpy(e->mcastatus_msg, "Don't know how to decode this bank"); ++ return; ++ } ++ ++ if (bank_type == SMCA_RESERVED) { ++ strcpy(e->mcastatus_msg, "Bank 4 is reserved.\n"); ++ return; ++ } ++ ++ ip_name = smca_names[bank_type].name; ++ ++ mce_snprintf(e->bank_name, "%s (bank=%d)", ip_name, e->bank); ++ ++ /* Only print the descriptor of valid extended error code */ ++ if (xec < smca_mce_descs[bank_type].num_descs) ++ mce_snprintf(e->mcastatus_msg, ++ " %s.\n", smca_mce_descs[bank_type].descs[xec]); ++ ++ if (bank_type == SMCA_UMC && xec == 0) { ++ channel = find_umc_channel(e); ++ csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */ ++ mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d", ++ channel, csrow); ++ } ++} ++ ++int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e) ++{ ++ uint64_t mcgstatus = e->mcgstatus; ++ ++ mce_snprintf(e->mcgstatus_msg, "mcgstatus=%lld", ++ (long long)e->mcgstatus); ++ ++ if (mcgstatus & MCG_STATUS_RIPV) ++ mce_snprintf(e->mcgstatus_msg, "RIPV"); ++ if (mcgstatus & MCG_STATUS_EIPV) ++ mce_snprintf(e->mcgstatus_msg, "EIPV"); ++ if (mcgstatus & MCG_STATUS_MCIP) ++ mce_snprintf(e->mcgstatus_msg, "MCIP"); ++ ++ decode_smca_error(e); ++ amd_decode_errcode(e); ++ return 0; ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ rasdaemon-0.6.1/mce-amd.c 2019-07-12 11:35:04.836470461 -0400 +@@ -0,0 +1,122 @@ ++/* ++ * Copyright (c) 2018, The AMD, Inc. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 and ++ * only version 2 as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ */ ++ ++#include ++#include ++ ++#include "ras-mce-handler.h" ++ ++/* Error Code Types */ ++#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010) ++#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100) ++#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800) ++#define INT_ERROR(x) (((x) & 0xF4FF) == 0x0400) ++ ++/* Error code: transaction type (TT) */ ++static char *transaction[] = { ++ "instruction", "data", "generic", "reserved" ++}; ++/* Error codes: cache level (LL) */ ++static char *cachelevel[] = { ++ "reserved", "L1", "L2", "L3/generic" ++}; ++/* Error codes: memory transaction type (RRRR) */ ++static char *memtrans[] = { ++ "generic", "generic read", "generic write", "data read", ++ "data write", "instruction fetch", "prefetch", "evict", "snoop", ++ "?", "?", "?", "?", "?", "?", "?" ++}; ++/* Participation Processor */ ++static char *partproc[] = { ++ "local node origin", "local node response", ++ "local node observed", "generic participation" ++}; ++/* Timeout */ ++static char *timeout[] = { ++ "request didn't time out", ++ "request timed out" ++}; ++/* internal unclassified error code */ ++static char *internal[] = { "reserved", ++ "reserved", ++ "hardware assert", ++ "reserved" }; ++ ++#define TT(x) (((x) >> 2) & 0x3) /*bit 2, bit 3*/ ++#define TT_MSG(x) transaction[TT(x)] ++#define LL(x) ((x) & 0x3) /*bit 0, bit 1*/ ++#define LL_MSG(x) cachelevel[LL(x)] ++ ++#define R4(x) (((x) >> 4) & 0xF) /*bit 4, bit 5, bit 6, bit 7 */ ++#define R4_MSG(x) ((R4(x) < 9) ? memtrans[R4(x)] : "Wrong R4!") ++ ++#define TO(x) (((x) >> 8) & 0x1) /*bit 8*/ ++#define TO_MSG(x) timeout[TO(x)] ++#define PP(x) (((x) >> 9) & 0x3) /*bit 9, bit 10*/ ++#define PP_MSG(x) partproc[PP(x)] ++ ++#define UU(x) (((x) >> 8) & 0x3) /*bit 8, bit 9*/ ++#define UU_MSG(x) internal[UU(x)] ++ ++void decode_amd_errcode(struct mce_event *e) ++{ ++ uint16_t ec = e->status & 0xffff; ++ uint16_t ecc = (e->status >> 45) & 0x3; ++ ++ if (e->status & MCI_STATUS_UC) { ++ if (e->status & MCI_STATUS_PCC) ++ strcpy(e->error_msg, "System Fatal error."); ++ if (e->mcgstatus & MCG_STATUS_RIPV) ++ strcpy(e->error_msg, ++ "Uncorrected, software restartable error."); ++ strcpy(e->error_msg, ++ "Uncorrected, software containable error."); ++ } else if (e->status & MCI_STATUS_DEFERRED) ++ strcpy(e->error_msg, "Deferred error, no action required."); ++ else ++ strcpy(e->error_msg, "Corrected error, no action required."); ++ ++ if (!(e->status & MCI_STATUS_VAL)) ++ mce_snprintf(e->mcistatus_msg, "MCE_INVALID"); ++ ++ if (e->status & MCI_STATUS_OVER) ++ mce_snprintf(e->mcistatus_msg, "Error_overflow"); ++ ++ if (e->status & MCI_STATUS_PCC) ++ mce_snprintf(e->mcistatus_msg, "Processor_context_corrupt"); ++ ++ if (ecc) ++ mce_snprintf(e->mcistatus_msg, ++ "%sECC", ((ecc == 2) ? "C" : "U")); ++ ++ if (INT_ERROR(ec)) { ++ mce_snprintf(e->mcastatus_msg, "Internal '%s'", UU_MSG(ec)); ++ return; ++ } ++ ++ if (TLB_ERROR(ec)) ++ mce_snprintf(e->mcastatus_msg, ++ "TLB Error 'tx: %s, level: %s'", ++ TT_MSG(ec), LL_MSG(ec)); ++ else if (MEM_ERROR(ec)) ++ mce_snprintf(e->mcastatus_msg, ++ "Memory Error 'mem-tx: %s, tx: %s, level: %s'", ++ R4_MSG(ec), TT_MSG(ec), LL_MSG(ec)); ++ else if (BUS_ERROR(ec)) ++ mce_snprintf(e->mcastatus_msg, ++ "Bus Error '%s, %s, mem-tx: %s, level: %s'", ++ PP_MSG(ec), TO_MSG(ec), ++ R4_MSG(ec), LL_MSG(ec)); ++ return; ++ ++} +--- rasdaemon-0.6.1.orig/ras-mce-handler.c 2019-07-12 11:35:01.585502811 -0400 ++++ rasdaemon-0.6.1/ras-mce-handler.c 2019-07-12 11:35:04.836470461 -0400 +@@ -55,6 +55,7 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series + [CPU_KNIGHTS_LANDING] = "Knights Landing", + [CPU_KNIGHTS_MILL] = "Knights Mill", + [CPU_SKYLAKE_XEON] = "Skylake server", ++ [CPU_NAPLES] = "AMD Family 17h Zen1" + }; + + static enum cputype select_intel_cputype(struct ras_events *ras) +@@ -190,9 +191,12 @@ ret = 0; + if (!strcmp(mce->vendor, "AuthenticAMD")) { + if (mce->family == 15) + mce->cputype = CPU_K8; +- if (mce->family > 15) { ++ if (mce->family == 23) ++ mce->cputype = CPU_NAPLES; ++ if (mce->family > 23) { + log(ALL, LOG_INFO, +- "Can't parse MCE for this AMD CPU yet\n"); ++ "Can't parse MCE for this AMD CPU yet %d\n", ++ mce->family); + ret = EINVAL; + } + goto ret; +@@ -331,6 +335,12 @@ #if 0 + if (e->status & MCI_STATUS_ADDRV) + trace_seq_printf(s, ", addr= %llx", (long long)e->addr); + ++ if (e->status & MCI_STATUS_SYNDV) ++ trace_seq_printf(s, ", synd= %llx", (long long)e->synd); ++ ++ if (e->ipid) ++ trace_seq_printf(s, ", ipid= %llx", (long long)e->ipid); ++ + if (e->mcgstatus_msg) + trace_seq_printf(s, ", %s", e->mcgstatus_msg); + else +@@ -411,6 +421,13 @@ if (pevent_get_field_val(s, event, "bank + if (pevent_get_field_val(s, event, "cpuvendor", record, &val, 1) < 0) + return -1; + e.cpuvendor = val; ++ /* Get New entries */ ++ if (pevent_get_field_val(s, event, "synd", record, &val, 1) < 0) ++ return -1; ++ e.synd = val; ++ if (pevent_get_field_val(s, event, "ipid", record, &val, 1) < 0) ++ return -1; ++ e.ipid = val; + + switch (mce->cputype) { + case CPU_GENERIC: +@@ -418,6 +435,9 @@ if (pevent_get_field_val(s, event, "cpuv + case CPU_K8: + rc = parse_amd_k8_event(ras, &e); + break; ++ case CPU_NAPLES: ++ rc = parse_amd_smca_event(ras, &e); ++ break; + default: /* All other CPU types are Intel */ + rc = parse_intel_event(ras, &e); + } +--- rasdaemon-0.6.1.orig/ras-mce-handler.h 2019-07-12 11:35:01.585502811 -0400 ++++ rasdaemon-0.6.1/ras-mce-handler.h 2019-07-12 11:35:04.836470461 -0400 +@@ -50,6 +50,7 @@ enum cputype { + CPU_KNIGHTS_LANDING, + CPU_KNIGHTS_MILL, + CPU_SKYLAKE_XEON, ++ CPU_NAPLES, + }; + + struct mce_event { +@@ -69,6 +70,8 @@ struct mce_event { + uint8_t cs; + uint8_t bank; + uint8_t cpuvendor; ++ uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */ ++ uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */ + + /* Parsed data */ + char timestamp[64]; +@@ -129,6 +132,9 @@ void broadwell_de_decode_model(struct ra + void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e); + void skylake_s_decode_model(struct ras_events *ras, struct mce_event *e); + ++/* AMD error code decode function */ ++void decode_amd_errcode(struct mce_event *e); ++ + /* Software defined banks */ + #define MCE_EXTENDED_BANK 128 + +@@ -144,6 +150,13 @@ #define MCI_STATUS_EN (1ULL<<60) /* + #define MCI_STATUS_S (1ULL<<56) /* signalled */ + #define MCI_STATUS_AR (1ULL<<55) /* action-required */ + ++/* AMD-specific bits */ ++#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */ ++#define MCI_STATUS_SYNDV (1ULL<<53) /* synd reg. valid */ ++/* uncorrected error,deferred exception */ ++#define MCI_STATUS_DEFERRED (1ULL<<44) ++#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ ++ + #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ + #define MCG_STATUS_EIPV (1ULL<<1) /* eip points to correct instruction */ + #define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ +@@ -154,4 +167,6 @@ int parse_intel_event(struct ras_events + + int parse_amd_k8_event(struct ras_events *ras, struct mce_event *e); + ++int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e); ++ + #endif +--- rasdaemon-0.6.1.orig/Makefile.in 2018-04-25 06:29:05.000000000 -0400 ++++ rasdaemon-0.6.1/Makefile.in 2019-07-15 14:41:22.308278851 -0400 +@@ -100,7 +100,7 @@ sbin_PROGRAMS = rasdaemon$(EXEEXT) + @WITH_MCE_TRUE@ mce-intel-dunnington.c mce-intel-tulsa.c \ + @WITH_MCE_TRUE@ mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \ + @WITH_MCE_TRUE@ mce-intel-knl.c mce-intel-broadwell-de.c \ +-@WITH_MCE_TRUE@ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c ++@WITH_MCE_TRUE@ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c mce-amd.c mce-amd-smca.c + + @WITH_EXTLOG_TRUE@am__append_6 = ras-extlog-handler.c + @WITH_ABRT_REPORT_TRUE@am__append_7 = ras-report.c +@@ -132,7 +132,7 @@ am__rasdaemon_SOURCES_DIST = rasdaemon.c + mce-intel-ivb.c mce-intel-haswell.c mce-intel-knl.c \ + mce-intel-broadwell-de.c mce-intel-broadwell-epex.c \ + mce-intel-skylake-xeon.c ras-extlog-handler.c ras-report.c \ +- non-standard-hisi_hip07.c ++ non-standard-hisi_hip07.c mce-amd-smca.c mce-amd.c + @WITH_SQLITE3_TRUE@am__objects_1 = ras-record.$(OBJEXT) + @WITH_AER_TRUE@am__objects_2 = ras-aer-handler.$(OBJEXT) + @WITH_NON_STANDARD_TRUE@am__objects_3 = \ +@@ -149,7 +149,9 @@ non-standard-hisi_hip07.c + @WITH_MCE_TRUE@ mce-intel-knl.$(OBJEXT) \ + @WITH_MCE_TRUE@ mce-intel-broadwell-de.$(OBJEXT) \ + @WITH_MCE_TRUE@ mce-intel-broadwell-epex.$(OBJEXT) \ +-@WITH_MCE_TRUE@ mce-intel-skylake-xeon.$(OBJEXT) ++@WITH_MCE_TRUE@ mce-intel-skylake-xeon.$(OBJEXT) \ ++@WITH_MCE_TRUE@ mce-amd-smca.$(OBJEXT) \ ++@WITH_MCE_TRUE@ mce-amd.$(OBJEXT) + @WITH_EXTLOG_TRUE@am__objects_6 = ras-extlog-handler.$(OBJEXT) + @WITH_ABRT_REPORT_TRUE@am__objects_7 = ras-report.$(OBJEXT) + @WITH_HISI_NS_DECODE_TRUE@am__objects_8 = \ +@@ -595,6 +597,8 @@ distclean-compile: + + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bitfield.Po@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd-k8.Po@am__quote@ ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd.Po@am__quote@ ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd-scma.Po@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-broadwell-de.Po@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-broadwell-epex.Po@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-dunnington.Po@am__quote@ diff --git a/SPECS/rasdaemon.spec b/SPECS/rasdaemon.spec new file mode 100644 index 0000000..abd4bf5 --- /dev/null +++ b/SPECS/rasdaemon.spec @@ -0,0 +1,159 @@ +Name: rasdaemon +Version: 0.6.1 +Release: 3%{?dist} +Summary: Utility to receive RAS error tracings +Group: Applications/System +License: GPLv2 +URL: http://git.infradead.org/users/mchehab/rasdaemon.git +Source0: http://www.infradead.org/~mchehab/rasdaemon/%{name}-%{version}.tar.bz2 + +ExcludeArch: s390 s390x +BuildRequires: gettext-devel +BuildRequires: perl-generators +BuildRequires: sqlite-devel +BuildRequires: systemd +Provides: bundled(kernel-event-lib) +Requires: hwdata +Requires: perl-DBD-SQLite +%ifarch %{ix86} x86_64 +Requires: dmidecode +%endif + +Requires(post): systemd +Requires(preun): systemd +Requires(postun): systemd + +Patch1: 60a91e4da4f2daf2b10143fc148a8043312b61e5.patch +Patch2: a16ca0711001957ee98f2c124abce0fa1f801529.patch + +%description +%{name} is a RAS (Reliability, Availability and Serviceability) logging tool. +It currently records memory errors, using the EDAC tracing events. +EDAC is drivers in the Linux kernel that handle detection of ECC errors +from memory controllers for most chipsets on i386 and x86_64 architectures. +EDAC drivers for other architectures like arm also exists. +This userspace component consists of an init script which makes sure +EDAC drivers and DIMM labels are loaded at system startup, as well as +an utility for reporting current error counts from the EDAC sysfs files. + +%prep +%setup -q +%patch1 -p1 +%patch2 -p1 + +%build +%ifarch %{arm} aarch64 +%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-non-standard --enable-hisi-ns-decode --enable-arm +%else +%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report +%endif +make %{?_smp_mflags} + +%install +make install DESTDIR=%{buildroot} +install -D -p -m 0644 misc/rasdaemon.service %{buildroot}/%{_unitdir}/rasdaemon.service +install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service +rm INSTALL %{buildroot}/usr/include/*.h + +%files +%doc AUTHORS ChangeLog COPYING README TODO +%{_sbindir}/rasdaemon +%{_sbindir}/ras-mc-ctl +%{_mandir}/*/* +%{_unitdir}/*.service +%{_sharedstatedir}/rasdaemon +%{_sysconfdir}/ras/dimm_labels.d + +%changelog +* Thu Jul 11 2019 Aristeu Rozanski 0.6.1-3 +- Add support for AMD scalable MCA [1725488] + +* Mon Aug 20 2018 Aristeu Rozanski 0.6.1-2 +- Add support for error count display [1573685] + +* Wed Apr 25 2018 Mauro Carvalho Chehab 0.6.1-1 +- Bump to version 0.6.1 adding support for Skylake Xeon MSCOD, a bug fix and some new DELL labels + +* Fri Feb 09 2018 Fedora Release Engineering - 0.6.0-2 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_28_Mass_Rebuild + +* Sat Oct 14 2017 Mauro Carvalho Chehab 0.6.0-1 +- Bump to version 0.6.0 adding support for Arm and Hisilicon events and update Dell Skylate labels + +* Thu Aug 03 2017 Fedora Release Engineering - 0.5.8-6 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Binutils_Mass_Rebuild + +* Thu Jul 27 2017 Fedora Release Engineering - 0.5.8-5 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Mass_Rebuild + +* Sat Feb 11 2017 Fedora Release Engineering - 0.5.8-4 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_26_Mass_Rebuild + +* Fri Apr 15 2016 Mauro Carvalho Chehab 0.5.8-3 +- Add a virtual provide, per BZ#104132 + +* Fri Apr 15 2016 Mauro Carvalho Chehab 0.5.8-2 +- Bump to version 0.5.8 with support for Broadwell EP/EX MSCOD/DE MSCOD + +* Thu Feb 04 2016 Fedora Release Engineering - 0.5.6-2 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_24_Mass_Rebuild + +* Fri Jul 03 2015 Mauro Carvalho Chehab 0.5.6-1 +- Bump to version 0.5.6 with support for LMCE and some fixes + +* Thu Jun 18 2015 Fedora Release Engineering - 0.5.5-2 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_23_Mass_Rebuild + +* Wed Jun 03 2015 Mauro Carvalho Chehab 0.5.5-1 +- Bump to version 0.5.5 with support for newer Intel platforms & some fixes + +* Tue Sep 16 2014 Peter Robinson 0.5.4-3 +- aarch64/ppc64 have edac capabilities +- spec cleanups +- No need to run autoreconf + +* Sun Aug 17 2014 Fedora Release Engineering - 0.5.4-2 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_21_22_Mass_Rebuild + +* Fri Aug 15 2014 Mauro Carvalho Chehab 0.5.4-1 +- Bump to version 0.5.4 with some fixes, mainly for amd64 + +* Sun Aug 10 2014 Mauro Carvalho Chehab 0.5.3-1 +- Bump to version 0.5.3 and enable ABRT and ExtLog + +* Sun Jun 08 2014 Fedora Release Engineering - 0.5.2-2 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_21_Mass_Rebuild + +* Thu Apr 03 2014 Mauro Carvalho Chehab 0.5.2-1 +- fix and enable ABRT report support + +* Fri Mar 28 2014 Mauro Carvalho Chehab 0.5.1-1 +- Do some fixes at the service files and add some documentation for --record + +* Sun Feb 16 2014 Mauro Carvalho Chehab 0.5.0-1 +- Add experimental ABRT support + +* Tue Sep 10 2013 Mauro Carvalho Chehab 0.4.2-1 +- Fix ras-mc-ctl layout filling + +* Sun Aug 04 2013 Fedora Release Engineering - 0.4.1-5 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_20_Mass_Rebuild + +* Wed Jul 17 2013 Petr Pisar - 0.4.1-4 +- Perl 5.18 rebuild + +* Sun Jun 2 2013 Peter Robinson 0.4.1-3 +- ARM has EDMA drivers (currently supported in Calxeda highbank) + +* Wed May 29 2013 Mauro Carvalho Chehab 0.4.1-2 +- Fix the name of perl-DBD-SQLite package + +* Wed May 29 2013 Mauro Carvalho Chehab 0.4.1-1 +- Updated to version 0.4.1 with contains some bug fixes + +* Tue May 28 2013 Mauro Carvalho Chehab 0.4.0-1 +- Updated to version 0.4.0 and added support for mce, aer and sqlite3 storage + +* Mon May 20 2013 Mauro Carvalho Chehab 0.3.0-1 +- Package created +