Compare commits

...

No commits in common. "c8" and "c9s" have entirely different histories.
c8 ... c9s

53 changed files with 3024 additions and 3417 deletions

3
.gitignore vendored
View File

@ -1 +1,2 @@
SOURCES/rasdaemon-0.6.1.tar.bz2
/rasdaemon-*.tar.bz2
/rasdaemon-0.6.7.tar.gz

View File

@ -1 +1 @@
742eda555cccb8ca8f9b6a18bab1f4a732c11318 SOURCES/rasdaemon-0.6.1.tar.bz2
8ae34f40b676a0843be6647854b950f45161e7d4 rasdaemon-0.6.7.tar.bz2

View File

@ -0,0 +1,163 @@
commit 1f74a59ee33b7448b00d7ba13d5ecd4918b9853c
Author: Muralidhara M K <muralidhara.mk@amd.com>
Date: Fri Jun 30 10:36:53 2023 +0000
rasdaemon: Add new MA_LLC, USR_DP, and USR_CP bank types.
Add HWID and McaType values for new SMCA bank types
and error decoding for those new SMCA banks.
Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 7c88a46..fc51b5a 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -61,6 +61,7 @@ enum smca_bank_types {
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
SMCA_UMC_V2,
+ SMCA_MA_LLC, /* Memory Attached Last Level Cache */
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
SMCA_PSP_V2,
@@ -76,6 +77,8 @@ enum smca_bank_types {
SMCA_SHUB, /* System Hub Unit */
SMCA_SATA, /* SATA Unit */
SMCA_USB, /* USB Unit */
+ SMCA_USR_DP, /* Ultra Short Reach Data Plane Controller */
+ SMCA_USR_CP, /* Ultra Short Reach Control Plane Controller */
SMCA_GMI_PCS, /* GMI PCS Unit */
SMCA_XGMI_PHY, /* xGMI PHY Unit */
SMCA_WAFL_PHY, /* WAFL PHY Unit */
@@ -325,6 +328,16 @@ static const char * const smca_umc2_mce_desc[] = {
"LM32 MP errors",
};
+static const char * const smca_mall_mce_desc[] = {
+ "Counter overflow error",
+ "Counter underflow error",
+ "Write Data Parity Error",
+ "Read Response Parity Error",
+ "Cache Tag ECC Error Macro 0",
+ "Cache Tag ECC Error Macro 1",
+ "Cache Data ECC Error"
+};
+
static const char * const smca_pb_mce_desc[] = {
"An ECC error in the Parameter Block RAM array"
};
@@ -524,6 +537,57 @@ static const char * const smca_usb_mce_desc[] = {
"AXI Slave Response error",
};
+static const char * const smca_usrdp_mce_desc[] = {
+ "Mst CMD Error",
+ "Mst Rx FIFO Error",
+ "Mst Deskew Error",
+ "Mst Detect Timeout Error",
+ "Mst FlowControl Error",
+ "Mst DataValid FIFO Error",
+ "Mac LinkState Error",
+ "Deskew Error",
+ "Init Timeout Error",
+ "Init Attempt Error",
+ "Recovery Timeout Error",
+ "Recovery Attempt Error",
+ "Eye Training Timeout Error",
+ "Data Startup Limit Error",
+ "LS0 Exit Error",
+ "PLL powerState Update Timeout Error",
+ "Rx FIFO Error",
+ "Lcu Error",
+ "Conv CECC Error",
+ "Conv UECC Error",
+ "Reserved",
+ "Rx DataLoss Error",
+ "Replay CECC Error",
+ "Replay UECC Error",
+ "CRC Error",
+ "BER Exceeded Error",
+ "FC Init Timeout Error",
+ "FC Init Attempt Error",
+ "Replay Timeout Error",
+ "Replay Attempt Error",
+ "Replay Underflow Error",
+ "Replay Overflow Error",
+};
+
+static const char * const smca_usrcp_mce_desc[] = {
+ "Packet Type Error",
+ "Rx FIFO Error",
+ "Deskew Error",
+ "Rx Detect Timeout Error",
+ "Data Parity Error",
+ "Data Loss Error",
+ "Lcu Error",
+ "HB1 Handshake Timeout Error",
+ "HB2 Handshake Timeout Error",
+ "Clk Sleep Rsp Timeout Error",
+ "Clk Wake Rsp Timeout Error",
+ "Reset Attack Error",
+ "Remote Link Fatal Error",
+};
+
static const char * const smca_gmipcs_mce_desc[] = {
"Data Loss Error",
"Training Error",
@@ -579,6 +643,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
[SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) },
+ [SMCA_MA_LLC] = { smca_mall_mce_desc, ARRAY_SIZE(smca_mall_mce_desc) },
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
[SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
[SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)},
@@ -595,6 +660,8 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_SHUB] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) },
[SMCA_SATA] = { smca_sata_mce_desc, ARRAY_SIZE(smca_sata_mce_desc) },
[SMCA_USB] = { smca_usb_mce_desc, ARRAY_SIZE(smca_usb_mce_desc) },
+ [SMCA_USR_DP] = { smca_usrdp_mce_desc, ARRAY_SIZE(smca_usrdp_mce_desc) },
+ [SMCA_USR_CP] = { smca_usrcp_mce_desc, ARRAY_SIZE(smca_usrcp_mce_desc) },
[SMCA_GMI_PCS] = { smca_gmipcs_mce_desc, ARRAY_SIZE(smca_gmipcs_mce_desc) },
/* All the PHY bank types have the same error descriptions, for now. */
[SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
@@ -631,6 +698,8 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_UMC, 0x00000096 },
/* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */
{ SMCA_UMC_V2, 0x00010096 },
+ /* Memory Attached Last Level Cache */
+ { SMCA_MA_LLC, 0x0004002E },
/* Parameter Block MCA type */
{ SMCA_PB, 0x00000005 },
@@ -664,6 +733,11 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_SHUB, 0x00000080 },
{ SMCA_SATA, 0x000000A8 },
{ SMCA_USB, 0x000000AA },
+
+ /* Ultra Short Reach Data and Control Plane Controller */
+ { SMCA_USR_DP, 0x00000170 },
+ { SMCA_USR_CP, 0x00000180 },
+
{ SMCA_GMI_PCS, 0x00000241 },
/* Ext Global Memory Interconnect PHY MCA type */
@@ -692,6 +766,7 @@ static struct smca_bank_name smca_names[] = {
[SMCA_PIE] = { "Power, Interrupts, etc." },
[SMCA_UMC] = { "Unified Memory Controller" },
[SMCA_UMC_V2] = { "Unified Memory Controller V2" },
+ [SMCA_MA_LLC] = { "Memory Attached Last Level Cache" },
[SMCA_PB] = { "Parameter Block" },
[SMCA_PSP ... SMCA_PSP_V2] = { "Platform Security Processor" },
[SMCA_SMU ... SMCA_SMU_V2] = { "System Management Unit" },
@@ -704,6 +779,8 @@ static struct smca_bank_name smca_names[] = {
[SMCA_SHUB] = { "System Hub Unit" },
[SMCA_SATA] = { "SATA Unit" },
[SMCA_USB] = { "USB Unit" },
+ [SMCA_USR_DP] = { "Ultra Short Reach Data Plane Controller" },
+ [SMCA_USR_CP] = { "Ultra Short Reach Control Plane Controller" },
[SMCA_GMI_PCS] = { "Global Memory Interconnect PCS Unit" },
[SMCA_XGMI_PHY] = { "Ext Global Memory Interconnect PHY Unit" },
[SMCA_WAFL_PHY] = { "WAFL PHY Unit" },

View File

@ -0,0 +1,32 @@
commit 1ff5f3d2a0fcd48add9462567c30fe0e14585fb4
Author: Matt Whitlock <whitslack@users.noreply.github.com>
Date: Wed Jun 9 10:25:18 2021 -0400
configure.ac: fix SYSCONFDEFDIR default value
configure.ac was using AC_ARG_WITH incorrectly, yielding a generated configure script like:
# Check whether --with-sysconfdefdir was given.
if test "${with_sysconfdefdir+set}" = set; then :
withval=$with_sysconfdefdir; SYSCONFDEFDIR=$withval
else
"/etc/sysconfig"
fi
This commit fixes the default case so that the SYSCONFDEFDIR variable is assigned the value "/etc/sysconfig" rather than trying to execute "/etc/sysconfig" as a command.
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/configure.ac b/configure.ac
index f7d1947..33b81fe 100644
--- a/configure.ac
+++ b/configure.ac
@@ -172,7 +172,7 @@ AC_SUBST([RASSTATEDIR])
AC_ARG_WITH(sysconfdefdir,
AC_HELP_STRING([--with-sysconfdefdir=DIR], [rasdaemon environment file dir]),
[SYSCONFDEFDIR=$withval],
- ["/etc/sysconfig"])
+ [SYSCONFDEFDIR=/etc/sysconfig])
AC_SUBST([SYSCONFDEFDIR])
AC_DEFINE([RAS_DB_FNAME], ["ras-mc_event.db"], [ras events database])

View File

@ -0,0 +1,63 @@
commit 2b37a26dcec389723f75d69d3da9c2f15f6c317d
Author: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Wed May 26 12:41:27 2021 +0200
ci.yml: Fix the job for it to run on a single arch
There were some issues on the previous content. Fix them, in
order to allow it to build on a single architecture.
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5b3e757..747a844 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,34 +1,23 @@
name: CI
-# Should run only on branches and PR, as "on_tag.yml" will handle tags
on:
+ workflow_dispatch:
push:
- branches: master test
pull_request:
- branches: master
jobs:
-
-#
-# Linux
-#
Ubuntu:
name: Ubuntu
- runs-on: ubuntu-20.04
- strategy:
- matrix:
- arch: [x64_64, aarch64, armv7, ppc64le]
+ runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v2
- with:
- arch: ${{ matrix.arch }}
- - name: prepare
- run: |
- sudo apt-get update
- sudo apt-get install -y build-essential sqlite3
- - name: build
- run: |
- autoreconf -vfi
- ./configure --enable-all
- make
- sudo make install
+ - uses: actions/checkout@v2
+ - name: prepare
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y build-essential sqlite3
+ - name: build
+ run: |
+ autoreconf -vfi
+ ./configure --enable-all
+ make
+ sudo make install

View File

@ -0,0 +1,44 @@
commit 2b6a54b0d31e02e657171fd27f4e31d996756bc6
Author: DmNosachev <quartz64@gmail.com>
Date: Thu Jul 22 10:25:38 2021 +0300
labels/supermicro: added Supermicro X10DRL, X11SPM
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/labels/supermicro b/labels/supermicro
index 1e7761f..990fc9e 100644
--- a/labels/supermicro
+++ b/labels/supermicro
@@ -88,6 +88,16 @@ Vendor: Supermicro
P2-DIMMF1: 1.1.0; P2-DIMMF2: 1.1.1;
P2-DIMMG1: 1.2.0; P2-DIMMG2: 1.2.1;
P2-DIMMH1: 1.3.0; P2-DIMMH2: 1.3.1;
+
+ Model: X10DRL-i
+ P1-DIMMA1: 0.0.0;
+ P1-DIMMB1: 0.1.0;
+ P1-DIMMC1: 0.2.0;
+ P1-DIMMD1: 0.3.0;
+ P2-DIMME1: 1.0.0;
+ P2-DIMMF1: 1.1.0;
+ P2-DIMMG1: 1.2.0;
+ P2-DIMMH1: 1.3.0;
Model: X11DDW-NT, X11DDW-L
P1-DIMMA1: 0.0.0;
@@ -102,6 +112,14 @@ Vendor: Supermicro
P2-DIMMD1: 3.0.0;
P2-DIMME1: 3.1.0;
P2-DIMMF1: 3.2.0;
+
+ Model: X11SPM-F, X11SPM-TF, X11SPM-TPF
+ DIMMA1: 0.0.0;
+ DIMMB1: 0.1.0;
+ DIMMC1: 0.2.0;
+ DIMMD1: 1.0.0;
+ DIMME1: 1.1.0;
+ DIMMF1: 1.2.0;
Model: B1DRi
P1_DIMMA1: 0.0.0;

View File

@ -0,0 +1,105 @@
commit 2d15882a0cbfce0b905039bebc811ac8311cd739
Author: Muralidhara M K <muralidhara.mk@amd.com>
Date: Fri Jun 30 11:19:42 2023 +0000
rasdaemon: Handle reassigned bit definitions for UMC bank
On some AMD systems some of the existing bit definitions in the
CTL register of SMCA bank type are reassigned without defining
new HWID and McaType. Consequently, the errors whose bit
definitions have been reassigned in the CTL register are being
erroneously decoded.
Add new error description structure to compensate for the
reassigned bit definitions, by new software defined SMCA bank
type by utilizing the hardware-reserved values for HWID.
The new SMCA bank type will only be employed for UMC error
decoding on affected models and the existing error description
structure for UMC bank type is still valid.
Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index fc51b5a..54060ee 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -60,6 +60,7 @@ enum smca_bank_types {
SMCA_CS_V2_QUIRK,
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
+ SMCA_UMC_QUIRK,
SMCA_UMC_V2,
SMCA_MA_LLC, /* Memory Attached Last Level Cache */
SMCA_PB, /* Parameter Block */
@@ -313,6 +314,25 @@ static const char * const smca_umc_mce_desc[] = {
"Read CRC Error",
};
+static const char * const smca_umc_quirk_mce_desc[] = {
+ "DRAM On Die ECC error",
+ "Data poison error",
+ "SDP parity error",
+ "Reserved",
+ "Address/Command parity error",
+ "HBM Write data parity error",
+ "Consolidated SRAM ECC error",
+ "Reserved",
+ "Reserved",
+ "Rdb SRAM ECC error",
+ "Thermal throttling",
+ "HBM Read Data Parity error",
+ "Reserved",
+ "UMC FW Error",
+ "SRAM Parity Error",
+ "HBM CRC Error",
+};
+
static const char * const smca_umc2_mce_desc[] = {
"DRAM ECC error",
"Data poison error",
@@ -642,6 +662,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_CS_V2_QUIRK] = { smca_cs2_quirk_mce_desc, ARRAY_SIZE(smca_cs2_quirk_mce_desc)},
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
+ [SMCA_UMC_QUIRK] = { smca_umc_quirk_mce_desc, ARRAY_SIZE(smca_umc_quirk_mce_desc) },
[SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) },
[SMCA_MA_LLC] = { smca_mall_mce_desc, ARRAY_SIZE(smca_mall_mce_desc) },
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
@@ -696,6 +717,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Unified Memory Controller MCA type */
{ SMCA_UMC, 0x00000096 },
+ { SMCA_UMC_QUIRK, 0x00020000 },
/* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */
{ SMCA_UMC_V2, 0x00010096 },
/* Memory Attached Last Level Cache */
@@ -764,7 +786,7 @@ static struct smca_bank_name smca_names[] = {
[SMCA_L3_CACHE] = { "L3 Cache" },
[SMCA_CS ... SMCA_CS_V2_QUIRK] = { "Coherent Slave" },
[SMCA_PIE] = { "Power, Interrupts, etc." },
- [SMCA_UMC] = { "Unified Memory Controller" },
+ [SMCA_UMC ... SMCA_UMC_QUIRK] = { "Unified Memory Controller" },
[SMCA_UMC_V2] = { "Unified Memory Controller V2" },
[SMCA_MA_LLC] = { "Memory Attached Last Level Cache" },
[SMCA_PB] = { "Parameter Block" },
@@ -843,6 +865,10 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
if (*hwid_mcatype == 0x0002002E)
*hwid_mcatype = 0x00010000;
break;
+ case 0x90 ... 0x9F:
+ if ((*hwid_mcatype & 0xFF) == 0x00000096)
+ *hwid_mcatype = 0x00020000;
+ break;
default:
break;
}
@@ -908,7 +934,7 @@ void decode_smca_error(struct mce_event *e, struct mce_priv *m)
smca_mce_descs[bank_type].descs[xec],
xec);
- if (bank_type == SMCA_UMC && xec == 0) {
+ if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_QUIRK) && xec == 0) {
channel = find_umc_channel(e);
csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */
mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",

View File

@ -0,0 +1,524 @@
commit 30158ef8d7aebc3e5201bf39b73ce7644f8e419e
Author: Avadhut Naik <avadnaik@amd.com>
Date: Tue Apr 18 18:24:21 2023 +0000
rasdaemon: Update SMCA bank error descriptions
Update, reword some existing SMCA bank type error descriptions to extend
SMCA error decoding functionality for modern AMD processors. Additionally,
also add new error descriptions for missing SMCA bank types.
Signed-off-by: Avadhut Naik <avadnaik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 27ca8aa..7ec787a 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -66,12 +66,19 @@ enum smca_bank_types {
SMCA_SMU, /* System Management Unit */
SMCA_SMU_V2,
SMCA_MP5, /* Microprocessor 5 Unit */
+ SMCA_MPDMA, /* MPDMA Unit */
SMCA_NBIO, /* Northbridge IO Unit */
SMCA_PCIE, /* PCI Express Unit */
SMCA_PCIE_V2,
SMCA_XGMI_PCS, /* xGMI PCS Unit */
+ SMCA_NBIF, /*NBIF Unit */
+ SMCA_SHUB, /* System Hub Unit */
+ SMCA_SATA, /* SATA Unit */
+ SMCA_USB, /* USB Unit */
+ SMCA_GMI_PCS, /* GMI PCS Unit */
SMCA_XGMI_PHY, /* xGMI PHY Unit */
SMCA_WAFL_PHY, /* WAFL PHY Unit */
+ SMCA_GMI_PHY, /* GMI PHY Unit */
N_SMCA_BANK_TYPES
};
@@ -85,7 +92,6 @@ enum smca_bank_types {
#define NONCPU_NODE_INDEX 8
/* SMCA Extended error strings */
-/* Load Store */
static const char * const smca_ls_mce_desc[] = {
"Load queue parity",
"Store queue parity",
@@ -109,6 +115,7 @@ static const char * const smca_ls_mce_desc[] = {
"DC tag error type 5",
"L2 fill data error",
};
+
static const char * const smca_ls2_mce_desc[] = {
"An ECC error was detected on a data cache read by a probe or victimization",
"An ECC error or L2 poison was detected on a data cache read by a load",
@@ -133,92 +140,104 @@ static const char * const smca_ls2_mce_desc[] = {
"A SystemReadDataError error was reported on read data returned from L2 for an SCB store",
"A SystemReadDataError error was reported on read data returned from L2 for a WCB store",
"A hardware assertion error was reported",
- "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access",
+ "A parity error was detected in an STLF, SCB EMEM entry, store data mask or SRB store data by any access",
};
-/* Instruction Fetch */
+
static const char * const smca_if_mce_desc[] = {
"microtag probe port parity error",
"IC microtag or full tag multi-hit error",
"IC full tag parity",
"IC data array parity",
- "Decoupling queue phys addr parity error",
+ "PRQ Parity Error",
"L0 ITLB parity error",
- "L1 ITLB parity error",
- "L2 ITLB parity error",
+ "L1-TLB parity error",
+ "L2-TLB parity error",
"BPQ snoop parity on Thread 0",
"BPQ snoop parity on Thread 1",
- "L1 BTB multi-match error",
- "L2 BTB multi-match error",
+ "BP L1-BTB Multi-Hit Error",
+ "BP L2-BTB Multi-Hit Error",
"L2 Cache Response Poison error",
- "System Read Data error",
+ "L2 Cache Error Response",
+ "Hardware Assertion Error",
+ "L1-TLB Multi-Hit",
+ "L2-TLB Multi-Hit",
+ "BSR Parity Error",
+ "CT MCE",
};
-/* L2 Cache */
+
static const char * const smca_l2_mce_desc[] = {
- "L2M tag multi-way-hit error",
- "L2M tag ECC error",
- "L2M data ECC error",
- "HW assert",
+ "L2M Tag Multiple-Way-Hit error",
+ "L2M Tag or State Array ECC Error",
+ "L2M Data Array ECC Error",
+ "Hardware Assert Error",
+ "SDP Read Response Parity Error",
};
-/* Decoder Unit */
+
static const char * const smca_de_mce_desc[] = {
- "uop cache tag parity error",
- "uop cache data parity error",
- "Insn buffer parity error",
- "uop queue parity error",
- "Insn dispatch queue parity error",
- "Fetch address FIFO parity",
- "Patch RAM data parity",
- "Patch RAM sequencer parity",
- "uop buffer parity"
-};
-/* Execution Unit */
+ "Micro-op cache tag array parity error",
+ "Micro-op cache data array parity error",
+ "IBB Register File parity error",
+ "Micro-op queue parity error",
+ "Instruction dispatch queue parity error",
+ "Fetch address FIFO parity error",
+ "Patch RAM data parity error",
+ "Patch RAM sequencer parity error",
+ "Micro-op buffer parity error",
+ "Hardware Assertion MCA Error",
+};
+
static const char * const smca_ex_mce_desc[] = {
"Watchdog timeout error",
- "Phy register file parity",
- "Flag register file parity",
- "Immediate displacement register file parity",
- "Address generator payload parity",
- "EX payload parity",
- "Checkpoint queue parity",
- "Retire dispatch queue parity",
+ "Physical register file parity error",
+ "Flag register file parity error",
+ "Immediate displacement register file parity error",
+ "Address generator payload parity error",
+ "EX payload parity error",
+ "Checkpoint queue parity error",
+ "Retire dispatch queue parity error",
"Retire status queue parity error",
- "Scheduling queue parity error",
+ "Scheduler queue parity error",
"Branch buffer queue parity error",
+ "Hardware Assertion error",
+ "Spec Map parity error",
+ "Retire Map parity error",
};
-/* Floating Point Unit */
+
static const char * const smca_fp_mce_desc[] = {
- "Physical register file parity",
- "Freelist parity error",
- "Schedule queue parity",
+ "Physical register file (PRF) parity error",
+ "Freelist (FL) parity error",
+ "Schedule queue parity error",
"NSQ parity error",
- "Retire queue parity",
- "Status register file parity",
+ "Retire queue (RQ) parity error",
+ "Status register file (SRF) parity error",
"Hardware assertion",
+ "Physical K mask register file (KRF) parity error",
};
-/* L3 Cache */
+
static const char * const smca_l3_mce_desc[] = {
"Shadow tag macro ECC error",
"Shadow tag macro multi-way-hit error",
"L3M tag ECC error",
"L3M tag multi-way-hit error",
"L3M data ECC error",
- "XI parity, L3 fill done channel error",
- "L3 victim queue parity",
- "L3 HW assert",
+ "SDP Parity Error from XI",
+ "L3 victim queue Data Fabric error",
+ "L3 Hardware Assertion",
+ "XI WCB Parity Poison Creation event",
};
-/* Coherent Slave Unit */
+
static const char * const smca_cs_mce_desc[] = {
- "Illegal request from transport layer",
+ "Illegal request",
"Address violation",
"Security violation",
- "Illegal response from transport layer",
+ "Illegal response",
"Unexpected response",
- "Parity error on incoming request or probe response data",
- "Parity error on incoming read response data",
- "Atomic request parity",
- "ECC error on probe filter access",
+ "Request or Probe Parity Error",
+ "Read Response Parity Error",
+ "Atomic request parity error",
+ "Probe Filter ECC Error",
};
-/* Coherent Slave Unit V2 */
+
static const char * const smca_cs2_mce_desc[] = {
"Illegal Request",
"Address Violation",
@@ -234,15 +253,22 @@ static const char * const smca_cs2_mce_desc[] = {
"SDP read response had an unexpected RETRY error",
"Counter overflow error",
"Counter underflow error",
+ "Illegal Request on the no data channel",
+ "Address Violation on the no data channel",
+ "Security Violation on the no data channel",
+ "Hardware Assert Error",
};
-/* Power, Interrupt, etc.. */
+
static const char * const smca_pie_mce_desc[] = {
- "HW assert",
- "Internal PIE register security violation",
- "Error on GMI link",
- "Poison data written to internal PIE register",
+ "Hardware assert",
+ "Register security violation",
+ "Link error",
+ "Poison data consumption",
+ "A deferred error was detected in the DF",
+ "Watch Dog Timer",
+ "An SRAM ECC error was detected in the CNLI block",
};
-/* Unified Memory Controller */
+
static const char * const smca_umc_mce_desc[] = {
"DRAM ECC error",
"Data poison error on DRAM",
@@ -250,6 +276,12 @@ static const char * const smca_umc_mce_desc[] = {
"Advanced peripheral bus error",
"Command/address parity error",
"Write data CRC error",
+ "DCQ SRAM ECC error",
+ "AES SRAM ECC error",
+ "ECS Row Error",
+ "ECS Error",
+ "UMC Throttling Error",
+ "Read CRC Error",
};
static const char * const smca_umc2_mce_desc[] = {
@@ -267,15 +299,14 @@ static const char * const smca_umc2_mce_desc[] = {
"LM32 MP errors",
};
-/* Parameter Block */
static const char * const smca_pb_mce_desc[] = {
- "Parameter Block RAM ECC error",
+ "An ECC error in the Parameter Block RAM array"
};
-/* Platform Security Processor */
+
static const char * const smca_psp_mce_desc[] = {
- "PSP RAM ECC or parity error",
+ "An ECC or parity error in a PSP RAM instance",
};
-/* Platform Security Processor V2 */
+
static const char * const smca_psp2_mce_desc[] = {
"High SRAM ECC or parity error",
"Low SRAM ECC or parity error",
@@ -296,11 +327,11 @@ static const char * const smca_psp2_mce_desc[] = {
"TLB Bank 1 parity error",
"System Hub Read Buffer ECC or parity error",
};
-/* System Management Unit */
+
static const char * const smca_smu_mce_desc[] = {
- "SMU RAM ECC or parity error",
+ "An ECC or parity error in an SMU RAM instance",
};
-/* System Management Unit V2 */
+
static const char * const smca_smu2_mce_desc[] = {
"High SRAM ECC or parity error",
"Low SRAM ECC or parity error",
@@ -314,7 +345,7 @@ static const char * const smca_smu2_mce_desc[] = {
"Instruction Tag Cache Bank B ECC or parity error",
"System Hub Read Buffer ECC or parity error",
};
-/* Microprocessor 5 Unit */
+
static const char * const smca_mp5_mce_desc[] = {
"High SRAM ECC or parity error",
"Low SRAM ECC or parity error",
@@ -327,15 +358,68 @@ static const char * const smca_mp5_mce_desc[] = {
"Instruction Tag Cache Bank A ECC or parity error",
"Instruction Tag Cache Bank B ECC or parity error",
};
-/* Northbridge IO Unit */
+
+static const char * const smca_mpdma_mce_desc[] = {
+ "Main SRAM [31:0] bank ECC or parity error",
+ "Main SRAM [63:32] bank ECC or parity error",
+ "Main SRAM [95:64] bank ECC or parity error",
+ "Main SRAM [127:96] bank ECC or parity error",
+ "Data Cache Bank A ECC or parity error",
+ "Data Cache Bank B ECC or parity error",
+ "Data Tag Cache Bank A ECC or parity error",
+ "Data Tag Cache Bank B ECC or parity error",
+ "Instruction Cache Bank A ECC or parity error",
+ "Instruction Cache Bank B ECC or parity error",
+ "Instruction Tag Cache Bank A ECC or parity error",
+ "Instruction Tag Cache Bank B ECC or parity error",
+ "Data Cache Bank A ECC or parity error",
+ "Data Cache Bank B ECC or parity error",
+ "Data Tag Cache Bank A ECC or parity error",
+ "Data Tag Cache Bank B ECC or parity error",
+ "Instruction Cache Bank A ECC or parity error",
+ "Instruction Cache Bank B ECC or parity error",
+ "Instruction Tag Cache Bank A ECC or parity error",
+ "Instruction Tag Cache Bank B ECC or parity error",
+ "Data Cache Bank A ECC or parity error",
+ "Data Cache Bank B ECC or parity error",
+ "Data Tag Cache Bank A ECC or parity error",
+ "Data Tag Cache Bank B ECC or parity error",
+ "Instruction Cache Bank A ECC or parity error",
+ "Instruction Cache Bank B ECC or parity error",
+ "Instruction Tag Cache Bank A ECC or parity error",
+ "Instruction Tag Cache Bank B ECC or parity error",
+ "System Hub Read Buffer ECC or parity error",
+ "MPDMA TVF DVSEC Memory ECC or parity error",
+ "MPDMA TVF MMIO Mailbox0 ECC or parity error",
+ "MPDMA TVF MMIO Mailbox1 ECC or parity error",
+ "MPDMA TVF Doorbell Memory ECC or parity error",
+ "MPDMA TVF SDP Slave Memory 0 ECC or parity error",
+ "MPDMA TVF SDP Slave Memory 1 ECC or parity error",
+ "MPDMA TVF SDP Slave Memory 2 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 0 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 1 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 2 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 3 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 4 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 5 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 6 ECC or parity error",
+ "SDP Watchdog Timer expired",
+ "MPDMA PTE Command FIFO ECC or parity error",
+ "MPDMA PTE Hub Data FIFO ECC or parity error",
+ "MPDMA PTE Internal Data FIFO ECC or parity error",
+ "MPDMA PTE Command Memory DMA ECC or parity error",
+ "MPDMA PTE Command Memory Internal ECC or parity error",
+};
+
static const char * const smca_nbio_mce_desc[] = {
"ECC or Parity error",
"PCIE error",
- "SDP ErrEvent error",
- "SDP Egress Poison Error",
- "IOHC Internal Poison Error",
+ "External SDP ErrEvent error",
+ "SDP Egress Poison error",
+ "Internal Poison error",
+ "Internal system fatal error event",
};
-/* PCI Express Unit */
+
static const char * const smca_pcie_mce_desc[] = {
"CCIX PER Message logging",
"CCIX Read Response with Status: Non-Data Error",
@@ -345,7 +429,7 @@ static const char * const smca_pcie_mce_desc[] = {
};
static const char * const smca_pcie2_mce_desc[] = {
- "SDP Parity Error logging",
+ "SDP Data Parity Error logging",
};
static const char * const smca_xgmipcs_mce_desc[] = {
@@ -387,11 +471,66 @@ static const char * const smca_xgmiphy_mce_desc[] = {
"PHY APB error",
};
-static const char * const smca_waflphy_mce_desc[] = {
- "RAM ECC Error",
- "ARC instruction buffer parity error",
- "ARC data buffer parity error",
- "PHY APB error",
+static const char * const smca_nbif_mce_desc[] = {
+ "Timeout error from GMI",
+ "SRAM ECC error",
+ "NTB Error Event",
+ "SDP Parity error",
+};
+
+static const char * const smca_sata_mce_desc[] = {
+ "Parity error for port 0",
+ "Parity error for port 1",
+ "Parity error for port 2",
+ "Parity error for port 3",
+ "Parity error for port 4",
+ "Parity error for port 5",
+ "Parity error for port 6",
+ "Parity error for port 7",
+};
+
+static const char * const smca_usb_mce_desc[] = {
+ "Parity error or ECC error for S0 RAM0",
+ "Parity error or ECC error for S0 RAM1",
+ "Parity error or ECC error for S0 RAM2",
+ "Parity error for PHY RAM0",
+ "Parity error for PHY RAM1",
+ "AXI Slave Response error",
+};
+
+static const char * const smca_gmipcs_mce_desc[] = {
+ "Data Loss Error",
+ "Training Error",
+ "Replay Parity Error",
+ "Rx Fifo Underflow Error",
+ "Rx Fifo Overflow Error",
+ "CRC Error",
+ "BER Exceeded Error",
+ "Tx Fifo Underflow Error",
+ "Replay Buffer Parity Error",
+ "Tx Overflow Error",
+ "Replay Fifo Overflow Error",
+ "Replay Fifo Underflow Error",
+ "Elastic Fifo Overflow Error",
+ "Deskew Error",
+ "Offline Error",
+ "Data Startup Limit Error",
+ "FC Init Timeout Error",
+ "Recovery Timeout Error",
+ "Ready Serial Timeout Error",
+ "Ready Serial Attempt Error",
+ "Recovery Attempt Error",
+ "Recovery Relock Attempt Error",
+ "Deskew Abort Error",
+ "Rx Buffer Error",
+ "Rx LFDS Fifo Overflow Error",
+ "Rx LFDS Fifo Underflow Error",
+ "LinkSub Tx Timeout Error",
+ "LinkSub Rx Timeout Error",
+ "Rx CMD Packet Error",
+ "LFDS Training Timeout Error",
+ "LFDS FC Init Timeout Error",
+ "Data Loss Error",
};
struct smca_mce_desc {
@@ -419,12 +558,21 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
[SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc)},
[SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) },
+ [SMCA_MPDMA] = { smca_mpdma_mce_desc, ARRAY_SIZE(smca_mpdma_mce_desc) },
[SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)},
[SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)},
[SMCA_PCIE_V2] = { smca_pcie2_mce_desc, ARRAY_SIZE(smca_pcie2_mce_desc) },
[SMCA_XGMI_PCS] = { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc) },
+ /* NBIF and SHUB have the same error descriptions, for now. */
+ [SMCA_NBIF] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) },
+ [SMCA_SHUB] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) },
+ [SMCA_SATA] = { smca_sata_mce_desc, ARRAY_SIZE(smca_sata_mce_desc) },
+ [SMCA_USB] = { smca_usb_mce_desc, ARRAY_SIZE(smca_usb_mce_desc) },
+ [SMCA_GMI_PCS] = { smca_gmipcs_mce_desc, ARRAY_SIZE(smca_gmipcs_mce_desc) },
+ /* All the PHY bank types have the same error descriptions, for now. */
[SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
- [SMCA_WAFL_PHY] = { smca_waflphy_mce_desc, ARRAY_SIZE(smca_waflphy_mce_desc) },
+ [SMCA_WAFL_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
+ [SMCA_GMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
};
struct smca_hwid {
@@ -470,6 +618,9 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Microprocessor 5 Unit MCA type */
{ SMCA_MP5, 0x00020001 },
+ /* MPDMA MCA Type */
+ { SMCA_MPDMA, 0x00030001 },
+
/* Northbridge IO Unit MCA type */
{ SMCA_NBIO, 0x00000018 },
@@ -480,11 +631,20 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Ext Global Memory Interconnect PCS MCA type */
{ SMCA_XGMI_PCS, 0x00000050 },
+ { SMCA_NBIF, 0x0000006C },
+
+ { SMCA_SHUB, 0x00000080 },
+ { SMCA_SATA, 0x000000A8 },
+ { SMCA_USB, 0x000000AA },
+ { SMCA_GMI_PCS, 0x00000241 },
+
/* Ext Global Memory Interconnect PHY MCA type */
{ SMCA_XGMI_PHY, 0x00000259 },
/* WAFL PHY MCA type */
{ SMCA_WAFL_PHY, 0x00000267 },
+
+ { SMCA_GMI_PHY, 0x00000269 },
};
struct smca_bank_name {
@@ -508,12 +668,18 @@ static struct smca_bank_name smca_names[] = {
[SMCA_PSP ... SMCA_PSP_V2] = { "Platform Security Processor" },
[SMCA_SMU ... SMCA_SMU_V2] = { "System Management Unit" },
[SMCA_MP5] = { "Microprocessor 5 Unit" },
+ [SMCA_MPDMA] = { "MPDMA Unit" },
[SMCA_NBIO] = { "Northbridge IO Unit" },
[SMCA_PCIE ... SMCA_PCIE_V2] = { "PCI Express Unit" },
[SMCA_XGMI_PCS] = { "Ext Global Memory Interconnect PCS Unit" },
+ [SMCA_NBIF] = { "NBIF Unit" },
+ [SMCA_SHUB] = { "System Hub Unit" },
+ [SMCA_SATA] = { "SATA Unit" },
+ [SMCA_USB] = { "USB Unit" },
+ [SMCA_GMI_PCS] = { "Global Memory Interconnect PCS Unit" },
[SMCA_XGMI_PHY] = { "Ext Global Memory Interconnect PHY Unit" },
[SMCA_WAFL_PHY] = { "WAFL PHY Unit" },
-
+ [SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" },
};
static void amd_decode_errcode(struct mce_event *e)

View File

@ -0,0 +1,43 @@
commit 50565005b10fe909c66f1c90f2feb95712427c7d
Author: DmNosachev <quartz64@gmail.com>
Date: Tue Jun 29 14:07:54 2021 +0300
labels/supermicro: added Supermicro X11DDW-NT(-L)
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/labels/supermicro b/labels/supermicro
index 86e4617..373de07 100644
--- a/labels/supermicro
+++ b/labels/supermicro
@@ -69,7 +69,7 @@ Vendor: Supermicro
P2_DIMM4B: 2.0.1;
P2_DIMM4B: 2.1.1;
- Model: X11DPH-i
+ Model: X11DPH-i, X11DPH-T, X11DPH-TQ
P1-DIMMA1: 0.0.0; P1-DIMMA2: 0.0.1;
P1-DIMMB1: 0.1.0;
P1-DIMMC1: 0.2.0;
@@ -91,4 +91,18 @@ Vendor: Supermicro
P2-DIMME1: 1.0.0; P2-DIMME2: 1.0.1;
P2-DIMMF1: 1.1.0; P2-DIMMF2: 1.1.1;
P2-DIMMG1: 1.2.0; P2-DIMMG2: 1.2.1;
- P2-DIMMH1: 1.3.0; P2-DIMMH2: 1.3.1;
\ No newline at end of file
+ P2-DIMMH1: 1.3.0; P2-DIMMH2: 1.3.1;
+
+ Model: X11DDW-NT, X11DDW-L
+ P1-DIMMA1: 0.0.0;
+ P1-DIMMB1: 0.1.0;
+ P1-DIMMC1: 0.2.0;
+ P1-DIMMD1: 1.0.0;
+ P1-DIMME1: 1.1.0;
+ P1-DIMMF1: 1.2.0;
+ P2-DIMMA1: 2.0.0;
+ P2-DIMMB1: 2.1.0;
+ P2-DIMMC1: 2.2.0;
+ P2-DIMMD1: 3.0.0;
+ P2-DIMME1: 3.1.0;
+ P2-DIMMF1: 3.2.0;
\ No newline at end of file

View File

@ -0,0 +1,37 @@
commit 6bc43db1b6b3d73805179c21d1dd5521e8dc0f74
Author: DmNosachev <quartz64@gmail.com>
Date: Fri Jul 2 13:13:46 2021 +0300
labels/supermicro: added Supermicro X11SCA(-F)
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/labels/supermicro b/labels/supermicro
index b924a32..1e7761f 100644
--- a/labels/supermicro
+++ b/labels/supermicro
@@ -10,11 +10,7 @@
#
Vendor: Supermicro
- Model: A2SDi-8C-HLN4F
- DIMMA1: 0.0.0; DIMMA2: 0.0.1;
- DIMMB1: 0.1.0; DIMMB2: 0.1.1;
-
- Model: A2SDi-8C+-HLN4F
+ Model: A2SDi-8C-HLN4F, A2SDi-8C+-HLN4F
DIMMA1: 0.0.0; DIMMA2: 0.0.1;
DIMMB1: 0.1.0; DIMMB2: 0.1.1;
@@ -115,4 +111,8 @@ Vendor: Supermicro
P2_DIMME1: 1.0.0;
P2_DIMMF1: 1.1.0;
P2_DIMMG1: 1.2.0;
- P2_DIMMH1: 1.3.0;
\ No newline at end of file
+ P2_DIMMH1: 1.3.0;
+
+ Model: X11SCA, X11SCA-F
+ DIMMA1: 0.0.0, 0.1.0; DIMMA2: 0.2.0, 0.3.0;
+ DIMMB1: 0.0.1, 0.1.1; DIMMB2: 0.2.1, 0.3.1;
\ No newline at end of file

View File

@ -0,0 +1,610 @@
commit 738bafafdcb2e8b0ced32fff31b13754d571090b
Author: Jason Tian <jason@os.amperecomputing.com>
Date: Fri May 28 11:35:43 2021 +0800
Add error handling for Ampere-specific errors.
Save Ampere-specific errors' decode into sqlite3 data
base and log PCIe segment, bus/device/function number
into BMC SEL.
Signed-off-by: Jason Tian <jason@os.amperecomputing.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/non-standard-ampere.c b/non-standard-ampere.c
index 8cceb26..05b5252 100644
--- a/non-standard-ampere.c
+++ b/non-standard-ampere.c
@@ -216,6 +216,13 @@ static const char * const err_bert_sub_type[] = {
"PMPRO Fatal",
};
+static char *sqlite3_table_list[] = {
+ "amp_payload0_event_tab",
+ "amp_payload1_event_tab",
+ "amp_payload2_event_tab",
+ "amp_payload3_event_tab",
+};
+
struct amp_ras_type_info {
int id;
const char *name;
@@ -352,6 +359,359 @@ static const char *oem_subtype_name(const struct amp_ras_type_info *info,
return "unknown";
}
+#ifdef HAVE_SQLITE3
+/*key pair definition for ampere specific error payload type 0*/
+static const struct db_fields amp_payload0_event_fields[] = {
+ { .name = "id", .type = "INTEGER PRIMARY KEY" },
+ { .name = "timestamp", .type = "TEXT" },
+ { .name = "type", .type = "TEXT" },
+ { .name = "subtype", .type = "TEXT" },
+ { .name = "instance", .type = "INTEGER" },
+ { .name = "socket_num", .type = "INTEGER" },
+ { .name = "status_reg", .type = "INTEGER" },
+ { .name = "addr_reg", .type = "INTEGER" },
+ { .name = "misc0", .type = "INTEGER" },
+ { .name = "misc1", .type = "INTEGER" },
+ { .name = "misc2", .type = "INTEGER" },
+ { .name = "misc3", .type = "INTEGER" },
+};
+
+static const struct db_table_descriptor amp_payload0_event_tab = {
+ .name = "amp_payload0_event",
+ .fields = amp_payload0_event_fields,
+ .num_fields = ARRAY_SIZE(amp_payload0_event_fields),
+};
+
+/*key pair definition for ampere specific error payload type 1*/
+static const struct db_fields amp_payload1_event_fields[] = {
+ { .name = "id", .type = "INTEGER PRIMARY KEY" },
+ { .name = "timestamp", .type = "TEXT" },
+ { .name = "type", .type = "TEXT" },
+ { .name = "subtype", .type = "TEXT" },
+ { .name = "instance", .type = "INTEGER" },
+ { .name = "socket_num", .type = "INTEGER" },
+ { .name = "uncore_err_status", .type = "INTEGER" },
+ { .name = "uncore_err_mask", .type = "INTEGER" },
+ { .name = "uncore_err_sev", .type = "INTEGER" },
+ { .name = "core_err_status", .type = "INTEGER" },
+ { .name = "core_err_mask", .type = "INTEGER" },
+ { .name = "root_err_cmd", .type = "INTEGER" },
+ { .name = "root_err_status", .type = "INTEGER" },
+ { .name = "src_id", .type = "INTEGER" },
+ { .name = "reserved1", .type = "INTEGER" },
+ { .name = "reserverd2", .type = "INTEGER" },
+};
+
+static const struct db_table_descriptor amp_payload1_event_tab = {
+ .name = "amp_payload1_event",
+ .fields = amp_payload1_event_fields,
+ .num_fields = ARRAY_SIZE(amp_payload1_event_fields),
+};
+
+/*key pair definition for ampere specific error payload type 2*/
+static const struct db_fields amp_payload2_event_fields[] = {
+ { .name = "id", .type = "INTEGER PRIMARY KEY" },
+ { .name = "timestamp", .type = "TEXT" },
+ { .name = "type", .type = "TEXT" },
+ { .name = "subtype", .type = "TEXT" },
+ { .name = "instance", .type = "INTEGER" },
+ { .name = "socket_num", .type = "INTEGER" },
+ { .name = "ce_report_reg", .type = "INTEGER" },
+ { .name = "ce_location", .type = "INTEGER" },
+ { .name = "ce_addr", .type = "INTEGER" },
+ { .name = "ue_report_reg", .type = "INTEGER" },
+ { .name = "ue_location", .type = "INTEGER" },
+ { .name = "ue_addr", .type = "INTEGER" },
+ { .name = "reserved1", .type = "INTEGER" },
+ { .name = "reserved2", .type = "INTEGER" },
+ { .name = "reserved2", .type = "INTEGER" },
+};
+
+static const struct db_table_descriptor amp_payload2_event_tab = {
+ .name = "amp_payload2_event",
+ .fields = amp_payload2_event_fields,
+ .num_fields = ARRAY_SIZE(amp_payload2_event_fields),
+};
+
+/*key pair definition for ampere specific error payload type 3*/
+static const struct db_fields amp_payload3_event_fields[] = {
+ { .name = "id", .type = "INTEGER PRIMARY KEY" },
+ { .name = "timestamp", .type = "TEXT" },
+ { .name = "type", .type = "TEXT" },
+ { .name = "subtype", .type = "TEXT" },
+ { .name = "instance", .type = "INTEGER" },
+ { .name = "socket_num", .type = "INTEGER" },
+ { .name = "fw_spec_data0", .type = "INTEGER" },
+ { .name = "fw_spec_data1", .type = "INTEGER" },
+ { .name = "fw_spec_data2", .type = "INTEGER" },
+ { .name = "fw_spec_data3", .type = "INTEGER" },
+ { .name = "fw_spec_data4", .type = "INTEGER" },
+ { .name = "fw_spec_data5", .type = "INTEGER" },
+};
+
+static const struct db_table_descriptor amp_payload3_event_tab = {
+ .name = "amp_payload3_event",
+ .fields = amp_payload3_event_fields,
+ .num_fields = ARRAY_SIZE(amp_payload3_event_fields),
+};
+
+/*Save data with different type into sqlite3 db*/
+static void record_amp_data(struct ras_ns_ev_decoder *ev_decoder,
+ enum amp_oem_data_type data_type,
+ int id, int64_t data, const char *text)
+{
+ switch (data_type) {
+ case AMP_OEM_DATA_TYPE_INT:
+ sqlite3_bind_int(ev_decoder->stmt_dec_record, id, data);
+ break;
+ case AMP_OEM_DATA_TYPE_INT64:
+ sqlite3_bind_int64(ev_decoder->stmt_dec_record, id, data);
+ break;
+ case AMP_OEM_DATA_TYPE_TEXT:
+ sqlite3_bind_text(ev_decoder->stmt_dec_record, id,
+ text, -1, NULL);
+ break;
+ default:
+ break;
+ }
+}
+
+static int store_amp_err_data(struct ras_ns_ev_decoder *ev_decoder,
+ const char *name)
+{
+ int rc;
+
+ rc = sqlite3_step(ev_decoder->stmt_dec_record);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to do %s step on sqlite: error = %d\n", name, rc);
+
+ rc = sqlite3_reset(ev_decoder->stmt_dec_record);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to reset %s on sqlite: error = %d\n", name, rc);
+
+ rc = sqlite3_clear_bindings(ev_decoder->stmt_dec_record);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to clear bindings %s on sqlite: error = %d\n",
+ name, rc);
+
+ return rc;
+}
+
+/*save all Ampere Specific Error Payload type 0 to sqlite3 database*/
+static void record_amp_payload0_err(struct ras_ns_ev_decoder *ev_decoder,
+ const char *type_str, const char *subtype_str,
+ const struct amp_payload0_type_sec *err)
+{
+ if (ev_decoder != NULL) {
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT,
+ AMP_PAYLOAD0_FIELD_TYPE, 0, type_str);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT,
+ AMP_PAYLOAD0_FIELD_SUB_TYPE, 0, subtype_str);
+
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD0_FIELD_INS, INSTANCE(err->instance), NULL);
+
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD0_FIELD_SOCKET_NUM,
+ SOCKET_NUM(err->instance), NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD0_FIELD_STATUS_REG, err->err_status, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD0_FIELD_ADDR_REG,
+ err->err_addr, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD0_FIELD_MISC0,
+ err->err_misc_0, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD0_FIELD_MISC1,
+ err->err_misc_1, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD0_FIELD_MISC2,
+ err->err_misc_2, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD0_FIELD_MISC3,
+ err->err_misc_3, NULL);
+ store_amp_err_data(ev_decoder, "amp_payload0_event_tab");
+ }
+}
+
+/*save all Ampere Specific Error Payload type 1 to sqlite3 database*/
+static void record_amp_payload1_err(struct ras_ns_ev_decoder *ev_decoder,
+ const char *type_str, const char *subtype_str,
+ const struct amp_payload1_type_sec *err)
+{
+ if (ev_decoder != NULL) {
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT,
+ AMP_PAYLOAD1_FIELD_TYPE, 0, type_str);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT,
+ AMP_PAYLOAD1_FIELD_SUB_TYPE, 0, subtype_str);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_INS,
+ INSTANCE(err->instance), NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_SOCKET_NUM,
+ SOCKET_NUM(err->instance), NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_UNCORE_ERR_STATUS,
+ err->uncore_status, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_UNCORE_ERR_MASK,
+ err->uncore_mask, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_UNCORE_ERR_SEV,
+ err->uncore_sev, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_CORE_ERR_STATUS,
+ err->core_status, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_CORE_ERR_MASK,
+ err->core_mask, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_ROOT_ERR_CMD,
+ err->root_err_cmd, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_ROOT_ERR_STATUS,
+ err->root_status, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_SRC_ID,
+ err->src_id, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD1_FIELD_RESERVED1,
+ err->reserved1, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD1_FIELD_RESERVED2,
+ err->reserved2, NULL);
+ store_amp_err_data(ev_decoder, "amp_payload1_event_tab");
+ }
+}
+
+/*save all Ampere Specific Error Payload type 2 to sqlite3 database*/
+static void record_amp_payload2_err(struct ras_ns_ev_decoder *ev_decoder,
+ const char *type_str, const char *subtype_str,
+ const struct amp_payload2_type_sec *err)
+{
+ if (ev_decoder != NULL) {
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT,
+ AMP_PAYLOAD2_FIELD_TYPE, 0, type_str);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT,
+ AMP_PAYLOAD2_FIELD_SUB_TYPE, 0, subtype_str);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD2_FIELD_INS, INSTANCE(err->instance), NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD2_FIELD_SOCKET_NUM,
+ SOCKET_NUM(err->instance), NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD2_FIELD_CE_REPORT_REG,
+ err->ce_register, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD2_FIELD_CE_LOACATION,
+ err->ce_location, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD2_FIELD_CE_ADDR,
+ err->ce_addr, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD2_FIELD_UE_REPORT_REG,
+ err->ue_register, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD2_FIELD_UE_LOCATION,
+ err->ue_location, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD2_FIELD_UE_ADDR,
+ err->ue_addr, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD2_FIELD_RESERVED1,
+ err->reserved1, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD2_FIELD_RESERVED2,
+ err->reserved2, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD2_FIELD_RESERVED3,
+ err->reserved3, NULL);
+ store_amp_err_data(ev_decoder, "amp_payload2_event_tab");
+ }
+}
+
+/*save all Ampere Specific Error Payload type 3 to sqlite3 database*/
+static void record_amp_payload3_err(struct ras_ns_ev_decoder *ev_decoder,
+ const char *type_str, const char *subtype_str,
+ const struct amp_payload3_type_sec *err)
+{
+ if (ev_decoder != NULL) {
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT,
+ AMP_PAYLOAD3_FIELD_TYPE, 0, type_str);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT,
+ AMP_PAYLOAD3_FIELD_SUB_TYPE, 0, subtype_str);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD3_FIELD_INS, INSTANCE(err->instance), NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD3_FIELD_SOCKET_NUM,
+ SOCKET_NUM(err->instance), NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA0,
+ err->fw_speci_data0, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA1,
+ err->fw_speci_data1, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA2,
+ err->fw_speci_data2, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA3,
+ err->fw_speci_data3, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA4,
+ err->fw_speci_data4, NULL);
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_INT64,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA5,
+ err->fw_speci_data5, NULL);
+ store_amp_err_data(ev_decoder, "amp_payload3_event_tab");
+ }
+}
+
+#else
+static void record_amp_data(struct ras_ns_ev_decoder *ev_decoder,
+ enum amp_oem_data_type data_type,
+ int id, int64_t data, const char *text)
+{
+ return 0;
+}
+
+static void record_amp_payload0_err(struct ras_ns_ev_decoder *ev_decoder,
+ const char *type_str, const char *subtype_str,
+ const struct amp_payload0_type_sec *err)
+{
+ return 0;
+}
+
+static void record_amp_payload1_err(struct ras_ns_ev_decoder *ev_decoder,
+ const char *type_str, const char *subtype_str,
+ const struct amp_payload1_type_sec *err)
+{
+ return 0;
+}
+
+static void record_amp_payload2_err(struct ras_ns_ev_decoder *ev_decoder,
+ const char *type_str, const char *subtype_str,
+ const struct amp_payload2_type_sec *err)
+{
+ return 0;
+}
+
+static void record_amp_payload3_err(struct ras_ns_ev_decoder *ev_decoder,
+ const char *type_str, const char *subtype_str,
+ const struct amp_payload3_type_sec *err)
+{
+ return 0;
+}
+
+static int store_amp_err_data(struct ras_ns_ev_decoder *ev_decoder, char *name)
+{
+ return 0;
+}
+#endif
/*decode ampere specific error payload type 0, the CPU's data is save*/
/*to sqlite by ras-arm-handler, others are saved by this function.*/
@@ -434,6 +794,7 @@ void decode_amp_payload0_err_regs(struct ras_ns_ev_decoder *ev_decoder,
*p = '\0';
}
+ record_amp_payload0_err(ev_decoder, type_str, subtype_str, err);
i = 0;
p = NULL;
end = NULL;
@@ -517,6 +878,7 @@ static void decode_amp_payload1_err_regs(struct ras_ns_ev_decoder *ev_decoder,
*p = '\0';
}
+ record_amp_payload1_err(ev_decoder, type_str, subtype_str, err);
i = 0;
p = NULL;
end = NULL;
@@ -601,6 +963,7 @@ static void decode_amp_payload2_err_regs(struct ras_ns_ev_decoder *ev_decoder,
*p = '\0';
}
+ record_amp_payload2_err(ev_decoder, type_str, subtype_str, err);
i = 0;
p = NULL;
end = NULL;
@@ -673,6 +1036,7 @@ static void decode_amp_payload3_err_regs(struct ras_ns_ev_decoder *ev_decoder,
*p = '\0';
}
+ record_amp_payload3_err(ev_decoder, type_str, subtype_str, err);
i = 0;
p = NULL;
end = NULL;
@@ -687,6 +1051,38 @@ static int decode_amp_oem_type_error(struct ras_events *ras,
{
int payload_type = PAYLOAD_TYPE(event->error[0]);
+#ifdef HAVE_SQLITE3
+ struct db_table_descriptor db_tab;
+ int id = 0;
+
+ if (payload_type == PAYLOAD_TYPE_0) {
+ db_tab = amp_payload0_event_tab;
+ id = AMP_PAYLOAD0_FIELD_TIMESTAMP;
+ } else if (payload_type == PAYLOAD_TYPE_1) {
+ db_tab = amp_payload1_event_tab;
+ id = AMP_PAYLOAD1_FIELD_TIMESTAMP;
+ } else if (payload_type == PAYLOAD_TYPE_2) {
+ db_tab = amp_payload2_event_tab;
+ id = AMP_PAYLOAD2_FIELD_TIMESTAMP;
+ } else if (payload_type == PAYLOAD_TYPE_3) {
+ db_tab = amp_payload3_event_tab;
+ id = AMP_PAYLOAD3_FIELD_TIMESTAMP;
+ } else
+ return -1;
+
+ if (!ev_decoder->stmt_dec_record) {
+ if (ras_mc_add_vendor_table(ras, &ev_decoder->stmt_dec_record,
+ &db_tab) != SQLITE_OK) {
+ trace_seq_printf(s,
+ "create sql %s fail\n",
+ sqlite3_table_list[payload_type]);
+ return -1;
+ }
+ }
+ record_amp_data(ev_decoder, AMP_OEM_DATA_TYPE_TEXT,
+ id, 0, event->timestamp);
+#endif
+
if (payload_type == PAYLOAD_TYPE_0) {
const struct amp_payload0_type_sec *err =
(struct amp_payload0_type_sec *)event->error;
diff --git a/non-standard-ampere.h b/non-standard-ampere.h
index aacf3a8..f463c53 100644
--- a/non-standard-ampere.h
+++ b/non-standard-ampere.h
@@ -102,6 +102,79 @@ struct amp_payload3_type_sec {
uint64_t fw_speci_data5;
};
+enum amp_oem_data_type {
+ AMP_OEM_DATA_TYPE_INT,
+ AMP_OEM_DATA_TYPE_INT64,
+ AMP_OEM_DATA_TYPE_TEXT,
+};
+
+enum {
+ AMP_PAYLOAD0_FIELD_ID,
+ AMP_PAYLOAD0_FIELD_TIMESTAMP,
+ AMP_PAYLOAD0_FIELD_TYPE,
+ AMP_PAYLOAD0_FIELD_SUB_TYPE,
+ AMP_PAYLOAD0_FIELD_INS,
+ AMP_PAYLOAD0_FIELD_SOCKET_NUM,
+ AMP_PAYLOAD0_FIELD_STATUS_REG,
+ AMP_PAYLOAD0_FIELD_ADDR_REG,
+ AMP_PAYLOAD0_FIELD_MISC0,
+ AMP_PAYLOAD0_FIELD_MISC1,
+ AMP_PAYLOAD0_FIELD_MISC2,
+ AMP_PAYLOAD0_FIELD_MISC3,
+};
+
+enum {
+ AMP_PAYLOAD1_FIELD_ID,
+ AMP_PAYLOAD1_FIELD_TIMESTAMP,
+ AMP_PAYLOAD1_FIELD_TYPE,
+ AMP_PAYLOAD1_FIELD_SUB_TYPE,
+ AMP_PAYLOAD1_FIELD_INS,
+ AMP_PAYLOAD1_FIELD_SOCKET_NUM,
+ AMP_PAYLOAD1_FIELD_UNCORE_ERR_STATUS,
+ AMP_PAYLOAD1_FIELD_UNCORE_ERR_MASK,
+ AMP_PAYLOAD1_FIELD_UNCORE_ERR_SEV,
+ AMP_PAYLOAD1_FIELD_CORE_ERR_STATUS,
+ AMP_PAYLOAD1_FIELD_CORE_ERR_MASK,
+ AMP_PAYLOAD1_FIELD_ROOT_ERR_CMD,
+ AMP_PAYLOAD1_FIELD_ROOT_ERR_STATUS,
+ AMP_PAYLOAD1_FIELD_SRC_ID,
+ AMP_PAYLOAD1_FIELD_RESERVED1,
+ AMP_PAYLOAD1_FIELD_RESERVED2,
+};
+
+enum {
+ AMP_PAYLOAD2_FIELD_ID,
+ AMP_PAYLOAD2_FIELD_TIMESTAMP,
+ AMP_PAYLOAD2_FIELD_TYPE,
+ AMP_PAYLOAD2_FIELD_SUB_TYPE,
+ AMP_PAYLOAD2_FIELD_INS,
+ AMP_PAYLOAD2_FIELD_SOCKET_NUM,
+ AMP_PAYLOAD2_FIELD_CE_REPORT_REG,
+ AMP_PAYLOAD2_FIELD_CE_LOACATION,
+ AMP_PAYLOAD2_FIELD_CE_ADDR,
+ AMP_PAYLOAD2_FIELD_UE_REPORT_REG,
+ AMP_PAYLOAD2_FIELD_UE_LOCATION,
+ AMP_PAYLOAD2_FIELD_UE_ADDR,
+ AMP_PAYLOAD2_FIELD_RESERVED1,
+ AMP_PAYLOAD2_FIELD_RESERVED2,
+ AMP_PAYLOAD2_FIELD_RESERVED3,
+};
+
+enum {
+ AMP_PAYLOAD3_FIELD_ID,
+ AMP_PAYLOAD3_FIELD_TIMESTAMP,
+ AMP_PAYLOAD3_FIELD_TYPE,
+ AMP_PAYLOAD3_FIELD_SUB_TYPE,
+ AMP_PAYLOAD3_FIELD_INS,
+ AMP_PAYLOAD3_FIELD_SOCKET_NUM,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA0,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA1,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA2,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA3,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA4,
+ AMP_PAYLOAD3_FIELD_FW_SPEC_DATA5
+};
+
void decode_amp_payload0_err_regs(struct ras_ns_ev_decoder *ev_decoder,
struct trace_seq *s,
const struct amp_payload0_type_sec *err);
diff --git a/ras-aer-handler.c b/ras-aer-handler.c
index 8ddd439..6f4cb2b 100644
--- a/ras-aer-handler.c
+++ b/ras-aer-handler.c
@@ -67,6 +67,9 @@ int ras_aer_event_handler(struct trace_seq *s,
struct tm *tm;
struct ras_aer_event ev;
char buf[BUF_LEN];
+ char ipmi_add_sel[105];
+ uint8_t sel_data[5];
+ int seg, bus, dev, fn;
/*
* Newer kernels (3.10-rc1 or upper) provide an uptime clock.
@@ -129,15 +132,19 @@ int ras_aer_event_handler(struct trace_seq *s,
switch (severity_val) {
case HW_EVENT_AER_UNCORRECTED_NON_FATAL:
ev.error_type = "Uncorrected (Non-Fatal)";
+ sel_data[0] = 0xca;
break;
case HW_EVENT_AER_UNCORRECTED_FATAL:
ev.error_type = "Uncorrected (Fatal)";
+ sel_data[0] = 0xca;
break;
case HW_EVENT_AER_CORRECTED:
ev.error_type = "Corrected";
+ sel_data[0] = 0xbf;
break;
default:
ev.error_type = "Unknown severity";
+ sel_data[0] = 0xbf;
}
trace_seq_puts(s, ev.error_type);
@@ -151,5 +158,29 @@ int ras_aer_event_handler(struct trace_seq *s,
ras_report_aer_event(ras, &ev);
#endif
+#ifdef HAVE_AMP_NS_DECODE
+ /*
+ * Get PCIe AER error source seg/bus/dev/fn and save it into
+ * BMC OEM SEL, ipmitool raw 0x0a 0x44 is IPMI command-Add SEL
+ * entry, please refer IPMI specificaiton chapter 31.6. 0xcd3a
+ * is manufactuer ID(ampere),byte 12 is sensor num(CE is 0xBF,
+ * UE is 0xCA), byte 13~14 is segment number, byte 15 is bus
+ * number, byte 16[7:3] is device number, byte 16[2:0] is
+ * function number
+ */
+ sscanf(ev.dev_name, "%x:%x:%x.%x", &seg, &bus, &dev, &fn);
+
+ sel_data[1] = seg & 0xff;
+ sel_data[2] = (seg & 0xff00) >> 8;
+ sel_data[3] = bus;
+ sel_data[4] = (((dev & 0x1f) << 3) | (fn & 0x7));
+
+ sprintf(ipmi_add_sel,
+ "ipmitool raw 0x0a 0x44 0x00 0x00 0xc0 0x00 0x00 0x00 0x00 0x3a 0xcd 0x00 0xc0 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x",
+ sel_data[0], sel_data[1], sel_data[2], sel_data[3], sel_data[4]);
+
+ system(ipmi_add_sel);
+#endif
+
return 0;
}

View File

@ -0,0 +1,26 @@
commit 7ccf12f5ae26a055926d175d908c7930293438c4
Author: DmNosachev <quartz64@gmail.com>
Date: Fri Jul 23 17:28:33 2021 +0300
labels/supermicro: added Supermicro X11SCW
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/labels/supermicro b/labels/supermicro
index 990fc9e..aea7c3c 100644
--- a/labels/supermicro
+++ b/labels/supermicro
@@ -133,4 +133,10 @@ Vendor: Supermicro
Model: X11SCA, X11SCA-F
DIMMA1: 0.0.0, 0.1.0; DIMMA2: 0.2.0, 0.3.0;
- DIMMB1: 0.0.1, 0.1.1; DIMMB2: 0.2.1, 0.3.1;
\ No newline at end of file
+ DIMMB1: 0.0.1, 0.1.1; DIMMB2: 0.2.1, 0.3.1;
+
+ Model: X11SCW-F
+ DIMMA1: 0.1.0;
+ DIMMA2: 0.0.0;
+ DIMMB1: 0.1.1;
+ DIMMB2: 0.0.1;
\ No newline at end of file

View File

@ -0,0 +1,411 @@
commit 932118b04a04104dfac6b8536419803f236e6118
Author: Avadhut Naik <avadhut.naik@amd.com>
Date: Mon May 22 22:13:17 2023 +0000
rasdaemon: Add support for post-processing MCA errors
Currently, the rasdaemon performs detailed error decoding of received
MCA errors on the system only whence it is running, either as a daemon
or in the foreground.
As such, error decoding cannot be undertaken for any MCA errors received
whence the rasdaemon wasn't running. Additionally, if the error decoding
modules like edac_mce_amd too have not been loaded, error records in the
demsg buffer might correspond to raw values in associated MSRs, compelling
users to undertake decoding manually. The scenario seems more plausible on
AMD systems with Scalabale MCA (SMCA) with plans in place to remove SMCA
Extended Error Descriptions from the edac_mce_amd module in an effort to
offload SMCA Error Decoding to the rasdaemon.
As such, add support to post-process and decode MCA Errors received on AMD
SMCA systems from raw MSR values. Support for post-processing and decoding
of MCA Errors received on CPUs of other vendors can be added in the future,
as needed.
Suggested-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
mce-amd-smca.c | 8 ++-
ras-events.h | 1
ras-mce-handler.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++----
ras-mce-handler.h | 4 +
ras-record.h | 10 ++++
rasdaemon.c | 94 +++++++++++++++++++++++++++++++++++++++++++++-
6 files changed, 216 insertions(+), 11 deletions(-)
--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2023-10-27 12:44:55.541077722 -0400
+++ rasdaemon-0.6.7/mce-amd-smca.c 2023-10-27 12:44:58.549049019 -0400
@@ -710,7 +710,7 @@ static struct smca_bank_name smca_names[
[SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" },
};
-static void amd_decode_errcode(struct mce_event *e)
+void amd_decode_errcode(struct mce_event *e)
{
decode_amd_errcode(e);
@@ -782,7 +782,7 @@ *hwid_mcatype = 0x00010000;
}
/* Decode extended errors according to Scalable MCA specification */
-static void decode_smca_error(struct mce_event *e, struct mce_priv* m)
+void decode_smca_error(struct mce_event *e, struct mce_priv *m)
{
enum smca_bank_types bank_type;
const char *ip_name;
@@ -827,7 +827,9 @@ for (i = 0; i < ARRAY_SIZE(smca_hwid_mca
/* Only print the descriptor of valid extended error code */
if (xec < smca_mce_descs[bank_type].num_descs)
mce_snprintf(e->mcastatus_msg,
- " %s.\n", smca_mce_descs[bank_type].descs[xec]);
+ "%s. Ext Err Code: %d",
+ smca_mce_descs[bank_type].descs[xec],
+ xec);
if (bank_type == SMCA_UMC && xec == 0) {
channel = find_umc_channel(e);
--- rasdaemon-0.6.7.orig/ras-events.h 2023-10-27 12:44:55.541077722 -0400
+++ rasdaemon-0.6.7/ras-events.h 2023-10-27 12:44:58.549049019 -0400
@@ -100,6 +100,7 @@ enum ghes_severity {
/* Function prototypes */
int toggle_ras_mc_event(int enable);
+int ras_offline_mce_event(struct ras_mc_offline_event *event);
int handle_ras_events(int record_events);
#endif
--- rasdaemon-0.6.7.orig/ras-mce-handler.c 2023-10-27 12:44:55.541077722 -0400
+++ rasdaemon-0.6.7/ras-mce-handler.c 2023-10-27 12:45:27.159776011 -0400
@@ -63,10 +63,8 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series
[CPU_SAPPHIRERAPIDS] = "Sapphirerapids server",
};
-static enum cputype select_intel_cputype(struct ras_events *ras)
+static enum cputype select_intel_cputype(struct mce_priv *mce)
{
- struct mce_priv *mce = ras->mce_priv;
-
if (mce->family == 15) {
if (mce->model == 6)
return CPU_TULSA;
@@ -140,9 +138,8 @@ if (mce->model > 0x1a) {
return mce->family == 6 ? CPU_P6OLD : CPU_GENERIC;
}
-static int detect_cpu(struct ras_events *ras)
+static int detect_cpu(struct mce_priv *mce)
{
- struct mce_priv *mce = ras->mce_priv;
FILE *f;
int ret = 0;
char *line = NULL;
@@ -221,7 +218,7 @@ ret = 0;
}
goto ret;
} else if (!strcmp(mce->vendor,"GenuineIntel")) {
- mce->cputype = select_intel_cputype(ras);
+ mce->cputype = select_intel_cputype(mce);
} else {
ret = EINVAL;
}
@@ -246,7 +243,7 @@ int register_mce_handler(struct ras_even
mce = ras->mce_priv;
- rc = detect_cpu(ras);
+ rc = detect_cpu(mce);
if (rc) {
if (mce->processor_flags)
free (mce->processor_flags);
@@ -383,6 +380,105 @@ #if 0
*/
}
+static int report_mce_offline(struct trace_seq *s,
+ struct mce_event *mce,
+ struct mce_priv *priv)
+{
+ time_t now;
+ struct tm *tm;
+
+ time(&now);
+ tm = localtime(&now);
+
+ if (tm)
+ strftime(mce->timestamp, sizeof(mce->timestamp),
+ "%Y-%m-%d %H:%M:%S %z", tm);
+ trace_seq_printf(s, "%s,", mce->timestamp);
+
+ if (*mce->bank_name)
+ trace_seq_printf(s, " %s,", mce->bank_name);
+ else
+ trace_seq_printf(s, " bank=%x,", mce->bank);
+
+ if (*mce->mcastatus_msg)
+ trace_seq_printf(s, " mca: %s,", mce->mcastatus_msg);
+
+ if (*mce->mcistatus_msg)
+ trace_seq_printf(s, " mci: %s,", mce->mcistatus_msg);
+
+ if (*mce->mc_location)
+ trace_seq_printf(s, " Locn: %s,", mce->mc_location);
+
+ if (*mce->error_msg)
+ trace_seq_printf(s, " Error Msg: %s\n", mce->error_msg);
+
+ return 0;
+}
+
+int ras_offline_mce_event(struct ras_mc_offline_event *event)
+{
+ int rc = 0;
+ struct trace_seq s;
+ struct mce_event *mce = NULL;
+ struct mce_priv *priv = NULL;
+
+ mce = (struct mce_event *)calloc(1, sizeof(struct mce_event));
+ if (!mce) {
+ log(TERM, LOG_ERR, "Can't allocate memory for mce struct\n");
+ return errno;
+ }
+
+ priv = (struct mce_priv *)calloc(1, sizeof(struct mce_priv));
+ if (!priv) {
+ log(TERM, LOG_ERR, "Can't allocate memory for mce_priv struct\n");
+ free(mce);
+ return errno;
+ }
+
+ if (event->smca) {
+ priv->cputype = CPU_AMD_SMCA;
+ priv->family = event->family;
+ priv->model = event->model;
+ } else {
+ rc = detect_cpu(priv);
+ if (rc) {
+ log(TERM, LOG_ERR, "Failed to detect CPU\n");
+ goto free_mce;
+ }
+ }
+
+ mce->status = event->status;
+ mce->bank = event->bank;
+
+ switch (priv->cputype) {
+ case CPU_AMD_SMCA:
+ mce->synd = event->synd;
+ mce->ipid = event->ipid;
+ if (!mce->ipid || !mce->status) {
+ log(TERM, LOG_ERR, "%s MSR required.\n",
+ mce->ipid ? "Status" : "Ipid");
+ rc = -EINVAL;
+ goto free_mce;
+ }
+ decode_smca_error(mce, priv);
+ amd_decode_errcode(mce);
+ break;
+ default:
+ break;
+ }
+
+ trace_seq_init(&s);
+ report_mce_offline(&s, mce, priv);
+ trace_seq_do_printf(&s);
+ fflush(stdout);
+ trace_seq_destroy(&s);
+
+free_mce:
+ free(priv);
+ free(mce);
+ return rc;
+}
+
int ras_mce_event_handler(struct trace_seq *s,
struct pevent_record *record,
struct event_format *event, void *context)
--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2023-10-27 12:44:55.541077722 -0400
+++ rasdaemon-0.6.7/ras-mce-handler.h 2023-10-27 12:44:58.550049010 -0400
@@ -118,6 +118,10 @@ int ras_mce_event_handler(struct trace_s
/* enables intel iMC logs */
int set_intel_imc_log(enum cputype cputype, unsigned ncpus);
+/* Undertake AMD SMCA Error Decoding */
+void decode_smca_error(struct mce_event *e, struct mce_priv *m);
+void amd_decode_errcode(struct mce_event *e);
+
/* Per-CPU-type decoders for Intel CPUs */
void p4_decode_model(struct mce_event *e);
void core2_decode_model(struct mce_event *e);
--- rasdaemon-0.6.7.orig/ras-record.h 2023-10-27 12:44:55.541077722 -0400
+++ rasdaemon-0.6.7/ras-record.h 2023-10-27 12:44:58.550049010 -0400
@@ -21,6 +21,7 @@ * Foundation, Inc., 51 Franklin Street,
#define __RAS_RECORD_H
#include <stdint.h>
+#include <stdbool.h>
#include "config.h"
#define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x)))
@@ -39,6 +40,15 @@ struct ras_mc_event {
const char *driver_detail;
};
+struct ras_mc_offline_event {
+ unsigned int family, model;
+ bool smca;
+ uint8_t bank;
+ uint64_t ipid;
+ uint64_t synd;
+ uint64_t status;
+};
+
struct ras_aer_event {
char timestamp[64];
const char *error_type;
--- rasdaemon-0.6.7.orig/rasdaemon.c 2023-10-27 12:44:55.541077722 -0400
+++ rasdaemon-0.6.7/rasdaemon.c 2023-10-27 12:44:58.550049010 -0400
@@ -41,8 +41,21 @@ struct arguments {
int record_events;
int enable_ras;
int foreground;
+ int offline;
};
+enum OFFLINE_ARG_KEYS {
+ SMCA = 0x100,
+ MODEL,
+ FAMILY,
+ BANK_NUM,
+ IPID_REG,
+ STATUS_REG,
+ SYNDROME_REG
+};
+
+struct ras_mc_offline_event event;
+
static error_t parse_opt(int k, char *arg, struct argp_state *state)
{
struct arguments *args = state->input;
@@ -62,18 +75,84 @@ static error_t parse_opt(int k, char *ar
case 'f':
args->foreground++;
break;
+#ifdef HAVE_MCE
+ case 'p':
+ if (state->argc < 4)
+ argp_state_help(state, stdout, ARGP_HELP_LONG | ARGP_HELP_EXIT_ERR);
+ args->offline++;
+ break;
+#endif
default:
return ARGP_ERR_UNKNOWN;
}
return 0;
}
+#ifdef HAVE_MCE
+static error_t parse_opt_offline(int key, char *arg,
+ struct argp_state *state)
+{
+ switch (key) {
+ case SMCA:
+ event.smca = true;
+ break;
+ case MODEL:
+ event.model = strtoul(state->argv[state->next], NULL, 0);
+ break;
+ case FAMILY:
+ event.family = strtoul(state->argv[state->next], NULL, 0);
+ break;
+ case BANK_NUM:
+ event.bank = atoi(state->argv[state->next]);
+ break;
+ case IPID_REG:
+ event.ipid = strtoull(state->argv[state->next], NULL, 0);
+ break;
+ case STATUS_REG:
+ event.status = strtoull(state->argv[state->next], NULL, 0);
+ break;
+ case SYNDROME_REG:
+ event.synd = strtoull(state->argv[state->next], NULL, 0);
+ break;
+ default:
+ return ARGP_ERR_UNKNOWN;
+ }
+ return 0;
+}
+#endif
+
long user_hz;
int main(int argc, char *argv[])
{
struct arguments args;
int idx = -1;
+
+#ifdef HAVE_MCE
+ const struct argp_option offline_options[] = {
+ {"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"},
+ {"model", MODEL, 0, 0, "CPU Model"},
+ {"family", FAMILY, 0, 0, "CPU Family"},
+ {"bank", BANK_NUM, 0, 0, "Bank Number"},
+ {"ipid", IPID_REG, 0, 0, "IPID Register (for SMCA systems only)"},
+ {"status", STATUS_REG, 0, 0, "Status Register"},
+ {"synd", SYNDROME_REG, 0, 0, "Syndrome Register"},
+ {0, 0, 0, 0, 0, 0},
+ };
+
+ struct argp offline_argp = {
+ .options = offline_options,
+ .parser = parse_opt_offline,
+ .doc = TOOL_DESCRIPTION,
+ .args_doc = ARGS_DOC,
+ };
+
+ struct argp_child offline_parser[] = {
+ {&offline_argp, 0, "Post-Processing Options:", 0},
+ {0, 0, 0, 0},
+ };
+#endif
+
const struct argp_option options[] = {
{"enable", 'e', 0, 0, "enable RAS events and exit", 0},
{"disable", 'd', 0, 0, "disable RAS events and exit", 0},
@@ -81,6 +160,10 @@ {"disable", 'd', 0, 0, "disable RAS even
{"record", 'r', 0, 0, "record events via sqlite3", 0},
#endif
{"foreground", 'f', 0, 0, "run foreground, not daemonize"},
+#ifdef HAVE_MCE
+ {"post-processing", 'p', 0, 0,
+ "Post-processing MCE's with raw register values"},
+#endif
{ 0, 0, 0, 0, 0, 0 }
};
@@ -89,7 +172,9 @@ { 0, 0, 0, 0, 0, 0 }
.parser = parse_opt,
.doc = TOOL_DESCRIPTION,
.args_doc = ARGS_DOC,
-
+#ifdef HAVE_MCE
+ .children = offline_parser,
+#endif
};
memset (&args, 0, sizeof(args));
@@ -111,6 +196,13 @@ enable = (args.enable_ras > 0) ? 1 : 0;
return 0;
}
+#ifdef HAVE_MCE
+ if (args.offline) {
+ ras_offline_mce_event(&event);
+ return 0;
+ }
+#endif
+
openlog(TOOL_NAME, 0, LOG_DAEMON);
if (!args.foreground)
if (daemon(0,0))

View File

@ -0,0 +1,51 @@
commit 9415b7449c70f5ea4a0209ddb89c2f5f392d3b4b
Author: Muralidhara M K <muralimk@amd.com>
Date: Tue Jul 27 06:36:45 2021 -0500
rasdaemon: ras-mc-ctl: Fix script to parse dimm sizes
Removes trailing spaces at the end of a line from
file location and fixes --layout option to parse dimm nodes
to get the size of each dimm from ras-mc-ctl.
Issue is reported https://github.com/mchehab/rasdaemon/issues/43
Where '> ras-mc-ctl --layout' reports all 0s
With this change the layout option prints the correct dimm sizes
> sudo ras-mc-ctl --layout
+-----------------------------------------------+
| mc0 |
| csrow0 | csrow1 | csrow2 | csrow3 |
----------+-----------------------------------------------+
...
channel7: | 16384 MB | 0 MB | 0 MB | 0 MB |
channel6: | 16384 MB | 0 MB | 0 MB | 0 MB |
...
----------+-----------------------------------------------+
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
Cc: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Link: https://lkml.kernel.org/r/20210810183855.129076-1-nchatrad@amd.com/
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 1e3aeb7..b22dd60 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -246,6 +246,7 @@ sub parse_dimm_nodes
if (($file =~ /max_location$/)) {
open IN, $file;
my $location = <IN>;
+ $location =~ s/\s+$//;
close IN;
my @temp = split(/ /, $location);
@@ -288,6 +289,7 @@ sub parse_dimm_nodes
open IN, $file;
my $location = <IN>;
+ $location =~ s/\s+$//;
close IN;
my @pos;

View File

@ -0,0 +1,40 @@
commit 9a5baed97b21af31064d9995ffcfaac0e9d7983e
Author: DmNosachev <quartz64@gmail.com>
Date: Tue Jun 29 13:37:48 2021 +0300
labels/supermicro: supermicro db syntax
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/labels/supermicro b/labels/supermicro
index bfaed93..47ea05f 100644
--- a/labels/supermicro
+++ b/labels/supermicro
@@ -18,17 +18,17 @@ Vendor: Supermicro
DIMMA1: 0.0.0; DIMMA2: 0.0.1;
DIMMB1: 0.1.0; DIMMB2: 0.1.1;
- Product: X10SRA-F
- DIMMA1: 0.0.0
- DIMMA2: 0.0.1
- DIMMB1: 0.1.0
- DIMMB2: 0.1.1
- DIMMC1: 1.0.0
- DIMMC2: 1.0.1
- DIMMD1: 1.1.0
- DIMMD2: 1.1.1
+ Model: X10SRA-F
+ DIMMA1: 0.0.0;
+ DIMMA2: 0.0.1;
+ DIMMB1: 0.1.0;
+ DIMMB2: 0.1.1;
+ DIMMC1: 1.0.0;
+ DIMMC2: 1.0.1;
+ DIMMD1: 1.1.0;
+ DIMMD2: 1.1.1;
- Product: H8DGU
+ Model: H8DGU
P1_DIMM1A: 0.2.0;
P1_DIMM1A: 0.3.0;
P2_DIMM1A: 3.2.0;

View File

@ -1,85 +0,0 @@
commit 0862a096c3a1d0f993703ab3299f1ddfadf53d7f
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Tue Aug 11 13:31:46 2020 +0100
rasdaemon: ras-mc-ctl: Add ARM processor error information
Add supporting ARM processor error in the ras-mc-ctl tool.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
util/ras-mc-ctl.in | 40 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 40 insertions(+)
--- rasdaemon-0.6.1.orig/util/ras-mc-ctl.in 2021-10-06 14:14:25.000440090 -0400
+++ rasdaemon-0.6.1/util/ras-mc-ctl.in 2021-10-06 14:15:59.995598590 -0400
@@ -1124,6 +1124,7 @@ sub summary
my ($query, $query_handle, $out);
my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg);
my ($etype, $severity, $etype_string, $severity_string);
+ my ($affinity, $mpidr);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
@@ -1159,6 +1160,22 @@ sub summary
}
$query_handle->finish;
+ # ARM processor arm_event errors
+ $query = "select affinity, mpidr, count(*) from arm_event group by affinity, mpidr";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($affinity, $mpidr, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$count errors\n";
+ }
+ if ($out ne "") {
+ print "ARM processor events summary:\n$out\n";
+ } else {
+ print "No ARM processor errors.\n\n";
+ }
+ $query_handle->finish;
+
# extlog errors
$query = "select etype, severity, count(*) from extlog_event group by etype, severity";
$query_handle = $dbh->prepare($query);
@@ -1202,6 +1219,7 @@ sub errors
my ($query, $query_handle, $id, $time, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out);
my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location);
my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data);
+ my ($error_count, $affinity, $mpidr, $r_state, $psci_state);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
@@ -1241,6 +1259,28 @@ sub errors
}
$query_handle->finish;
+ # ARM processor arm_event errors
+ $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "error_count=$error_count, " if ($error_count);
+ $out .= "affinity_level=$affinity, ";
+ $out .= sprintf "mpidr=0x%x, ", $mpidr;
+ $out .= sprintf "running_state=0x%x, ", $r_state;
+ $out .= sprintf "psci_state=0x%x", $psci_state;
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "ARM processor events:\n$out\n";
+ } else {
+ print "No ARM processor errors.\n\n";
+ }
+ $query_handle->finish;
+
# Extlog errors
$query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id";
$query_handle = $dbh->prepare($query);

View File

@ -1,32 +0,0 @@
commit 16d929b024c31d54a7f8a72eab094376c7be27f5
Author: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Wed May 26 10:20:39 2021 +0200
Makefile.am: fix build header rules
non-standard-hisilicon.h was added twice;
ras-memory-failure-handler.h is missing.
Due to that, the tarball becomes incomplete, causing build
errors.
While here, also adjust .travis.yml to use --enable-all.
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
Makefile.am | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/Makefile.am 2021-10-13 13:27:53.402685179 -0400
+++ b/Makefile.am 2021-10-13 13:28:11.664525173 -0400
@@ -54,7 +54,8 @@ rasdaemon_LDADD = -lpthread $(SQLITE3_LI
include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \
- ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h
+ ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \
+ ras-memory-failure-handler.h
# This rule can't be called with more than one Makefile job (like make -j8)
# I can't figure out a way to fix that

View File

@ -1,538 +0,0 @@
commit 2290d65b97311dd5736838f1e285355f7f357046
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Mon Mar 8 16:57:26 2021 +0000
rasdaemon: add support for memory_failure events
Add support to log the memory_failure kernel trace
events.
Example rasdaemon log and SQLite DB output for the
memory_failure event,
=================================================
rasdaemon: memory_failure_event store: 0x126ce8f8
rasdaemon: register inserted at db
<...>-785 [000] 0.000024: memory_failure_event: 2020-10-02 13:27:13 -0400 pfn=0x204000000 page_type=free buddy page action_result=Delayed
CREATE TABLE memory_failure_event (id INTEGER PRIMARY KEY, timestamp TEXT, pfn TEXT, page_type TEXT, action_result TEXT);
INSERT INTO memory_failure_event VALUES(1,'2020-10-02 13:27:13 -0400','0x204000000','free buddy page','Delayed');
==================================================
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
Makefile.am | 4
ras-events.c | 15 +++
ras-memory-failure-handler.c | 179 +++++++++++++++++++++++++++++++++++++++++++
ras-memory-failure-handler.h | 25 ++++++
ras-record.c | 56 +++++++++++++
ras-record.h | 13 +++
ras-report.c | 68 ++++++++++++++++
ras-report.h | 5 -
8 files changed, 364 insertions(+), 1 deletion(-)
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ b/ras-memory-failure-handler.c 2021-10-14 16:31:36.840657728 -0400
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "libtrace/kbuffer.h"
+#include "ras-memory-failure-handler.h"
+#include "ras-record.h"
+#include "ras-logger.h"
+#include "ras-report.h"
+
+/* Memory failure - various types of pages */
+enum mf_action_page_type {
+ MF_MSG_KERNEL,
+ MF_MSG_KERNEL_HIGH_ORDER,
+ MF_MSG_SLAB,
+ MF_MSG_DIFFERENT_COMPOUND,
+ MF_MSG_POISONED_HUGE,
+ MF_MSG_HUGE,
+ MF_MSG_FREE_HUGE,
+ MF_MSG_NON_PMD_HUGE,
+ MF_MSG_UNMAP_FAILED,
+ MF_MSG_DIRTY_SWAPCACHE,
+ MF_MSG_CLEAN_SWAPCACHE,
+ MF_MSG_DIRTY_MLOCKED_LRU,
+ MF_MSG_CLEAN_MLOCKED_LRU,
+ MF_MSG_DIRTY_UNEVICTABLE_LRU,
+ MF_MSG_CLEAN_UNEVICTABLE_LRU,
+ MF_MSG_DIRTY_LRU,
+ MF_MSG_CLEAN_LRU,
+ MF_MSG_TRUNCATED_LRU,
+ MF_MSG_BUDDY,
+ MF_MSG_BUDDY_2ND,
+ MF_MSG_DAX,
+ MF_MSG_UNSPLIT_THP,
+ MF_MSG_UNKNOWN,
+};
+
+/* Action results for various types of pages */
+enum mf_action_result {
+ MF_IGNORED, /* Error: cannot be handled */
+ MF_FAILED, /* Error: handling failed */
+ MF_DELAYED, /* Will be handled later */
+ MF_RECOVERED, /* Successfully recovered */
+};
+
+/* memory failure page types */
+static const struct {
+ int type;
+ const char *page_type;
+} mf_page_type[] = {
+ { MF_MSG_KERNEL, "reserved kernel page" },
+ { MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page"},
+ { MF_MSG_SLAB, "kernel slab page"},
+ { MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking"},
+ { MF_MSG_POISONED_HUGE, "huge page already hardware poisoned"},
+ { MF_MSG_HUGE, "huge page"},
+ { MF_MSG_FREE_HUGE, "free huge page"},
+ { MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page"},
+ { MF_MSG_UNMAP_FAILED, "unmapping failed page"},
+ { MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page"},
+ { MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page"},
+ { MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page"},
+ { MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page"},
+ { MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page"},
+ { MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page"},
+ { MF_MSG_DIRTY_LRU, "dirty LRU page"},
+ { MF_MSG_CLEAN_LRU, "clean LRU page"},
+ { MF_MSG_TRUNCATED_LRU, "already truncated LRU page"},
+ { MF_MSG_BUDDY, "free buddy page"},
+ { MF_MSG_BUDDY_2ND, "free buddy page (2nd try)"},
+ { MF_MSG_DAX, "dax page"},
+ { MF_MSG_UNSPLIT_THP, "unsplit thp"},
+ { MF_MSG_UNKNOWN, "unknown page"},
+};
+
+/* memory failure action results */
+static const struct {
+ int result;
+ const char *action_result;
+} mf_action_result[] = {
+ { MF_IGNORED, "Ignored" },
+ { MF_FAILED, "Failed" },
+ { MF_DELAYED, "Delayed" },
+ { MF_RECOVERED, "Recovered" },
+};
+
+static const char *get_page_type(int page_type)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mf_page_type); i++)
+ if (mf_page_type[i].type == page_type)
+ return mf_page_type[i].page_type;
+
+ return "unknown page";
+}
+
+static const char *get_action_result(int result)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mf_action_result); i++)
+ if (mf_action_result[i].result == result)
+ return mf_action_result[i].action_result;
+
+ return "unknown";
+}
+
+
+int ras_memory_failure_event_handler(struct trace_seq *s,
+ struct pevent_record *record,
+ struct event_format *event, void *context)
+{
+ unsigned long long val;
+ struct ras_events *ras = context;
+ time_t now;
+ struct tm *tm;
+ struct ras_mf_event ev;
+
+ /*
+ * Newer kernels (3.10-rc1 or upper) provide an uptime clock.
+ * On previous kernels, the way to properly generate an event would
+ * be to inject a fake one, measure its timestamp and diff it against
+ * gettimeofday. We won't do it here. Instead, let's use uptime,
+ * falling-back to the event report's time, if "uptime" clock is
+ * not available (legacy kernels).
+ */
+
+ if (ras->use_uptime)
+ now = record->ts/user_hz + ras->uptime_diff;
+ else
+ now = time(NULL);
+
+ tm = localtime(&now);
+ if (tm)
+ strftime(ev.timestamp, sizeof(ev.timestamp),
+ "%Y-%m-%d %H:%M:%S %z", tm);
+ trace_seq_printf(s, "%s ", ev.timestamp);
+
+ if (pevent_get_field_val(s, event, "pfn", record, &val, 1) < 0)
+ return -1;
+ sprintf(ev.pfn, "0x%llx", val);
+ trace_seq_printf(s, "pfn=0x%llx ", val);
+
+ if (pevent_get_field_val(s, event, "type", record, &val, 1) < 0)
+ return -1;
+ ev.page_type = get_page_type(val);
+ trace_seq_printf(s, "page_type=%s ", ev.page_type);
+
+ if (pevent_get_field_val(s, event, "result", record, &val, 1) < 0)
+ return -1;
+ ev.action_result = get_action_result(val);
+ trace_seq_printf(s, "action_result=%s ", ev.action_result);
+
+ /* Store data into the SQLite DB */
+#ifdef HAVE_SQLITE3
+ ras_store_mf_event(ras, &ev);
+#endif
+
+#ifdef HAVE_ABRT_REPORT
+ /* Report event to ABRT */
+ ras_report_mf_event(ras, &ev);
+#endif
+
+ return 0;
+}
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ b/ras-memory-failure-handler.h 2021-10-14 16:31:36.840657728 -0400
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+*/
+
+#ifndef __RAS_MEMORY_FAILURE_HANDLER_H
+#define __RAS_MEMORY_FAILURE_HANDLER_H
+
+#include "ras-events.h"
+#include "libtrace/event-parse.h"
+
+int ras_memory_failure_event_handler(struct trace_seq *s,
+ struct pevent_record *record,
+ struct event_format *event, void *context);
+
+#endif
--- a/ras-record.c 2018-04-25 06:19:03.000000000 -0400
+++ b/ras-record.c 2021-10-14 16:31:36.840657728 -0400
@@ -404,6 +404,55 @@ sqlite3_bind_text(priv->stmt_mce_record,
}
#endif
+/*
+ * Table and functions to handle ras:memory_failure
+ */
+
+#ifdef HAVE_MEMORY_FAILURE
+static const struct db_fields mf_event_fields[] = {
+ { .name="id", .type="INTEGER PRIMARY KEY" },
+ { .name="timestamp", .type="TEXT" },
+ { .name="pfn", .type="TEXT" },
+ { .name="page_type", .type="TEXT" },
+ { .name="action_result", .type="TEXT" },
+};
+
+static const struct db_table_descriptor mf_event_tab = {
+ .name = "memory_failure_event",
+ .fields = mf_event_fields,
+ .num_fields = ARRAY_SIZE(mf_event_fields),
+};
+
+int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev)
+{
+ int rc;
+ struct sqlite3_priv *priv = ras->db_priv;
+
+ if (!priv || !priv->stmt_mf_event)
+ return 0;
+ log(TERM, LOG_INFO, "memory_failure_event store: %p\n", priv->stmt_mf_event);
+
+ sqlite3_bind_text(priv->stmt_mf_event, 1, ev->timestamp, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mf_event, 2, ev->pfn, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mf_event, 3, ev->page_type, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mf_event, 4, ev->action_result, -1, NULL);
+
+ rc = sqlite3_step(priv->stmt_mf_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to do memory_failure_event step on sqlite: error = %d\n", rc);
+
+ rc = sqlite3_reset(priv->stmt_mf_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed reset memory_failure_event on sqlite: error = %d\n",
+ rc);
+
+ log(TERM, LOG_INFO, "register inserted at db\n");
+
+ return rc;
+}
+#endif
/*
* Generic code
@@ -567,6 +616,13 @@ usleep(10000);
rc = ras_mc_prepare_stmt(priv, &priv->stmt_arm_record,
&arm_event_tab);
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ rc = ras_mc_create_table(priv, &mf_event_tab);
+ if (rc == SQLITE_OK) {
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_mf_event,
+ &mf_event_tab);
+ }
+#endif
ras->db_priv = priv;
return 0;
--- a/ras-record.h 2018-04-25 06:19:03.000000000 -0400
+++ b/ras-record.h 2021-10-14 16:31:36.840657728 -0400
@@ -75,12 +75,20 @@ struct ras_arm_event {
int32_t psci_state;
};
+struct ras_mf_event {
+ char timestamp[64];
+ char pfn[30];
+ const char *page_type;
+ const char *action_result;
+};
+
struct ras_mc_event;
struct ras_aer_event;
struct ras_extlog_event;
struct ras_non_standard_event;
struct ras_arm_event;
struct mce_event;
+struct ras_mf_event;
#ifdef HAVE_SQLITE3
@@ -104,6 +112,9 @@ struct sqlite3_priv {
#ifdef HAVE_ARM
sqlite3_stmt *stmt_arm_record;
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ sqlite3_stmt *stmt_mf_event;
+#endif
};
int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras);
@@ -113,6 +124,7 @@ int ras_store_mce_record(struct ras_even
int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev);
int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev);
int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev);
+int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
#else
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
@@ -122,6 +134,7 @@ static inline int ras_store_mce_record(s
static inline int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev) { return 0; };
static inline int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; };
static inline int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) { return 0; };
+static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; };
#endif
--- a/ras-report.c 2017-10-14 05:11:34.000000000 -0400
+++ b/ras-report.c 2021-10-14 16:31:36.840657728 -0400
@@ -255,6 +255,28 @@ "midr=0x%lx\n" \
return 0;
}
+static int set_mf_event_backtrace(char *buf, struct ras_mf_event *ev)
+{
+ char bt_buf[MAX_BACKTRACE_SIZE];
+
+ if (!buf || !ev)
+ return -1;
+
+ sprintf(bt_buf, "BACKTRACE=" \
+ "timestamp=%s\n" \
+ "pfn=%s\n" \
+ "page_type=%s\n" \
+ "action_result=%s\n", \
+ ev->timestamp, \
+ ev->pfn, \
+ ev->page_type, \
+ ev->action_result);
+
+ strcat(buf, bt_buf);
+
+ return 0;
+}
+
static int commit_report_backtrace(int sockfd, int type, void *ev){
char buf[MAX_BACKTRACE_SIZE];
char *pbuf = buf;
@@ -283,6 +305,9 @@ memset(buf, 0, MAX_BACKTRACE_SIZE);
case ARM_EVENT:
rc = set_arm_event_backtrace(buf, (struct ras_arm_event *)ev);
break;
+ case MF_EVENT:
+ rc = set_mf_event_backtrace(buf, (struct ras_mf_event *)ev);
+ break;
default:
return -1;
}
@@ -549,3 +574,46 @@ return 0;
return -1;
}
}
+
+int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev)
+{
+ char buf[MAX_MESSAGE_SIZE];
+ int sockfd = 0;
+ int done = 0;
+ int rc = -1;
+
+ memset(buf, 0, sizeof(buf));
+
+ sockfd = setup_report_socket();
+ if (sockfd < 0)
+ return -1;
+
+ rc = commit_report_basic(sockfd);
+ if (rc < 0)
+ goto mf_fail;
+
+ rc = commit_report_backtrace(sockfd, MF_EVENT, ev);
+ if (rc < 0)
+ goto mf_fail;
+
+ sprintf(buf, "ANALYZER=%s", "rasdaemon-memory_failure");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto mf_fail;
+
+ sprintf(buf, "REASON=%s", "memory failure problem");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto mf_fail;
+
+ done = 1;
+
+mf_fail:
+ if (sockfd > 0)
+ close(sockfd);
+
+ if (done)
+ return 0;
+ else
+ return -1;
+}
--- a/ras-report.h 2017-10-14 05:11:34.000000000 -0400
+++ b/ras-report.h 2021-10-14 16:31:36.840657728 -0400
@@ -34,7 +34,8 @@ enum {
MCE_EVENT,
AER_EVENT,
NON_STANDARD_EVENT,
- ARM_EVENT
+ ARM_EVENT,
+ MF_EVENT,
};
#ifdef HAVE_ABRT_REPORT
@@ -44,6 +45,7 @@ int ras_report_aer_event(struct ras_even
int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev);
int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev);
int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev);
+int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
#else
@@ -52,6 +54,7 @@ static inline int ras_report_aer_event(s
static inline int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev) { return 0; };
static inline int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; };
static inline int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev) { return 0; };
+static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; };
#endif
--- a/Makefile.am 2018-04-25 06:21:56.000000000 -0400
+++ b/Makefile.am 2021-10-14 16:37:42.423639762 -0400
@@ -41,12 +41,16 @@ endif
if WITH_EXTLOG
rasdaemon_SOURCES += ras-extlog-handler.c
endif
+if WITH_MEMORY_FAILURE
+ rasdaemon_SOURCES += ras-memory-failure-handler.c
+endif
if WITH_ABRT_REPORT
rasdaemon_SOURCES += ras-report.c
endif
if WITH_HISI_NS_DECODE
rasdaemon_SOURCES += non-standard-hisi_hip07.c
endif
+
rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a
include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
--- a/ras-events.c 2021-10-14 16:31:36.730658636 -0400
+++ b/ras-events.c 2021-10-14 16:37:11.043898809 -0400
@@ -33,6 +33,7 @@ * Foundation, Inc., 51 Franklin Street,
#include "ras-arm-handler.h"
#include "ras-mce-handler.h"
#include "ras-extlog-handler.h"
+#include "ras-memory-failure-handler.h"
#include "ras-record.h"
#include "ras-logger.h"
@@ -218,6 +219,10 @@ if (rc < 0) {
rc |= __toggle_ras_mc_event(ras, "ras", "arm_event", enable);
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ rc |= __toggle_ras_mc_event(ras, "ras", "memory_failure_event", enable);
+#endif
+
free_ras:
free(ras);
return rc;
@@ -736,6 +741,16 @@ (void)open("/sys/kernel/debug/ras/daemon
"ras", "aer_event");
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ rc = add_event_handler(ras, pevent, page_size, "ras", "memory_failure_event",
+ ras_memory_failure_event_handler);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "ras", "memory_failure_event");
+#endif
+
if (!num_events) {
log(ALL, LOG_INFO,
"Failed to trace all supported RAS events. Aborting.\n");

View File

@ -1,66 +0,0 @@
commit 2a1d217660351c08eb2f8bccebf939abba2f7e69
Author: Brian WoodsGhannam, Yazen <brian.woods@amd.comYazen.Ghannam@amd.com>
Date: Fri Nov 1 15:48:13 2019 +0100
rasdaemon: rename CPU_NAPLES cputype
Change CPU_NAPLES to CPU_AMD_SMCA to reflect that it isn't just NAPLES
that is supported, but AMD's Scalable Machine Check Architecture (SMCA).
[ Yazen: change family check to feature check, and change CPU name. ]
CC: "mchehab+samsung@kernel.org" <mchehab+samsung@kernel.org>, "Namburu, Chandu-babu" <chandu@amd.com> # Thread-Topic: [PATCH 1/2] rasdaemon: rename CPU_NAPLES cputype
Signed-off-by: Brian Woods <brian.woods@amd.com>
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Cc: Chandu-babu Namburu <chandu@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
ras-mce-handler.c | 10 ++++++----
ras-mce-handler.h | 2 +-
2 files changed, 7 insertions(+), 5 deletions(-)
--- rasdaemon-0.6.1.orig/ras-mce-handler.c 2021-05-26 15:16:24.699096556 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.c 2021-05-26 15:18:06.543162745 -0400
@@ -55,7 +55,7 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series
[CPU_KNIGHTS_LANDING] = "Knights Landing",
[CPU_KNIGHTS_MILL] = "Knights Mill",
[CPU_SKYLAKE_XEON] = "Skylake server",
- [CPU_NAPLES] = "AMD Family 17h Zen1"
+ [CPU_AMD_SMCA] = "AMD Scalable MCA",
};
static enum cputype select_intel_cputype(struct ras_events *ras)
@@ -191,8 +191,10 @@ ret = 0;
if (!strcmp(mce->vendor, "AuthenticAMD")) {
if (mce->family == 15)
mce->cputype = CPU_K8;
- if (mce->family == 23)
- mce->cputype = CPU_NAPLES;
+ if (strstr(mce->processor_flags, "smca")) {
+ mce->cputype = CPU_AMD_SMCA;
+ goto ret;
+ }
if (mce->family > 23) {
log(ALL, LOG_INFO,
"Can't parse MCE for this AMD CPU yet %d\n",
@@ -435,7 +437,7 @@ if (pevent_get_field_val(s, event, "ipid
case CPU_K8:
rc = parse_amd_k8_event(ras, &e);
break;
- case CPU_NAPLES:
+ case CPU_AMD_SMCA:
rc = parse_amd_smca_event(ras, &e);
break;
default: /* All other CPU types are Intel */
--- rasdaemon-0.6.1.orig/ras-mce-handler.h 2021-05-26 15:17:15.409631590 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.h 2021-05-26 15:18:20.102038424 -0400
@@ -50,7 +50,7 @@ enum cputype {
CPU_KNIGHTS_LANDING,
CPU_KNIGHTS_MILL,
CPU_SKYLAKE_XEON,
- CPU_NAPLES,
+ CPU_AMD_SMCA,
};
struct mce_event {

View File

@ -1,372 +0,0 @@
commit 546cf713f667437fb6e283cc3dc090679eb47d08
Author: Subhendu Saha <subhends@akamai.com>
Date: Tue Jan 12 03:29:55 2021 -0500
Fix ras-mc-ctl script.
When rasdaemon is compiled without enabling aer, mce, devlink,
etc., those tables are not created in the database file. Then
ras-mc-ctl script breaks trying to query data from non-existent
tables.
Signed-off-by: Subhendu Saha subhends@akamai.com
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
util/ras-mc-ctl.in | 310 ++++++++++++++++++++++++++++-------------------------
1 file changed, 168 insertions(+), 142 deletions(-)
--- a/util/ras-mc-ctl.in 2021-10-12 13:45:43.260646935 -0400
+++ b/util/ras-mc-ctl.in 2021-10-12 13:46:38.610158949 -0400
@@ -41,6 +41,16 @@ my $sysconfdir = "@sysconfdir@";
my $dmidecode = find_prog ("dmidecode");
my $modprobe = find_prog ("modprobe") or exit (1);
+my $has_aer = 0;
+my $has_arm = 0;
+my $has_extlog = 0;
+my $has_mce = 0;
+
+@WITH_AER_TRUE@$has_aer = 1;
+@WITH_ARM_TRUE@$has_arm = 1;
+@WITH_EXTLOG_TRUE@$has_extlog = 1;
+@WITH_MCE_TRUE@$has_mce = 1;
+
my %conf = ();
my %bus = ();
my %dimm_size = ();
@@ -1145,70 +1155,78 @@ sub summary
$query_handle->finish;
# PCIe AER aer_event errors
- $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($err_type, $msg, $count));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "\t$count $err_type errors: $msg\n";
- }
- if ($out ne "") {
- print "PCIe AER events summary:\n$out\n";
- } else {
- print "No PCIe AER errors.\n\n";
+ if ($has_aer == 1) {
+ $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($err_type, $msg, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$count $err_type errors: $msg\n";
+ }
+ if ($out ne "") {
+ print "PCIe AER events summary:\n$out\n";
+ } else {
+ print "No PCIe AER errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# ARM processor arm_event errors
- $query = "select affinity, mpidr, count(*) from arm_event group by affinity, mpidr";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($affinity, $mpidr, $count));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "\t$count errors\n";
- }
- if ($out ne "") {
- print "ARM processor events summary:\n$out\n";
- } else {
- print "No ARM processor errors.\n\n";
+ if ($has_arm == 1) {
+ $query = "select affinity, mpidr, count(*) from arm_event group by affinity, mpidr";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($affinity, $mpidr, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$count errors\n";
+ }
+ if ($out ne "") {
+ print "ARM processor events summary:\n$out\n";
+ } else {
+ print "No ARM processor errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# extlog errors
- $query = "select etype, severity, count(*) from extlog_event group by etype, severity";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($etype, $severity, $count));
- $out = "";
- while($query_handle->fetch()) {
- $etype_string = get_extlog_type($etype);
- $severity_string = get_extlog_severity($severity);
- $out .= "\t$count $etype_string $severity_string errors\n";
- }
- if ($out ne "") {
- print "Extlog records summary:\n$out";
- } else {
- print "No Extlog errors.\n";
+ if ($has_extlog == 1) {
+ $query = "select etype, severity, count(*) from extlog_event group by etype, severity";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($etype, $severity, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $etype_string = get_extlog_type($etype);
+ $severity_string = get_extlog_severity($severity);
+ $out .= "\t$count $etype_string $severity_string errors\n";
+ }
+ if ($out ne "") {
+ print "Extlog records summary:\n$out";
+ } else {
+ print "No Extlog errors.\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# MCE mce_record errors
- $query = "select error_msg, count(*) from mce_record group by error_msg";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($msg, $count));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "\t$count $msg errors\n";
- }
- if ($out ne "") {
- print "MCE records summary:\n$out";
- } else {
- print "No MCE errors.\n";
+ if ($has_mce == 1) {
+ $query = "select error_msg, count(*) from mce_record group by error_msg";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($msg, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$count $msg errors\n";
+ }
+ if ($out ne "") {
+ print "MCE records summary:\n$out";
+ } else {
+ print "No MCE errors.\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
undef($dbh);
}
@@ -1244,105 +1262,113 @@ sub errors
$query_handle->finish;
# PCIe AER aer_event errors
- $query = "select id, timestamp, err_type, err_msg from aer_event order by id";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($id, $time, $type, $msg));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "$id $time $type error: $msg\n";
- }
- if ($out ne "") {
- print "PCIe AER events:\n$out\n";
- } else {
- print "No PCIe AER errors.\n\n";
+ if ($has_aer == 1) {
+ $query = "select id, timestamp, err_type, err_msg from aer_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $time, $type, $msg));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $time $type error: $msg\n";
+ }
+ if ($out ne "") {
+ print "PCIe AER events:\n$out\n";
+ } else {
+ print "No PCIe AER errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# ARM processor arm_event errors
- $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "$id $timestamp error: ";
- $out .= "error_count=$error_count, " if ($error_count);
- $out .= "affinity_level=$affinity, ";
- $out .= sprintf "mpidr=0x%x, ", $mpidr;
- $out .= sprintf "running_state=0x%x, ", $r_state;
- $out .= sprintf "psci_state=0x%x", $psci_state;
- $out .= "\n";
- }
- if ($out ne "") {
- print "ARM processor events:\n$out\n";
- } else {
- print "No ARM processor errors.\n\n";
+ if ($has_arm == 1) {
+ $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "error_count=$error_count, " if ($error_count);
+ $out .= "affinity_level=$affinity, ";
+ $out .= sprintf "mpidr=0x%x, ", $mpidr;
+ $out .= sprintf "running_state=0x%x, ", $r_state;
+ $out .= sprintf "psci_state=0x%x", $psci_state;
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "ARM processor events:\n$out\n";
+ } else {
+ print "No ARM processor errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# Extlog errors
- $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data));
- $out = "";
- while($query_handle->fetch()) {
- $etype_string = get_extlog_type($etype);
- $severity_string = get_extlog_severity($severity);
- $out .= "$id $timestamp error: ";
- $out .= "type=$etype_string, ";
- $out .= "severity=$severity_string, ";
- $out .= sprintf "address=0x%08x, ", $addr;
- $out .= sprintf "fru_id=%s, ", get_uuid_le($fru_id);
- $out .= "fru_text='$fru_text', ";
- $out .= get_cper_data_text($cper_data) if ($cper_data);
- $out .= "\n";
- }
- if ($out ne "") {
- print "Extlog events:\n$out\n";
- } else {
- print "No Extlog errors.\n\n";
+ if ($has_extlog) {
+ $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data));
+ $out = "";
+ while($query_handle->fetch()) {
+ $etype_string = get_extlog_type($etype);
+ $severity_string = get_extlog_severity($severity);
+ $out .= "$id $timestamp error: ";
+ $out .= "type=$etype_string, ";
+ $out .= "severity=$severity_string, ";
+ $out .= sprintf "address=0x%08x, ", $addr;
+ $out .= sprintf "fru_id=%s, ", get_uuid_le($fru_id);
+ $out .= "fru_text='$fru_text', ";
+ $out .= get_cper_data_text($cper_data) if ($cper_data);
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "Extlog events:\n$out\n";
+ } else {
+ print "No Extlog errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# MCE mce_record errors
- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "$id $time error: $msg";
- $out .= ", CPU $cpuvendor" if ($cpuvendor);
- $out .= ", bank $bank_name" if ($bank_name);
- $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg);
- $out .= ", mci $mcistatus_msg" if ($mcistatus_msg);
- $out .= ", $mc_location" if ($mc_location);
- $out .= ", $user_action" if ($user_action);
- $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap);
- $out .= sprintf ", mcgstatus=0x%08x", $mcgstatus if ($mcgstatus);
- $out .= sprintf ", status=0x%08x", $status if ($status);
- $out .= sprintf ", addr=0x%08x", $addr if ($addr);
- $out .= sprintf ", misc=0x%08x", $misc if ($misc);
- $out .= sprintf ", ip=0x%08x", $ip if ($ip);
- $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc);
- $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime);
- $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu);
- $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid);
- $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid);
- $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid);
- $out .= sprintf ", cs=0x%08x", $cs if ($cs);
- $out .= sprintf ", bank=0x%08x", $bank if ($bank);
+ if ($has_mce == 1) {
+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $time error: $msg";
+ $out .= ", CPU $cpuvendor" if ($cpuvendor);
+ $out .= ", bank $bank_name" if ($bank_name);
+ $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg);
+ $out .= ", mci $mcistatus_msg" if ($mcistatus_msg);
+ $out .= ", $mc_location" if ($mc_location);
+ $out .= ", $user_action" if ($user_action);
+ $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap);
+ $out .= sprintf ", mcgstatus=0x%08x", $mcgstatus if ($mcgstatus);
+ $out .= sprintf ", status=0x%08x", $status if ($status);
+ $out .= sprintf ", addr=0x%08x", $addr if ($addr);
+ $out .= sprintf ", misc=0x%08x", $misc if ($misc);
+ $out .= sprintf ", ip=0x%08x", $ip if ($ip);
+ $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc);
+ $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime);
+ $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu);
+ $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid);
+ $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid);
+ $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid);
+ $out .= sprintf ", cs=0x%08x", $cs if ($cs);
+ $out .= sprintf ", bank=0x%08x", $bank if ($bank);
- $out .= "\n";
- }
- if ($out ne "") {
- print "MCE events:\n$out\n";
- } else {
- print "No MCE errors.\n\n";
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "MCE events:\n$out\n";
+ } else {
+ print "No MCE errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
undef($dbh);
}

View File

@ -1,149 +0,0 @@
commit 60a91e4da4f2daf2b10143fc148a8043312b61e5
Author: Aristeu Rozanski <aris@redhat.com>
Date: Wed Aug 1 16:29:58 2018 -0400
rasdaemon: ras-mc-ctl: add option to show error counts
In some scenarios it might not be desirable to have a daemon running
to parse and store the errors provided by EDAC and only having the
number of CEs and UEs is enough. This patch implements this feature
as an ras-mc-ctl option.
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 38b7824..aee431a 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -50,6 +50,8 @@ my %dimm_location = ();
my %csrow_size = ();
my %rank_size = ();
my %csrow_ranks = ();
+my %dimm_ce_count = ();
+my %dimm_ue_count = ();
my @layers;
my @max_pos;
@@ -76,6 +78,7 @@ Usage: $prog [OPTIONS...]
--layout Display the memory layout.
--summary Presents a summary of the logged errors.
--errors Shows the errors stored at the error database.
+ --error-count Shows the corrected and uncorrected error counts using sysfs.
--help This help message.
EOF
@@ -83,7 +86,7 @@ parse_cmdline();
if ( $conf{opt}{mainboard} || $conf{opt}{print_labels}
|| $conf{opt}{register_labels} || $conf{opt}{display_memory_layout}
- || $conf{opt}{guess_dimm_label}) {
+ || $conf{opt}{guess_dimm_label} || $conf{opt}{error_count}) {
get_mainboard_info();
@@ -105,6 +108,9 @@ if ( $conf{opt}{mainboard} || $conf{opt}{print_labels}
if ($conf{opt}{guess_dimm_label}) {
guess_dimm_label ();
}
+ if ($conf{opt}{error_count}) {
+ display_error_count ();
+ }
}
if ($conf{opt}{status}) {
@@ -134,6 +140,7 @@ sub parse_cmdline
$conf{opt}{guess_dimm_label} = 0;
$conf{opt}{summary} = 0;
$conf{opt}{errors} = 0;
+ $conf{opt}{error_count} = 0;
my $rref = \$conf{opt}{report};
my $mref = \$conf{opt}{mainboard};
@@ -150,7 +157,8 @@ sub parse_cmdline
"status" => \$conf{opt}{status},
"layout" => \$conf{opt}{display_memory_layout},
"summary" => \$conf{opt}{summary},
- "errors" => \$conf{opt}{errors}
+ "errors" => \$conf{opt}{errors},
+ "error-count" => \$conf{opt}{error_count}
);
usage(1) if !$rc;
@@ -284,6 +292,30 @@ sub parse_dimm_nodes
$dimm_label_file{$str_loc} = $file;
$dimm_location{$str_loc} = $location;
+ my $count;
+
+ $file =~s/dimm_label/dimm_ce_count/;
+ if (-e $file) {
+ open IN, $file;
+ chomp($count = <IN>);
+ close IN;
+ } else {
+ log_error ("dimm_ce_count not found in sysfs. Old kernel?\n");
+ exit -1;
+ }
+ $dimm_ce_count{$str_loc} = $count;
+
+ $file =~s/dimm_ce_count/dimm_ue_count/;
+ if (-e $file) {
+ open IN, $file;
+ chomp($count = <IN>);
+ close IN;
+ } else {
+ log_error ("dimm_ue_count not found in sysfs. Old kernel?\n");
+ exit -1;
+ }
+ $dimm_ue_count{$str_loc} = $count;
+
return;
}
}
@@ -906,6 +938,45 @@ sub display_memory_layout
dimm_display_mem();
}
+sub display_error_count
+{
+ my $sysfs_dir = "/sys/devices/system/edac/mc";
+ my $key;
+ my $max_width = 0;
+ my %dimm_labels = ();
+
+ find ({wanted => \&parse_dimm_nodes, no_chdir => 1}, $sysfs_dir);
+
+ if (!scalar(keys %dimm_node)) {
+ log_error ("No DIMMs found in /sys or new sysfs EDAC interface not found.\n");
+ exit -1;
+ }
+
+ foreach $key (keys %dimm_node) {
+ my $label_width;
+
+ open IN, $dimm_label_file{$key};
+ chomp(my $label = <IN>);
+ close IN;
+ $label_width = length $label;
+
+ if ($label_width > $max_width) {
+ $max_width = $label_width;
+ }
+ $dimm_labels{$key} = $label;
+ }
+ my $string = "Label";
+ $string .= " " x ($max_width - length $string);
+ print($string . "\tCE\tUE\n");
+
+ foreach $key (keys %dimm_node) {
+ my $ce_count = $dimm_ce_count{$key};
+ my $ue_count = $dimm_ue_count{$key};
+
+ print("$dimm_labels{$key}\t$ce_count\t$ue_count\n");
+ }
+}
+
sub find_prog
{
my ($file) = @_;

View File

@ -1,38 +0,0 @@
commit 854364ba44aee9bc5646f6537fc744b0b54aff37
Author: Muralidhara M K <muralimk@amd.com>
Date: Thu Aug 20 21:00:57 2020 +0530
rasdaemon: Add 8 channel decoding for SMCA systems
Current Scalable Machine Check Architecture (SMCA) systems support up
to 8 UMC channels.
To find the UMC channel represented by a bank, look at the 6th nibble
in the MCA_IPID[InstanceId] field.
Signed-off-by: Muralidhara M K <muralimk@amd.com>
[ Adjust commit message. ]
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index d0b6cb6..7c619fd 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -438,15 +438,7 @@ static void amd_decode_errcode(struct mce_event *e)
*/
static int find_umc_channel(struct mce_event *e)
{
- uint32_t umc_instance_id[] = {0x50f00, 0x150f00};
- uint32_t instance_id = EXTRACT(e->ipid, 0, 31);
- int i, channel = -1;
-
- for (i = 0; i < ARRAY_SIZE(umc_instance_id); i++)
- if (umc_instance_id[i] == instance_id)
- channel = i;
-
- return channel;
+ return EXTRACT(e->ipid, 0, 31) >> 20;
}
/* Decode extended errors according to Scalable MCA specification */
static void decode_smca_error(struct mce_event *e)

View File

@ -1,207 +0,0 @@
commit 8704a85d8dc3483423ec2934fee8132f85f8fdb6
Author: Brian WoodsGhannam, Yazen <brian.woods@amd.comYazen.Ghannam@amd.com>
Date: Fri Nov 1 15:48:14 2019 +0100
rasdaemon: add support for new AMD SMCA bank types
Going forward, the Scalable Machine Check Architecture (SMCA) has some
updated and additional bank types which show up in Zen2. The differing
bank types include: CS_V2, PSP_V2, SMU_V2, MP5, NBIO, and PCIE. The V2
bank types replace the original bank types but have unique HWID/MCAtype
IDs from the originals so there's no conflicts between different
versions or other bank types. All of the differing bank types have new
MCE descriptions which have been added as well.
CC: "mchehab+samsung@kernel.org" <mchehab+samsung@kernel.org>, "Namburu, Chandu-babu" <chandu@amd.com> # Thread-Topic: [PATCH 2/2] rasdaemon: add support for new AMD SMCA bank types
Signed-off-by: Brian Woods <brian.woods@amd.com>
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Cc: Chandu-babu Namburu <chandu@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 6c3e8a5..114e786 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -49,11 +49,17 @@ enum smca_bank_types {
SMCA_FP, /* Floating Point */
SMCA_L3_CACHE, /* L3 Cache */
SMCA_CS, /* Coherent Slave */
+ SMCA_CS_V2, /* Coherent Slave V2 */
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
+ SMCA_PSP_V2, /* Platform Security Processor V2 */
SMCA_SMU, /* System Management Unit */
+ SMCA_SMU_V2, /* System Management Unit V2 */
+ SMCA_MP5, /* Microprocessor 5 Unit */
+ SMCA_NBIO, /* Northbridge IO Unit */
+ SMCA_PCIE, /* PCI Express Unit */
N_SMCA_BANK_TYPES
};
@@ -165,6 +171,23 @@ static const char * const smca_cs_mce_desc[] = {
"Atomic request parity",
"ECC error on probe filter access",
};
+/* Coherent Slave Unit V2 */
+static const char * const smca_cs2_mce_desc[] = {
+ "Illegal Request",
+ "Address Violation",
+ "Security Violation",
+ "Illegal Response",
+ "Unexpected Response",
+ "Request or Probe Parity Error",
+ "Read Response Parity Error",
+ "Atomic Request Parity Error",
+ "SDP read response had no match in the CS queue",
+ "Probe Filter Protocol Error",
+ "Probe Filter ECC Error",
+ "SDP read response had an unexpected RETRY error",
+ "Counter overflow error",
+ "Counter underflow error",
+};
/* Power, Interrupt, etc.. */
static const char * const smca_pie_mce_desc[] = {
"HW assert",
@@ -189,10 +212,75 @@ static const char * const smca_pb_mce_desc[] = {
static const char * const smca_psp_mce_desc[] = {
"PSP RAM ECC or parity error",
};
+/* Platform Security Processor V2 */
+static const char * const smca_psp2_mce_desc[] = {
+ "High SRAM ECC or parity error",
+ "Low SRAM ECC or parity error",
+ "Instruction Cache Bank 0 ECC or parity error",
+ "Instruction Cache Bank 1 ECC or parity error",
+ "Instruction Tag Ram 0 parity error",
+ "Instruction Tag Ram 1 parity error",
+ "Data Cache Bank 0 ECC or parity error",
+ "Data Cache Bank 1 ECC or parity error",
+ "Data Cache Bank 2 ECC or parity error",
+ "Data Cache Bank 3 ECC or parity error",
+ "Data Tag Bank 0 parity error",
+ "Data Tag Bank 1 parity error",
+ "Data Tag Bank 2 parity error",
+ "Data Tag Bank 3 parity error",
+ "Dirty Data Ram parity error",
+ "TLB Bank 0 parity error",
+ "TLB Bank 1 parity error",
+ "System Hub Read Buffer ECC or parity error",
+};
/* System Management Unit */
static const char * const smca_smu_mce_desc[] = {
"SMU RAM ECC or parity error",
};
+/* System Management Unit V2 */
+static const char * const smca_smu2_mce_desc[] = {
+ "High SRAM ECC or parity error",
+ "Low SRAM ECC or parity error",
+ "Data Cache Bank A ECC or parity error",
+ "Data Cache Bank B ECC or parity error",
+ "Data Tag Cache Bank A ECC or parity error",
+ "Data Tag Cache Bank B ECC or parity error",
+ "Instruction Cache Bank A ECC or parity error",
+ "Instruction Cache Bank B ECC or parity error",
+ "Instruction Tag Cache Bank A ECC or parity error",
+ "Instruction Tag Cache Bank B ECC or parity error",
+ "System Hub Read Buffer ECC or parity error",
+};
+/* Microprocessor 5 Unit */
+static const char * const smca_mp5_mce_desc[] = {
+ "High SRAM ECC or parity error",
+ "Low SRAM ECC or parity error",
+ "Data Cache Bank A ECC or parity error",
+ "Data Cache Bank B ECC or parity error",
+ "Data Tag Cache Bank A ECC or parity error",
+ "Data Tag Cache Bank B ECC or parity error",
+ "Instruction Cache Bank A ECC or parity error",
+ "Instruction Cache Bank B ECC or parity error",
+ "Instruction Tag Cache Bank A ECC or parity error",
+ "Instruction Tag Cache Bank B ECC or parity error",
+};
+/* Northbridge IO Unit */
+static const char * const smca_nbio_mce_desc[] = {
+ "ECC or Parity error",
+ "PCIE error",
+ "SDP ErrEvent error",
+ "SDP Egress Poison Error",
+ "IOHC Internal Poison Error",
+};
+/* PCI Express Unit */
+static const char * const smca_pcie_mce_desc[] = {
+ "CCIX PER Message logging",
+ "CCIX Read Response with Status: Non-Data Error",
+ "CCIX Write Response with Status: Non-Data Error",
+ "CCIX Read Response with Status: Data Error",
+ "CCIX Non-okay write response with data error",
+};
+
struct smca_mce_desc {
const char * const *descs;
@@ -208,11 +296,17 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) },
[SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
[SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
+ [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) },
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
[SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
+ [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)},
[SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
+ [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc)},
+ [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) },
+ [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)},
+ [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)},
};
struct smca_hwid {
@@ -235,6 +329,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Data Fabric MCA types */
{ SMCA_CS, 0x0000002E },
+ { SMCA_CS_V2, 0x0002002E },
{ SMCA_PIE, 0x0001002E },
/* Unified Memory Controller MCA type */
@@ -245,9 +340,20 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Platform Security Processor MCA type */
{ SMCA_PSP, 0x000000FF },
+ { SMCA_PSP_V2, 0x000100FF },
/* System Management Unit MCA type */
{ SMCA_SMU, 0x00000001 },
+ { SMCA_SMU_V2, 0x00010001 },
+
+ /* Microprocessor 5 Unit MCA type */
+ { SMCA_MP5, 0x00020001 },
+
+ /* Northbridge IO Unit MCA type */
+ { SMCA_NBIO, 0x00000018 },
+
+ /* PCI Express Unit MCA type */
+ { SMCA_PCIE, 0x00000046 },
};
struct smca_bank_name {
@@ -264,11 +370,17 @@ static struct smca_bank_name smca_names[] = {
[SMCA_FP] = { "Floating Point Unit" },
[SMCA_L3_CACHE] = { "L3 Cache" },
[SMCA_CS] = { "Coherent Slave" },
+ [SMCA_CS_V2] = { "Coherent Slave" },
[SMCA_PIE] = { "Power, Interrupts, etc." },
[SMCA_UMC] = { "Unified Memory Controller" },
[SMCA_PB] = { "Parameter Block" },
[SMCA_PSP] = { "Platform Security Processor" },
+ [SMCA_PSP_V2] = { "Platform Security Processor" },
[SMCA_SMU] = { "System Management Unit" },
+ [SMCA_SMU_V2] = { "System Management Unit" },
+ [SMCA_MP5] = { "Microprocessor 5 Unit" },
+ [SMCA_NBIO] = { "Northbridge IO Unit" },
+ [SMCA_PCIE] = { "PCI Express Unit" },
};
static void amd_decode_errcode(struct mce_event *e)

View File

@ -1,71 +0,0 @@
commit 899fcc2cf21c86b5462c8f4441cd9c92b3d75f7d
Author: Aristeu Rozanski <arozansk@redhat.com>
Date: Thu Jan 19 08:45:57 2023 -0500
rasdaemon: ras-report: fix possible but unlikely file descriptor leak
Found with covscan.
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
ras-report.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
--- rasdaemon-0.6.1.orig/ras-report.c 2023-01-23 11:36:20.972368760 -0500
+++ rasdaemon-0.6.1/ras-report.c 2023-01-23 11:36:23.236343267 -0500
@@ -374,7 +374,7 @@ if(rc < 0){
mc_fail:
- if(sockfd > 0){
+ if(sockfd >= 0){
close(sockfd);
}
@@ -424,7 +424,7 @@ if(rc < 0){
aer_fail:
- if(sockfd > 0){
+ if(sockfd >= 0){
close(sockfd);
}
@@ -473,7 +473,7 @@ rc = 0;
non_standard_fail:
- if(sockfd > 0){
+ if(sockfd >= 0){
close(sockfd);
}
@@ -518,7 +518,7 @@ rc = 0;
arm_fail:
- if(sockfd > 0){
+ if(sockfd >= 0){
close(sockfd);
}
@@ -564,7 +564,7 @@ if(rc < 0){
mce_fail:
- if(sockfd > 0){
+ if(sockfd >= 0){
close(sockfd);
}
@@ -609,7 +609,7 @@ if (rc < 0)
done = 1;
mf_fail:
- if (sockfd > 0)
+ if (sockfd >= 0)
close(sockfd);
if (done)

View File

@ -1,670 +0,0 @@
commit a16ca0711001957ee98f2c124abce0fa1f801529
Author: Chandu-babu Namburu <chandu@amd.com>
Date: Wed Jan 30 20:36:45 2019 +0530
rasdaemon: add support for AMD Scalable MCA
Add logic here to decode errors from all known IP blocks for
AMD Scalable MCA supported processors
Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Chandu-babu Namburu <chandu@amd.com>
---
mce-amd-smca.c | 371 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
mce-amd.c | 122 +++++++++++++++++
ras-mce-handler.c | 24 +++
ras-mce-handler.h | 15 ++
4 files changed, 530 insertions(+), 2 deletions(-)
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ rasdaemon-0.6.1/mce-amd-smca.c 2019-07-12 11:35:04.836470461 -0400
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2018, AMD, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "ras-mce-handler.h"
+#include "bitfield.h"
+
+/* MCA_STATUS REGISTER FOR FAMILY 17H
+ *********************** Higher 32-bits *****************************
+ * 63: VALIDERROR, 62: OVERFLOW, 61: UC, 60: Err ENABLE,
+ * 59: Misc Valid, 58: Addr Valid, 57: PCC, 56: ErrCoreID Valid,
+ * 55: TCC, 54: RES, 53: Syndrom Valid, 52: Transparanet,
+ * 51: RES, 50: RES, 49: RES, 48: RES,
+ * 47: RES, 46: CECC, 45: UECC, 44: Deferred,
+ * 43: Poison, 42: RES, 41: RES, 40: RES,
+ * 39: RES, 38: RES, 37: ErrCoreID[5], 36: ErrCoreID[4],
+ * 35: ErrCoreID[3], 34: ErrCoreID[2] 33: ErrCoreID[1] 32: ErrCoreID[0]
+ *********************** Lower 32-bits ******************************
+ * 31: RES, 30: RES, 29: RES, 28: RES,
+ * 27: RES, 26: RES, 25: RES, 24: RES
+ * 23: RES, 22: RES, 21: XEC[5], 20: XEC[4],
+ * 19: XEC[3], 18: XEC[2], 17: XEC[1], 16: XEC[0]
+ * 15: EC[15], 14: EC[14], 13: EC[13], 12: EC[12],
+ * 11: EC[11], 10: EC[10], 09: EC[9], 08: EC[8],
+ * 07: EC[7], 06: EC[6], 05: EC[5], 04: EC[4],
+ * 03: EC[3], 02: EC[2], 01: EC[1], 00: EC[0]
+ */
+
+/* These may be used by multiple smca_hwid_mcatypes */
+enum smca_bank_types {
+ SMCA_LS = 0, /* Load Store */
+ SMCA_IF, /* Instruction Fetch */
+ SMCA_L2_CACHE, /* L2 Cache */
+ SMCA_DE, /* Decoder Unit */
+ SMCA_RESERVED, /* Reserved */
+ SMCA_EX, /* Execution Unit */
+ SMCA_FP, /* Floating Point */
+ SMCA_L3_CACHE, /* L3 Cache */
+ SMCA_CS, /* Coherent Slave */
+ SMCA_PIE, /* Power, Interrupts, etc. */
+ SMCA_UMC, /* Unified Memory Controller */
+ SMCA_PB, /* Parameter Block */
+ SMCA_PSP, /* Platform Security Processor */
+ SMCA_SMU, /* System Management Unit */
+ N_SMCA_BANK_TYPES
+};
+
+/* SMCA Extended error strings */
+/* Load Store */
+static const char * const smca_ls_mce_desc[] = {
+ "Load queue parity",
+ "Store queue parity",
+ "Miss address buffer payload parity",
+ "L1 TLB parity",
+ "Reserved",
+ "DC tag error type 6",
+ "DC tag error type 1",
+ "Internal error type 1",
+ "Internal error type 2",
+ "Sys Read data error thread 0",
+ "Sys read data error thread 1",
+ "DC tag error type 2",
+ "DC data error type 1 (poison consumption)",
+ "DC data error type 2",
+ "DC data error type 3",
+ "DC tag error type 4",
+ "L2 TLB parity",
+ "PDC parity error",
+ "DC tag error type 3",
+ "DC tag error type 5",
+ "L2 fill data error",
+};
+/* Instruction Fetch */
+static const char * const smca_if_mce_desc[] = {
+ "microtag probe port parity error",
+ "IC microtag or full tag multi-hit error",
+ "IC full tag parity",
+ "IC data array parity",
+ "Decoupling queue phys addr parity error",
+ "L0 ITLB parity error",
+ "L1 ITLB parity error",
+ "L2 ITLB parity error",
+ "BPQ snoop parity on Thread 0",
+ "BPQ snoop parity on Thread 1",
+ "L1 BTB multi-match error",
+ "L2 BTB multi-match error",
+ "L2 Cache Response Poison error",
+ "System Read Data error",
+};
+/* L2 Cache */
+static const char * const smca_l2_mce_desc[] = {
+ "L2M tag multi-way-hit error",
+ "L2M tag ECC error",
+ "L2M data ECC error",
+ "HW assert",
+};
+/* Decoder Unit */
+static const char * const smca_de_mce_desc[] = {
+ "uop cache tag parity error",
+ "uop cache data parity error",
+ "Insn buffer parity error",
+ "uop queue parity error",
+ "Insn dispatch queue parity error",
+ "Fetch address FIFO parity",
+ "Patch RAM data parity",
+ "Patch RAM sequencer parity",
+ "uop buffer parity"
+};
+/* Execution Unit */
+static const char * const smca_ex_mce_desc[] = {
+ "Watchdog timeout error",
+ "Phy register file parity",
+ "Flag register file parity",
+ "Immediate displacement register file parity",
+ "Address generator payload parity",
+ "EX payload parity",
+ "Checkpoint queue parity",
+ "Retire dispatch queue parity",
+ "Retire status queue parity error",
+ "Scheduling queue parity error",
+ "Branch buffer queue parity error",
+};
+/* Floating Point Unit */
+static const char * const smca_fp_mce_desc[] = {
+ "Physical register file parity",
+ "Freelist parity error",
+ "Schedule queue parity",
+ "NSQ parity error",
+ "Retire queue parity",
+ "Status register file parity",
+ "Hardware assertion",
+};
+/* L3 Cache */
+static const char * const smca_l3_mce_desc[] = {
+ "Shadow tag macro ECC error",
+ "Shadow tag macro multi-way-hit error",
+ "L3M tag ECC error",
+ "L3M tag multi-way-hit error",
+ "L3M data ECC error",
+ "XI parity, L3 fill done channel error",
+ "L3 victim queue parity",
+ "L3 HW assert",
+};
+/* Coherent Slave Unit */
+static const char * const smca_cs_mce_desc[] = {
+ "Illegal request from transport layer",
+ "Address violation",
+ "Security violation",
+ "Illegal response from transport layer",
+ "Unexpected response",
+ "Parity error on incoming request or probe response data",
+ "Parity error on incoming read response data",
+ "Atomic request parity",
+ "ECC error on probe filter access",
+};
+/* Power, Interrupt, etc.. */
+static const char * const smca_pie_mce_desc[] = {
+ "HW assert",
+ "Internal PIE register security violation",
+ "Error on GMI link",
+ "Poison data written to internal PIE register",
+};
+/* Unified Memory Controller */
+static const char * const smca_umc_mce_desc[] = {
+ "DRAM ECC error",
+ "Data poison error on DRAM",
+ "SDP parity error",
+ "Advanced peripheral bus error",
+ "Command/address parity error",
+ "Write data CRC error",
+};
+/* Parameter Block */
+static const char * const smca_pb_mce_desc[] = {
+ "Parameter Block RAM ECC error",
+};
+/* Platform Security Processor */
+static const char * const smca_psp_mce_desc[] = {
+ "PSP RAM ECC or parity error",
+};
+/* System Management Unit */
+static const char * const smca_smu_mce_desc[] = {
+ "SMU RAM ECC or parity error",
+};
+
+struct smca_mce_desc {
+ const char * const *descs;
+ unsigned int num_descs;
+};
+
+static struct smca_mce_desc smca_mce_descs[] = {
+ [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) },
+ [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) },
+ [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) },
+ [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) },
+ [SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) },
+ [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) },
+ [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
+ [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
+ [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
+ [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
+ [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
+ [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
+ [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
+};
+
+struct smca_hwid {
+ unsigned int bank_type; /* Use with smca_bank_types for easy indexing.*/
+ uint32_t mcatype_hwid; /* mcatype,hwid bit 63-32 in MCx_IPID Register*/
+};
+
+static struct smca_hwid smca_hwid_mcatypes[] = {
+ /* { bank_type, mcatype_hwid } */
+
+ /* ZN Core (HWID=0xB0) MCA types */
+ { SMCA_LS, 0x000000B0 },
+ { SMCA_IF, 0x000100B0 },
+ { SMCA_L2_CACHE, 0x000200B0 },
+ { SMCA_DE, 0x000300B0 },
+ /* HWID 0xB0 MCATYPE 0x4 is Reserved */
+ { SMCA_EX, 0x000500B0 },
+ { SMCA_FP, 0x000600B0 },
+ { SMCA_L3_CACHE, 0x000700B0 },
+
+ /* Data Fabric MCA types */
+ { SMCA_CS, 0x0000002E },
+ { SMCA_PIE, 0x0001002E },
+
+ /* Unified Memory Controller MCA type */
+ { SMCA_UMC, 0x00000096 },
+
+ /* Parameter Block MCA type */
+ { SMCA_PB, 0x00000005 },
+
+ /* Platform Security Processor MCA type */
+ { SMCA_PSP, 0x000000FF },
+
+ /* System Management Unit MCA type */
+ { SMCA_SMU, 0x00000001 },
+};
+
+struct smca_bank_name {
+ const char *name;
+};
+
+static struct smca_bank_name smca_names[] = {
+ [SMCA_LS] = { "Load Store Unit" },
+ [SMCA_IF] = { "Instruction Fetch Unit" },
+ [SMCA_L2_CACHE] = { "L2 Cache" },
+ [SMCA_DE] = { "Decode Unit" },
+ [SMCA_RESERVED] = { "Reserved" },
+ [SMCA_EX] = { "Execution Unit" },
+ [SMCA_FP] = { "Floating Point Unit" },
+ [SMCA_L3_CACHE] = { "L3 Cache" },
+ [SMCA_CS] = { "Coherent Slave" },
+ [SMCA_PIE] = { "Power, Interrupts, etc." },
+ [SMCA_UMC] = { "Unified Memory Controller" },
+ [SMCA_PB] = { "Parameter Block" },
+ [SMCA_PSP] = { "Platform Security Processor" },
+ [SMCA_SMU] = { "System Management Unit" },
+};
+
+static void amd_decode_errcode(struct mce_event *e)
+{
+
+ decode_amd_errcode(e);
+
+ if (e->status & MCI_STATUS_POISON)
+ mce_snprintf(e->mcistatus_msg, "Poison consumed");
+
+ if (e->status & MCI_STATUS_TCC)
+ mce_snprintf(e->mcistatus_msg, "Task_context_corrupt");
+
+}
+/*
+ * To find the UMC channel represented by this bank we need to match on its
+ * instance_id. The instance_id of a bank is held in the lower 32 bits of its
+ * IPID.
+ */
+static int find_umc_channel(struct mce_event *e)
+{
+ uint32_t umc_instance_id[] = {0x50f00, 0x150f00};
+ uint32_t instance_id = EXTRACT(e->ipid, 0, 31);
+ int i, channel = -1;
+
+ for (i = 0; i < ARRAY_SIZE(umc_instance_id); i++)
+ if (umc_instance_id[i] == instance_id)
+ channel = i;
+
+ return channel;
+}
+/* Decode extended errors according to Scalable MCA specification */
+static void decode_smca_error(struct mce_event *e)
+{
+ enum smca_bank_types bank_type;
+ const char *ip_name;
+ unsigned short xec = (e->status >> 16) & 0x3f;
+ const struct smca_hwid *s_hwid;
+ uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
+ unsigned int csrow = -1, channel = -1;
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
+ s_hwid = &smca_hwid_mcatypes[i];
+ if (mcatype_hwid == s_hwid->mcatype_hwid) {
+ bank_type = s_hwid->bank_type;
+ break;
+ }
+ }
+
+ if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) {
+ strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID");
+ return;
+ }
+
+ if (bank_type >= N_SMCA_BANK_TYPES) {
+ strcpy(e->mcastatus_msg, "Don't know how to decode this bank");
+ return;
+ }
+
+ if (bank_type == SMCA_RESERVED) {
+ strcpy(e->mcastatus_msg, "Bank 4 is reserved.\n");
+ return;
+ }
+
+ ip_name = smca_names[bank_type].name;
+
+ mce_snprintf(e->bank_name, "%s (bank=%d)", ip_name, e->bank);
+
+ /* Only print the descriptor of valid extended error code */
+ if (xec < smca_mce_descs[bank_type].num_descs)
+ mce_snprintf(e->mcastatus_msg,
+ " %s.\n", smca_mce_descs[bank_type].descs[xec]);
+
+ if (bank_type == SMCA_UMC && xec == 0) {
+ channel = find_umc_channel(e);
+ csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */
+ mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
+ channel, csrow);
+ }
+}
+
+int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)
+{
+ uint64_t mcgstatus = e->mcgstatus;
+
+ mce_snprintf(e->mcgstatus_msg, "mcgstatus=%lld",
+ (long long)e->mcgstatus);
+
+ if (mcgstatus & MCG_STATUS_RIPV)
+ mce_snprintf(e->mcgstatus_msg, "RIPV");
+ if (mcgstatus & MCG_STATUS_EIPV)
+ mce_snprintf(e->mcgstatus_msg, "EIPV");
+ if (mcgstatus & MCG_STATUS_MCIP)
+ mce_snprintf(e->mcgstatus_msg, "MCIP");
+
+ decode_smca_error(e);
+ amd_decode_errcode(e);
+ return 0;
+}
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ rasdaemon-0.6.1/mce-amd.c 2019-07-12 11:35:04.836470461 -0400
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2018, The AMD, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "ras-mce-handler.h"
+
+/* Error Code Types */
+#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010)
+#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100)
+#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800)
+#define INT_ERROR(x) (((x) & 0xF4FF) == 0x0400)
+
+/* Error code: transaction type (TT) */
+static char *transaction[] = {
+ "instruction", "data", "generic", "reserved"
+};
+/* Error codes: cache level (LL) */
+static char *cachelevel[] = {
+ "reserved", "L1", "L2", "L3/generic"
+};
+/* Error codes: memory transaction type (RRRR) */
+static char *memtrans[] = {
+ "generic", "generic read", "generic write", "data read",
+ "data write", "instruction fetch", "prefetch", "evict", "snoop",
+ "?", "?", "?", "?", "?", "?", "?"
+};
+/* Participation Processor */
+static char *partproc[] = {
+ "local node origin", "local node response",
+ "local node observed", "generic participation"
+};
+/* Timeout */
+static char *timeout[] = {
+ "request didn't time out",
+ "request timed out"
+};
+/* internal unclassified error code */
+static char *internal[] = { "reserved",
+ "reserved",
+ "hardware assert",
+ "reserved" };
+
+#define TT(x) (((x) >> 2) & 0x3) /*bit 2, bit 3*/
+#define TT_MSG(x) transaction[TT(x)]
+#define LL(x) ((x) & 0x3) /*bit 0, bit 1*/
+#define LL_MSG(x) cachelevel[LL(x)]
+
+#define R4(x) (((x) >> 4) & 0xF) /*bit 4, bit 5, bit 6, bit 7 */
+#define R4_MSG(x) ((R4(x) < 9) ? memtrans[R4(x)] : "Wrong R4!")
+
+#define TO(x) (((x) >> 8) & 0x1) /*bit 8*/
+#define TO_MSG(x) timeout[TO(x)]
+#define PP(x) (((x) >> 9) & 0x3) /*bit 9, bit 10*/
+#define PP_MSG(x) partproc[PP(x)]
+
+#define UU(x) (((x) >> 8) & 0x3) /*bit 8, bit 9*/
+#define UU_MSG(x) internal[UU(x)]
+
+void decode_amd_errcode(struct mce_event *e)
+{
+ uint16_t ec = e->status & 0xffff;
+ uint16_t ecc = (e->status >> 45) & 0x3;
+
+ if (e->status & MCI_STATUS_UC) {
+ if (e->status & MCI_STATUS_PCC)
+ strcpy(e->error_msg, "System Fatal error.");
+ if (e->mcgstatus & MCG_STATUS_RIPV)
+ strcpy(e->error_msg,
+ "Uncorrected, software restartable error.");
+ strcpy(e->error_msg,
+ "Uncorrected, software containable error.");
+ } else if (e->status & MCI_STATUS_DEFERRED)
+ strcpy(e->error_msg, "Deferred error, no action required.");
+ else
+ strcpy(e->error_msg, "Corrected error, no action required.");
+
+ if (!(e->status & MCI_STATUS_VAL))
+ mce_snprintf(e->mcistatus_msg, "MCE_INVALID");
+
+ if (e->status & MCI_STATUS_OVER)
+ mce_snprintf(e->mcistatus_msg, "Error_overflow");
+
+ if (e->status & MCI_STATUS_PCC)
+ mce_snprintf(e->mcistatus_msg, "Processor_context_corrupt");
+
+ if (ecc)
+ mce_snprintf(e->mcistatus_msg,
+ "%sECC", ((ecc == 2) ? "C" : "U"));
+
+ if (INT_ERROR(ec)) {
+ mce_snprintf(e->mcastatus_msg, "Internal '%s'", UU_MSG(ec));
+ return;
+ }
+
+ if (TLB_ERROR(ec))
+ mce_snprintf(e->mcastatus_msg,
+ "TLB Error 'tx: %s, level: %s'",
+ TT_MSG(ec), LL_MSG(ec));
+ else if (MEM_ERROR(ec))
+ mce_snprintf(e->mcastatus_msg,
+ "Memory Error 'mem-tx: %s, tx: %s, level: %s'",
+ R4_MSG(ec), TT_MSG(ec), LL_MSG(ec));
+ else if (BUS_ERROR(ec))
+ mce_snprintf(e->mcastatus_msg,
+ "Bus Error '%s, %s, mem-tx: %s, level: %s'",
+ PP_MSG(ec), TO_MSG(ec),
+ R4_MSG(ec), LL_MSG(ec));
+ return;
+
+}
--- rasdaemon-0.6.1.orig/ras-mce-handler.c 2019-07-12 11:35:01.585502811 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.c 2019-07-12 11:35:04.836470461 -0400
@@ -55,6 +55,7 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series
[CPU_KNIGHTS_LANDING] = "Knights Landing",
[CPU_KNIGHTS_MILL] = "Knights Mill",
[CPU_SKYLAKE_XEON] = "Skylake server",
+ [CPU_NAPLES] = "AMD Family 17h Zen1"
};
static enum cputype select_intel_cputype(struct ras_events *ras)
@@ -190,9 +191,12 @@ ret = 0;
if (!strcmp(mce->vendor, "AuthenticAMD")) {
if (mce->family == 15)
mce->cputype = CPU_K8;
- if (mce->family > 15) {
+ if (mce->family == 23)
+ mce->cputype = CPU_NAPLES;
+ if (mce->family > 23) {
log(ALL, LOG_INFO,
- "Can't parse MCE for this AMD CPU yet\n");
+ "Can't parse MCE for this AMD CPU yet %d\n",
+ mce->family);
ret = EINVAL;
}
goto ret;
@@ -331,6 +335,12 @@ #if 0
if (e->status & MCI_STATUS_ADDRV)
trace_seq_printf(s, ", addr= %llx", (long long)e->addr);
+ if (e->status & MCI_STATUS_SYNDV)
+ trace_seq_printf(s, ", synd= %llx", (long long)e->synd);
+
+ if (e->ipid)
+ trace_seq_printf(s, ", ipid= %llx", (long long)e->ipid);
+
if (e->mcgstatus_msg)
trace_seq_printf(s, ", %s", e->mcgstatus_msg);
else
@@ -411,6 +421,13 @@ if (pevent_get_field_val(s, event, "bank
if (pevent_get_field_val(s, event, "cpuvendor", record, &val, 1) < 0)
return -1;
e.cpuvendor = val;
+ /* Get New entries */
+ if (pevent_get_field_val(s, event, "synd", record, &val, 1) < 0)
+ return -1;
+ e.synd = val;
+ if (pevent_get_field_val(s, event, "ipid", record, &val, 1) < 0)
+ return -1;
+ e.ipid = val;
switch (mce->cputype) {
case CPU_GENERIC:
@@ -418,6 +435,9 @@ if (pevent_get_field_val(s, event, "cpuv
case CPU_K8:
rc = parse_amd_k8_event(ras, &e);
break;
+ case CPU_NAPLES:
+ rc = parse_amd_smca_event(ras, &e);
+ break;
default: /* All other CPU types are Intel */
rc = parse_intel_event(ras, &e);
}
--- rasdaemon-0.6.1.orig/ras-mce-handler.h 2019-07-12 11:35:01.585502811 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.h 2019-07-12 11:35:04.836470461 -0400
@@ -50,6 +50,7 @@ enum cputype {
CPU_KNIGHTS_LANDING,
CPU_KNIGHTS_MILL,
CPU_SKYLAKE_XEON,
+ CPU_NAPLES,
};
struct mce_event {
@@ -69,6 +70,8 @@ struct mce_event {
uint8_t cs;
uint8_t bank;
uint8_t cpuvendor;
+ uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */
+ uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */
/* Parsed data */
char timestamp[64];
@@ -129,6 +132,9 @@ void broadwell_de_decode_model(struct ra
void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e);
void skylake_s_decode_model(struct ras_events *ras, struct mce_event *e);
+/* AMD error code decode function */
+void decode_amd_errcode(struct mce_event *e);
+
/* Software defined banks */
#define MCE_EXTENDED_BANK 128
@@ -144,6 +150,13 @@ #define MCI_STATUS_EN (1ULL<<60) /*
#define MCI_STATUS_S (1ULL<<56) /* signalled */
#define MCI_STATUS_AR (1ULL<<55) /* action-required */
+/* AMD-specific bits */
+#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */
+#define MCI_STATUS_SYNDV (1ULL<<53) /* synd reg. valid */
+/* uncorrected error,deferred exception */
+#define MCI_STATUS_DEFERRED (1ULL<<44)
+#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
+
#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
#define MCG_STATUS_EIPV (1ULL<<1) /* eip points to correct instruction */
#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
@@ -154,4 +167,6 @@ int parse_intel_event(struct ras_events
int parse_amd_k8_event(struct ras_events *ras, struct mce_event *e);
+int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e);
+
#endif
--- rasdaemon-0.6.1.orig/Makefile.in 2018-04-25 06:29:05.000000000 -0400
+++ rasdaemon-0.6.1/Makefile.in 2019-07-15 14:41:22.308278851 -0400
@@ -100,7 +100,7 @@ sbin_PROGRAMS = rasdaemon$(EXEEXT)
@WITH_MCE_TRUE@ mce-intel-dunnington.c mce-intel-tulsa.c \
@WITH_MCE_TRUE@ mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \
@WITH_MCE_TRUE@ mce-intel-knl.c mce-intel-broadwell-de.c \
-@WITH_MCE_TRUE@ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c
+@WITH_MCE_TRUE@ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c mce-amd.c mce-amd-smca.c
@WITH_EXTLOG_TRUE@am__append_6 = ras-extlog-handler.c
@WITH_ABRT_REPORT_TRUE@am__append_7 = ras-report.c
@@ -132,7 +132,7 @@ am__rasdaemon_SOURCES_DIST = rasdaemon.c
mce-intel-ivb.c mce-intel-haswell.c mce-intel-knl.c \
mce-intel-broadwell-de.c mce-intel-broadwell-epex.c \
mce-intel-skylake-xeon.c ras-extlog-handler.c ras-report.c \
- non-standard-hisi_hip07.c
+ non-standard-hisi_hip07.c mce-amd-smca.c mce-amd.c
@WITH_SQLITE3_TRUE@am__objects_1 = ras-record.$(OBJEXT)
@WITH_AER_TRUE@am__objects_2 = ras-aer-handler.$(OBJEXT)
@WITH_NON_STANDARD_TRUE@am__objects_3 = \
@@ -149,7 +149,9 @@ non-standard-hisi_hip07.c
@WITH_MCE_TRUE@ mce-intel-knl.$(OBJEXT) \
@WITH_MCE_TRUE@ mce-intel-broadwell-de.$(OBJEXT) \
@WITH_MCE_TRUE@ mce-intel-broadwell-epex.$(OBJEXT) \
-@WITH_MCE_TRUE@ mce-intel-skylake-xeon.$(OBJEXT)
+@WITH_MCE_TRUE@ mce-intel-skylake-xeon.$(OBJEXT) \
+@WITH_MCE_TRUE@ mce-amd-smca.$(OBJEXT) \
+@WITH_MCE_TRUE@ mce-amd.$(OBJEXT)
@WITH_EXTLOG_TRUE@am__objects_6 = ras-extlog-handler.$(OBJEXT)
@WITH_ABRT_REPORT_TRUE@am__objects_7 = ras-report.$(OBJEXT)
@WITH_HISI_NS_DECODE_TRUE@am__objects_8 = \
@@ -595,6 +597,8 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bitfield.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd-k8.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd-scma.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-broadwell-de.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-broadwell-epex.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-dunnington.Po@am__quote@

View File

@ -1,138 +0,0 @@
commit a8c776ed94f68ae31d7b5f74e19545698898c13c
Author: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Tue Aug 14 13:06:27 2018 -0300
mce-intel-*: fix a warning when using FIELD(<num>, NULL)
Internally, FIELD() macro checks the size of an array, by
using ARRAY_SIZE. Well, this macro causes a division by zero
if NULL is used, as its type is void, as warned:
mce-intel-dunnington.c:30:2: note: in expansion of macro FIELD
FIELD(17, NULL),
^~~~~
ras-mce-handler.h:28:33: warning: division sizeof (void *) / sizeof (void) does not compute the number of array elements [-Wsizeof-pointer-div]
#define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x)))
^
bitfield.h:37:51: note: in expansion of macro ARRAY_SIZE
#define FIELD(start_bit, name) { start_bit, name, ARRAY_SIZE(name) }
^~~~~~~~~~
While this warning is harmless, it may prevent seeing more serios
warnings. So, add a FIELD_NULL(<num>) macro to avoid that.
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
diff --git a/bitfield.h b/bitfield.h
index c7dfeb1..fccbb36 100644
--- a/bitfield.h
+++ b/bitfield.h
@@ -35,6 +35,7 @@ struct numfield {
};
#define FIELD(start_bit, name) { start_bit, name, ARRAY_SIZE(name) }
+#define FIELD_NULL(start_bit) { start_bit, NULL, 0 }
#define SBITFIELD(start_bit, string) { start_bit, ((char * [2]) { NULL, string }), 2 }
#define NUMBER(start, end, name) { start, end, name, "%Lu", 0 }
diff --git a/mce-intel-dunnington.c b/mce-intel-dunnington.c
index 4b1c7e3..c695c62 100644
--- a/mce-intel-dunnington.c
+++ b/mce-intel-dunnington.c
@@ -27,14 +27,14 @@
static struct field dunnington_bus_status[] = {
SBITFIELD(16, "Parity error detected during FSB request phase"),
- FIELD(17, NULL),
+ FIELD_NULL(17),
SBITFIELD(20, "Hard Failure response received for a local transaction"),
SBITFIELD(21, "Parity error on FSB response field detected"),
SBITFIELD(22, "Parity data error on inbound data detected"),
- FIELD(23, NULL),
- FIELD(25, NULL),
- FIELD(28, NULL),
- FIELD(31, NULL),
+ FIELD_NULL(23),
+ FIELD_NULL(25),
+ FIELD_NULL(28),
+ FIELD_NULL(31),
{}
};
diff --git a/mce-intel-p4-p6.c b/mce-intel-p4-p6.c
index 4615e1a..5c6c3ff 100644
--- a/mce-intel-p4-p6.c
+++ b/mce-intel-p4-p6.c
@@ -60,7 +60,7 @@ static char *bus_queue_error_type[] = {
};
static struct field p6_shared_status[] = {
- FIELD(16, NULL),
+ FIELD_NULL(16),
FIELD(19, bus_queue_req_type),
FIELD(25, bus_queue_error_type),
FIELD(25, bus_queue_error_type),
@@ -68,7 +68,7 @@ static struct field p6_shared_status[] = {
SBITFIELD(36, "received parity error on response transaction"),
SBITFIELD(38, "timeout BINIT (ROB timeout)."
" No micro-instruction retired for some time"),
- FIELD(39, NULL),
+ FIELD_NULL(39),
SBITFIELD(42, "bus transaction received hard error response"),
SBITFIELD(43, "failure that caused IERR"),
/* The following are reserved for Core in the SDM. Let's keep them here anyways*/
@@ -76,15 +76,15 @@ static struct field p6_shared_status[] = {
SBITFIELD(45, "uncorrectable ECC error"),
SBITFIELD(46, "correctable ECC error"),
/* [47..54]: ECC syndrome */
- FIELD(55, NULL),
+ FIELD_NULL(55),
{},
};
static struct field p6old_status[] = {
SBITFIELD(28, "FRC error"),
SBITFIELD(29, "BERR on this CPU"),
- FIELD(31, NULL),
- FIELD(32, NULL),
+ FIELD_NULL(31),
+ FIELD_NULL(32),
SBITFIELD(35, "BINIT received from external bus"),
SBITFIELD(37, "Received hard error reponse on split transaction (Bus BINIT)"),
{}
@@ -94,9 +94,9 @@ static struct field core2_status[] = {
SBITFIELD(28, "MCE driven"),
SBITFIELD(29, "MCE is observed"),
SBITFIELD(31, "BINIT observed"),
- FIELD(32, NULL),
+ FIELD_NULL(32),
SBITFIELD(34, "PIC or FSB data parity error"),
- FIELD(35, NULL),
+ FIELD_NULL(35),
SBITFIELD(37, "FSB address parity error detected"),
{}
};
diff --git a/mce-intel-tulsa.c b/mce-intel-tulsa.c
index 6cea421..e59bf06 100644
--- a/mce-intel-tulsa.c
+++ b/mce-intel-tulsa.c
@@ -39,7 +39,7 @@ static struct field tls_bus_status[] = {
SBITFIELD(16, "Parity error detected during FSB request phase"),
SBITFIELD(17, "Partity error detected on Core 0 request's address field"),
SBITFIELD(18, "Partity error detected on Core 1 request's address field"),
- FIELD(19, NULL),
+ FIELD_NULL(19),
SBITFIELD(20, "Parity error on FSB response field detected"),
SBITFIELD(21, "FSB data parity error on inbound date detected"),
SBITFIELD(22, "Data parity error on data received from Core 0 detected"),
@@ -48,8 +48,8 @@ static struct field tls_bus_status[] = {
SBITFIELD(25, "Data ECC event to error on inbound data correctable or uncorrectable"),
SBITFIELD(26, "Pad logic detected a data strobe glitch or sequencing error"),
SBITFIELD(27, "Pad logic detected a request strobe glitch or sequencing error"),
- FIELD(28, NULL),
- FIELD(31, NULL),
+ FIELD_NULL(28),
+ FIELD_NULL(31),
{}
};

View File

@ -1,37 +0,0 @@
commit b22be68453b2497e86cbd273b9cd56fadc5859e3
Author: Ying Lv <lvying6@huawei.com>
Date: Wed May 15 11:15:42 2019 +0800
fix rasdaemon high CPU usage when part of CPUs offline
When we set part of CPU core offline, such as by setting the kernel cmdline
maxcpus = N(N is less than the total number of system CPU cores).
And then, we will observe that the CPU usage of some rasdaemon threads
is very close to 100.
This is because when part of CPU offline, poll in read_ras_event_all_cpus func
will fallback to pthread way.
Offlined CPU thread will return negative value when read trace_pipe_raw,
negative return value will covert to positive value because of 'unsigned size'.
So code will always go into 'size > 0' branch, and the CPU usage is too high.
Here, variable size uses int type will go to the right branch.
Fiexs: eff7c9e0("ras-events: Only use pthreads for collect if poll() not available")
Reported-by: Zhipeng Xie <xiezhipeng1@huawei.com>
Signed-off-by: Ying Lv <lvying6@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
diff --git a/ras-events.c b/ras-events.c
index 4e7b815..38ebe1e 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -426,7 +426,7 @@ static int read_ras_event(int fd,
struct kbuffer *kbuf,
void *page)
{
- unsigned size;
+ int size;
unsigned long long time_stamp;
void *data;

View File

@ -1,148 +0,0 @@
commit b497a3d6a39d402c41065e9284d49114b97e3bfe
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Mon Mar 8 16:57:28 2021 +0000
rasdaemon: ras-mc-ctl: Add memory failure events
Add supporting memory failure errors (memory_failure_event)
to the ras-mc-ctl tool.
Sample Log,
ras-mc-ctl --summary
...
Memory failure events summary:
Delayed errors: 4
Failed errors: 1
...
ras-mc-ctl --errors
...
Memory failure events:
1 2020-10-28 23:20:41 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Delayed
2 2020-10-28 23:31:38 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Delayed
3 2020-10-28 23:54:54 -0800 error: pfn=0x205000000, page_type=free buddy page, action_result=Delayed
4 2020-10-29 00:12:25 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Delayed
5 2020-10-29 00:26:36 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Failed
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
configure.ac | 11 +++++++++++
util/ras-mc-ctl.in | 46 +++++++++++++++++++++++++++++++++++++++++++---
2 files changed, 54 insertions(+), 3 deletions(-)
--- a/util/ras-mc-ctl.in 2021-10-13 13:51:00.887292563 -0400
+++ b/util/ras-mc-ctl.in 2021-10-13 13:51:27.536061894 -0400
@@ -44,11 +44,13 @@ my $modprobe = find_prog ("modprobe")
my $has_aer = 0;
my $has_arm = 0;
my $has_extlog = 0;
+my $has_mem_failure = 0;
my $has_mce = 0;
@WITH_AER_TRUE@$has_aer = 1;
@WITH_ARM_TRUE@$has_arm = 1;
@WITH_EXTLOG_TRUE@$has_extlog = 1;
+@WITH_MEMORY_FAILURE_TRUE@$has_mem_failure = 1;
@WITH_MCE_TRUE@$has_mce = 1;
my %conf = ();
@@ -1132,7 +1134,7 @@ sub summary
{
require DBI;
my ($query, $query_handle, $out);
- my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg);
+ my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg, $action_result);
my ($etype, $severity, $etype_string, $severity_string);
my ($affinity, $mpidr);
@@ -1203,9 +1205,27 @@ sub summary
$out .= "\t$count $etype_string $severity_string errors\n";
}
if ($out ne "") {
- print "Extlog records summary:\n$out";
+ print "Extlog records summary:\n$out\n";
} else {
- print "No Extlog errors.\n";
+ print "No Extlog errors.\n\n";
+ }
+ $query_handle->finish;
+ }
+
+ # Memory failure errors
+ if ($has_mem_failure == 1) {
+ $query = "select action_result, count(*) from memory_failure_event group by action_result";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($action_result, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$action_result errors: $count\n";
+ }
+ if ($out ne "") {
+ print "Memory failure events summary:\n$out\n";
+ } else {
+ print "No Memory failure errors.\n\n";
}
$query_handle->finish;
}
@@ -1238,6 +1258,7 @@ sub errors
my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location);
my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data);
my ($error_count, $affinity, $mpidr, $r_state, $psci_state);
+ my ($pfn, $page_type, $action_result);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
@@ -1329,6 +1350,25 @@ $out .= sprintf "address=0x%08x, ", $add
}
$query_handle->finish;
}
+
+ # Memory failure errors
+ if ($has_mem_failure == 1) {
+ $query = "select id, timestamp, pfn, page_type, action_result from memory_failure_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $pfn, $page_type, $action_result));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "pfn=$pfn, page_type=$page_type, action_result=$action_result\n";
+ }
+ if ($out ne "") {
+ print "Memory failure events:\n$out\n";
+ } else {
+ print "No Memory failure errors.\n\n";
+ }
+ $query_handle->finish;
+ }
# MCE mce_record errors
if ($has_mce == 1) {
--- a/configure.ac 2018-04-25 06:28:51.000000000 -0400
+++ b/configure.ac 2021-10-13 13:51:00.916292312 -0400
@@ -80,6 +80,16 @@ AS_IF([test "x$enable_extlog" = "xyes"],
])
AM_CONDITIONAL([WITH_EXTLOG], [test x$enable_extlog = xyes])
+AC_ARG_ENABLE([memory_failure],
+ AS_HELP_STRING([--enable-memory-failure], [enable memory failure events (currently experimental)]))
+
+AS_IF([test "x$enable_memory_failure" = "xyes" || test "x$enable_all" == "xyes"], [
+ AC_DEFINE(HAVE_MEMORY_FAILURE,1,"have memory failure events collect")
+ AC_SUBST([WITH_MEMORY_FAILURE])
+])
+AM_CONDITIONAL([WITH_MEMORY_FAILURE], [test x$enable_memory_failure = xyes || test x$enable_all == xyes])
+AM_COND_IF([WITH_MEMORY_FAILURE], [USE_MEMORY_FAILURE="yes"], [USE_MEMORY_FAILURE="no"])
+
AC_ARG_ENABLE([abrt_report],
AS_HELP_STRING([--enable-abrt-report], [enable report event to ABRT (currently experimental)]))
@@ -127,4 +137,5 @@ compile time options summary
ABRT report : $enable_abrt_report
HIP07 SAS HW errors : $enable_hisi_ns_decode
ARM events : $enable_arm
+ Memory Failure : $USE_MEMORY_FAILURE
EOF

View File

@ -1,94 +0,0 @@
commit cc2ce5c65ed5a42eaa97aa3659854add6d808da5
Author: Muralidhara M K <muralidhara.mk@amd.com>
Date: Mon Jan 13 19:12:06 2020 +0530
rasdaemon: Add error decoding for new SMCA Load Store bank type
Future Scalable Machine Check Architecture (SMCA) systems will have a
new Load Store bank type.
Add the new type's (HWID, McaType) ID and error decoding.
Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
[ Adjust commit message. ]
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 114e786..d0b6cb6 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -38,9 +38,16 @@
* 03: EC[3], 02: EC[2], 01: EC[1], 00: EC[0]
*/
+/* MCA_STATUS REGISTER FOR FAMILY 19H
+ * The bits 24 ~ 29 contains AddressLsb
+ * 29: ADDRLS[5], 28: ADDRLS[4], 27: ADDRLS[3],
+ * 26: ADDRLS[2], 25: ADDRLS[1], 24: ADDRLS[0]
+ */
+
/* These may be used by multiple smca_hwid_mcatypes */
enum smca_bank_types {
SMCA_LS = 0, /* Load Store */
+ SMCA_LS_V2, /* Load Store */
SMCA_IF, /* Instruction Fetch */
SMCA_L2_CACHE, /* L2 Cache */
SMCA_DE, /* Decoder Unit */
@@ -88,6 +95,32 @@ static const char * const smca_ls_mce_desc[] = {
"DC tag error type 5",
"L2 fill data error",
};
+static const char * const smca_ls2_mce_desc[] = {
+ "An ECC error was detected on a data cache read by a probe or victimization",
+ "An ECC error or L2 poison was detected on a data cache read by a load",
+ "An ECC error was detected on a data cache read-modify-write by a store",
+ "An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization",
+ "An ECC error or poison bit mismatch was detected on a tag read by a load",
+ "An ECC error or poison bit mismatch was detected on a tag read by a store",
+ "An ECC error was detected on an EMEM read by a load",
+ "An ECC error was detected on an EMEM read-modify-write by a store",
+ "A parity error was detected in an L1 TLB entry by any access",
+ "A parity error was detected in an L2 TLB entry by any access",
+ "A parity error was detected in a PWC entry by any access",
+ "A parity error was detected in an STQ entry by any access",
+ "A parity error was detected in an LDQ entry by any access",
+ "A parity error was detected in a MAB entry by any access",
+ "A parity error was detected in an SCB entry state field by any access",
+ "A parity error was detected in an SCB entry address field by any access",
+ "A parity error was detected in an SCB entry data field by any access",
+ "A parity error was detected in a WCB entry by any access",
+ "A poisoned line was detected in an SCB entry by any access",
+ "A SystemReadDataError error was reported on read data returned from L2 for a load",
+ "A SystemReadDataError error was reported on read data returned from L2 for an SCB store",
+ "A SystemReadDataError error was reported on read data returned from L2 for a WCB store",
+ "A hardware assertion error was reported",
+ "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access",
+};
/* Instruction Fetch */
static const char * const smca_if_mce_desc[] = {
"microtag probe port parity error",
@@ -289,6 +322,7 @@ struct smca_mce_desc {
static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) },
+ [SMCA_LS_V2] = { smca_ls2_mce_desc, ARRAY_SIZE(smca_ls2_mce_desc) },
[SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) },
[SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) },
[SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) },
@@ -319,6 +353,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* ZN Core (HWID=0xB0) MCA types */
{ SMCA_LS, 0x000000B0 },
+ { SMCA_LS_V2, 0x001000B0 },
{ SMCA_IF, 0x000100B0 },
{ SMCA_L2_CACHE, 0x000200B0 },
{ SMCA_DE, 0x000300B0 },
@@ -362,6 +397,7 @@ struct smca_bank_name {
static struct smca_bank_name smca_names[] = {
[SMCA_LS] = { "Load Store Unit" },
+ [SMCA_LS_V2] = { "Load Store Unit" },
[SMCA_IF] = { "Instruction Fetch Unit" },
[SMCA_L2_CACHE] = { "L2 Cache" },
[SMCA_DE] = { "Decode Unit" },

View File

@ -1,28 +0,0 @@
commit ce33041e0abfa20054ff5d6874ffbd1ab592558d
Author: Aristeu Rozanski <arozansk@redhat.com>
Date: Thu Jan 19 08:45:57 2023 -0500
rasdaemon: ras-memory-failure-handler: handle localtime() failure correctly
We could just have an empty string but keeping the format could prevent
issues if someone is actually parsing this.
Found with covscan.
v2: fixed the timestamp as pointed by Robert Elliott
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c
index 9941e68..1951456 100644
--- a/ras-memory-failure-handler.c
+++ b/ras-memory-failure-handler.c
@@ -148,6 +148,8 @@ int ras_memory_failure_event_handler(struct trace_seq *s,
if (tm)
strftime(ev.timestamp, sizeof(ev.timestamp),
"%Y-%m-%d %H:%M:%S %z", tm);
+ else
+ strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp));
trace_seq_printf(s, "%s ", ev.timestamp);
if (pevent_get_field_val(s, event, "pfn", record, &val, 1) < 0)

View File

@ -1,611 +0,0 @@
commit ce6e7864f11f709c4f803828fbc8e507d115d03b
Author: Greg Edwards <gedwards@ddn.com>
Date: Thu Apr 8 15:03:30 2021 -0600
rasdaemon: Add Ice Lake and Sapphire Rapids MSCOD values
Based on mcelog commits:
ee90ff20ce6a ("mcelog: Add support for Icelake server, Icelake-D, and Snow Ridge")
391abaac9bdf ("mcelog: Add decode for MCi_MISC from 10nm memory controller")
59cb7ad4bc72 ("mcelog: i10nm: Fix mapping from bank number to functional unit")
c0acd0e6a639 ("mcelog: Add support for Sapphirerapids server.")
Signed-off-by: Greg Edwards <gedwards@ddn.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
Makefile.am | 3
mce-intel-i10nm.c | 509 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
mce-intel.c | 5
ras-mce-handler.c | 12 +
ras-mce-handler.h | 5
5 files changed, 533 insertions(+), 1 deletion(-)
--- rasdaemon-0.6.1.orig/Makefile.am 2021-09-17 15:29:45.977790658 -0400
+++ rasdaemon-0.6.1/Makefile.am 2021-09-17 15:29:57.439698580 -0400
@@ -36,7 +36,8 @@ if WITH_MCE
mce-intel-dunnington.c mce-intel-tulsa.c \
mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \
mce-intel-knl.c mce-intel-broadwell-de.c \
- mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c
+ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c \
+ mce-amd.c mce-amd-smca.c mce-intel-i10nm.c
endif
if WITH_EXTLOG
rasdaemon_SOURCES += ras-extlog-handler.c
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ rasdaemon-0.6.1/mce-intel-i10nm.c 2021-09-17 15:29:45.977790658 -0400
@@ -0,0 +1,509 @@
+/*
+ * The code below came from Tony Luck's mcelog code,
+ * released under GNU Public General License, v.2
+ *
+ * Copyright (C) 2019 Intel Corporation
+ * Decode Intel 10nm specific machine check errors.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "ras-mce-handler.h"
+#include "bitfield.h"
+
+static char *pcu_1[] = {
+ [0x0D] = "MCA_LLC_BIST_ACTIVE_TIMEOUT",
+ [0x0E] = "MCA_DMI_TRAINING_TIMEOUT",
+ [0x0F] = "MCA_DMI_STRAP_SET_ARRIVAL_TIMEOUT",
+ [0x10] = "MCA_DMI_CPU_RESET_ACK_TIMEOUT",
+ [0x11] = "MCA_MORE_THAN_ONE_LT_AGENT",
+ [0x14] = "MCA_INCOMPATIBLE_PCH_TYPE",
+ [0x1E] = "MCA_BIOS_RST_CPL_INVALID_SEQ",
+ [0x1F] = "MCA_BIOS_INVALID_PKG_STATE_CONFIG",
+ [0x2D] = "MCA_PCU_PMAX_CALIB_ERROR",
+ [0x2E] = "MCA_TSC100_SYNC_TIMEOUT",
+ [0x3A] = "MCA_GPSB_TIMEOUT",
+ [0x3B] = "MCA_PMSB_TIMEOUT",
+ [0x3E] = "MCA_IOSFSB_PMREQ_CMP_TIMEOUT",
+ [0x40] = "MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE",
+ [0x42] = "MCA_SVID_VCCIN_VR_VOUT_FAILURE",
+ [0x43] = "MCA_SVID_CPU_VR_CAPABILITY_ERROR",
+ [0x44] = "MCA_SVID_CRITICAL_VR_FAILED",
+ [0x45] = "MCA_SVID_SA_ITD_ERROR",
+ [0x46] = "MCA_SVID_READ_REG_FAILED",
+ [0x47] = "MCA_SVID_WRITE_REG_FAILED",
+ [0x4A] = "MCA_SVID_PKGC_REQUEST_FAILED",
+ [0x4B] = "MCA_SVID_IMON_REQUEST_FAILED",
+ [0x4C] = "MCA_SVID_ALERT_REQUEST_FAILED",
+ [0x4D] = "MCA_SVID_MCP_VR_RAMP_ERROR",
+ [0x56] = "MCA_FIVR_PD_HARDERR",
+ [0x58] = "MCA_WATCHDOG_TIMEOUT_PKGC_SLAVE",
+ [0x59] = "MCA_WATCHDOG_TIMEOUT_PKGC_MASTER",
+ [0x5A] = "MCA_WATCHDOG_TIMEOUT_PKGS_MASTER",
+ [0x5B] = "MCA_WATCHDOG_TIMEOUT_MSG_CH_FSM",
+ [0x5C] = "MCA_WATCHDOG_TIMEOUT_BULK_CR_FSM",
+ [0x5D] = "MCA_WATCHDOG_TIMEOUT_IOSFSB_FSM",
+ [0x60] = "MCA_PKGS_SAFE_WP_TIMEOUT",
+ [0x61] = "MCA_PKGS_CPD_UNCPD_TIMEOUT",
+ [0x62] = "MCA_PKGS_INVALID_REQ_PCH",
+ [0x63] = "MCA_PKGS_INVALID_REQ_INTERNAL",
+ [0x64] = "MCA_PKGS_INVALID_RSP_INTERNAL",
+ [0x65 ... 0x7A] = "MCA_PKGS_RESET_PREP_TIMEOUT",
+ [0x7B] = "MCA_PKGS_SMBUS_VPP_PAUSE_TIMEOUT",
+ [0x7C] = "MCA_PKGS_SMBUS_MCP_PAUSE_TIMEOUT",
+ [0x7D] = "MCA_PKGS_SMBUS_SPD_PAUSE_TIMEOUT",
+ [0x80] = "MCA_PKGC_DISP_BUSY_TIMEOUT",
+ [0x81] = "MCA_PKGC_INVALID_RSP_PCH",
+ [0x83] = "MCA_PKGC_WATCHDOG_HANG_CBZ_DOWN",
+ [0x84] = "MCA_PKGC_WATCHDOG_HANG_CBZ_UP",
+ [0x87] = "MCA_PKGC_WATCHDOG_HANG_C2_BLKMASTER",
+ [0x88] = "MCA_PKGC_WATCHDOG_HANG_C2_PSLIMIT",
+ [0x89] = "MCA_PKGC_WATCHDOG_HANG_SETDISP",
+ [0x8B] = "MCA_PKGC_ALLOW_L1_ERROR",
+ [0x90] = "MCA_RECOVERABLE_DIE_THERMAL_TOO_HOT",
+ [0xA0] = "MCA_ADR_SIGNAL_TIMEOUT",
+ [0xA1] = "MCA_BCLK_FREQ_OC_ABOVE_THRESHOLD",
+ [0xB0] = "MCA_DISPATCHER_RUN_BUSY_TIMEOUT",
+};
+
+static char *pcu_2[] = {
+ [0x04] = "Clock/power IP response timeout",
+ [0x05] = "SMBus controller raised SMI",
+ [0x09] = "PM controller received invalid transaction",
+};
+
+static char *pcu_3[] = {
+ [0x01] = "Instruction address out of valid space",
+ [0x02] = "Double bit RAM error on Instruction Fetch",
+ [0x03] = "Invalid OpCode seen",
+ [0x04] = "Stack Underflow",
+ [0x05] = "Stack Overflow",
+ [0x06] = "Data address out of valid space",
+ [0x07] = "Double bit RAM error on Data Fetch",
+};
+
+static struct field pcu1[] = {
+ FIELD(0, pcu_1),
+ {}
+};
+
+static struct field pcu2[] = {
+ FIELD(0, pcu_2),
+ {}
+};
+
+static struct field pcu3[] = {
+ FIELD(0, pcu_3),
+ {}
+};
+
+static struct field upi1[] = {
+ SBITFIELD(22, "Phy Control Error"),
+ SBITFIELD(23, "Unexpected Retry.Ack flit"),
+ SBITFIELD(24, "Unexpected Retry.Req flit"),
+ SBITFIELD(25, "RF parity error"),
+ SBITFIELD(26, "Routeback Table error"),
+ SBITFIELD(27, "Unexpected Tx Protocol flit (EOP, Header or Data)"),
+ SBITFIELD(28, "Rx Header-or-Credit BGF credit overflow/underflow"),
+ SBITFIELD(29, "Link Layer Reset still in progress when Phy enters L0"),
+ SBITFIELD(30, "Link Layer reset initiated while protocol traffic not idle"),
+ SBITFIELD(31, "Link Layer Tx Parity Error"),
+ {}
+};
+
+static char *upi_2[] = {
+ [0x00] = "Phy Initialization Failure (NumInit)",
+ [0x01] = "Phy Detected Drift Buffer Alarm",
+ [0x02] = "Phy Detected Latency Buffer Rollover",
+ [0x10] = "LL Rx detected CRC error: unsuccessful LLR (entered Abort state)",
+ [0x11] = "LL Rx Unsupported/Undefined packet",
+ [0x12] = "LL or Phy Control Error",
+ [0x13] = "LL Rx Parameter Exception",
+ [0x1F] = "LL Detected Control Error",
+ [0x20] = "Phy Initialization Abort",
+ [0x21] = "Phy Inband Reset",
+ [0x22] = "Phy Lane failure, recovery in x8 width",
+ [0x23] = "Phy L0c error corrected without Phy reset",
+ [0x24] = "Phy L0c error triggering Phy reset",
+ [0x25] = "Phy L0p exit error corrected with reset",
+ [0x30] = "LL Rx detected CRC error: successful LLR without Phy Reinit",
+ [0x31] = "LL Rx detected CRC error: successful LLR with Phy Reinit",
+ [0x32] = "Tx received LLR",
+};
+
+static struct field upi2[] = {
+ FIELD(0, upi_2),
+ {}
+};
+
+static struct field m2m[] = {
+ SBITFIELD(16, "MC read data error"),
+ SBITFIELD(17, "Reserved"),
+ SBITFIELD(18, "MC partial write data error"),
+ SBITFIELD(19, "Full write data error"),
+ SBITFIELD(20, "M2M clock-domain-crossing buffer (BGF) error"),
+ SBITFIELD(21, "M2M time out"),
+ SBITFIELD(22, "M2M tracker parity error"),
+ SBITFIELD(23, "fatal Bucket1 error"),
+ {}
+};
+
+static char *imc_0[] = {
+ [0x01] = "Address parity error",
+ [0x02] = "Data parity error",
+ [0x03] = "Data ECC error",
+ [0x04] = "Data byte enable parity error",
+ [0x07] = "Transaction ID parity error",
+ [0x08] = "Corrected patrol scrub error",
+ [0x10] = "Uncorrected patrol scrub error",
+ [0x20] = "Corrected spare error",
+ [0x40] = "Uncorrected spare error",
+ [0x80] = "Corrected read error",
+ [0xA0] = "Uncorrected read error",
+ [0xC0] = "Uncorrected metadata",
+};
+
+static char *imc_1[] = {
+ [0x00] = "WDB read parity error",
+ [0x03] = "RPA parity error",
+ [0x06] = "DDR_T_DPPP data BE error",
+ [0x07] = "DDR_T_DPPP data error",
+ [0x08] = "DDR link failure",
+ [0x11] = "PCLS CAM error",
+ [0x12] = "PCLS data error",
+};
+
+static char *imc_2[] = {
+ [0x00] = "DDR4 command / address parity error",
+ [0x20] = "HBM command / address parity error",
+ [0x21] = "HBM data parity error",
+};
+
+static char *imc_4[] = {
+ [0x00] = "RPQ parity (primary) error",
+};
+
+static char *imc_8[] = {
+ [0x00] = "DDR-T bad request",
+ [0x01] = "DDR Data response to an invalid entry",
+ [0x02] = "DDR data response to an entry not expecting data",
+ [0x03] = "DDR4 completion to an invalid entry",
+ [0x04] = "DDR-T completion to an invalid entry",
+ [0x05] = "DDR data/completion FIFO overflow",
+ [0x06] = "DDR-T ERID correctable parity error",
+ [0x07] = "DDR-T ERID uncorrectable error",
+ [0x08] = "DDR-T interrupt received while outstanding interrupt was not ACKed",
+ [0x09] = "ERID FI FO overflow",
+ [0x0A] = "DDR-T error on FNV write credits",
+ [0x0B] = "DDR-T error on FNV read credits",
+ [0x0C] = "DDR-T scheduler error",
+ [0x0D] = "DDR-T FNV error event",
+ [0x0E] = "DDR-T FNV thermal event",
+ [0x0F] = "CMI packet while idle",
+ [0x10] = "DDR_T_RPQ_REQ_PARITY_ERR",
+ [0x11] = "DDR_T_WPQ_REQ_PARITY_ERR",
+ [0x12] = "2LM_NMFILLWR_CAM_ERR",
+ [0x13] = "CMI_CREDIT_OVERSUB_ERR",
+ [0x14] = "CMI_CREDIT_TOTAL_ERR",
+ [0x15] = "CMI_CREDIT_RSVD_POOL_ERR",
+ [0x16] = "DDR_T_RD_ERROR",
+ [0x17] = "WDB_FIFO_ERR",
+ [0x18] = "CMI_REQ_FIFO_OVERFLOW",
+ [0x19] = "CMI_REQ_FIFO_UNDERFLOW",
+ [0x1A] = "CMI_RSP_FIFO_OVERFLOW",
+ [0x1B] = "CMI_RSP_FIFO_UNDERFLOW",
+ [0x1C] = "CMI _MISC_MC_CRDT_ERRORS",
+ [0x1D] = "CMI_MISC_MC_ARB_ERRORS",
+ [0x1E] = "DDR_T_WR_CMPL_FI FO_OVERFLOW",
+ [0x1F] = "DDR_T_WR_CMPL_FI FO_UNDERFLOW",
+ [0x20] = "CMI_RD_CPL_FIFO_OVERFLOW",
+ [0x21] = "CMI_RD_CPL_FIFO_UNDERFLOW",
+ [0x22] = "TME_KEY_PAR_ERR",
+ [0x23] = "TME_CMI_MISC_ERR",
+ [0x24] = "TME_CMI_OVFL_ERR",
+ [0x25] = "TME_CMI_UFL_ERR",
+ [0x26] = "TME_TEM_SECURE_ERR",
+ [0x27] = "TME_UFILL_PAR_ERR",
+ [0x29] = "INTERNAL_ERR",
+ [0x2A] = "TME_INTEGRITY_ERR",
+ [0x2B] = "TME_TDX_ERR",
+ [0x2C] = "TME_UFILL_TEM_SECURE_ERR",
+ [0x2D] = "TME_KEY_POISON_ERR",
+ [0x2E] = "TME_SECURITY_ENGINE_ERR",
+};
+
+static char *imc_10[] = {
+ [0x08] = "CORR_PATSCRUB_MIRR2ND_ERR",
+ [0x10] = "UC_PATSCRUB_MIRR2ND_ERR",
+ [0x20] = "COR_SPARE_MIRR2ND_ERR",
+ [0x40] = "UC_SPARE_MIRR2ND_ERR",
+ [0x80] = "HA_RD_MIRR2ND_ERR",
+ [0xA0] = "HA_UNCORR_RD_MIRR2ND_ERR",
+};
+
+static struct field imc0[] = {
+ FIELD(0, imc_0),
+ {}
+};
+
+static struct field imc1[] = {
+ FIELD(0, imc_1),
+ {}
+};
+
+static struct field imc2[] = {
+ FIELD(0, imc_2),
+ {}
+};
+
+static struct field imc4[] = {
+ FIELD(0, imc_4),
+ {}
+};
+
+static struct field imc8[] = {
+ FIELD(0, imc_8),
+ {}
+};
+
+static struct field imc10[] = {
+ FIELD(0, imc_10),
+ {}
+};
+
+static void i10nm_imc_misc(struct mce_event *e)
+{
+ uint32_t column = EXTRACT(e->misc, 9, 18) << 2;
+ uint32_t row = EXTRACT(e->misc, 19, 39);
+ uint32_t bank = EXTRACT(e->misc, 42, 43);
+ uint32_t bankgroup = EXTRACT(e->misc, 40, 41) | (EXTRACT(e->misc, 44, 44) << 2);
+ uint32_t fdevice = EXTRACT(e->misc, 46, 51);
+ uint32_t subrank = EXTRACT(e->misc, 52, 55);
+ uint32_t rank = EXTRACT(e->misc, 56, 58);
+ uint32_t eccmode = EXTRACT(e->misc, 59, 62);
+ uint32_t transient = EXTRACT(e->misc, 63, 63);
+
+ mce_snprintf(e->error_msg, "bank: 0x%x bankgroup: 0x%x row: 0x%x column: 0x%x", bank, bankgroup, row, column);
+ if (!transient && !EXTRACT(e->status, 61, 61))
+ mce_snprintf(e->error_msg, "failed device: 0x%x", fdevice);
+ mce_snprintf(e->error_msg, "rank: 0x%x subrank: 0x%x", rank, subrank);
+ mce_snprintf(e->error_msg, "ecc mode: ");
+ switch (eccmode) {
+ case 0: mce_snprintf(e->error_msg, "SDDC memory mode"); break;
+ case 1: mce_snprintf(e->error_msg, "SDDC"); break;
+ case 4: mce_snprintf(e->error_msg, "ADDDC memory mode"); break;
+ case 5: mce_snprintf(e->error_msg, "ADDDC"); break;
+ case 8: mce_snprintf(e->error_msg, "DDRT read"); break;
+ default: mce_snprintf(e->error_msg, "unknown"); break;
+ }
+ if (transient)
+ mce_snprintf(e->error_msg, "transient");
+}
+
+enum banktype {
+ BT_UNKNOWN,
+ BT_PCU,
+ BT_UPI,
+ BT_M2M,
+ BT_IMC,
+};
+
+static enum banktype icelake[32] = {
+ [4] = BT_PCU,
+ [5] = BT_UPI,
+ [7 ... 8] = BT_UPI,
+ [12] = BT_M2M,
+ [16] = BT_M2M,
+ [20] = BT_M2M,
+ [24] = BT_M2M,
+ [13 ... 15] = BT_IMC,
+ [17 ... 19] = BT_IMC,
+ [21 ... 23] = BT_IMC,
+ [25 ... 27] = BT_IMC,
+};
+
+static enum banktype icelake_de[32] = {
+ [4] = BT_PCU,
+ [12] = BT_M2M,
+ [16] = BT_M2M,
+ [13 ... 15] = BT_IMC,
+ [17 ... 19] = BT_IMC,
+};
+
+static enum banktype tremont[32] = {
+ [4] = BT_PCU,
+ [12] = BT_M2M,
+ [13 ... 15] = BT_IMC,
+};
+
+static enum banktype sapphire[32] = {
+ [4] = BT_PCU,
+ [5] = BT_UPI,
+ [12] = BT_M2M,
+ [13 ... 20] = BT_IMC,
+};
+
+void i10nm_memerr_misc(struct mce_event *e, int *channel);
+
+void i10nm_decode_model(enum cputype cputype, struct ras_events *ras,
+ struct mce_event *e)
+{
+ enum banktype banktype;
+ uint64_t f, status = e->status;
+ uint32_t mca = status & 0xffff;
+ int channel = -1;
+
+ switch (cputype) {
+ case CPU_ICELAKE_XEON:
+ banktype = icelake[e->bank];
+ break;
+ case CPU_ICELAKE_DE:
+ banktype = icelake_de[e->bank];
+ break;
+ case CPU_TREMONT_D:
+ banktype = tremont[e->bank];
+ break;
+ case CPU_SAPPHIRERAPIDS:
+ banktype = sapphire[e->bank];
+ break;
+ default:
+ return;
+ }
+
+ switch (banktype) {
+ case BT_UNKNOWN:
+ break;
+
+ case BT_PCU:
+ mce_snprintf(e->error_msg, "PCU: ");
+ f = EXTRACT(status, 24, 31);
+ if (f)
+ decode_bitfield(e, f, pcu1);
+ f = EXTRACT(status, 20, 23);
+ if (f)
+ decode_bitfield(e, f, pcu2);
+ f = EXTRACT(status, 16, 19);
+ if (f)
+ decode_bitfield(e, f, pcu3);
+ break;
+
+ case BT_UPI:
+ mce_snprintf(e->error_msg, "UPI: ");
+ f = EXTRACT(status, 22, 31);
+ if (f)
+ decode_bitfield(e, status, upi1);
+ f = EXTRACT(status, 16, 21);
+ decode_bitfield(e, f, upi2);
+ break;
+
+ case BT_M2M:
+ mce_snprintf(e->error_msg, "M2M: ");
+ f = EXTRACT(status, 24, 25);
+ mce_snprintf(e->error_msg, "MscodDDRType=0x%" PRIx64, f);
+ f = EXTRACT(status, 26, 31);
+ mce_snprintf(e->error_msg, "MscodMiscErrs=0x%" PRIx64, f);
+ decode_bitfield(e, status, m2m);
+ break;
+
+ case BT_IMC:
+ mce_snprintf(e->error_msg, "MemCtrl: ");
+ f = EXTRACT(status, 16, 23);
+ switch (EXTRACT(status, 24, 31)) {
+ case 0: decode_bitfield(e, f, imc0); break;
+ case 1: decode_bitfield(e, f, imc1); break;
+ case 2: decode_bitfield(e, f, imc2); break;
+ case 4: decode_bitfield(e, f, imc4); break;
+ case 8: decode_bitfield(e, f, imc8); break;
+ case 0x10: decode_bitfield(e, f, imc10); break;
+ }
+ i10nm_imc_misc(e);
+ break;
+ }
+
+ /*
+ * Memory error specific code. Returns if the error is not a MC one
+ */
+
+ /* Check if the error is at the memory controller */
+ if ((mca >> 7) != 1)
+ return;
+
+ /* Ignore unless this is an corrected extended error from an iMC bank */
+ if (banktype != BT_IMC || (status & MCI_STATUS_UC))
+ return;
+
+ /*
+ * Parse the reported channel
+ */
+
+ i10nm_memerr_misc(e, &channel);
+ if (channel == -1)
+ return;
+ mce_snprintf(e->mc_location, "memory_channel=%d", channel);
+}
+
+/*
+ * There isn't enough information to identify the DIMM. But
+ * we can derive the channel from the bank number.
+ * There can be four memory controllers with two channels each.
+ */
+void i10nm_memerr_misc(struct mce_event *e, int *channel)
+{
+ uint64_t status = e->status;
+ unsigned int chan, imc;
+
+ /* Check this is a memory error */
+ if (!test_prefix(7, status & 0xefff))
+ return;
+
+ chan = EXTRACT(status, 0, 3);
+ if (chan == 0xf)
+ return;
+
+ switch (e->bank) {
+ case 12: /* M2M 0 */
+ case 13: /* IMC 0, Channel 0 */
+ case 14: /* IMC 0, Channel 1 */
+ case 15: /* IMC 0, Channel 2 */
+ imc = 0;
+ break;
+ case 16: /* M2M 1 */
+ case 17: /* IMC 1, Channel 0 */
+ case 18: /* IMC 1, Channel 1 */
+ case 19: /* IMC 1, Channel 2 */
+ imc = 1;
+ break;
+ case 20: /* M2M 2 */
+ case 21: /* IMC 2, Channel 0 */
+ case 22: /* IMC 2, Channel 1 */
+ case 23: /* IMC 2, Channel 2 */
+ imc = 2;
+ break;
+ case 24: /* M2M 3 */
+ case 25: /* IMC 3, Channel 0 */
+ case 26: /* IMC 3, Channel 1 */
+ case 27: /* IMC 3, Channel 2 */
+ imc = 3;
+ break;
+ default:
+ return;
+ }
+
+ channel[0] = imc * 3 + chan;
+}
--- rasdaemon-0.6.1.orig/mce-intel.c 2021-09-17 15:29:39.189845188 -0400
+++ rasdaemon-0.6.1/mce-intel.c 2021-09-17 15:29:45.977790658 -0400
@@ -411,6 +411,11 @@ if (test_prefix(11, (e->status & 0xffffL
case CPU_SKYLAKE_XEON:
skylake_s_decode_model(ras, e);
break;
+ case CPU_ICELAKE_XEON:
+ case CPU_ICELAKE_DE:
+ case CPU_TREMONT_D:
+ case CPU_SAPPHIRERAPIDS:
+ i10nm_decode_model(mce->cputype, ras, e);
default:
break;
}
--- rasdaemon-0.6.1.orig/ras-mce-handler.c 2021-09-17 15:29:39.189845188 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.c 2021-09-17 15:29:45.977790658 -0400
@@ -56,6 +56,10 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series
[CPU_KNIGHTS_MILL] = "Knights Mill",
[CPU_SKYLAKE_XEON] = "Skylake server",
[CPU_AMD_SMCA] = "AMD Scalable MCA",
+ [CPU_ICELAKE_XEON] = "Icelake server",
+ [CPU_ICELAKE_DE] = "Icelake server D Family",
+ [CPU_TREMONT_D] = "Tremont microserver",
+ [CPU_SAPPHIRERAPIDS] = "Sapphirerapids server",
};
static enum cputype select_intel_cputype(struct ras_events *ras)
@@ -107,6 +111,14 @@ else if (mce->model == 0x85)
return CPU_KNIGHTS_MILL;
else if (mce->model == 0x55)
return CPU_SKYLAKE_XEON;
+ else if (mce->model == 0x6a)
+ return CPU_ICELAKE_XEON;
+ else if (mce->model == 0x6c)
+ return CPU_ICELAKE_DE;
+ else if (mce->model == 0x86)
+ return CPU_TREMONT_D;
+ else if (mce->model == 0x8f)
+ return CPU_SAPPHIRERAPIDS;
if (mce->model > 0x1a) {
log(ALL, LOG_INFO,
--- rasdaemon-0.6.1.orig/ras-mce-handler.h 2021-09-17 15:29:39.189845188 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.h 2021-09-17 15:29:45.977790658 -0400
@@ -51,6 +51,10 @@ enum cputype {
CPU_KNIGHTS_MILL,
CPU_SKYLAKE_XEON,
CPU_AMD_SMCA,
+ CPU_ICELAKE_XEON,
+ CPU_ICELAKE_DE,
+ CPU_TREMONT_D,
+ CPU_SAPPHIRERAPIDS,
};
struct mce_event {
@@ -131,6 +135,7 @@ void tulsa_decode_model(struct mce_event
void broadwell_de_decode_model(struct ras_events *ras, struct mce_event *e);
void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e);
void skylake_s_decode_model(struct ras_events *ras, struct mce_event *e);
+void i10nm_decode_model(enum cputype cputype, struct ras_events *ras, struct mce_event *e);
/* AMD error code decode function */
void decode_amd_errcode(struct mce_event *e);

View File

@ -1,24 +0,0 @@
commit e8b97ec14a11764fedfea50bd4d96ddda43c7fc1
Author: Aristeu Rozanski <arozansk@redhat.com>
Date: Thu Jan 19 08:45:57 2023 -0500
rasdaemon: mce-amd-smca: properly limit bank types
Found with covscan.
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index f3379fc..27ca8aa 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -584,7 +584,7 @@ static void decode_smca_error(struct mce_event *e)
return;
}
- if (bank_type >= MAX_NR_BANKS) {
+ if (bank_type >= N_SMCA_BANK_TYPES) {
strcpy(e->mcastatus_msg, "Don't know how to decode this bank");
return;
}

View File

@ -1,47 +0,0 @@
From: Muralidhara M K <muralimk@amd.com>
This patch removes trailing spaces at the end of a line from
file location and fixes --layout option to parse dimm nodes
to get the size from ras-mc-ctl.
Issue is reported https://github.com/mchehab/rasdaemon/issues/43
Where '> ras-mc-ctl --layout' reports all 0s
With this change the layout prints the correct dimm sizes
> sudo ras-mc-ctl --layout
+-----------------------------------------------+
| mc0 |
| csrow0 | csrow1 | csrow2 | csrow3 |
----------+-----------------------------------------------+
...
channel7: | 16384 MB | 0 MB | 0 MB | 0 MB |
channel6: | 16384 MB | 0 MB | 0 MB | 0 MB |
...
----------+-----------------------------------------------+
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
---
util/ras-mc-ctl.in | 2 ++
1 file changed, 2 insertions(+)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 1e3aeb7..b22dd60 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -246,6 +246,7 @@ sub parse_dimm_nodes
if (($file =~ /max_location$/)) {
open IN, $file;
my $location = <IN>;
+ $location =~ s/\s+$//;
close IN;
my @temp = split(/ /, $location);
@@ -288,6 +289,7 @@ sub parse_dimm_nodes
open IN, $file;
my $location = <IN>;
+ $location =~ s/\s+$//;
close IN;
my @pos;

View File

@ -0,0 +1,159 @@
commit aa36c96cd52d775570dae989dd95a060f1149077
Author: Avadhut Naik <avadnaik@amd.com>
Date: Mon Apr 24 20:35:56 2023 +0000
rasdaemon: Handle reassigned bit definitions for CS SMCA
Currently, on AMD systems with Scalable MCA (SMCA), each machine check
error of a SMCA bank type has an associated bit position in the bank's
control (CTL) register used for enabling / disabling reporting of the
very error. An error's bit position in the CTL register is also used
during error decoding for offsetting into the corresponding bank's error
description structure. As new errors are being added in newer AMD systems
for existing SMCA bank types, the underlying SMCA architecture guarantees
that the bit positions of existing errors are not altered.
However, on some AMD systems viz. Genoa, some of the existing bit
definitions in the CTL register of the Coherent Slave (CS) SMCA bank type
are reassigned without defining new HWID and McaType. Consequently, the
very errors whose bit definitions have been reassigned in the CTL register
are being erroneously decoded.
As a solution, create a new software defined SMCA bank type by utilizing
one of the hardware-reserved values for HWID. The new SMCA bank type will
only be employed for CS error decoding on affected CPU models.
Additionally, since the existing error description structure for the CS
SMCA bank type is still valid, add new error description structure to
compensate for the reassigned bit definitions.
Signed-off-by: Avadhut Naik <avadnaik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 7ec787a..e81f732 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -57,6 +57,7 @@ enum smca_bank_types {
SMCA_L3_CACHE, /* L3 Cache */
SMCA_CS, /* Coherent Slave */
SMCA_CS_V2,
+ SMCA_CS_V2_QUIRK,
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
SMCA_UMC_V2,
@@ -259,6 +260,31 @@ static const char * const smca_cs2_mce_desc[] = {
"Hardware Assert Error",
};
+/*
+ * Per Genoa's revision guide, erratum 1384, existing bit definitions
+ * are reassigned for SMCA CS bank type.
+ */
+static const char * const smca_cs2_quirk_mce_desc[] = {
+ "Illegal Request",
+ "Address Violation",
+ "Security Violation",
+ "Illegal Response",
+ "Unexpected Response",
+ "Request or Probe Parity Error",
+ "Read Response Parity Error",
+ "Atomic Request Parity Error",
+ "SDP read response had no match in the CS queue",
+ "SDP read response had an unexpected RETRY error",
+ "Counter overflow error",
+ "Counter underflow error",
+ "Probe Filter Protocol Error",
+ "Probe Filter ECC Error",
+ "Illegal Request on the no data channel",
+ "Address Violation on the no data channel",
+ "Security Violation on the no data channel",
+ "Hardware Assert Error",
+};
+
static const char * const smca_pie_mce_desc[] = {
"Hardware assert",
"Register security violation",
@@ -549,6 +575,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
[SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
[SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) },
+ [SMCA_CS_V2_QUIRK] = { smca_cs2_quirk_mce_desc, ARRAY_SIZE(smca_cs2_quirk_mce_desc)},
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
[SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) },
@@ -597,6 +624,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Data Fabric MCA types */
{ SMCA_CS, 0x0000002E },
{ SMCA_CS_V2, 0x0002002E },
+ {SMCA_CS_V2_QUIRK, 0x00010000 },
{ SMCA_PIE, 0x0001002E },
/* Unified Memory Controller MCA type */
@@ -660,7 +688,7 @@ static struct smca_bank_name smca_names[] = {
[SMCA_EX] = { "Execution Unit" },
[SMCA_FP] = { "Floating Point Unit" },
[SMCA_L3_CACHE] = { "L3 Cache" },
- [SMCA_CS ... SMCA_CS_V2] = { "Coherent Slave" },
+ [SMCA_CS ... SMCA_CS_V2_QUIRK] = { "Coherent Slave" },
[SMCA_PIE] = { "Power, Interrupts, etc." },
[SMCA_UMC] = { "Unified Memory Controller" },
[SMCA_UMC_V2] = { "Unified Memory Controller V2" },
@@ -723,8 +751,38 @@ static int find_hbm_channel(struct mce_event *e)
return (umc % 2) ? tmp + 4 : tmp;
}
+static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
+{
+ if (m->family == 0x19) {
+ switch (m->model) {
+ /*
+ * Per Genoa's revision guide, erratum 1384, some SMCA Extended
+ * Error Codes and SMCA Control bits are incorrect for SMCA CS
+ * bank type.
+ */
+ case 0x10 ... 0x1F:
+ case 0x60 ... 0x7B:
+ case 0xA0 ... 0xAF:
+ if (*hwid_mcatype == 0x0002002E)
+ *hwid_mcatype = 0x00010000;
+ break;
+ default:
+ break;
+ }
+ } else if (m->family == 0x1A) {
+ switch (m->model) {
+ case 0x40 ... 0x4F:
+ if (*hwid_mcatype == 0x0002002E)
+ *hwid_mcatype = 0x00010000;
+ break;
+ default:
+ break;
+ }
+ }
+}
+
/* Decode extended errors according to Scalable MCA specification */
-static void decode_smca_error(struct mce_event *e)
+static void decode_smca_error(struct mce_event *e, struct mce_priv* m)
{
enum smca_bank_types bank_type;
const char *ip_name;
@@ -735,6 +793,8 @@ static void decode_smca_error(struct mce_event *e)
unsigned int csrow = -1, channel = -1;
unsigned int i;
+ fixup_hwid(m, &mcatype_hwid);
+
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
s_hwid = &smca_hwid_mcatypes[i];
if (mcatype_hwid == s_hwid->mcatype_hwid) {
@@ -801,7 +861,7 @@ int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)
if (mcgstatus & MCG_STATUS_MCIP)
mce_snprintf(e->mcgstatus_msg, "MCIP");
- decode_smca_error(e);
+ decode_smca_error(e, ras->mce_priv);
amd_decode_errcode(e);
return 0;
}

View File

@ -0,0 +1,30 @@
commit b4402d36e1b42fb7b0d8ddccc83463a6e622dbc4
Author: DmNosachev <quartz64@gmail.com>
Date: Tue Jun 29 13:48:55 2021 +0300
labels/supermicro: added Supermicro X10DRI(-T)
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/labels/supermicro b/labels/supermicro
index 47ea05f..86e4617 100644
--- a/labels/supermicro
+++ b/labels/supermicro
@@ -81,4 +81,14 @@ Vendor: Supermicro
P2-DIMMC1: 2.2.0;
P2-DIMMD1: 3.0.0; P2-DIMMD2: 3.0.1;
P2-DIMME1: 3.1.0;
- P2-DIMMF1: 3.2.0;
\ No newline at end of file
+ P2-DIMMF1: 3.2.0;
+
+ Model: X10DRI, X10DRI-T
+ P1-DIMMA1: 0.0.0; P1-DIMMA2: 0.0.1;
+ P1-DIMMB1: 0.1.0; P1-DIMMB2: 0.1.1;
+ P1-DIMMC1: 0.2.0; P1-DIMMC2: 0.2.1;
+ P1-DIMMD1: 0.3.0; P1-DIMMD2: 0.3.1;
+ P2-DIMME1: 1.0.0; P2-DIMME2: 1.0.1;
+ P2-DIMMF1: 1.1.0; P2-DIMMF2: 1.1.1;
+ P2-DIMMG1: 1.2.0; P2-DIMMG2: 1.2.1;
+ P2-DIMMH1: 1.3.0; P2-DIMMH2: 1.3.1;
\ No newline at end of file

View File

@ -0,0 +1,208 @@
commit b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87
Author: Avadhut Naik <avadhut.naik@amd.com>
Date: Thu Aug 31 02:23:48 2023 -0500
rasdaemon: Fix SMCA bank type decoding
On AMD systems with Scalable MCA (SMCA), the (HWID, MCATYPE) tuple from
the MCA_IPID MSR, bits 43:32 and 63:48 respectively, are used for SMCA
bank type decoding. On occurrence of an SMCA error, the cached tuples are
compared against the tuple read from the MCA_IPID MSR to determine the
SMCA bank type.
Currently however, all high 32 bits of the MCA_IPID register are cached in
the rasdaemon for all SMCA bank types. Bits 47:44 which do not play a part
in bank type decoding are zeroed out. Likewise, when an SMCA error occurs,
all high 32 bits of the MCA_IPID register are read and compared against
the cached values in smca_hwid_mcatypes array.
This can lead to erroneous bank type decoding since the bits 47:44 are
not guaranteed to be zero. They are either reserved or, on some modern
AMD systems viz. Genoa, denote the InstanceIdHi value. The bits therefore,
should not be associated with SMCA bank type decoding.
Import the HWID_MCATYPE macro from the kernel to ensure that only the
relevant fields i.e. (HWID, MCATYPE) tuples are used for SMCA bank type
decoding on occurrence of an SMCA error.
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index a20f03c..55620e2 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -90,6 +90,12 @@ enum smca_bank_types {
/* Maximum number of MCA banks per CPU. */
#define MAX_NR_BANKS 64
+#define MCI_IPID_MCATYPE 0xFFFF0000
+#define MCI_IPID_HWID 0xFFF
+
+/* Obtain HWID_MCATYPE Tuple on SMCA Systems */
+#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
+
/*
* On Newer heterogeneous systems from AMD with CPU and GPU nodes connected
* via xGMI links, the NON CPU Nodes are enumerated from index 8
@@ -699,76 +705,76 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* { bank_type, mcatype_hwid } */
/* ZN Core (HWID=0xB0) MCA types */
- { SMCA_LS, 0x000000B0 },
- { SMCA_LS_V2, 0x001000B0 },
- { SMCA_IF, 0x000100B0 },
- { SMCA_L2_CACHE, 0x000200B0 },
- { SMCA_DE, 0x000300B0 },
+ { SMCA_LS, HWID_MCATYPE(0xB0, 0x0) },
+ { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10) },
+ { SMCA_IF, HWID_MCATYPE(0xB0, 0x1) },
+ { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2) },
+ { SMCA_DE, HWID_MCATYPE(0xB0, 0x3) },
/* HWID 0xB0 MCATYPE 0x4 is Reserved */
- { SMCA_EX, 0x000500B0 },
- { SMCA_FP, 0x000600B0 },
- { SMCA_L3_CACHE, 0x000700B0 },
+ { SMCA_EX, HWID_MCATYPE(0xB0, 0x5) },
+ { SMCA_FP, HWID_MCATYPE(0xB0, 0x6) },
+ { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7) },
/* Data Fabric MCA types */
- { SMCA_CS, 0x0000002E },
- { SMCA_CS_V2, 0x0002002E },
- {SMCA_CS_V2_QUIRK, 0x00010000 },
- { SMCA_PIE, 0x0001002E },
+ { SMCA_CS, HWID_MCATYPE(0x2E, 0x0) },
+ { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) },
+ { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) },
+ { SMCA_CS_V2_QUIRK, HWID_MCATYPE(0x0, 0x1) },
/* Unified Memory Controller MCA type */
- { SMCA_UMC, 0x00000096 },
- { SMCA_UMC_QUIRK, 0x00020000 },
+ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) },
+ { SMCA_UMC_QUIRK, HWID_MCATYPE(0x0, 0x2) },
/* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */
- { SMCA_UMC_V2, 0x00010096 },
+ { SMCA_UMC_V2, HWID_MCATYPE(0x96, 0x1) },
/* Memory Attached Last Level Cache */
- { SMCA_MA_LLC, 0x0004002E },
+ { SMCA_MA_LLC, HWID_MCATYPE(0x2E, 0x4) },
/* Parameter Block MCA type */
- { SMCA_PB, 0x00000005 },
+ { SMCA_PB, HWID_MCATYPE(0x05, 0x0) },
/* Platform Security Processor MCA type */
- { SMCA_PSP, 0x000000FF },
- { SMCA_PSP_V2, 0x000100FF },
+ { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0) },
+ { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1) },
/* System Management Unit MCA type */
- { SMCA_SMU, 0x00000001 },
- { SMCA_SMU_V2, 0x00010001 },
+ { SMCA_SMU, HWID_MCATYPE(0x01, 0x0) },
+ { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1) },
/* Microprocessor 5 Unit MCA type */
- { SMCA_MP5, 0x00020001 },
+ { SMCA_MP5, HWID_MCATYPE(0x01, 0x2) },
/* MPDMA MCA Type */
- { SMCA_MPDMA, 0x00030001 },
+ { SMCA_MPDMA, HWID_MCATYPE(0x01, 0x3) },
/* Northbridge IO Unit MCA type */
- { SMCA_NBIO, 0x00000018 },
+ { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) },
/* PCI Express Unit MCA type */
- { SMCA_PCIE, 0x00000046 },
- { SMCA_PCIE_V2, 0x00010046 },
+ { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) },
+ { SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) },
/* Ext Global Memory Interconnect PCS MCA type */
- { SMCA_XGMI_PCS, 0x00000050 },
+ { SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) },
- { SMCA_NBIF, 0x0000006C },
+ { SMCA_NBIF, HWID_MCATYPE(0x6C, 0x0) },
- { SMCA_SHUB, 0x00000080 },
- { SMCA_SATA, 0x000000A8 },
- { SMCA_USB, 0x000000AA },
+ { SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) },
+ { SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) },
+ { SMCA_USB, HWID_MCATYPE(0xAA, 0x0) },
/* Ultra Short Reach Data and Control Plane Controller */
- { SMCA_USR_DP, 0x00000170 },
- { SMCA_USR_CP, 0x00000180 },
+ { SMCA_USR_DP, HWID_MCATYPE(0x170, 0x0) },
+ { SMCA_USR_CP, HWID_MCATYPE(0x180, 0x0) },
- { SMCA_GMI_PCS, 0x00000241 },
+ { SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) },
/* Ext Global Memory Interconnect PHY MCA type */
- { SMCA_XGMI_PHY, 0x00000259 },
+ { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
/* WAFL PHY MCA type */
- { SMCA_WAFL_PHY, 0x00000267 },
+ { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
- { SMCA_GMI_PHY, 0x00000269 },
+ { SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) },
};
struct smca_bank_name {
@@ -862,12 +868,12 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
case 0x10 ... 0x1F:
case 0x60 ... 0x7B:
case 0xA0 ... 0xAF:
- if (*hwid_mcatype == 0x0002002E)
- *hwid_mcatype = 0x00010000;
+ if (*hwid_mcatype == HWID_MCATYPE(0x2E, 0x2))
+ *hwid_mcatype = HWID_MCATYPE(0x0, 0x1);
break;
case 0x90 ... 0x9F:
- if ((*hwid_mcatype & 0xFF) == 0x00000096)
- *hwid_mcatype = 0x00020000;
+ if (*hwid_mcatype == HWID_MCATYPE(0x96, 0x0))
+ *hwid_mcatype = HWID_MCATYPE(0x0, 0x2);
break;
default:
break;
@@ -875,8 +881,8 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
} else if (m->family == 0x1A) {
switch (m->model) {
case 0x40 ... 0x4F:
- if (*hwid_mcatype == 0x0002002E)
- *hwid_mcatype = 0x00010000;
+ if (*hwid_mcatype == HWID_MCATYPE(0x2E, 0x2))
+ *hwid_mcatype = HWID_MCATYPE(0x0, 0x1);
break;
default:
break;
@@ -889,13 +895,17 @@ void decode_smca_error(struct mce_event *e, struct mce_priv *m)
{
enum smca_bank_types bank_type;
const char *ip_name;
+ uint32_t mcatype_hwid = 0;
unsigned short xec = (e->status >> 16) & 0x3f;
const struct smca_hwid *s_hwid;
- uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
+ uint32_t ipid_high = EXTRACT(e->ipid, 32, 63);
uint8_t mcatype_instancehi = EXTRACT(e->ipid, 44, 47);
unsigned int csrow = -1, channel = -1;
unsigned int i;
+ mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID,
+ (ipid_high & MCI_IPID_MCATYPE) >> 16);
+
fixup_hwid(m, &mcatype_hwid);
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {

View File

@ -0,0 +1,37 @@
commit c785d309dcbdeb7ecd219975244f3944a8d047e9
Author: Muralidhara M K <muralidhara.mk@amd.com>
Date: Thu Jul 27 10:18:12 2023 +0000
rasdaemon: Identify the DIe Number in multidie system
Some AMD systems have 4 dies in each socket and Die ID represents
whether the error occured on cpu die or gpu die.
Also, respective Die used for FRU identification.
Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 54060ee..a20f03c 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -935,10 +935,15 @@ void decode_smca_error(struct mce_event *e, struct mce_priv *m)
xec);
if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_QUIRK) && xec == 0) {
- channel = find_umc_channel(e);
- csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */
- mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
- channel, csrow);
+ if ((m->family == 0x19) && (m->model >= 0x90 && m->model <= 0x9f)) {
+ /* MCA_IPID[InstanceIdHi] give the AMD Node Die ID */
+ mce_snprintf(e->mc_location, "memory_die_id=%d", mcatype_instancehi / 4);
+ } else {
+ channel = find_umc_channel(e);
+ csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */
+ mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
+ channel, csrow);
+ }
}
if (bank_type == SMCA_UMC_V2 && xec == 0) {

View File

@ -0,0 +1,42 @@
commit d0e0bb3d73c4bc5060da20270a089857bba2a64c
Author: Justin Vreeland <vreeland.justin@gmail.com>
Date: Tue Nov 2 19:51:50 2021 -0700
Update ras-mc-ctl manpage to match current options
Signed-off-by: Justin Vreeland <vreeland.justin@gmail.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/man/ras-mc-ctl.8.in b/man/ras-mc-ctl.8.in
index 26230e0..a605122 100644
--- a/man/ras-mc-ctl.8.in
+++ b/man/ras-mc-ctl.8.in
@@ -79,9 +79,27 @@ Specify an alternate location for the labels database.
Specify a delay of \fBtime\fR seconds before registering DIMM labels.
Only meaninful if used together with --register-labels.
.TP
-.BI "--layout
+.BI "--layout"
Prints the memory layout as detected by the EDAC driver. Useful to check
if the EDAC driver is properly detecting the memory controller architecture.
+.TP
+.BI "--summary"
+Presents a summary of the logged errors.
+.TP
+.BI "--errors"
+Shows the errors stored at the error database.
+.TP
+.BI "--error-count"
+Shows the corrected and uncorrected error counts using sysfs.
+.TP
+.BI "--vendor-errors-summary="platform-id
+Pressents a summary of the vendor-specific logged errors.
+.TP
+.BI "--vendor-errors="platform-id
+Shows the vendor-specific errors stored in the error database.
+.TP
+.BI "--vendor-platforms"
+Shows the supported platforms with platform-ids for the vendor-specific errors.
.SH MAINBOARD CONFIGURATION
.PP

View File

@ -0,0 +1,27 @@
commit dda7d95bcbbb95e0db557a7a9325ee9815ab4e9b
Author: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Wed May 26 12:55:54 2021 +0200
Add support for multi-arch builds
Allow building rasdaemon on several architectures:
- x86_64
- arm 64
- ppc 64 LE
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 747a844..898687c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -9,6 +9,9 @@ jobs:
Ubuntu:
name: Ubuntu
runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ arch: [x64_64, aarch64, ppc64le]
steps:
- uses: actions/checkout@v2
- name: prepare

View File

@ -0,0 +1,31 @@
commit ec443ec0add059fa897f844349e1a2345d81713c
Author: DmNosachev <quartz64@gmail.com>
Date: Tue Jun 29 11:33:10 2021 +0300
labels/supermicro: added x11dph-i labels
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/labels/supermicro b/labels/supermicro
index 3fd6fee..bfaed93 100644
--- a/labels/supermicro
+++ b/labels/supermicro
@@ -68,3 +68,17 @@ Vendor: Supermicro
P1_DIMM4B: 1.1.1;
P2_DIMM4B: 2.0.1;
P2_DIMM4B: 2.1.1;
+
+ Model: X11DPH-i
+ P1-DIMMA1: 0.0.0; P1-DIMMA2: 0.0.1;
+ P1-DIMMB1: 0.1.0;
+ P1-DIMMC1: 0.2.0;
+ P1-DIMMD1: 1.0.0; P1-DIMMD2: 1.0.1;
+ P1-DIMME1: 1.1.0;
+ P1-DIMMF1: 1.2.0;
+ P2-DIMMA1: 2.0.0; P2-DIMMA2: 2.0.1;
+ P2-DIMMB1: 2.1.0;
+ P2-DIMMC1: 2.2.0;
+ P2-DIMMD1: 3.0.0; P2-DIMMD2: 3.0.1;
+ P2-DIMME1: 3.1.0;
+ P2-DIMMF1: 3.2.0;
\ No newline at end of file

View File

@ -0,0 +1,48 @@
commit f7cdd720297cd17e405a7170c04df89d1d9536f8
Author: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Wed May 26 12:35:55 2021 +0200
Add a github workflow for CI automation
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..5b3e757
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,34 @@
+name: CI
+
+# Should run only on branches and PR, as "on_tag.yml" will handle tags
+on:
+ push:
+ branches: master test
+ pull_request:
+ branches: master
+
+jobs:
+
+#
+# Linux
+#
+ Ubuntu:
+ name: Ubuntu
+ runs-on: ubuntu-20.04
+ strategy:
+ matrix:
+ arch: [x64_64, aarch64, armv7, ppc64le]
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ arch: ${{ matrix.arch }}
+ - name: prepare
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y build-essential sqlite3
+ - name: build
+ run: |
+ autoreconf -vfi
+ ./configure --enable-all
+ make
+ sudo make install

View File

@ -0,0 +1,30 @@
commit fc1dd37d422fc907416afd028514fff59b63ae12
Author: DmNosachev <quartz64@gmail.com>
Date: Wed Jun 30 16:49:18 2021 +0300
labels/supermicro: added Supermicro B1DRi
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/labels/supermicro b/labels/supermicro
index 373de07..b924a32 100644
--- a/labels/supermicro
+++ b/labels/supermicro
@@ -105,4 +105,14 @@ Vendor: Supermicro
P2-DIMMC1: 2.2.0;
P2-DIMMD1: 3.0.0;
P2-DIMME1: 3.1.0;
- P2-DIMMF1: 3.2.0;
\ No newline at end of file
+ P2-DIMMF1: 3.2.0;
+
+ Model: B1DRi
+ P1_DIMMA1: 0.0.0;
+ P1_DIMMB1: 0.1.0;
+ P1_DIMMC1: 0.2.0;
+ P1_DIMMD1: 0.3.0;
+ P2_DIMME1: 1.0.0;
+ P2_DIMMF1: 1.1.0;
+ P2_DIMMG1: 1.2.0;
+ P2_DIMMH1: 1.3.0;
\ No newline at end of file

View File

@ -0,0 +1,28 @@
commit fcdffdcb28ece67ed78e3575a3dce45d9dd4f015
Author: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Wed May 26 10:37:52 2021 +0200
rasdaemon.spec.in: Fix the description on this example file
While this is used just to test if building it is OK, better
to keep the logs nice ;-)
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/misc/rasdaemon.spec.in b/misc/rasdaemon.spec.in
index 6ef223f..afa4359 100644
--- a/misc/rasdaemon.spec.in
+++ b/misc/rasdaemon.spec.in
@@ -61,10 +61,10 @@ rm INSTALL %{buildroot}/usr/include/*.h
%changelog
* Wed May 26 2021 Mauro Carvalho Chehab <mchehab+huawei@kernel.org> 0.6.7-1
-- Bump to version 0.6.5 with several fixes and additions
+- Bump to version 0.6.7 with several fixes and additions
* Tue Jul 21 2020 Mauro Carvalho Chehab <mchehab+huawei@kernel.org> 0.6.6-1
-- Bump to version 0.6.5 with several fixes, new hip08 events and memory prediction analysis
+- Bump to version 0.6.6 with several fixes, new hip08 events and memory prediction analysis
* Wed Nov 20 2019 Mauro Carvalho Chehab <mchehab+huawei@kernel.org> 0.6.5-1
- Bump to version 0.6.5 with several fixes and improves PCIe events record

6
gating.yaml Normal file
View File

@ -0,0 +1,6 @@
--- !Policy
product_versions:
- rhel-9
decision_context: osci_compose_gate
rules:
- !PassingTestCaseRule {test_case_name: osci.brew-build.tier0.functional}

View File

@ -1,9 +1,40 @@
Add labels directory from upstream
Labels directory doesn't get exported by tarball releases.
Signed-off-by: Aristeu Rozanski <aris@redhat.com>
---
labels/dell | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 152 insertions(+)
labels/asus | 20 +++++++
labels/dell | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
labels/supermicro | 70 ++++++++++++++++++++++++
3 files changed, 242 insertions(+)
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ rasdaemon-0.6.1/labels/dell 2020-02-20 11:53:39.574579258 -0500
+++ rasdaemon-0.6.7/labels/asus 2022-02-08 15:44:53.563362010 -0500
@@ -0,0 +1,20 @@
+# RASDAEMON Motherboard DIMM labels Database file.
+#
+# Vendor-name and model-name are found from the program 'dmidecode'
+# labels are found from the silk screen on the motherboard.
+#
+#Vendor: <vendor-name>
+# Product: <product-name>
+# Model: <model-name>
+# <label>: <mc>.<top>.<mid>.<low>
+#
+#
+#Vendor: <vendor-name>
+# Model: <model-name>
+# <label>: <mc>.<row>.<channel>
+#
+
+Vendor: ASUSTeK COMPUTER INC.
+ Model: PRIME X570-PRO
+ DIMM_A1: 0.0.1, 0.1.1; DIMM_A2: 0.2.1, 0.3.1;
+ DIMM_B1: 0.0.0, 0.1.0; DIMM_B2: 0.2.0, 0.3.0;
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ rasdaemon-0.6.7/labels/dell 2022-02-08 15:44:53.564361999 -0500
@@ -0,0 +1,152 @@
+# RASDAEMON Motherboard DIMM labels Database file.
+#
@ -157,3 +188,76 @@
+
+ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0;
+ B7: 2.0.1; B8: 3.0.1;
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ rasdaemon-0.6.7/labels/supermicro 2022-02-08 15:44:53.564361999 -0500
@@ -0,0 +1,70 @@
+# RASDAEMON Motherboard DIMM labels Database file.
+#
+# Vendor-name and model-name are found from the program 'dmidecode'
+# labels are found from the silk screen on the motherboard.
+#
+#Vendor: <vendor-name>
+# Product: <product-name>
+# Model: <model-name>
+# <label>: <mc>.<top>.<mid>.<low>
+#
+
+Vendor: Supermicro
+ Model: A2SDi-8C-HLN4F
+ DIMMA1: 0.0.0; DIMMA2: 0.0.1;
+ DIMMB1: 0.1.0; DIMMB2: 0.1.1;
+
+ Model: A2SDi-8C+-HLN4F
+ DIMMA1: 0.0.0; DIMMA2: 0.0.1;
+ DIMMB1: 0.1.0; DIMMB2: 0.1.1;
+
+ Product: X10SRA-F
+ DIMMA1: 0.0.0
+ DIMMA2: 0.0.1
+ DIMMB1: 0.1.0
+ DIMMB2: 0.1.1
+ DIMMC1: 1.0.0
+ DIMMC2: 1.0.1
+ DIMMD1: 1.1.0
+ DIMMD2: 1.1.1
+
+ Product: H8DGU
+ P1_DIMM1A: 0.2.0;
+ P1_DIMM1A: 0.3.0;
+ P2_DIMM1A: 3.2.0;
+ P2_DIMM1A: 3.3.0;
+
+ P1_DIMM2A: 0.2.1;
+ P1_DIMM2A: 0.3.1;
+ P2_DIMM2A: 3.2.1;
+ P2_DIMM2A: 3.3.1;
+
+ P1_DIMM3A: 1.2.0;
+ P1_DIMM3A: 1.3.0;
+ P2_DIMM3A: 2.2.0;
+ P2_DIMM3A: 2.3.0;
+
+ P1_DIMM4A: 1.2.1;
+ P1_DIMM4A: 1.3.1;
+ P2_DIMM4A: 2.2.1;
+ P2_DIMM4A: 2.3.1;
+
+ P1_DIMM1B: 0.0.0;
+ P1_DIMM1B: 0.2.0;
+ P2_DIMM1B: 3.0.0;
+ P2_DIMM1B: 3.1.0;
+
+ P1_DIMM2B: 0.0.1;
+ P1_DIMM2B: 0.1.1;
+ P2_DIMM2B: 3.0.1;
+ P2_DIMM2B: 3.1.1;
+
+ P1_DIMM3B: 1.0.0;
+ P1_DIMM3B: 1.1.0;
+ P2_DIMM3B: 2.0.0;
+ P2_DIMM3B: 2.1.0;
+
+ P1_DIMM4B: 1.0.1;
+ P1_DIMM4B: 1.1.1;
+ P2_DIMM4B: 2.0.1;
+ P2_DIMM4B: 2.1.1;

View File

@ -1,17 +1,48 @@
Name: rasdaemon
Version: 0.6.1
Release: 13%{?dist}
Version: 0.6.7
Release: 10%{?dist}
Summary: Utility to receive RAS error tracings
Group: Applications/System
License: GPLv2
License: GPL-2.0-only
URL: http://git.infradead.org/users/mchehab/rasdaemon.git
Source0: http://www.infradead.org/~mchehab/rasdaemon/%{name}-%{version}.tar.bz2
Patch0: labels.patch
Patch1: fcdffdcb28ece67ed78e3575a3dce45d9dd4f015.patch
Patch2: f7cdd720297cd17e405a7170c04df89d1d9536f8.patch
Patch3: 2b37a26dcec389723f75d69d3da9c2f15f6c317d.patch
Patch4: dda7d95bcbbb95e0db557a7a9325ee9815ab4e9b.patch
Patch5: 738bafafdcb2e8b0ced32fff31b13754d571090b.patch
Patch6: 1ff5f3d2a0fcd48add9462567c30fe0e14585fb4.patch
Patch7: 9acef39f13833f7d53ef96abc5a72e79384260f4.patch
Patch8: 28ea956acc2dab7c18b4701f9657afb9ab3ddc79.patch
Patch9: aecf33aa70331670c06db6b652712b476e24051c.patch
Patch10: 7937f0d6c2aaaed096f3a3d306416743c0dcb7a4.patch
Patch11: ec443ec0add059fa897f844349e1a2345d81713c.patch
Patch12: 9a5baed97b21af31064d9995ffcfaac0e9d7983e.patch
Patch13: b4402d36e1b42fb7b0d8ddccc83463a6e622dbc4.patch
Patch14: 50565005b10fe909c66f1c90f2feb95712427c7d.patch
Patch15: fc1dd37d422fc907416afd028514fff59b63ae12.patch
Patch16: 6bc43db1b6b3d73805179c21d1dd5521e8dc0f74.patch
Patch17: 2b6a54b0d31e02e657171fd27f4e31d996756bc6.patch
Patch18: 7ccf12f5ae26a055926d175d908c7930293438c4.patch
Patch19: 9415b7449c70f5ea4a0209ddb89c2f5f392d3b4b.patch
Patch20: d0e0bb3d73c4bc5060da20270a089857bba2a64c.patch
Patch21: 30158ef8d7aebc3e5201bf39b73ce7644f8e419e.patch
Patch22: aa36c96cd52d775570dae989dd95a060f1149077.patch
Patch23: 932118b04a04104dfac6b8536419803f236e6118.patch
Patch24: 1f74a59ee33b7448b00d7ba13d5ecd4918b9853c.patch
Patch25: 2d15882a0cbfce0b905039bebc811ac8311cd739.patch
Patch26: c785d309dcbdeb7ecd219975244f3944a8d047e9.patch
Patch27: b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87.patch
ExcludeArch: s390 s390x
BuildRequires: make
BuildRequires: gcc
BuildRequires: gettext-devel
BuildRequires: perl-generators
BuildRequires: sqlite-devel
BuildRequires: systemd
BuildRequires: autoconf
BuildRequires: automake
BuildRequires: libtool
Provides: bundled(kernel-event-lib)
Requires: hwdata
@ -24,30 +55,6 @@ Requires(post): systemd
Requires(preun): systemd
Requires(postun): systemd
Patch1: 60a91e4da4f2daf2b10143fc148a8043312b61e5.patch
Patch2: a16ca0711001957ee98f2c124abce0fa1f801529.patch
Patch3: add_upstream_labels.patch
Patch4: b22be68453b2497e86cbd273b9cd56fadc5859e3.patch
Patch5: 2a1d217660351c08eb2f8bccebf939abba2f7e69.patch
Patch6: 8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch
Patch7: cc2ce5c65ed5a42eaa97aa3659854add6d808da5.patch
Patch8: 854364ba44aee9bc5646f6537fc744b0b54aff37.patch
Patch9: 9acef39f13833f7d53ef96abc5a72e79384260f4.patch
Patch10: 28ea956acc2dab7c18b4701f9657afb9ab3ddc79.patch
Patch11: aecf33aa70331670c06db6b652712b476e24051c.patch
Patch12: 7937f0d6c2aaaed096f3a3d306416743c0dcb7a4.patch
Patch13: rasdaemon-ras-mc-ctl-Fix-script-to-parse-dimm-sizes.patch
Patch14: 0862a096c3a1d0f993703ab3299f1ddfadf53d7f.patch
Patch15: 546cf713f667437fb6e283cc3dc090679eb47d08.patch
Patch16: 2290d65b97311dd5736838f1e285355f7f357046.patch
Patch17: 16d929b024c31d54a7f8a72eab094376c7be27f5.patch
Patch18: b497a3d6a39d402c41065e9284d49114b97e3bfe.patch
Patch19: ce6e7864f11f709c4f803828fbc8e507d115d03b.patch
Patch20: a8c776ed94f68ae31d7b5f74e19545698898c13c.patch
Patch21: 899fcc2cf21c86b5462c8f4441cd9c92b3d75f7d.patch
Patch22: e8b97ec14a11764fedfea50bd4d96ddda43c7fc1.patch
Patch23: ce33041e0abfa20054ff5d6874ffbd1ab592558d.patch
%description
%{name} is a RAS (Reliability, Availability and Serviceability) logging tool.
It currently records memory errors, using the EDAC tracing events.
@ -60,6 +67,7 @@ an utility for reporting current error counts from the EDAC sysfs files.
%prep
%setup -q
%patch0 -p1
%patch1 -p1
%patch2 -p1
%patch3 -p1
@ -83,6 +91,10 @@ an utility for reporting current error counts from the EDAC sysfs files.
%patch21 -p1
%patch22 -p1
%patch23 -p1
%patch24 -p1
%patch25 -p1
%patch26 -p1
%patch27 -p1
# The tarball is locked in time the first time aclocal was ran and will keep
# requiring an older version of automake
@ -90,9 +102,9 @@ autoreconf -vfi
%build
%ifarch %{arm} aarch64
%configure --enable-aer --enable-sqlite3 --enable-abrt-report --enable-non-standard --enable-hisi-ns-decode --enable-arm
%configure --enable-sqlite3 --enable-aer --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-non-standard --enable-arm --enable-hisi-ns-decode
%else
%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-memory-failure
%configure --enable-sqlite3 --enable-aer --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report
%endif
make %{?_smp_mflags}
@ -100,8 +112,12 @@ make %{?_smp_mflags}
make install DESTDIR=%{buildroot}
install -D -p -m 0644 misc/rasdaemon.service %{buildroot}/%{_unitdir}/rasdaemon.service
install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service
install -D -p -m 0655 labels/* %{buildroot}%{_sysconfdir}/ras/dimm_labels.d
rm INSTALL %{buildroot}/usr/include/*.h
mkdir -p %{buildroot}/%{_sharedstatedir}/rasdaemon
install -d -p -m 0755 %{buildroot}/%{_sharedstatedir}/rasdaemon
mkdir -p %{buildroot}/%{_sysconfdir}/sysconfig
install -D -p -m 0644 misc/rasdaemon.env %{buildroot}/%{_sysconfdir}/sysconfig/rasdaemon
sed -i "s/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION=account/" %{buildroot}/%{_sysconfdir}/sysconfig/rasdaemon
%files
%doc AUTHORS ChangeLog COPYING README TODO
@ -111,43 +127,75 @@ rm INSTALL %{buildroot}/usr/include/*.h
%{_unitdir}/*.service
%{_sharedstatedir}/rasdaemon
%{_sysconfdir}/ras/dimm_labels.d
%{_sysconfdir}/sysconfig/rasdaemon
%changelog
* Mon Jan 23 2023 Aristeu Rozanski <aris@redhat.com> 0.6.1-13
- Fixing covscan issues [2073516]
* Wed Jan 10 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-10
- Update License string to use SPDX [RHELMISC-1262]
* Tue Oct 12 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-12
- Adding missing bits from b497a3d6a39d402c41065e9284d49114b97e3bfe [1923254]
* Thu Oct 26 2023 Aristeu Rozanski <aris@redhat.com> 0.6.7-9
- Update SMCA support for AMD processors [RHEL-11092]
* Tue Oct 12 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-11
- Removed bits from devlink and diskerrors that aren't used yet [1923254]
* Tue May 03 2022 Aristeu Rozanski <aris@redhat.com> 0.6.7-8
- Update ras-mc-ctl manpage to match current options [2079132]
* Tue Oct 12 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-10
- Add miscellaneous patches required by customer [1923254]
* Mon May 02 2022 Aristeu Rozanski <aris@redhat.com> 0.6.7-7
- Fix issue printing memory module sizes [2080596]
* Wed Oct 06 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-9
- Prevent ras-mc-ctl trying to access extlog and mce tables if rasdaemon was built without support for them [2011404]
* Thu Mar 31 2022 Aristeu Rozanski <aris@redhat.com> 0.6.7-6
- Merging 2065729 fixes into 9.1 branch [2067499]
* Thu Aug 26 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-8
- Disable MCE and extlog in arm packages [2009499]
* Thu Mar 24 2022 Aristeu Rozanski <aris@redhat.com> 0.6.7-5
- Trying to guess what's going on on the testing side [2065729]
* Thu Aug 26 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-7
- Add support for AMD SMCA banks for family 19 [1991955]
* Thu Mar 24 2022 Aristeu Rozanski <aris@redhat.com> 0.6.7-4
- Adding simple test to stop being gated [2065729]
* Wed May 26 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-6
- Add support for AMD SMCA [1965011]
* Thu Mar 24 2022 Aristeu Rozanski <aris@redhat.com> 0.6.7-3
- Adding gating.yaml [2065729]
* Wed Apr 08 2020 Aristeu Rozanski <aris@redhat.com> 0.6.1-5
- Fix high CPU usage when CPUs are offline [1683420]
* Fri Mar 18 2022 Aristeu Rozanski <aris@redhat.com> 0.6.7-2
- Adding missing rasdaemon environment configuration to /etc/sysconfig/rasdaemon [2065729]
* Wed Apr 08 2020 Aristeu Rozanski <aris@redhat.com> 0.6.1-4
- Include upstream labels [1665418]
* Tue Feb 08 2022 Aristeu Rozanski <aris@redhat.com> 0.6.7-1
- Bumped to 0.6.7
- Backported patches that sit on top of 0.6.7 without being released
Related: rhbz#2052190
* Thu Jul 11 2019 Aristeu Rozanski <aris@redhat.com> 0.6.1-3
- Add support for AMD scalable MCA [1725488]
* Tue Aug 10 2021 Mohan Boddu <mboddu@redhat.com> - 0.6.4-6
- Rebuilt for IMA sigs, glibc 2.34, aarch64 flags
Related: rhbz#1991688
* Mon Aug 20 2018 Aristeu Rozanski <aris@redhat.com> 0.6.1-2
- Add support for error count display [1573685]
* Fri Apr 16 2021 Mohan Boddu <mboddu@redhat.com> - 0.6.4-5
- Rebuilt for RHEL 9 BETA on Apr 15th 2021. Related: rhbz#1947937
* Wed Jan 27 2021 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.4-4
- Rebuilt for https://fedoraproject.org/wiki/Fedora_34_Mass_Rebuild
* Wed Jul 29 2020 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.4-3
- Rebuilt for https://fedoraproject.org/wiki/Fedora_33_Mass_Rebuild
* Thu Jan 30 2020 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.4-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_32_Mass_Rebuild
* Thu Oct 10 2019 Mauro Carvalho Chehab <mchehab+samsung@kernel.org> 0.6.4-1
- Bump to version 0.6.4 with some DB changes for hip08 and some fixes
* Fri Aug 23 2019 Mauro Carvalho Chehab <mchehab+samsung@kernel.org> 0.6.3-1
- Bump to version 0.6.3 with new ARM events, plus disk I/O and netlink support
* Fri Jul 26 2019 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.2-3
- Rebuilt for https://fedoraproject.org/wiki/Fedora_31_Mass_Rebuild
* Sat Feb 02 2019 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.2-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_30_Mass_Rebuild
* Tue Aug 14 2018 Mauro Carvalho Chehab <mchehab+samsung@kernel.org> 0.6.2-1
- Bump to version 0.6.2 with improvements for PCIe AER parsing and at ras-mc-ctl tool
* Sat Jul 14 2018 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.1-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_29_Mass_Rebuild
* Wed Apr 25 2018 Mauro Carvalho Chehab <mchehab+samsung@kernel.org> 0.6.1-1
- Bump to version 0.6.1 adding support for Skylake Xeon MSCOD, a bug fix and some new DELL labels
@ -213,7 +261,7 @@ rm INSTALL %{buildroot}/usr/include/*.h
* Tue Sep 10 2013 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.4.2-1
- Fix ras-mc-ctl layout filling
* Sun Aug 04 2013 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 0.4.1-5
- Rebuilt for https://fedoraproject.org/wiki/Fedora_20_Mass_Rebuild
@ -225,7 +273,7 @@ rm INSTALL %{buildroot}/usr/include/*.h
* Wed May 29 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.4.1-2
- Fix the name of perl-DBD-SQLite package
* Wed May 29 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.4.1-1
- Updated to version 0.4.1 with contains some bug fixes
@ -234,3 +282,4 @@ rm INSTALL %{buildroot}/usr/include/*.h
* Mon May 20 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.3.0-1
- Package created

1
sources Normal file
View File

@ -0,0 +1 @@
SHA512 (rasdaemon-0.6.7.tar.bz2) = 15beae5d4964c49b7b7f9e731948b5def9622fba5d7d17ce52a282d7834d256366cdf3cf427b82b2a6a8fd0c99f202f545000bdb06064fbae7ae0296aef0946c

3
tests/basic-test.sh Normal file
View File

@ -0,0 +1,3 @@
#!/bin/sh
(systemctl start rasdaemon && systemctl status rasdaemon && ras-mc-ctl --summary && echo "PASS" && exit 0) || (echo "FAIL"; exit 1;)

9
tests/tests.yml Normal file
View File

@ -0,0 +1,9 @@
- hosts: localhost
roles:
- role: standard-test-basic # this is a standard test role, it takes care of the test environment, logging, archiving results..
tags:
- classic
tests:
- simple:
dir: .
run: "./basic-test.sh" # this is your test command, its exit code is the outcome of the test