Update SMCA support for AMD processors
This commit is contained in:
parent
6d9c9b23dc
commit
5931fdef9f
1
.rasdaemon.metadata
Normal file
1
.rasdaemon.metadata
Normal file
@ -0,0 +1 @@
|
||||
8ae34f40b676a0843be6647854b950f45161e7d4 rasdaemon-0.6.7.tar.bz2
|
163
1f74a59ee33b7448b00d7ba13d5ecd4918b9853c.patch
Normal file
163
1f74a59ee33b7448b00d7ba13d5ecd4918b9853c.patch
Normal file
@ -0,0 +1,163 @@
|
||||
commit 1f74a59ee33b7448b00d7ba13d5ecd4918b9853c
|
||||
Author: Muralidhara M K <muralidhara.mk@amd.com>
|
||||
Date: Fri Jun 30 10:36:53 2023 +0000
|
||||
|
||||
rasdaemon: Add new MA_LLC, USR_DP, and USR_CP bank types.
|
||||
|
||||
Add HWID and McaType values for new SMCA bank types
|
||||
and error decoding for those new SMCA banks.
|
||||
|
||||
Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
|
||||
|
||||
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
|
||||
index 7c88a46..fc51b5a 100644
|
||||
--- a/mce-amd-smca.c
|
||||
+++ b/mce-amd-smca.c
|
||||
@@ -61,6 +61,7 @@ enum smca_bank_types {
|
||||
SMCA_PIE, /* Power, Interrupts, etc. */
|
||||
SMCA_UMC, /* Unified Memory Controller */
|
||||
SMCA_UMC_V2,
|
||||
+ SMCA_MA_LLC, /* Memory Attached Last Level Cache */
|
||||
SMCA_PB, /* Parameter Block */
|
||||
SMCA_PSP, /* Platform Security Processor */
|
||||
SMCA_PSP_V2,
|
||||
@@ -76,6 +77,8 @@ enum smca_bank_types {
|
||||
SMCA_SHUB, /* System Hub Unit */
|
||||
SMCA_SATA, /* SATA Unit */
|
||||
SMCA_USB, /* USB Unit */
|
||||
+ SMCA_USR_DP, /* Ultra Short Reach Data Plane Controller */
|
||||
+ SMCA_USR_CP, /* Ultra Short Reach Control Plane Controller */
|
||||
SMCA_GMI_PCS, /* GMI PCS Unit */
|
||||
SMCA_XGMI_PHY, /* xGMI PHY Unit */
|
||||
SMCA_WAFL_PHY, /* WAFL PHY Unit */
|
||||
@@ -325,6 +328,16 @@ static const char * const smca_umc2_mce_desc[] = {
|
||||
"LM32 MP errors",
|
||||
};
|
||||
|
||||
+static const char * const smca_mall_mce_desc[] = {
|
||||
+ "Counter overflow error",
|
||||
+ "Counter underflow error",
|
||||
+ "Write Data Parity Error",
|
||||
+ "Read Response Parity Error",
|
||||
+ "Cache Tag ECC Error Macro 0",
|
||||
+ "Cache Tag ECC Error Macro 1",
|
||||
+ "Cache Data ECC Error"
|
||||
+};
|
||||
+
|
||||
static const char * const smca_pb_mce_desc[] = {
|
||||
"An ECC error in the Parameter Block RAM array"
|
||||
};
|
||||
@@ -524,6 +537,57 @@ static const char * const smca_usb_mce_desc[] = {
|
||||
"AXI Slave Response error",
|
||||
};
|
||||
|
||||
+static const char * const smca_usrdp_mce_desc[] = {
|
||||
+ "Mst CMD Error",
|
||||
+ "Mst Rx FIFO Error",
|
||||
+ "Mst Deskew Error",
|
||||
+ "Mst Detect Timeout Error",
|
||||
+ "Mst FlowControl Error",
|
||||
+ "Mst DataValid FIFO Error",
|
||||
+ "Mac LinkState Error",
|
||||
+ "Deskew Error",
|
||||
+ "Init Timeout Error",
|
||||
+ "Init Attempt Error",
|
||||
+ "Recovery Timeout Error",
|
||||
+ "Recovery Attempt Error",
|
||||
+ "Eye Training Timeout Error",
|
||||
+ "Data Startup Limit Error",
|
||||
+ "LS0 Exit Error",
|
||||
+ "PLL powerState Update Timeout Error",
|
||||
+ "Rx FIFO Error",
|
||||
+ "Lcu Error",
|
||||
+ "Conv CECC Error",
|
||||
+ "Conv UECC Error",
|
||||
+ "Reserved",
|
||||
+ "Rx DataLoss Error",
|
||||
+ "Replay CECC Error",
|
||||
+ "Replay UECC Error",
|
||||
+ "CRC Error",
|
||||
+ "BER Exceeded Error",
|
||||
+ "FC Init Timeout Error",
|
||||
+ "FC Init Attempt Error",
|
||||
+ "Replay Timeout Error",
|
||||
+ "Replay Attempt Error",
|
||||
+ "Replay Underflow Error",
|
||||
+ "Replay Overflow Error",
|
||||
+};
|
||||
+
|
||||
+static const char * const smca_usrcp_mce_desc[] = {
|
||||
+ "Packet Type Error",
|
||||
+ "Rx FIFO Error",
|
||||
+ "Deskew Error",
|
||||
+ "Rx Detect Timeout Error",
|
||||
+ "Data Parity Error",
|
||||
+ "Data Loss Error",
|
||||
+ "Lcu Error",
|
||||
+ "HB1 Handshake Timeout Error",
|
||||
+ "HB2 Handshake Timeout Error",
|
||||
+ "Clk Sleep Rsp Timeout Error",
|
||||
+ "Clk Wake Rsp Timeout Error",
|
||||
+ "Reset Attack Error",
|
||||
+ "Remote Link Fatal Error",
|
||||
+};
|
||||
+
|
||||
static const char * const smca_gmipcs_mce_desc[] = {
|
||||
"Data Loss Error",
|
||||
"Training Error",
|
||||
@@ -579,6 +643,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
|
||||
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
|
||||
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
|
||||
[SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) },
|
||||
+ [SMCA_MA_LLC] = { smca_mall_mce_desc, ARRAY_SIZE(smca_mall_mce_desc) },
|
||||
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
|
||||
[SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
|
||||
[SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)},
|
||||
@@ -595,6 +660,8 @@ static struct smca_mce_desc smca_mce_descs[] = {
|
||||
[SMCA_SHUB] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) },
|
||||
[SMCA_SATA] = { smca_sata_mce_desc, ARRAY_SIZE(smca_sata_mce_desc) },
|
||||
[SMCA_USB] = { smca_usb_mce_desc, ARRAY_SIZE(smca_usb_mce_desc) },
|
||||
+ [SMCA_USR_DP] = { smca_usrdp_mce_desc, ARRAY_SIZE(smca_usrdp_mce_desc) },
|
||||
+ [SMCA_USR_CP] = { smca_usrcp_mce_desc, ARRAY_SIZE(smca_usrcp_mce_desc) },
|
||||
[SMCA_GMI_PCS] = { smca_gmipcs_mce_desc, ARRAY_SIZE(smca_gmipcs_mce_desc) },
|
||||
/* All the PHY bank types have the same error descriptions, for now. */
|
||||
[SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
|
||||
@@ -631,6 +698,8 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
|
||||
{ SMCA_UMC, 0x00000096 },
|
||||
/* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */
|
||||
{ SMCA_UMC_V2, 0x00010096 },
|
||||
+ /* Memory Attached Last Level Cache */
|
||||
+ { SMCA_MA_LLC, 0x0004002E },
|
||||
|
||||
/* Parameter Block MCA type */
|
||||
{ SMCA_PB, 0x00000005 },
|
||||
@@ -664,6 +733,11 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
|
||||
{ SMCA_SHUB, 0x00000080 },
|
||||
{ SMCA_SATA, 0x000000A8 },
|
||||
{ SMCA_USB, 0x000000AA },
|
||||
+
|
||||
+ /* Ultra Short Reach Data and Control Plane Controller */
|
||||
+ { SMCA_USR_DP, 0x00000170 },
|
||||
+ { SMCA_USR_CP, 0x00000180 },
|
||||
+
|
||||
{ SMCA_GMI_PCS, 0x00000241 },
|
||||
|
||||
/* Ext Global Memory Interconnect PHY MCA type */
|
||||
@@ -692,6 +766,7 @@ static struct smca_bank_name smca_names[] = {
|
||||
[SMCA_PIE] = { "Power, Interrupts, etc." },
|
||||
[SMCA_UMC] = { "Unified Memory Controller" },
|
||||
[SMCA_UMC_V2] = { "Unified Memory Controller V2" },
|
||||
+ [SMCA_MA_LLC] = { "Memory Attached Last Level Cache" },
|
||||
[SMCA_PB] = { "Parameter Block" },
|
||||
[SMCA_PSP ... SMCA_PSP_V2] = { "Platform Security Processor" },
|
||||
[SMCA_SMU ... SMCA_SMU_V2] = { "System Management Unit" },
|
||||
@@ -704,6 +779,8 @@ static struct smca_bank_name smca_names[] = {
|
||||
[SMCA_SHUB] = { "System Hub Unit" },
|
||||
[SMCA_SATA] = { "SATA Unit" },
|
||||
[SMCA_USB] = { "USB Unit" },
|
||||
+ [SMCA_USR_DP] = { "Ultra Short Reach Data Plane Controller" },
|
||||
+ [SMCA_USR_CP] = { "Ultra Short Reach Control Plane Controller" },
|
||||
[SMCA_GMI_PCS] = { "Global Memory Interconnect PCS Unit" },
|
||||
[SMCA_XGMI_PHY] = { "Ext Global Memory Interconnect PHY Unit" },
|
||||
[SMCA_WAFL_PHY] = { "WAFL PHY Unit" },
|
105
2d15882a0cbfce0b905039bebc811ac8311cd739.patch
Normal file
105
2d15882a0cbfce0b905039bebc811ac8311cd739.patch
Normal file
@ -0,0 +1,105 @@
|
||||
commit 2d15882a0cbfce0b905039bebc811ac8311cd739
|
||||
Author: Muralidhara M K <muralidhara.mk@amd.com>
|
||||
Date: Fri Jun 30 11:19:42 2023 +0000
|
||||
|
||||
rasdaemon: Handle reassigned bit definitions for UMC bank
|
||||
|
||||
On some AMD systems some of the existing bit definitions in the
|
||||
CTL register of SMCA bank type are reassigned without defining
|
||||
new HWID and McaType. Consequently, the errors whose bit
|
||||
definitions have been reassigned in the CTL register are being
|
||||
erroneously decoded.
|
||||
|
||||
Add new error description structure to compensate for the
|
||||
reassigned bit definitions, by new software defined SMCA bank
|
||||
type by utilizing the hardware-reserved values for HWID.
|
||||
The new SMCA bank type will only be employed for UMC error
|
||||
decoding on affected models and the existing error description
|
||||
structure for UMC bank type is still valid.
|
||||
|
||||
Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
|
||||
|
||||
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
|
||||
index fc51b5a..54060ee 100644
|
||||
--- a/mce-amd-smca.c
|
||||
+++ b/mce-amd-smca.c
|
||||
@@ -60,6 +60,7 @@ enum smca_bank_types {
|
||||
SMCA_CS_V2_QUIRK,
|
||||
SMCA_PIE, /* Power, Interrupts, etc. */
|
||||
SMCA_UMC, /* Unified Memory Controller */
|
||||
+ SMCA_UMC_QUIRK,
|
||||
SMCA_UMC_V2,
|
||||
SMCA_MA_LLC, /* Memory Attached Last Level Cache */
|
||||
SMCA_PB, /* Parameter Block */
|
||||
@@ -313,6 +314,25 @@ static const char * const smca_umc_mce_desc[] = {
|
||||
"Read CRC Error",
|
||||
};
|
||||
|
||||
+static const char * const smca_umc_quirk_mce_desc[] = {
|
||||
+ "DRAM On Die ECC error",
|
||||
+ "Data poison error",
|
||||
+ "SDP parity error",
|
||||
+ "Reserved",
|
||||
+ "Address/Command parity error",
|
||||
+ "HBM Write data parity error",
|
||||
+ "Consolidated SRAM ECC error",
|
||||
+ "Reserved",
|
||||
+ "Reserved",
|
||||
+ "Rdb SRAM ECC error",
|
||||
+ "Thermal throttling",
|
||||
+ "HBM Read Data Parity error",
|
||||
+ "Reserved",
|
||||
+ "UMC FW Error",
|
||||
+ "SRAM Parity Error",
|
||||
+ "HBM CRC Error",
|
||||
+};
|
||||
+
|
||||
static const char * const smca_umc2_mce_desc[] = {
|
||||
"DRAM ECC error",
|
||||
"Data poison error",
|
||||
@@ -642,6 +662,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
|
||||
[SMCA_CS_V2_QUIRK] = { smca_cs2_quirk_mce_desc, ARRAY_SIZE(smca_cs2_quirk_mce_desc)},
|
||||
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
|
||||
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
|
||||
+ [SMCA_UMC_QUIRK] = { smca_umc_quirk_mce_desc, ARRAY_SIZE(smca_umc_quirk_mce_desc) },
|
||||
[SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) },
|
||||
[SMCA_MA_LLC] = { smca_mall_mce_desc, ARRAY_SIZE(smca_mall_mce_desc) },
|
||||
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
|
||||
@@ -696,6 +717,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
|
||||
|
||||
/* Unified Memory Controller MCA type */
|
||||
{ SMCA_UMC, 0x00000096 },
|
||||
+ { SMCA_UMC_QUIRK, 0x00020000 },
|
||||
/* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */
|
||||
{ SMCA_UMC_V2, 0x00010096 },
|
||||
/* Memory Attached Last Level Cache */
|
||||
@@ -764,7 +786,7 @@ static struct smca_bank_name smca_names[] = {
|
||||
[SMCA_L3_CACHE] = { "L3 Cache" },
|
||||
[SMCA_CS ... SMCA_CS_V2_QUIRK] = { "Coherent Slave" },
|
||||
[SMCA_PIE] = { "Power, Interrupts, etc." },
|
||||
- [SMCA_UMC] = { "Unified Memory Controller" },
|
||||
+ [SMCA_UMC ... SMCA_UMC_QUIRK] = { "Unified Memory Controller" },
|
||||
[SMCA_UMC_V2] = { "Unified Memory Controller V2" },
|
||||
[SMCA_MA_LLC] = { "Memory Attached Last Level Cache" },
|
||||
[SMCA_PB] = { "Parameter Block" },
|
||||
@@ -843,6 +865,10 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
|
||||
if (*hwid_mcatype == 0x0002002E)
|
||||
*hwid_mcatype = 0x00010000;
|
||||
break;
|
||||
+ case 0x90 ... 0x9F:
|
||||
+ if ((*hwid_mcatype & 0xFF) == 0x00000096)
|
||||
+ *hwid_mcatype = 0x00020000;
|
||||
+ break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -908,7 +934,7 @@ void decode_smca_error(struct mce_event *e, struct mce_priv *m)
|
||||
smca_mce_descs[bank_type].descs[xec],
|
||||
xec);
|
||||
|
||||
- if (bank_type == SMCA_UMC && xec == 0) {
|
||||
+ if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_QUIRK) && xec == 0) {
|
||||
channel = find_umc_channel(e);
|
||||
csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */
|
||||
mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
|
524
30158ef8d7aebc3e5201bf39b73ce7644f8e419e.patch
Normal file
524
30158ef8d7aebc3e5201bf39b73ce7644f8e419e.patch
Normal file
@ -0,0 +1,524 @@
|
||||
commit 30158ef8d7aebc3e5201bf39b73ce7644f8e419e
|
||||
Author: Avadhut Naik <avadnaik@amd.com>
|
||||
Date: Tue Apr 18 18:24:21 2023 +0000
|
||||
|
||||
rasdaemon: Update SMCA bank error descriptions
|
||||
|
||||
Update, reword some existing SMCA bank type error descriptions to extend
|
||||
SMCA error decoding functionality for modern AMD processors. Additionally,
|
||||
also add new error descriptions for missing SMCA bank types.
|
||||
|
||||
Signed-off-by: Avadhut Naik <avadnaik@amd.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
|
||||
|
||||
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
|
||||
index 27ca8aa..7ec787a 100644
|
||||
--- a/mce-amd-smca.c
|
||||
+++ b/mce-amd-smca.c
|
||||
@@ -66,12 +66,19 @@ enum smca_bank_types {
|
||||
SMCA_SMU, /* System Management Unit */
|
||||
SMCA_SMU_V2,
|
||||
SMCA_MP5, /* Microprocessor 5 Unit */
|
||||
+ SMCA_MPDMA, /* MPDMA Unit */
|
||||
SMCA_NBIO, /* Northbridge IO Unit */
|
||||
SMCA_PCIE, /* PCI Express Unit */
|
||||
SMCA_PCIE_V2,
|
||||
SMCA_XGMI_PCS, /* xGMI PCS Unit */
|
||||
+ SMCA_NBIF, /*NBIF Unit */
|
||||
+ SMCA_SHUB, /* System Hub Unit */
|
||||
+ SMCA_SATA, /* SATA Unit */
|
||||
+ SMCA_USB, /* USB Unit */
|
||||
+ SMCA_GMI_PCS, /* GMI PCS Unit */
|
||||
SMCA_XGMI_PHY, /* xGMI PHY Unit */
|
||||
SMCA_WAFL_PHY, /* WAFL PHY Unit */
|
||||
+ SMCA_GMI_PHY, /* GMI PHY Unit */
|
||||
N_SMCA_BANK_TYPES
|
||||
};
|
||||
|
||||
@@ -85,7 +92,6 @@ enum smca_bank_types {
|
||||
#define NONCPU_NODE_INDEX 8
|
||||
|
||||
/* SMCA Extended error strings */
|
||||
-/* Load Store */
|
||||
static const char * const smca_ls_mce_desc[] = {
|
||||
"Load queue parity",
|
||||
"Store queue parity",
|
||||
@@ -109,6 +115,7 @@ static const char * const smca_ls_mce_desc[] = {
|
||||
"DC tag error type 5",
|
||||
"L2 fill data error",
|
||||
};
|
||||
+
|
||||
static const char * const smca_ls2_mce_desc[] = {
|
||||
"An ECC error was detected on a data cache read by a probe or victimization",
|
||||
"An ECC error or L2 poison was detected on a data cache read by a load",
|
||||
@@ -133,92 +140,104 @@ static const char * const smca_ls2_mce_desc[] = {
|
||||
"A SystemReadDataError error was reported on read data returned from L2 for an SCB store",
|
||||
"A SystemReadDataError error was reported on read data returned from L2 for a WCB store",
|
||||
"A hardware assertion error was reported",
|
||||
- "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access",
|
||||
+ "A parity error was detected in an STLF, SCB EMEM entry, store data mask or SRB store data by any access",
|
||||
};
|
||||
-/* Instruction Fetch */
|
||||
+
|
||||
static const char * const smca_if_mce_desc[] = {
|
||||
"microtag probe port parity error",
|
||||
"IC microtag or full tag multi-hit error",
|
||||
"IC full tag parity",
|
||||
"IC data array parity",
|
||||
- "Decoupling queue phys addr parity error",
|
||||
+ "PRQ Parity Error",
|
||||
"L0 ITLB parity error",
|
||||
- "L1 ITLB parity error",
|
||||
- "L2 ITLB parity error",
|
||||
+ "L1-TLB parity error",
|
||||
+ "L2-TLB parity error",
|
||||
"BPQ snoop parity on Thread 0",
|
||||
"BPQ snoop parity on Thread 1",
|
||||
- "L1 BTB multi-match error",
|
||||
- "L2 BTB multi-match error",
|
||||
+ "BP L1-BTB Multi-Hit Error",
|
||||
+ "BP L2-BTB Multi-Hit Error",
|
||||
"L2 Cache Response Poison error",
|
||||
- "System Read Data error",
|
||||
+ "L2 Cache Error Response",
|
||||
+ "Hardware Assertion Error",
|
||||
+ "L1-TLB Multi-Hit",
|
||||
+ "L2-TLB Multi-Hit",
|
||||
+ "BSR Parity Error",
|
||||
+ "CT MCE",
|
||||
};
|
||||
-/* L2 Cache */
|
||||
+
|
||||
static const char * const smca_l2_mce_desc[] = {
|
||||
- "L2M tag multi-way-hit error",
|
||||
- "L2M tag ECC error",
|
||||
- "L2M data ECC error",
|
||||
- "HW assert",
|
||||
+ "L2M Tag Multiple-Way-Hit error",
|
||||
+ "L2M Tag or State Array ECC Error",
|
||||
+ "L2M Data Array ECC Error",
|
||||
+ "Hardware Assert Error",
|
||||
+ "SDP Read Response Parity Error",
|
||||
};
|
||||
-/* Decoder Unit */
|
||||
+
|
||||
static const char * const smca_de_mce_desc[] = {
|
||||
- "uop cache tag parity error",
|
||||
- "uop cache data parity error",
|
||||
- "Insn buffer parity error",
|
||||
- "uop queue parity error",
|
||||
- "Insn dispatch queue parity error",
|
||||
- "Fetch address FIFO parity",
|
||||
- "Patch RAM data parity",
|
||||
- "Patch RAM sequencer parity",
|
||||
- "uop buffer parity"
|
||||
-};
|
||||
-/* Execution Unit */
|
||||
+ "Micro-op cache tag array parity error",
|
||||
+ "Micro-op cache data array parity error",
|
||||
+ "IBB Register File parity error",
|
||||
+ "Micro-op queue parity error",
|
||||
+ "Instruction dispatch queue parity error",
|
||||
+ "Fetch address FIFO parity error",
|
||||
+ "Patch RAM data parity error",
|
||||
+ "Patch RAM sequencer parity error",
|
||||
+ "Micro-op buffer parity error",
|
||||
+ "Hardware Assertion MCA Error",
|
||||
+};
|
||||
+
|
||||
static const char * const smca_ex_mce_desc[] = {
|
||||
"Watchdog timeout error",
|
||||
- "Phy register file parity",
|
||||
- "Flag register file parity",
|
||||
- "Immediate displacement register file parity",
|
||||
- "Address generator payload parity",
|
||||
- "EX payload parity",
|
||||
- "Checkpoint queue parity",
|
||||
- "Retire dispatch queue parity",
|
||||
+ "Physical register file parity error",
|
||||
+ "Flag register file parity error",
|
||||
+ "Immediate displacement register file parity error",
|
||||
+ "Address generator payload parity error",
|
||||
+ "EX payload parity error",
|
||||
+ "Checkpoint queue parity error",
|
||||
+ "Retire dispatch queue parity error",
|
||||
"Retire status queue parity error",
|
||||
- "Scheduling queue parity error",
|
||||
+ "Scheduler queue parity error",
|
||||
"Branch buffer queue parity error",
|
||||
+ "Hardware Assertion error",
|
||||
+ "Spec Map parity error",
|
||||
+ "Retire Map parity error",
|
||||
};
|
||||
-/* Floating Point Unit */
|
||||
+
|
||||
static const char * const smca_fp_mce_desc[] = {
|
||||
- "Physical register file parity",
|
||||
- "Freelist parity error",
|
||||
- "Schedule queue parity",
|
||||
+ "Physical register file (PRF) parity error",
|
||||
+ "Freelist (FL) parity error",
|
||||
+ "Schedule queue parity error",
|
||||
"NSQ parity error",
|
||||
- "Retire queue parity",
|
||||
- "Status register file parity",
|
||||
+ "Retire queue (RQ) parity error",
|
||||
+ "Status register file (SRF) parity error",
|
||||
"Hardware assertion",
|
||||
+ "Physical K mask register file (KRF) parity error",
|
||||
};
|
||||
-/* L3 Cache */
|
||||
+
|
||||
static const char * const smca_l3_mce_desc[] = {
|
||||
"Shadow tag macro ECC error",
|
||||
"Shadow tag macro multi-way-hit error",
|
||||
"L3M tag ECC error",
|
||||
"L3M tag multi-way-hit error",
|
||||
"L3M data ECC error",
|
||||
- "XI parity, L3 fill done channel error",
|
||||
- "L3 victim queue parity",
|
||||
- "L3 HW assert",
|
||||
+ "SDP Parity Error from XI",
|
||||
+ "L3 victim queue Data Fabric error",
|
||||
+ "L3 Hardware Assertion",
|
||||
+ "XI WCB Parity Poison Creation event",
|
||||
};
|
||||
-/* Coherent Slave Unit */
|
||||
+
|
||||
static const char * const smca_cs_mce_desc[] = {
|
||||
- "Illegal request from transport layer",
|
||||
+ "Illegal request",
|
||||
"Address violation",
|
||||
"Security violation",
|
||||
- "Illegal response from transport layer",
|
||||
+ "Illegal response",
|
||||
"Unexpected response",
|
||||
- "Parity error on incoming request or probe response data",
|
||||
- "Parity error on incoming read response data",
|
||||
- "Atomic request parity",
|
||||
- "ECC error on probe filter access",
|
||||
+ "Request or Probe Parity Error",
|
||||
+ "Read Response Parity Error",
|
||||
+ "Atomic request parity error",
|
||||
+ "Probe Filter ECC Error",
|
||||
};
|
||||
-/* Coherent Slave Unit V2 */
|
||||
+
|
||||
static const char * const smca_cs2_mce_desc[] = {
|
||||
"Illegal Request",
|
||||
"Address Violation",
|
||||
@@ -234,15 +253,22 @@ static const char * const smca_cs2_mce_desc[] = {
|
||||
"SDP read response had an unexpected RETRY error",
|
||||
"Counter overflow error",
|
||||
"Counter underflow error",
|
||||
+ "Illegal Request on the no data channel",
|
||||
+ "Address Violation on the no data channel",
|
||||
+ "Security Violation on the no data channel",
|
||||
+ "Hardware Assert Error",
|
||||
};
|
||||
-/* Power, Interrupt, etc.. */
|
||||
+
|
||||
static const char * const smca_pie_mce_desc[] = {
|
||||
- "HW assert",
|
||||
- "Internal PIE register security violation",
|
||||
- "Error on GMI link",
|
||||
- "Poison data written to internal PIE register",
|
||||
+ "Hardware assert",
|
||||
+ "Register security violation",
|
||||
+ "Link error",
|
||||
+ "Poison data consumption",
|
||||
+ "A deferred error was detected in the DF",
|
||||
+ "Watch Dog Timer",
|
||||
+ "An SRAM ECC error was detected in the CNLI block",
|
||||
};
|
||||
-/* Unified Memory Controller */
|
||||
+
|
||||
static const char * const smca_umc_mce_desc[] = {
|
||||
"DRAM ECC error",
|
||||
"Data poison error on DRAM",
|
||||
@@ -250,6 +276,12 @@ static const char * const smca_umc_mce_desc[] = {
|
||||
"Advanced peripheral bus error",
|
||||
"Command/address parity error",
|
||||
"Write data CRC error",
|
||||
+ "DCQ SRAM ECC error",
|
||||
+ "AES SRAM ECC error",
|
||||
+ "ECS Row Error",
|
||||
+ "ECS Error",
|
||||
+ "UMC Throttling Error",
|
||||
+ "Read CRC Error",
|
||||
};
|
||||
|
||||
static const char * const smca_umc2_mce_desc[] = {
|
||||
@@ -267,15 +299,14 @@ static const char * const smca_umc2_mce_desc[] = {
|
||||
"LM32 MP errors",
|
||||
};
|
||||
|
||||
-/* Parameter Block */
|
||||
static const char * const smca_pb_mce_desc[] = {
|
||||
- "Parameter Block RAM ECC error",
|
||||
+ "An ECC error in the Parameter Block RAM array"
|
||||
};
|
||||
-/* Platform Security Processor */
|
||||
+
|
||||
static const char * const smca_psp_mce_desc[] = {
|
||||
- "PSP RAM ECC or parity error",
|
||||
+ "An ECC or parity error in a PSP RAM instance",
|
||||
};
|
||||
-/* Platform Security Processor V2 */
|
||||
+
|
||||
static const char * const smca_psp2_mce_desc[] = {
|
||||
"High SRAM ECC or parity error",
|
||||
"Low SRAM ECC or parity error",
|
||||
@@ -296,11 +327,11 @@ static const char * const smca_psp2_mce_desc[] = {
|
||||
"TLB Bank 1 parity error",
|
||||
"System Hub Read Buffer ECC or parity error",
|
||||
};
|
||||
-/* System Management Unit */
|
||||
+
|
||||
static const char * const smca_smu_mce_desc[] = {
|
||||
- "SMU RAM ECC or parity error",
|
||||
+ "An ECC or parity error in an SMU RAM instance",
|
||||
};
|
||||
-/* System Management Unit V2 */
|
||||
+
|
||||
static const char * const smca_smu2_mce_desc[] = {
|
||||
"High SRAM ECC or parity error",
|
||||
"Low SRAM ECC or parity error",
|
||||
@@ -314,7 +345,7 @@ static const char * const smca_smu2_mce_desc[] = {
|
||||
"Instruction Tag Cache Bank B ECC or parity error",
|
||||
"System Hub Read Buffer ECC or parity error",
|
||||
};
|
||||
-/* Microprocessor 5 Unit */
|
||||
+
|
||||
static const char * const smca_mp5_mce_desc[] = {
|
||||
"High SRAM ECC or parity error",
|
||||
"Low SRAM ECC or parity error",
|
||||
@@ -327,15 +358,68 @@ static const char * const smca_mp5_mce_desc[] = {
|
||||
"Instruction Tag Cache Bank A ECC or parity error",
|
||||
"Instruction Tag Cache Bank B ECC or parity error",
|
||||
};
|
||||
-/* Northbridge IO Unit */
|
||||
+
|
||||
+static const char * const smca_mpdma_mce_desc[] = {
|
||||
+ "Main SRAM [31:0] bank ECC or parity error",
|
||||
+ "Main SRAM [63:32] bank ECC or parity error",
|
||||
+ "Main SRAM [95:64] bank ECC or parity error",
|
||||
+ "Main SRAM [127:96] bank ECC or parity error",
|
||||
+ "Data Cache Bank A ECC or parity error",
|
||||
+ "Data Cache Bank B ECC or parity error",
|
||||
+ "Data Tag Cache Bank A ECC or parity error",
|
||||
+ "Data Tag Cache Bank B ECC or parity error",
|
||||
+ "Instruction Cache Bank A ECC or parity error",
|
||||
+ "Instruction Cache Bank B ECC or parity error",
|
||||
+ "Instruction Tag Cache Bank A ECC or parity error",
|
||||
+ "Instruction Tag Cache Bank B ECC or parity error",
|
||||
+ "Data Cache Bank A ECC or parity error",
|
||||
+ "Data Cache Bank B ECC or parity error",
|
||||
+ "Data Tag Cache Bank A ECC or parity error",
|
||||
+ "Data Tag Cache Bank B ECC or parity error",
|
||||
+ "Instruction Cache Bank A ECC or parity error",
|
||||
+ "Instruction Cache Bank B ECC or parity error",
|
||||
+ "Instruction Tag Cache Bank A ECC or parity error",
|
||||
+ "Instruction Tag Cache Bank B ECC or parity error",
|
||||
+ "Data Cache Bank A ECC or parity error",
|
||||
+ "Data Cache Bank B ECC or parity error",
|
||||
+ "Data Tag Cache Bank A ECC or parity error",
|
||||
+ "Data Tag Cache Bank B ECC or parity error",
|
||||
+ "Instruction Cache Bank A ECC or parity error",
|
||||
+ "Instruction Cache Bank B ECC or parity error",
|
||||
+ "Instruction Tag Cache Bank A ECC or parity error",
|
||||
+ "Instruction Tag Cache Bank B ECC or parity error",
|
||||
+ "System Hub Read Buffer ECC or parity error",
|
||||
+ "MPDMA TVF DVSEC Memory ECC or parity error",
|
||||
+ "MPDMA TVF MMIO Mailbox0 ECC or parity error",
|
||||
+ "MPDMA TVF MMIO Mailbox1 ECC or parity error",
|
||||
+ "MPDMA TVF Doorbell Memory ECC or parity error",
|
||||
+ "MPDMA TVF SDP Slave Memory 0 ECC or parity error",
|
||||
+ "MPDMA TVF SDP Slave Memory 1 ECC or parity error",
|
||||
+ "MPDMA TVF SDP Slave Memory 2 ECC or parity error",
|
||||
+ "MPDMA TVF SDP Master Memory 0 ECC or parity error",
|
||||
+ "MPDMA TVF SDP Master Memory 1 ECC or parity error",
|
||||
+ "MPDMA TVF SDP Master Memory 2 ECC or parity error",
|
||||
+ "MPDMA TVF SDP Master Memory 3 ECC or parity error",
|
||||
+ "MPDMA TVF SDP Master Memory 4 ECC or parity error",
|
||||
+ "MPDMA TVF SDP Master Memory 5 ECC or parity error",
|
||||
+ "MPDMA TVF SDP Master Memory 6 ECC or parity error",
|
||||
+ "SDP Watchdog Timer expired",
|
||||
+ "MPDMA PTE Command FIFO ECC or parity error",
|
||||
+ "MPDMA PTE Hub Data FIFO ECC or parity error",
|
||||
+ "MPDMA PTE Internal Data FIFO ECC or parity error",
|
||||
+ "MPDMA PTE Command Memory DMA ECC or parity error",
|
||||
+ "MPDMA PTE Command Memory Internal ECC or parity error",
|
||||
+};
|
||||
+
|
||||
static const char * const smca_nbio_mce_desc[] = {
|
||||
"ECC or Parity error",
|
||||
"PCIE error",
|
||||
- "SDP ErrEvent error",
|
||||
- "SDP Egress Poison Error",
|
||||
- "IOHC Internal Poison Error",
|
||||
+ "External SDP ErrEvent error",
|
||||
+ "SDP Egress Poison error",
|
||||
+ "Internal Poison error",
|
||||
+ "Internal system fatal error event",
|
||||
};
|
||||
-/* PCI Express Unit */
|
||||
+
|
||||
static const char * const smca_pcie_mce_desc[] = {
|
||||
"CCIX PER Message logging",
|
||||
"CCIX Read Response with Status: Non-Data Error",
|
||||
@@ -345,7 +429,7 @@ static const char * const smca_pcie_mce_desc[] = {
|
||||
};
|
||||
|
||||
static const char * const smca_pcie2_mce_desc[] = {
|
||||
- "SDP Parity Error logging",
|
||||
+ "SDP Data Parity Error logging",
|
||||
};
|
||||
|
||||
static const char * const smca_xgmipcs_mce_desc[] = {
|
||||
@@ -387,11 +471,66 @@ static const char * const smca_xgmiphy_mce_desc[] = {
|
||||
"PHY APB error",
|
||||
};
|
||||
|
||||
-static const char * const smca_waflphy_mce_desc[] = {
|
||||
- "RAM ECC Error",
|
||||
- "ARC instruction buffer parity error",
|
||||
- "ARC data buffer parity error",
|
||||
- "PHY APB error",
|
||||
+static const char * const smca_nbif_mce_desc[] = {
|
||||
+ "Timeout error from GMI",
|
||||
+ "SRAM ECC error",
|
||||
+ "NTB Error Event",
|
||||
+ "SDP Parity error",
|
||||
+};
|
||||
+
|
||||
+static const char * const smca_sata_mce_desc[] = {
|
||||
+ "Parity error for port 0",
|
||||
+ "Parity error for port 1",
|
||||
+ "Parity error for port 2",
|
||||
+ "Parity error for port 3",
|
||||
+ "Parity error for port 4",
|
||||
+ "Parity error for port 5",
|
||||
+ "Parity error for port 6",
|
||||
+ "Parity error for port 7",
|
||||
+};
|
||||
+
|
||||
+static const char * const smca_usb_mce_desc[] = {
|
||||
+ "Parity error or ECC error for S0 RAM0",
|
||||
+ "Parity error or ECC error for S0 RAM1",
|
||||
+ "Parity error or ECC error for S0 RAM2",
|
||||
+ "Parity error for PHY RAM0",
|
||||
+ "Parity error for PHY RAM1",
|
||||
+ "AXI Slave Response error",
|
||||
+};
|
||||
+
|
||||
+static const char * const smca_gmipcs_mce_desc[] = {
|
||||
+ "Data Loss Error",
|
||||
+ "Training Error",
|
||||
+ "Replay Parity Error",
|
||||
+ "Rx Fifo Underflow Error",
|
||||
+ "Rx Fifo Overflow Error",
|
||||
+ "CRC Error",
|
||||
+ "BER Exceeded Error",
|
||||
+ "Tx Fifo Underflow Error",
|
||||
+ "Replay Buffer Parity Error",
|
||||
+ "Tx Overflow Error",
|
||||
+ "Replay Fifo Overflow Error",
|
||||
+ "Replay Fifo Underflow Error",
|
||||
+ "Elastic Fifo Overflow Error",
|
||||
+ "Deskew Error",
|
||||
+ "Offline Error",
|
||||
+ "Data Startup Limit Error",
|
||||
+ "FC Init Timeout Error",
|
||||
+ "Recovery Timeout Error",
|
||||
+ "Ready Serial Timeout Error",
|
||||
+ "Ready Serial Attempt Error",
|
||||
+ "Recovery Attempt Error",
|
||||
+ "Recovery Relock Attempt Error",
|
||||
+ "Deskew Abort Error",
|
||||
+ "Rx Buffer Error",
|
||||
+ "Rx LFDS Fifo Overflow Error",
|
||||
+ "Rx LFDS Fifo Underflow Error",
|
||||
+ "LinkSub Tx Timeout Error",
|
||||
+ "LinkSub Rx Timeout Error",
|
||||
+ "Rx CMD Packet Error",
|
||||
+ "LFDS Training Timeout Error",
|
||||
+ "LFDS FC Init Timeout Error",
|
||||
+ "Data Loss Error",
|
||||
};
|
||||
|
||||
struct smca_mce_desc {
|
||||
@@ -419,12 +558,21 @@ static struct smca_mce_desc smca_mce_descs[] = {
|
||||
[SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
|
||||
[SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc)},
|
||||
[SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) },
|
||||
+ [SMCA_MPDMA] = { smca_mpdma_mce_desc, ARRAY_SIZE(smca_mpdma_mce_desc) },
|
||||
[SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)},
|
||||
[SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)},
|
||||
[SMCA_PCIE_V2] = { smca_pcie2_mce_desc, ARRAY_SIZE(smca_pcie2_mce_desc) },
|
||||
[SMCA_XGMI_PCS] = { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc) },
|
||||
+ /* NBIF and SHUB have the same error descriptions, for now. */
|
||||
+ [SMCA_NBIF] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) },
|
||||
+ [SMCA_SHUB] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) },
|
||||
+ [SMCA_SATA] = { smca_sata_mce_desc, ARRAY_SIZE(smca_sata_mce_desc) },
|
||||
+ [SMCA_USB] = { smca_usb_mce_desc, ARRAY_SIZE(smca_usb_mce_desc) },
|
||||
+ [SMCA_GMI_PCS] = { smca_gmipcs_mce_desc, ARRAY_SIZE(smca_gmipcs_mce_desc) },
|
||||
+ /* All the PHY bank types have the same error descriptions, for now. */
|
||||
[SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
|
||||
- [SMCA_WAFL_PHY] = { smca_waflphy_mce_desc, ARRAY_SIZE(smca_waflphy_mce_desc) },
|
||||
+ [SMCA_WAFL_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
|
||||
+ [SMCA_GMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
|
||||
};
|
||||
|
||||
struct smca_hwid {
|
||||
@@ -470,6 +618,9 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
|
||||
/* Microprocessor 5 Unit MCA type */
|
||||
{ SMCA_MP5, 0x00020001 },
|
||||
|
||||
+ /* MPDMA MCA Type */
|
||||
+ { SMCA_MPDMA, 0x00030001 },
|
||||
+
|
||||
/* Northbridge IO Unit MCA type */
|
||||
{ SMCA_NBIO, 0x00000018 },
|
||||
|
||||
@@ -480,11 +631,20 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
|
||||
/* Ext Global Memory Interconnect PCS MCA type */
|
||||
{ SMCA_XGMI_PCS, 0x00000050 },
|
||||
|
||||
+ { SMCA_NBIF, 0x0000006C },
|
||||
+
|
||||
+ { SMCA_SHUB, 0x00000080 },
|
||||
+ { SMCA_SATA, 0x000000A8 },
|
||||
+ { SMCA_USB, 0x000000AA },
|
||||
+ { SMCA_GMI_PCS, 0x00000241 },
|
||||
+
|
||||
/* Ext Global Memory Interconnect PHY MCA type */
|
||||
{ SMCA_XGMI_PHY, 0x00000259 },
|
||||
|
||||
/* WAFL PHY MCA type */
|
||||
{ SMCA_WAFL_PHY, 0x00000267 },
|
||||
+
|
||||
+ { SMCA_GMI_PHY, 0x00000269 },
|
||||
};
|
||||
|
||||
struct smca_bank_name {
|
||||
@@ -508,12 +668,18 @@ static struct smca_bank_name smca_names[] = {
|
||||
[SMCA_PSP ... SMCA_PSP_V2] = { "Platform Security Processor" },
|
||||
[SMCA_SMU ... SMCA_SMU_V2] = { "System Management Unit" },
|
||||
[SMCA_MP5] = { "Microprocessor 5 Unit" },
|
||||
+ [SMCA_MPDMA] = { "MPDMA Unit" },
|
||||
[SMCA_NBIO] = { "Northbridge IO Unit" },
|
||||
[SMCA_PCIE ... SMCA_PCIE_V2] = { "PCI Express Unit" },
|
||||
[SMCA_XGMI_PCS] = { "Ext Global Memory Interconnect PCS Unit" },
|
||||
+ [SMCA_NBIF] = { "NBIF Unit" },
|
||||
+ [SMCA_SHUB] = { "System Hub Unit" },
|
||||
+ [SMCA_SATA] = { "SATA Unit" },
|
||||
+ [SMCA_USB] = { "USB Unit" },
|
||||
+ [SMCA_GMI_PCS] = { "Global Memory Interconnect PCS Unit" },
|
||||
[SMCA_XGMI_PHY] = { "Ext Global Memory Interconnect PHY Unit" },
|
||||
[SMCA_WAFL_PHY] = { "WAFL PHY Unit" },
|
||||
-
|
||||
+ [SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" },
|
||||
};
|
||||
|
||||
static void amd_decode_errcode(struct mce_event *e)
|
411
932118b04a04104dfac6b8536419803f236e6118.patch
Normal file
411
932118b04a04104dfac6b8536419803f236e6118.patch
Normal file
@ -0,0 +1,411 @@
|
||||
commit 932118b04a04104dfac6b8536419803f236e6118
|
||||
Author: Avadhut Naik <avadhut.naik@amd.com>
|
||||
Date: Mon May 22 22:13:17 2023 +0000
|
||||
|
||||
rasdaemon: Add support for post-processing MCA errors
|
||||
|
||||
Currently, the rasdaemon performs detailed error decoding of received
|
||||
MCA errors on the system only whence it is running, either as a daemon
|
||||
or in the foreground.
|
||||
|
||||
As such, error decoding cannot be undertaken for any MCA errors received
|
||||
whence the rasdaemon wasn't running. Additionally, if the error decoding
|
||||
modules like edac_mce_amd too have not been loaded, error records in the
|
||||
demsg buffer might correspond to raw values in associated MSRs, compelling
|
||||
users to undertake decoding manually. The scenario seems more plausible on
|
||||
AMD systems with Scalabale MCA (SMCA) with plans in place to remove SMCA
|
||||
Extended Error Descriptions from the edac_mce_amd module in an effort to
|
||||
offload SMCA Error Decoding to the rasdaemon.
|
||||
|
||||
As such, add support to post-process and decode MCA Errors received on AMD
|
||||
SMCA systems from raw MSR values. Support for post-processing and decoding
|
||||
of MCA Errors received on CPUs of other vendors can be added in the future,
|
||||
as needed.
|
||||
|
||||
Suggested-by: Yazen Ghannam <yazen.ghannam@amd.com>
|
||||
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
|
||||
|
||||
---
|
||||
mce-amd-smca.c | 8 ++-
|
||||
ras-events.h | 1
|
||||
ras-mce-handler.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++----
|
||||
ras-mce-handler.h | 4 +
|
||||
ras-record.h | 10 ++++
|
||||
rasdaemon.c | 94 +++++++++++++++++++++++++++++++++++++++++++++-
|
||||
6 files changed, 216 insertions(+), 11 deletions(-)
|
||||
|
||||
--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2023-10-27 12:44:55.541077722 -0400
|
||||
+++ rasdaemon-0.6.7/mce-amd-smca.c 2023-10-27 12:44:58.549049019 -0400
|
||||
@@ -710,7 +710,7 @@ static struct smca_bank_name smca_names[
|
||||
[SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" },
|
||||
};
|
||||
|
||||
-static void amd_decode_errcode(struct mce_event *e)
|
||||
+void amd_decode_errcode(struct mce_event *e)
|
||||
{
|
||||
|
||||
decode_amd_errcode(e);
|
||||
@@ -782,7 +782,7 @@ *hwid_mcatype = 0x00010000;
|
||||
}
|
||||
|
||||
/* Decode extended errors according to Scalable MCA specification */
|
||||
-static void decode_smca_error(struct mce_event *e, struct mce_priv* m)
|
||||
+void decode_smca_error(struct mce_event *e, struct mce_priv *m)
|
||||
{
|
||||
enum smca_bank_types bank_type;
|
||||
const char *ip_name;
|
||||
@@ -827,7 +827,9 @@ for (i = 0; i < ARRAY_SIZE(smca_hwid_mca
|
||||
/* Only print the descriptor of valid extended error code */
|
||||
if (xec < smca_mce_descs[bank_type].num_descs)
|
||||
mce_snprintf(e->mcastatus_msg,
|
||||
- " %s.\n", smca_mce_descs[bank_type].descs[xec]);
|
||||
+ "%s. Ext Err Code: %d",
|
||||
+ smca_mce_descs[bank_type].descs[xec],
|
||||
+ xec);
|
||||
|
||||
if (bank_type == SMCA_UMC && xec == 0) {
|
||||
channel = find_umc_channel(e);
|
||||
--- rasdaemon-0.6.7.orig/ras-events.h 2023-10-27 12:44:55.541077722 -0400
|
||||
+++ rasdaemon-0.6.7/ras-events.h 2023-10-27 12:44:58.549049019 -0400
|
||||
@@ -100,6 +100,7 @@ enum ghes_severity {
|
||||
|
||||
/* Function prototypes */
|
||||
int toggle_ras_mc_event(int enable);
|
||||
+int ras_offline_mce_event(struct ras_mc_offline_event *event);
|
||||
int handle_ras_events(int record_events);
|
||||
|
||||
#endif
|
||||
--- rasdaemon-0.6.7.orig/ras-mce-handler.c 2023-10-27 12:44:55.541077722 -0400
|
||||
+++ rasdaemon-0.6.7/ras-mce-handler.c 2023-10-27 12:45:27.159776011 -0400
|
||||
@@ -63,10 +63,8 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series
|
||||
[CPU_SAPPHIRERAPIDS] = "Sapphirerapids server",
|
||||
};
|
||||
|
||||
-static enum cputype select_intel_cputype(struct ras_events *ras)
|
||||
+static enum cputype select_intel_cputype(struct mce_priv *mce)
|
||||
{
|
||||
- struct mce_priv *mce = ras->mce_priv;
|
||||
-
|
||||
if (mce->family == 15) {
|
||||
if (mce->model == 6)
|
||||
return CPU_TULSA;
|
||||
@@ -140,9 +138,8 @@ if (mce->model > 0x1a) {
|
||||
return mce->family == 6 ? CPU_P6OLD : CPU_GENERIC;
|
||||
}
|
||||
|
||||
-static int detect_cpu(struct ras_events *ras)
|
||||
+static int detect_cpu(struct mce_priv *mce)
|
||||
{
|
||||
- struct mce_priv *mce = ras->mce_priv;
|
||||
FILE *f;
|
||||
int ret = 0;
|
||||
char *line = NULL;
|
||||
@@ -221,7 +218,7 @@ ret = 0;
|
||||
}
|
||||
goto ret;
|
||||
} else if (!strcmp(mce->vendor,"GenuineIntel")) {
|
||||
- mce->cputype = select_intel_cputype(ras);
|
||||
+ mce->cputype = select_intel_cputype(mce);
|
||||
} else {
|
||||
ret = EINVAL;
|
||||
}
|
||||
@@ -246,7 +243,7 @@ int register_mce_handler(struct ras_even
|
||||
|
||||
mce = ras->mce_priv;
|
||||
|
||||
- rc = detect_cpu(ras);
|
||||
+ rc = detect_cpu(mce);
|
||||
if (rc) {
|
||||
if (mce->processor_flags)
|
||||
free (mce->processor_flags);
|
||||
@@ -383,6 +380,105 @@ #if 0
|
||||
*/
|
||||
}
|
||||
|
||||
+static int report_mce_offline(struct trace_seq *s,
|
||||
+ struct mce_event *mce,
|
||||
+ struct mce_priv *priv)
|
||||
+{
|
||||
+ time_t now;
|
||||
+ struct tm *tm;
|
||||
+
|
||||
+ time(&now);
|
||||
+ tm = localtime(&now);
|
||||
+
|
||||
+ if (tm)
|
||||
+ strftime(mce->timestamp, sizeof(mce->timestamp),
|
||||
+ "%Y-%m-%d %H:%M:%S %z", tm);
|
||||
+ trace_seq_printf(s, "%s,", mce->timestamp);
|
||||
+
|
||||
+ if (*mce->bank_name)
|
||||
+ trace_seq_printf(s, " %s,", mce->bank_name);
|
||||
+ else
|
||||
+ trace_seq_printf(s, " bank=%x,", mce->bank);
|
||||
+
|
||||
+ if (*mce->mcastatus_msg)
|
||||
+ trace_seq_printf(s, " mca: %s,", mce->mcastatus_msg);
|
||||
+
|
||||
+ if (*mce->mcistatus_msg)
|
||||
+ trace_seq_printf(s, " mci: %s,", mce->mcistatus_msg);
|
||||
+
|
||||
+ if (*mce->mc_location)
|
||||
+ trace_seq_printf(s, " Locn: %s,", mce->mc_location);
|
||||
+
|
||||
+ if (*mce->error_msg)
|
||||
+ trace_seq_printf(s, " Error Msg: %s\n", mce->error_msg);
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+int ras_offline_mce_event(struct ras_mc_offline_event *event)
|
||||
+{
|
||||
+ int rc = 0;
|
||||
+ struct trace_seq s;
|
||||
+ struct mce_event *mce = NULL;
|
||||
+ struct mce_priv *priv = NULL;
|
||||
+
|
||||
+ mce = (struct mce_event *)calloc(1, sizeof(struct mce_event));
|
||||
+ if (!mce) {
|
||||
+ log(TERM, LOG_ERR, "Can't allocate memory for mce struct\n");
|
||||
+ return errno;
|
||||
+ }
|
||||
+
|
||||
+ priv = (struct mce_priv *)calloc(1, sizeof(struct mce_priv));
|
||||
+ if (!priv) {
|
||||
+ log(TERM, LOG_ERR, "Can't allocate memory for mce_priv struct\n");
|
||||
+ free(mce);
|
||||
+ return errno;
|
||||
+ }
|
||||
+
|
||||
+ if (event->smca) {
|
||||
+ priv->cputype = CPU_AMD_SMCA;
|
||||
+ priv->family = event->family;
|
||||
+ priv->model = event->model;
|
||||
+ } else {
|
||||
+ rc = detect_cpu(priv);
|
||||
+ if (rc) {
|
||||
+ log(TERM, LOG_ERR, "Failed to detect CPU\n");
|
||||
+ goto free_mce;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ mce->status = event->status;
|
||||
+ mce->bank = event->bank;
|
||||
+
|
||||
+ switch (priv->cputype) {
|
||||
+ case CPU_AMD_SMCA:
|
||||
+ mce->synd = event->synd;
|
||||
+ mce->ipid = event->ipid;
|
||||
+ if (!mce->ipid || !mce->status) {
|
||||
+ log(TERM, LOG_ERR, "%s MSR required.\n",
|
||||
+ mce->ipid ? "Status" : "Ipid");
|
||||
+ rc = -EINVAL;
|
||||
+ goto free_mce;
|
||||
+ }
|
||||
+ decode_smca_error(mce, priv);
|
||||
+ amd_decode_errcode(mce);
|
||||
+ break;
|
||||
+ default:
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ trace_seq_init(&s);
|
||||
+ report_mce_offline(&s, mce, priv);
|
||||
+ trace_seq_do_printf(&s);
|
||||
+ fflush(stdout);
|
||||
+ trace_seq_destroy(&s);
|
||||
+
|
||||
+free_mce:
|
||||
+ free(priv);
|
||||
+ free(mce);
|
||||
+ return rc;
|
||||
+}
|
||||
+
|
||||
int ras_mce_event_handler(struct trace_seq *s,
|
||||
struct pevent_record *record,
|
||||
struct event_format *event, void *context)
|
||||
--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2023-10-27 12:44:55.541077722 -0400
|
||||
+++ rasdaemon-0.6.7/ras-mce-handler.h 2023-10-27 12:44:58.550049010 -0400
|
||||
@@ -118,6 +118,10 @@ int ras_mce_event_handler(struct trace_s
|
||||
/* enables intel iMC logs */
|
||||
int set_intel_imc_log(enum cputype cputype, unsigned ncpus);
|
||||
|
||||
+/* Undertake AMD SMCA Error Decoding */
|
||||
+void decode_smca_error(struct mce_event *e, struct mce_priv *m);
|
||||
+void amd_decode_errcode(struct mce_event *e);
|
||||
+
|
||||
/* Per-CPU-type decoders for Intel CPUs */
|
||||
void p4_decode_model(struct mce_event *e);
|
||||
void core2_decode_model(struct mce_event *e);
|
||||
--- rasdaemon-0.6.7.orig/ras-record.h 2023-10-27 12:44:55.541077722 -0400
|
||||
+++ rasdaemon-0.6.7/ras-record.h 2023-10-27 12:44:58.550049010 -0400
|
||||
@@ -21,6 +21,7 @@ * Foundation, Inc., 51 Franklin Street,
|
||||
#define __RAS_RECORD_H
|
||||
|
||||
#include <stdint.h>
|
||||
+#include <stdbool.h>
|
||||
#include "config.h"
|
||||
|
||||
#define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x)))
|
||||
@@ -39,6 +40,15 @@ struct ras_mc_event {
|
||||
const char *driver_detail;
|
||||
};
|
||||
|
||||
+struct ras_mc_offline_event {
|
||||
+ unsigned int family, model;
|
||||
+ bool smca;
|
||||
+ uint8_t bank;
|
||||
+ uint64_t ipid;
|
||||
+ uint64_t synd;
|
||||
+ uint64_t status;
|
||||
+};
|
||||
+
|
||||
struct ras_aer_event {
|
||||
char timestamp[64];
|
||||
const char *error_type;
|
||||
--- rasdaemon-0.6.7.orig/rasdaemon.c 2023-10-27 12:44:55.541077722 -0400
|
||||
+++ rasdaemon-0.6.7/rasdaemon.c 2023-10-27 12:44:58.550049010 -0400
|
||||
@@ -41,8 +41,21 @@ struct arguments {
|
||||
int record_events;
|
||||
int enable_ras;
|
||||
int foreground;
|
||||
+ int offline;
|
||||
};
|
||||
|
||||
+enum OFFLINE_ARG_KEYS {
|
||||
+ SMCA = 0x100,
|
||||
+ MODEL,
|
||||
+ FAMILY,
|
||||
+ BANK_NUM,
|
||||
+ IPID_REG,
|
||||
+ STATUS_REG,
|
||||
+ SYNDROME_REG
|
||||
+};
|
||||
+
|
||||
+struct ras_mc_offline_event event;
|
||||
+
|
||||
static error_t parse_opt(int k, char *arg, struct argp_state *state)
|
||||
{
|
||||
struct arguments *args = state->input;
|
||||
@@ -62,18 +75,84 @@ static error_t parse_opt(int k, char *ar
|
||||
case 'f':
|
||||
args->foreground++;
|
||||
break;
|
||||
+#ifdef HAVE_MCE
|
||||
+ case 'p':
|
||||
+ if (state->argc < 4)
|
||||
+ argp_state_help(state, stdout, ARGP_HELP_LONG | ARGP_HELP_EXIT_ERR);
|
||||
+ args->offline++;
|
||||
+ break;
|
||||
+#endif
|
||||
default:
|
||||
return ARGP_ERR_UNKNOWN;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
+#ifdef HAVE_MCE
|
||||
+static error_t parse_opt_offline(int key, char *arg,
|
||||
+ struct argp_state *state)
|
||||
+{
|
||||
+ switch (key) {
|
||||
+ case SMCA:
|
||||
+ event.smca = true;
|
||||
+ break;
|
||||
+ case MODEL:
|
||||
+ event.model = strtoul(state->argv[state->next], NULL, 0);
|
||||
+ break;
|
||||
+ case FAMILY:
|
||||
+ event.family = strtoul(state->argv[state->next], NULL, 0);
|
||||
+ break;
|
||||
+ case BANK_NUM:
|
||||
+ event.bank = atoi(state->argv[state->next]);
|
||||
+ break;
|
||||
+ case IPID_REG:
|
||||
+ event.ipid = strtoull(state->argv[state->next], NULL, 0);
|
||||
+ break;
|
||||
+ case STATUS_REG:
|
||||
+ event.status = strtoull(state->argv[state->next], NULL, 0);
|
||||
+ break;
|
||||
+ case SYNDROME_REG:
|
||||
+ event.synd = strtoull(state->argv[state->next], NULL, 0);
|
||||
+ break;
|
||||
+ default:
|
||||
+ return ARGP_ERR_UNKNOWN;
|
||||
+ }
|
||||
+ return 0;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
long user_hz;
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
struct arguments args;
|
||||
int idx = -1;
|
||||
+
|
||||
+#ifdef HAVE_MCE
|
||||
+ const struct argp_option offline_options[] = {
|
||||
+ {"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"},
|
||||
+ {"model", MODEL, 0, 0, "CPU Model"},
|
||||
+ {"family", FAMILY, 0, 0, "CPU Family"},
|
||||
+ {"bank", BANK_NUM, 0, 0, "Bank Number"},
|
||||
+ {"ipid", IPID_REG, 0, 0, "IPID Register (for SMCA systems only)"},
|
||||
+ {"status", STATUS_REG, 0, 0, "Status Register"},
|
||||
+ {"synd", SYNDROME_REG, 0, 0, "Syndrome Register"},
|
||||
+ {0, 0, 0, 0, 0, 0},
|
||||
+ };
|
||||
+
|
||||
+ struct argp offline_argp = {
|
||||
+ .options = offline_options,
|
||||
+ .parser = parse_opt_offline,
|
||||
+ .doc = TOOL_DESCRIPTION,
|
||||
+ .args_doc = ARGS_DOC,
|
||||
+ };
|
||||
+
|
||||
+ struct argp_child offline_parser[] = {
|
||||
+ {&offline_argp, 0, "Post-Processing Options:", 0},
|
||||
+ {0, 0, 0, 0},
|
||||
+ };
|
||||
+#endif
|
||||
+
|
||||
const struct argp_option options[] = {
|
||||
{"enable", 'e', 0, 0, "enable RAS events and exit", 0},
|
||||
{"disable", 'd', 0, 0, "disable RAS events and exit", 0},
|
||||
@@ -81,6 +160,10 @@ {"disable", 'd', 0, 0, "disable RAS even
|
||||
{"record", 'r', 0, 0, "record events via sqlite3", 0},
|
||||
#endif
|
||||
{"foreground", 'f', 0, 0, "run foreground, not daemonize"},
|
||||
+#ifdef HAVE_MCE
|
||||
+ {"post-processing", 'p', 0, 0,
|
||||
+ "Post-processing MCE's with raw register values"},
|
||||
+#endif
|
||||
|
||||
{ 0, 0, 0, 0, 0, 0 }
|
||||
};
|
||||
@@ -89,7 +172,9 @@ { 0, 0, 0, 0, 0, 0 }
|
||||
.parser = parse_opt,
|
||||
.doc = TOOL_DESCRIPTION,
|
||||
.args_doc = ARGS_DOC,
|
||||
-
|
||||
+#ifdef HAVE_MCE
|
||||
+ .children = offline_parser,
|
||||
+#endif
|
||||
};
|
||||
memset (&args, 0, sizeof(args));
|
||||
|
||||
@@ -111,6 +196,13 @@ enable = (args.enable_ras > 0) ? 1 : 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
+#ifdef HAVE_MCE
|
||||
+ if (args.offline) {
|
||||
+ ras_offline_mce_event(&event);
|
||||
+ return 0;
|
||||
+ }
|
||||
+#endif
|
||||
+
|
||||
openlog(TOOL_NAME, 0, LOG_DAEMON);
|
||||
if (!args.foreground)
|
||||
if (daemon(0,0))
|
159
aa36c96cd52d775570dae989dd95a060f1149077.patch
Normal file
159
aa36c96cd52d775570dae989dd95a060f1149077.patch
Normal file
@ -0,0 +1,159 @@
|
||||
commit aa36c96cd52d775570dae989dd95a060f1149077
|
||||
Author: Avadhut Naik <avadnaik@amd.com>
|
||||
Date: Mon Apr 24 20:35:56 2023 +0000
|
||||
|
||||
rasdaemon: Handle reassigned bit definitions for CS SMCA
|
||||
|
||||
Currently, on AMD systems with Scalable MCA (SMCA), each machine check
|
||||
error of a SMCA bank type has an associated bit position in the bank's
|
||||
control (CTL) register used for enabling / disabling reporting of the
|
||||
very error. An error's bit position in the CTL register is also used
|
||||
during error decoding for offsetting into the corresponding bank's error
|
||||
description structure. As new errors are being added in newer AMD systems
|
||||
for existing SMCA bank types, the underlying SMCA architecture guarantees
|
||||
that the bit positions of existing errors are not altered.
|
||||
|
||||
However, on some AMD systems viz. Genoa, some of the existing bit
|
||||
definitions in the CTL register of the Coherent Slave (CS) SMCA bank type
|
||||
are reassigned without defining new HWID and McaType. Consequently, the
|
||||
very errors whose bit definitions have been reassigned in the CTL register
|
||||
are being erroneously decoded.
|
||||
|
||||
As a solution, create a new software defined SMCA bank type by utilizing
|
||||
one of the hardware-reserved values for HWID. The new SMCA bank type will
|
||||
only be employed for CS error decoding on affected CPU models.
|
||||
|
||||
Additionally, since the existing error description structure for the CS
|
||||
SMCA bank type is still valid, add new error description structure to
|
||||
compensate for the reassigned bit definitions.
|
||||
|
||||
Signed-off-by: Avadhut Naik <avadnaik@amd.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
|
||||
|
||||
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
|
||||
index 7ec787a..e81f732 100644
|
||||
--- a/mce-amd-smca.c
|
||||
+++ b/mce-amd-smca.c
|
||||
@@ -57,6 +57,7 @@ enum smca_bank_types {
|
||||
SMCA_L3_CACHE, /* L3 Cache */
|
||||
SMCA_CS, /* Coherent Slave */
|
||||
SMCA_CS_V2,
|
||||
+ SMCA_CS_V2_QUIRK,
|
||||
SMCA_PIE, /* Power, Interrupts, etc. */
|
||||
SMCA_UMC, /* Unified Memory Controller */
|
||||
SMCA_UMC_V2,
|
||||
@@ -259,6 +260,31 @@ static const char * const smca_cs2_mce_desc[] = {
|
||||
"Hardware Assert Error",
|
||||
};
|
||||
|
||||
+/*
|
||||
+ * Per Genoa's revision guide, erratum 1384, existing bit definitions
|
||||
+ * are reassigned for SMCA CS bank type.
|
||||
+ */
|
||||
+static const char * const smca_cs2_quirk_mce_desc[] = {
|
||||
+ "Illegal Request",
|
||||
+ "Address Violation",
|
||||
+ "Security Violation",
|
||||
+ "Illegal Response",
|
||||
+ "Unexpected Response",
|
||||
+ "Request or Probe Parity Error",
|
||||
+ "Read Response Parity Error",
|
||||
+ "Atomic Request Parity Error",
|
||||
+ "SDP read response had no match in the CS queue",
|
||||
+ "SDP read response had an unexpected RETRY error",
|
||||
+ "Counter overflow error",
|
||||
+ "Counter underflow error",
|
||||
+ "Probe Filter Protocol Error",
|
||||
+ "Probe Filter ECC Error",
|
||||
+ "Illegal Request on the no data channel",
|
||||
+ "Address Violation on the no data channel",
|
||||
+ "Security Violation on the no data channel",
|
||||
+ "Hardware Assert Error",
|
||||
+};
|
||||
+
|
||||
static const char * const smca_pie_mce_desc[] = {
|
||||
"Hardware assert",
|
||||
"Register security violation",
|
||||
@@ -549,6 +575,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
|
||||
[SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
|
||||
[SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
|
||||
[SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) },
|
||||
+ [SMCA_CS_V2_QUIRK] = { smca_cs2_quirk_mce_desc, ARRAY_SIZE(smca_cs2_quirk_mce_desc)},
|
||||
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
|
||||
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
|
||||
[SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) },
|
||||
@@ -597,6 +624,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
|
||||
/* Data Fabric MCA types */
|
||||
{ SMCA_CS, 0x0000002E },
|
||||
{ SMCA_CS_V2, 0x0002002E },
|
||||
+ {SMCA_CS_V2_QUIRK, 0x00010000 },
|
||||
{ SMCA_PIE, 0x0001002E },
|
||||
|
||||
/* Unified Memory Controller MCA type */
|
||||
@@ -660,7 +688,7 @@ static struct smca_bank_name smca_names[] = {
|
||||
[SMCA_EX] = { "Execution Unit" },
|
||||
[SMCA_FP] = { "Floating Point Unit" },
|
||||
[SMCA_L3_CACHE] = { "L3 Cache" },
|
||||
- [SMCA_CS ... SMCA_CS_V2] = { "Coherent Slave" },
|
||||
+ [SMCA_CS ... SMCA_CS_V2_QUIRK] = { "Coherent Slave" },
|
||||
[SMCA_PIE] = { "Power, Interrupts, etc." },
|
||||
[SMCA_UMC] = { "Unified Memory Controller" },
|
||||
[SMCA_UMC_V2] = { "Unified Memory Controller V2" },
|
||||
@@ -723,8 +751,38 @@ static int find_hbm_channel(struct mce_event *e)
|
||||
return (umc % 2) ? tmp + 4 : tmp;
|
||||
}
|
||||
|
||||
+static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
|
||||
+{
|
||||
+ if (m->family == 0x19) {
|
||||
+ switch (m->model) {
|
||||
+ /*
|
||||
+ * Per Genoa's revision guide, erratum 1384, some SMCA Extended
|
||||
+ * Error Codes and SMCA Control bits are incorrect for SMCA CS
|
||||
+ * bank type.
|
||||
+ */
|
||||
+ case 0x10 ... 0x1F:
|
||||
+ case 0x60 ... 0x7B:
|
||||
+ case 0xA0 ... 0xAF:
|
||||
+ if (*hwid_mcatype == 0x0002002E)
|
||||
+ *hwid_mcatype = 0x00010000;
|
||||
+ break;
|
||||
+ default:
|
||||
+ break;
|
||||
+ }
|
||||
+ } else if (m->family == 0x1A) {
|
||||
+ switch (m->model) {
|
||||
+ case 0x40 ... 0x4F:
|
||||
+ if (*hwid_mcatype == 0x0002002E)
|
||||
+ *hwid_mcatype = 0x00010000;
|
||||
+ break;
|
||||
+ default:
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
/* Decode extended errors according to Scalable MCA specification */
|
||||
-static void decode_smca_error(struct mce_event *e)
|
||||
+static void decode_smca_error(struct mce_event *e, struct mce_priv* m)
|
||||
{
|
||||
enum smca_bank_types bank_type;
|
||||
const char *ip_name;
|
||||
@@ -735,6 +793,8 @@ static void decode_smca_error(struct mce_event *e)
|
||||
unsigned int csrow = -1, channel = -1;
|
||||
unsigned int i;
|
||||
|
||||
+ fixup_hwid(m, &mcatype_hwid);
|
||||
+
|
||||
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
|
||||
s_hwid = &smca_hwid_mcatypes[i];
|
||||
if (mcatype_hwid == s_hwid->mcatype_hwid) {
|
||||
@@ -801,7 +861,7 @@ int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)
|
||||
if (mcgstatus & MCG_STATUS_MCIP)
|
||||
mce_snprintf(e->mcgstatus_msg, "MCIP");
|
||||
|
||||
- decode_smca_error(e);
|
||||
+ decode_smca_error(e, ras->mce_priv);
|
||||
amd_decode_errcode(e);
|
||||
return 0;
|
||||
}
|
208
b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87.patch
Normal file
208
b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87.patch
Normal file
@ -0,0 +1,208 @@
|
||||
commit b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87
|
||||
Author: Avadhut Naik <avadhut.naik@amd.com>
|
||||
Date: Thu Aug 31 02:23:48 2023 -0500
|
||||
|
||||
rasdaemon: Fix SMCA bank type decoding
|
||||
|
||||
On AMD systems with Scalable MCA (SMCA), the (HWID, MCATYPE) tuple from
|
||||
the MCA_IPID MSR, bits 43:32 and 63:48 respectively, are used for SMCA
|
||||
bank type decoding. On occurrence of an SMCA error, the cached tuples are
|
||||
compared against the tuple read from the MCA_IPID MSR to determine the
|
||||
SMCA bank type.
|
||||
|
||||
Currently however, all high 32 bits of the MCA_IPID register are cached in
|
||||
the rasdaemon for all SMCA bank types. Bits 47:44 which do not play a part
|
||||
in bank type decoding are zeroed out. Likewise, when an SMCA error occurs,
|
||||
all high 32 bits of the MCA_IPID register are read and compared against
|
||||
the cached values in smca_hwid_mcatypes array.
|
||||
|
||||
This can lead to erroneous bank type decoding since the bits 47:44 are
|
||||
not guaranteed to be zero. They are either reserved or, on some modern
|
||||
AMD systems viz. Genoa, denote the InstanceIdHi value. The bits therefore,
|
||||
should not be associated with SMCA bank type decoding.
|
||||
|
||||
Import the HWID_MCATYPE macro from the kernel to ensure that only the
|
||||
relevant fields i.e. (HWID, MCATYPE) tuples are used for SMCA bank type
|
||||
decoding on occurrence of an SMCA error.
|
||||
|
||||
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
|
||||
|
||||
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
|
||||
index a20f03c..55620e2 100644
|
||||
--- a/mce-amd-smca.c
|
||||
+++ b/mce-amd-smca.c
|
||||
@@ -90,6 +90,12 @@ enum smca_bank_types {
|
||||
/* Maximum number of MCA banks per CPU. */
|
||||
#define MAX_NR_BANKS 64
|
||||
|
||||
+#define MCI_IPID_MCATYPE 0xFFFF0000
|
||||
+#define MCI_IPID_HWID 0xFFF
|
||||
+
|
||||
+/* Obtain HWID_MCATYPE Tuple on SMCA Systems */
|
||||
+#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
|
||||
+
|
||||
/*
|
||||
* On Newer heterogeneous systems from AMD with CPU and GPU nodes connected
|
||||
* via xGMI links, the NON CPU Nodes are enumerated from index 8
|
||||
@@ -699,76 +705,76 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
|
||||
/* { bank_type, mcatype_hwid } */
|
||||
|
||||
/* ZN Core (HWID=0xB0) MCA types */
|
||||
- { SMCA_LS, 0x000000B0 },
|
||||
- { SMCA_LS_V2, 0x001000B0 },
|
||||
- { SMCA_IF, 0x000100B0 },
|
||||
- { SMCA_L2_CACHE, 0x000200B0 },
|
||||
- { SMCA_DE, 0x000300B0 },
|
||||
+ { SMCA_LS, HWID_MCATYPE(0xB0, 0x0) },
|
||||
+ { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10) },
|
||||
+ { SMCA_IF, HWID_MCATYPE(0xB0, 0x1) },
|
||||
+ { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2) },
|
||||
+ { SMCA_DE, HWID_MCATYPE(0xB0, 0x3) },
|
||||
/* HWID 0xB0 MCATYPE 0x4 is Reserved */
|
||||
- { SMCA_EX, 0x000500B0 },
|
||||
- { SMCA_FP, 0x000600B0 },
|
||||
- { SMCA_L3_CACHE, 0x000700B0 },
|
||||
+ { SMCA_EX, HWID_MCATYPE(0xB0, 0x5) },
|
||||
+ { SMCA_FP, HWID_MCATYPE(0xB0, 0x6) },
|
||||
+ { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7) },
|
||||
|
||||
/* Data Fabric MCA types */
|
||||
- { SMCA_CS, 0x0000002E },
|
||||
- { SMCA_CS_V2, 0x0002002E },
|
||||
- {SMCA_CS_V2_QUIRK, 0x00010000 },
|
||||
- { SMCA_PIE, 0x0001002E },
|
||||
+ { SMCA_CS, HWID_MCATYPE(0x2E, 0x0) },
|
||||
+ { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) },
|
||||
+ { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) },
|
||||
+ { SMCA_CS_V2_QUIRK, HWID_MCATYPE(0x0, 0x1) },
|
||||
|
||||
/* Unified Memory Controller MCA type */
|
||||
- { SMCA_UMC, 0x00000096 },
|
||||
- { SMCA_UMC_QUIRK, 0x00020000 },
|
||||
+ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) },
|
||||
+ { SMCA_UMC_QUIRK, HWID_MCATYPE(0x0, 0x2) },
|
||||
/* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */
|
||||
- { SMCA_UMC_V2, 0x00010096 },
|
||||
+ { SMCA_UMC_V2, HWID_MCATYPE(0x96, 0x1) },
|
||||
/* Memory Attached Last Level Cache */
|
||||
- { SMCA_MA_LLC, 0x0004002E },
|
||||
+ { SMCA_MA_LLC, HWID_MCATYPE(0x2E, 0x4) },
|
||||
|
||||
/* Parameter Block MCA type */
|
||||
- { SMCA_PB, 0x00000005 },
|
||||
+ { SMCA_PB, HWID_MCATYPE(0x05, 0x0) },
|
||||
|
||||
/* Platform Security Processor MCA type */
|
||||
- { SMCA_PSP, 0x000000FF },
|
||||
- { SMCA_PSP_V2, 0x000100FF },
|
||||
+ { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0) },
|
||||
+ { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1) },
|
||||
|
||||
/* System Management Unit MCA type */
|
||||
- { SMCA_SMU, 0x00000001 },
|
||||
- { SMCA_SMU_V2, 0x00010001 },
|
||||
+ { SMCA_SMU, HWID_MCATYPE(0x01, 0x0) },
|
||||
+ { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1) },
|
||||
|
||||
/* Microprocessor 5 Unit MCA type */
|
||||
- { SMCA_MP5, 0x00020001 },
|
||||
+ { SMCA_MP5, HWID_MCATYPE(0x01, 0x2) },
|
||||
|
||||
/* MPDMA MCA Type */
|
||||
- { SMCA_MPDMA, 0x00030001 },
|
||||
+ { SMCA_MPDMA, HWID_MCATYPE(0x01, 0x3) },
|
||||
|
||||
/* Northbridge IO Unit MCA type */
|
||||
- { SMCA_NBIO, 0x00000018 },
|
||||
+ { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) },
|
||||
|
||||
/* PCI Express Unit MCA type */
|
||||
- { SMCA_PCIE, 0x00000046 },
|
||||
- { SMCA_PCIE_V2, 0x00010046 },
|
||||
+ { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) },
|
||||
+ { SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) },
|
||||
|
||||
/* Ext Global Memory Interconnect PCS MCA type */
|
||||
- { SMCA_XGMI_PCS, 0x00000050 },
|
||||
+ { SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) },
|
||||
|
||||
- { SMCA_NBIF, 0x0000006C },
|
||||
+ { SMCA_NBIF, HWID_MCATYPE(0x6C, 0x0) },
|
||||
|
||||
- { SMCA_SHUB, 0x00000080 },
|
||||
- { SMCA_SATA, 0x000000A8 },
|
||||
- { SMCA_USB, 0x000000AA },
|
||||
+ { SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) },
|
||||
+ { SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) },
|
||||
+ { SMCA_USB, HWID_MCATYPE(0xAA, 0x0) },
|
||||
|
||||
/* Ultra Short Reach Data and Control Plane Controller */
|
||||
- { SMCA_USR_DP, 0x00000170 },
|
||||
- { SMCA_USR_CP, 0x00000180 },
|
||||
+ { SMCA_USR_DP, HWID_MCATYPE(0x170, 0x0) },
|
||||
+ { SMCA_USR_CP, HWID_MCATYPE(0x180, 0x0) },
|
||||
|
||||
- { SMCA_GMI_PCS, 0x00000241 },
|
||||
+ { SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) },
|
||||
|
||||
/* Ext Global Memory Interconnect PHY MCA type */
|
||||
- { SMCA_XGMI_PHY, 0x00000259 },
|
||||
+ { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
|
||||
|
||||
/* WAFL PHY MCA type */
|
||||
- { SMCA_WAFL_PHY, 0x00000267 },
|
||||
+ { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
|
||||
|
||||
- { SMCA_GMI_PHY, 0x00000269 },
|
||||
+ { SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) },
|
||||
};
|
||||
|
||||
struct smca_bank_name {
|
||||
@@ -862,12 +868,12 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
|
||||
case 0x10 ... 0x1F:
|
||||
case 0x60 ... 0x7B:
|
||||
case 0xA0 ... 0xAF:
|
||||
- if (*hwid_mcatype == 0x0002002E)
|
||||
- *hwid_mcatype = 0x00010000;
|
||||
+ if (*hwid_mcatype == HWID_MCATYPE(0x2E, 0x2))
|
||||
+ *hwid_mcatype = HWID_MCATYPE(0x0, 0x1);
|
||||
break;
|
||||
case 0x90 ... 0x9F:
|
||||
- if ((*hwid_mcatype & 0xFF) == 0x00000096)
|
||||
- *hwid_mcatype = 0x00020000;
|
||||
+ if (*hwid_mcatype == HWID_MCATYPE(0x96, 0x0))
|
||||
+ *hwid_mcatype = HWID_MCATYPE(0x0, 0x2);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -875,8 +881,8 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
|
||||
} else if (m->family == 0x1A) {
|
||||
switch (m->model) {
|
||||
case 0x40 ... 0x4F:
|
||||
- if (*hwid_mcatype == 0x0002002E)
|
||||
- *hwid_mcatype = 0x00010000;
|
||||
+ if (*hwid_mcatype == HWID_MCATYPE(0x2E, 0x2))
|
||||
+ *hwid_mcatype = HWID_MCATYPE(0x0, 0x1);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -889,13 +895,17 @@ void decode_smca_error(struct mce_event *e, struct mce_priv *m)
|
||||
{
|
||||
enum smca_bank_types bank_type;
|
||||
const char *ip_name;
|
||||
+ uint32_t mcatype_hwid = 0;
|
||||
unsigned short xec = (e->status >> 16) & 0x3f;
|
||||
const struct smca_hwid *s_hwid;
|
||||
- uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
|
||||
+ uint32_t ipid_high = EXTRACT(e->ipid, 32, 63);
|
||||
uint8_t mcatype_instancehi = EXTRACT(e->ipid, 44, 47);
|
||||
unsigned int csrow = -1, channel = -1;
|
||||
unsigned int i;
|
||||
|
||||
+ mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID,
|
||||
+ (ipid_high & MCI_IPID_MCATYPE) >> 16);
|
||||
+
|
||||
fixup_hwid(m, &mcatype_hwid);
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
|
37
c785d309dcbdeb7ecd219975244f3944a8d047e9.patch
Normal file
37
c785d309dcbdeb7ecd219975244f3944a8d047e9.patch
Normal file
@ -0,0 +1,37 @@
|
||||
commit c785d309dcbdeb7ecd219975244f3944a8d047e9
|
||||
Author: Muralidhara M K <muralidhara.mk@amd.com>
|
||||
Date: Thu Jul 27 10:18:12 2023 +0000
|
||||
|
||||
rasdaemon: Identify the DIe Number in multidie system
|
||||
|
||||
Some AMD systems have 4 dies in each socket and Die ID represents
|
||||
whether the error occured on cpu die or gpu die.
|
||||
Also, respective Die used for FRU identification.
|
||||
|
||||
Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
|
||||
|
||||
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
|
||||
index 54060ee..a20f03c 100644
|
||||
--- a/mce-amd-smca.c
|
||||
+++ b/mce-amd-smca.c
|
||||
@@ -935,10 +935,15 @@ void decode_smca_error(struct mce_event *e, struct mce_priv *m)
|
||||
xec);
|
||||
|
||||
if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_QUIRK) && xec == 0) {
|
||||
- channel = find_umc_channel(e);
|
||||
- csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */
|
||||
- mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
|
||||
- channel, csrow);
|
||||
+ if ((m->family == 0x19) && (m->model >= 0x90 && m->model <= 0x9f)) {
|
||||
+ /* MCA_IPID[InstanceIdHi] give the AMD Node Die ID */
|
||||
+ mce_snprintf(e->mc_location, "memory_die_id=%d", mcatype_instancehi / 4);
|
||||
+ } else {
|
||||
+ channel = find_umc_channel(e);
|
||||
+ csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */
|
||||
+ mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
|
||||
+ channel, csrow);
|
||||
+ }
|
||||
}
|
||||
|
||||
if (bank_type == SMCA_UMC_V2 && xec == 0) {
|
@ -1,6 +1,6 @@
|
||||
Name: rasdaemon
|
||||
Version: 0.6.7
|
||||
Release: 8%{?dist}
|
||||
Release: 9%{?dist}
|
||||
Summary: Utility to receive RAS error tracings
|
||||
License: GPLv2
|
||||
URL: http://git.infradead.org/users/mchehab/rasdaemon.git
|
||||
@ -26,6 +26,13 @@ Patch17: 2b6a54b0d31e02e657171fd27f4e31d996756bc6.patch
|
||||
Patch18: 7ccf12f5ae26a055926d175d908c7930293438c4.patch
|
||||
Patch19: 9415b7449c70f5ea4a0209ddb89c2f5f392d3b4b.patch
|
||||
Patch20: d0e0bb3d73c4bc5060da20270a089857bba2a64c.patch
|
||||
Patch21: 30158ef8d7aebc3e5201bf39b73ce7644f8e419e.patch
|
||||
Patch22: aa36c96cd52d775570dae989dd95a060f1149077.patch
|
||||
Patch23: 932118b04a04104dfac6b8536419803f236e6118.patch
|
||||
Patch24: 1f74a59ee33b7448b00d7ba13d5ecd4918b9853c.patch
|
||||
Patch25: 2d15882a0cbfce0b905039bebc811ac8311cd739.patch
|
||||
Patch26: c785d309dcbdeb7ecd219975244f3944a8d047e9.patch
|
||||
Patch27: b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87.patch
|
||||
|
||||
ExcludeArch: s390 s390x
|
||||
BuildRequires: make
|
||||
@ -81,6 +88,13 @@ an utility for reporting current error counts from the EDAC sysfs files.
|
||||
%patch18 -p1
|
||||
%patch19 -p1
|
||||
%patch20 -p1
|
||||
%patch21 -p1
|
||||
%patch22 -p1
|
||||
%patch23 -p1
|
||||
%patch24 -p1
|
||||
%patch25 -p1
|
||||
%patch26 -p1
|
||||
%patch27 -p1
|
||||
|
||||
# The tarball is locked in time the first time aclocal was ran and will keep
|
||||
# requiring an older version of automake
|
||||
@ -116,6 +130,9 @@ sed -i "s/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION=account/" %{buildroot}/%{_sysconfdir
|
||||
%{_sysconfdir}/sysconfig/rasdaemon
|
||||
|
||||
%changelog
|
||||
* Thu Oct 26 2023 Aristeu Rozanski <aris@redhat.com> 0.6.7-9
|
||||
- Update SMCA support for AMD processors [RHEL-11092]
|
||||
|
||||
* Tue May 03 2022 Aristeu Rozanski <aris@redhat.com> 0.6.7-8
|
||||
- Update ras-mc-ctl manpage to match current options [2079132]
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user