Import from CS git

This commit is contained in:
eabdullin 2024-04-04 10:07:36 +03:00
parent 48cb57c06c
commit 2468c69d7d
8 changed files with 1625 additions and 1 deletions

View File

@ -0,0 +1,163 @@
commit 1f74a59ee33b7448b00d7ba13d5ecd4918b9853c
Author: Muralidhara M K <muralidhara.mk@amd.com>
Date: Fri Jun 30 10:36:53 2023 +0000
rasdaemon: Add new MA_LLC, USR_DP, and USR_CP bank types.
Add HWID and McaType values for new SMCA bank types
and error decoding for those new SMCA banks.
Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 7c88a46..fc51b5a 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -61,6 +61,7 @@ enum smca_bank_types {
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
SMCA_UMC_V2,
+ SMCA_MA_LLC, /* Memory Attached Last Level Cache */
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
SMCA_PSP_V2,
@@ -76,6 +77,8 @@ enum smca_bank_types {
SMCA_SHUB, /* System Hub Unit */
SMCA_SATA, /* SATA Unit */
SMCA_USB, /* USB Unit */
+ SMCA_USR_DP, /* Ultra Short Reach Data Plane Controller */
+ SMCA_USR_CP, /* Ultra Short Reach Control Plane Controller */
SMCA_GMI_PCS, /* GMI PCS Unit */
SMCA_XGMI_PHY, /* xGMI PHY Unit */
SMCA_WAFL_PHY, /* WAFL PHY Unit */
@@ -325,6 +328,16 @@ static const char * const smca_umc2_mce_desc[] = {
"LM32 MP errors",
};
+static const char * const smca_mall_mce_desc[] = {
+ "Counter overflow error",
+ "Counter underflow error",
+ "Write Data Parity Error",
+ "Read Response Parity Error",
+ "Cache Tag ECC Error Macro 0",
+ "Cache Tag ECC Error Macro 1",
+ "Cache Data ECC Error"
+};
+
static const char * const smca_pb_mce_desc[] = {
"An ECC error in the Parameter Block RAM array"
};
@@ -524,6 +537,57 @@ static const char * const smca_usb_mce_desc[] = {
"AXI Slave Response error",
};
+static const char * const smca_usrdp_mce_desc[] = {
+ "Mst CMD Error",
+ "Mst Rx FIFO Error",
+ "Mst Deskew Error",
+ "Mst Detect Timeout Error",
+ "Mst FlowControl Error",
+ "Mst DataValid FIFO Error",
+ "Mac LinkState Error",
+ "Deskew Error",
+ "Init Timeout Error",
+ "Init Attempt Error",
+ "Recovery Timeout Error",
+ "Recovery Attempt Error",
+ "Eye Training Timeout Error",
+ "Data Startup Limit Error",
+ "LS0 Exit Error",
+ "PLL powerState Update Timeout Error",
+ "Rx FIFO Error",
+ "Lcu Error",
+ "Conv CECC Error",
+ "Conv UECC Error",
+ "Reserved",
+ "Rx DataLoss Error",
+ "Replay CECC Error",
+ "Replay UECC Error",
+ "CRC Error",
+ "BER Exceeded Error",
+ "FC Init Timeout Error",
+ "FC Init Attempt Error",
+ "Replay Timeout Error",
+ "Replay Attempt Error",
+ "Replay Underflow Error",
+ "Replay Overflow Error",
+};
+
+static const char * const smca_usrcp_mce_desc[] = {
+ "Packet Type Error",
+ "Rx FIFO Error",
+ "Deskew Error",
+ "Rx Detect Timeout Error",
+ "Data Parity Error",
+ "Data Loss Error",
+ "Lcu Error",
+ "HB1 Handshake Timeout Error",
+ "HB2 Handshake Timeout Error",
+ "Clk Sleep Rsp Timeout Error",
+ "Clk Wake Rsp Timeout Error",
+ "Reset Attack Error",
+ "Remote Link Fatal Error",
+};
+
static const char * const smca_gmipcs_mce_desc[] = {
"Data Loss Error",
"Training Error",
@@ -579,6 +643,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
[SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) },
+ [SMCA_MA_LLC] = { smca_mall_mce_desc, ARRAY_SIZE(smca_mall_mce_desc) },
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
[SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
[SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)},
@@ -595,6 +660,8 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_SHUB] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) },
[SMCA_SATA] = { smca_sata_mce_desc, ARRAY_SIZE(smca_sata_mce_desc) },
[SMCA_USB] = { smca_usb_mce_desc, ARRAY_SIZE(smca_usb_mce_desc) },
+ [SMCA_USR_DP] = { smca_usrdp_mce_desc, ARRAY_SIZE(smca_usrdp_mce_desc) },
+ [SMCA_USR_CP] = { smca_usrcp_mce_desc, ARRAY_SIZE(smca_usrcp_mce_desc) },
[SMCA_GMI_PCS] = { smca_gmipcs_mce_desc, ARRAY_SIZE(smca_gmipcs_mce_desc) },
/* All the PHY bank types have the same error descriptions, for now. */
[SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
@@ -631,6 +698,8 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_UMC, 0x00000096 },
/* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */
{ SMCA_UMC_V2, 0x00010096 },
+ /* Memory Attached Last Level Cache */
+ { SMCA_MA_LLC, 0x0004002E },
/* Parameter Block MCA type */
{ SMCA_PB, 0x00000005 },
@@ -664,6 +733,11 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_SHUB, 0x00000080 },
{ SMCA_SATA, 0x000000A8 },
{ SMCA_USB, 0x000000AA },
+
+ /* Ultra Short Reach Data and Control Plane Controller */
+ { SMCA_USR_DP, 0x00000170 },
+ { SMCA_USR_CP, 0x00000180 },
+
{ SMCA_GMI_PCS, 0x00000241 },
/* Ext Global Memory Interconnect PHY MCA type */
@@ -692,6 +766,7 @@ static struct smca_bank_name smca_names[] = {
[SMCA_PIE] = { "Power, Interrupts, etc." },
[SMCA_UMC] = { "Unified Memory Controller" },
[SMCA_UMC_V2] = { "Unified Memory Controller V2" },
+ [SMCA_MA_LLC] = { "Memory Attached Last Level Cache" },
[SMCA_PB] = { "Parameter Block" },
[SMCA_PSP ... SMCA_PSP_V2] = { "Platform Security Processor" },
[SMCA_SMU ... SMCA_SMU_V2] = { "System Management Unit" },
@@ -704,6 +779,8 @@ static struct smca_bank_name smca_names[] = {
[SMCA_SHUB] = { "System Hub Unit" },
[SMCA_SATA] = { "SATA Unit" },
[SMCA_USB] = { "USB Unit" },
+ [SMCA_USR_DP] = { "Ultra Short Reach Data Plane Controller" },
+ [SMCA_USR_CP] = { "Ultra Short Reach Control Plane Controller" },
[SMCA_GMI_PCS] = { "Global Memory Interconnect PCS Unit" },
[SMCA_XGMI_PHY] = { "Ext Global Memory Interconnect PHY Unit" },
[SMCA_WAFL_PHY] = { "WAFL PHY Unit" },

View File

@ -0,0 +1,105 @@
commit 2d15882a0cbfce0b905039bebc811ac8311cd739
Author: Muralidhara M K <muralidhara.mk@amd.com>
Date: Fri Jun 30 11:19:42 2023 +0000
rasdaemon: Handle reassigned bit definitions for UMC bank
On some AMD systems some of the existing bit definitions in the
CTL register of SMCA bank type are reassigned without defining
new HWID and McaType. Consequently, the errors whose bit
definitions have been reassigned in the CTL register are being
erroneously decoded.
Add new error description structure to compensate for the
reassigned bit definitions, by new software defined SMCA bank
type by utilizing the hardware-reserved values for HWID.
The new SMCA bank type will only be employed for UMC error
decoding on affected models and the existing error description
structure for UMC bank type is still valid.
Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index fc51b5a..54060ee 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -60,6 +60,7 @@ enum smca_bank_types {
SMCA_CS_V2_QUIRK,
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
+ SMCA_UMC_QUIRK,
SMCA_UMC_V2,
SMCA_MA_LLC, /* Memory Attached Last Level Cache */
SMCA_PB, /* Parameter Block */
@@ -313,6 +314,25 @@ static const char * const smca_umc_mce_desc[] = {
"Read CRC Error",
};
+static const char * const smca_umc_quirk_mce_desc[] = {
+ "DRAM On Die ECC error",
+ "Data poison error",
+ "SDP parity error",
+ "Reserved",
+ "Address/Command parity error",
+ "HBM Write data parity error",
+ "Consolidated SRAM ECC error",
+ "Reserved",
+ "Reserved",
+ "Rdb SRAM ECC error",
+ "Thermal throttling",
+ "HBM Read Data Parity error",
+ "Reserved",
+ "UMC FW Error",
+ "SRAM Parity Error",
+ "HBM CRC Error",
+};
+
static const char * const smca_umc2_mce_desc[] = {
"DRAM ECC error",
"Data poison error",
@@ -642,6 +662,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_CS_V2_QUIRK] = { smca_cs2_quirk_mce_desc, ARRAY_SIZE(smca_cs2_quirk_mce_desc)},
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
+ [SMCA_UMC_QUIRK] = { smca_umc_quirk_mce_desc, ARRAY_SIZE(smca_umc_quirk_mce_desc) },
[SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) },
[SMCA_MA_LLC] = { smca_mall_mce_desc, ARRAY_SIZE(smca_mall_mce_desc) },
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
@@ -696,6 +717,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Unified Memory Controller MCA type */
{ SMCA_UMC, 0x00000096 },
+ { SMCA_UMC_QUIRK, 0x00020000 },
/* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */
{ SMCA_UMC_V2, 0x00010096 },
/* Memory Attached Last Level Cache */
@@ -764,7 +786,7 @@ static struct smca_bank_name smca_names[] = {
[SMCA_L3_CACHE] = { "L3 Cache" },
[SMCA_CS ... SMCA_CS_V2_QUIRK] = { "Coherent Slave" },
[SMCA_PIE] = { "Power, Interrupts, etc." },
- [SMCA_UMC] = { "Unified Memory Controller" },
+ [SMCA_UMC ... SMCA_UMC_QUIRK] = { "Unified Memory Controller" },
[SMCA_UMC_V2] = { "Unified Memory Controller V2" },
[SMCA_MA_LLC] = { "Memory Attached Last Level Cache" },
[SMCA_PB] = { "Parameter Block" },
@@ -843,6 +865,10 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
if (*hwid_mcatype == 0x0002002E)
*hwid_mcatype = 0x00010000;
break;
+ case 0x90 ... 0x9F:
+ if ((*hwid_mcatype & 0xFF) == 0x00000096)
+ *hwid_mcatype = 0x00020000;
+ break;
default:
break;
}
@@ -908,7 +934,7 @@ void decode_smca_error(struct mce_event *e, struct mce_priv *m)
smca_mce_descs[bank_type].descs[xec],
xec);
- if (bank_type == SMCA_UMC && xec == 0) {
+ if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_QUIRK) && xec == 0) {
channel = find_umc_channel(e);
csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */
mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",

View File

@ -0,0 +1,524 @@
commit 30158ef8d7aebc3e5201bf39b73ce7644f8e419e
Author: Avadhut Naik <avadnaik@amd.com>
Date: Tue Apr 18 18:24:21 2023 +0000
rasdaemon: Update SMCA bank error descriptions
Update, reword some existing SMCA bank type error descriptions to extend
SMCA error decoding functionality for modern AMD processors. Additionally,
also add new error descriptions for missing SMCA bank types.
Signed-off-by: Avadhut Naik <avadnaik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 27ca8aa..7ec787a 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -66,12 +66,19 @@ enum smca_bank_types {
SMCA_SMU, /* System Management Unit */
SMCA_SMU_V2,
SMCA_MP5, /* Microprocessor 5 Unit */
+ SMCA_MPDMA, /* MPDMA Unit */
SMCA_NBIO, /* Northbridge IO Unit */
SMCA_PCIE, /* PCI Express Unit */
SMCA_PCIE_V2,
SMCA_XGMI_PCS, /* xGMI PCS Unit */
+ SMCA_NBIF, /*NBIF Unit */
+ SMCA_SHUB, /* System Hub Unit */
+ SMCA_SATA, /* SATA Unit */
+ SMCA_USB, /* USB Unit */
+ SMCA_GMI_PCS, /* GMI PCS Unit */
SMCA_XGMI_PHY, /* xGMI PHY Unit */
SMCA_WAFL_PHY, /* WAFL PHY Unit */
+ SMCA_GMI_PHY, /* GMI PHY Unit */
N_SMCA_BANK_TYPES
};
@@ -85,7 +92,6 @@ enum smca_bank_types {
#define NONCPU_NODE_INDEX 8
/* SMCA Extended error strings */
-/* Load Store */
static const char * const smca_ls_mce_desc[] = {
"Load queue parity",
"Store queue parity",
@@ -109,6 +115,7 @@ static const char * const smca_ls_mce_desc[] = {
"DC tag error type 5",
"L2 fill data error",
};
+
static const char * const smca_ls2_mce_desc[] = {
"An ECC error was detected on a data cache read by a probe or victimization",
"An ECC error or L2 poison was detected on a data cache read by a load",
@@ -133,92 +140,104 @@ static const char * const smca_ls2_mce_desc[] = {
"A SystemReadDataError error was reported on read data returned from L2 for an SCB store",
"A SystemReadDataError error was reported on read data returned from L2 for a WCB store",
"A hardware assertion error was reported",
- "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access",
+ "A parity error was detected in an STLF, SCB EMEM entry, store data mask or SRB store data by any access",
};
-/* Instruction Fetch */
+
static const char * const smca_if_mce_desc[] = {
"microtag probe port parity error",
"IC microtag or full tag multi-hit error",
"IC full tag parity",
"IC data array parity",
- "Decoupling queue phys addr parity error",
+ "PRQ Parity Error",
"L0 ITLB parity error",
- "L1 ITLB parity error",
- "L2 ITLB parity error",
+ "L1-TLB parity error",
+ "L2-TLB parity error",
"BPQ snoop parity on Thread 0",
"BPQ snoop parity on Thread 1",
- "L1 BTB multi-match error",
- "L2 BTB multi-match error",
+ "BP L1-BTB Multi-Hit Error",
+ "BP L2-BTB Multi-Hit Error",
"L2 Cache Response Poison error",
- "System Read Data error",
+ "L2 Cache Error Response",
+ "Hardware Assertion Error",
+ "L1-TLB Multi-Hit",
+ "L2-TLB Multi-Hit",
+ "BSR Parity Error",
+ "CT MCE",
};
-/* L2 Cache */
+
static const char * const smca_l2_mce_desc[] = {
- "L2M tag multi-way-hit error",
- "L2M tag ECC error",
- "L2M data ECC error",
- "HW assert",
+ "L2M Tag Multiple-Way-Hit error",
+ "L2M Tag or State Array ECC Error",
+ "L2M Data Array ECC Error",
+ "Hardware Assert Error",
+ "SDP Read Response Parity Error",
};
-/* Decoder Unit */
+
static const char * const smca_de_mce_desc[] = {
- "uop cache tag parity error",
- "uop cache data parity error",
- "Insn buffer parity error",
- "uop queue parity error",
- "Insn dispatch queue parity error",
- "Fetch address FIFO parity",
- "Patch RAM data parity",
- "Patch RAM sequencer parity",
- "uop buffer parity"
-};
-/* Execution Unit */
+ "Micro-op cache tag array parity error",
+ "Micro-op cache data array parity error",
+ "IBB Register File parity error",
+ "Micro-op queue parity error",
+ "Instruction dispatch queue parity error",
+ "Fetch address FIFO parity error",
+ "Patch RAM data parity error",
+ "Patch RAM sequencer parity error",
+ "Micro-op buffer parity error",
+ "Hardware Assertion MCA Error",
+};
+
static const char * const smca_ex_mce_desc[] = {
"Watchdog timeout error",
- "Phy register file parity",
- "Flag register file parity",
- "Immediate displacement register file parity",
- "Address generator payload parity",
- "EX payload parity",
- "Checkpoint queue parity",
- "Retire dispatch queue parity",
+ "Physical register file parity error",
+ "Flag register file parity error",
+ "Immediate displacement register file parity error",
+ "Address generator payload parity error",
+ "EX payload parity error",
+ "Checkpoint queue parity error",
+ "Retire dispatch queue parity error",
"Retire status queue parity error",
- "Scheduling queue parity error",
+ "Scheduler queue parity error",
"Branch buffer queue parity error",
+ "Hardware Assertion error",
+ "Spec Map parity error",
+ "Retire Map parity error",
};
-/* Floating Point Unit */
+
static const char * const smca_fp_mce_desc[] = {
- "Physical register file parity",
- "Freelist parity error",
- "Schedule queue parity",
+ "Physical register file (PRF) parity error",
+ "Freelist (FL) parity error",
+ "Schedule queue parity error",
"NSQ parity error",
- "Retire queue parity",
- "Status register file parity",
+ "Retire queue (RQ) parity error",
+ "Status register file (SRF) parity error",
"Hardware assertion",
+ "Physical K mask register file (KRF) parity error",
};
-/* L3 Cache */
+
static const char * const smca_l3_mce_desc[] = {
"Shadow tag macro ECC error",
"Shadow tag macro multi-way-hit error",
"L3M tag ECC error",
"L3M tag multi-way-hit error",
"L3M data ECC error",
- "XI parity, L3 fill done channel error",
- "L3 victim queue parity",
- "L3 HW assert",
+ "SDP Parity Error from XI",
+ "L3 victim queue Data Fabric error",
+ "L3 Hardware Assertion",
+ "XI WCB Parity Poison Creation event",
};
-/* Coherent Slave Unit */
+
static const char * const smca_cs_mce_desc[] = {
- "Illegal request from transport layer",
+ "Illegal request",
"Address violation",
"Security violation",
- "Illegal response from transport layer",
+ "Illegal response",
"Unexpected response",
- "Parity error on incoming request or probe response data",
- "Parity error on incoming read response data",
- "Atomic request parity",
- "ECC error on probe filter access",
+ "Request or Probe Parity Error",
+ "Read Response Parity Error",
+ "Atomic request parity error",
+ "Probe Filter ECC Error",
};
-/* Coherent Slave Unit V2 */
+
static const char * const smca_cs2_mce_desc[] = {
"Illegal Request",
"Address Violation",
@@ -234,15 +253,22 @@ static const char * const smca_cs2_mce_desc[] = {
"SDP read response had an unexpected RETRY error",
"Counter overflow error",
"Counter underflow error",
+ "Illegal Request on the no data channel",
+ "Address Violation on the no data channel",
+ "Security Violation on the no data channel",
+ "Hardware Assert Error",
};
-/* Power, Interrupt, etc.. */
+
static const char * const smca_pie_mce_desc[] = {
- "HW assert",
- "Internal PIE register security violation",
- "Error on GMI link",
- "Poison data written to internal PIE register",
+ "Hardware assert",
+ "Register security violation",
+ "Link error",
+ "Poison data consumption",
+ "A deferred error was detected in the DF",
+ "Watch Dog Timer",
+ "An SRAM ECC error was detected in the CNLI block",
};
-/* Unified Memory Controller */
+
static const char * const smca_umc_mce_desc[] = {
"DRAM ECC error",
"Data poison error on DRAM",
@@ -250,6 +276,12 @@ static const char * const smca_umc_mce_desc[] = {
"Advanced peripheral bus error",
"Command/address parity error",
"Write data CRC error",
+ "DCQ SRAM ECC error",
+ "AES SRAM ECC error",
+ "ECS Row Error",
+ "ECS Error",
+ "UMC Throttling Error",
+ "Read CRC Error",
};
static const char * const smca_umc2_mce_desc[] = {
@@ -267,15 +299,14 @@ static const char * const smca_umc2_mce_desc[] = {
"LM32 MP errors",
};
-/* Parameter Block */
static const char * const smca_pb_mce_desc[] = {
- "Parameter Block RAM ECC error",
+ "An ECC error in the Parameter Block RAM array"
};
-/* Platform Security Processor */
+
static const char * const smca_psp_mce_desc[] = {
- "PSP RAM ECC or parity error",
+ "An ECC or parity error in a PSP RAM instance",
};
-/* Platform Security Processor V2 */
+
static const char * const smca_psp2_mce_desc[] = {
"High SRAM ECC or parity error",
"Low SRAM ECC or parity error",
@@ -296,11 +327,11 @@ static const char * const smca_psp2_mce_desc[] = {
"TLB Bank 1 parity error",
"System Hub Read Buffer ECC or parity error",
};
-/* System Management Unit */
+
static const char * const smca_smu_mce_desc[] = {
- "SMU RAM ECC or parity error",
+ "An ECC or parity error in an SMU RAM instance",
};
-/* System Management Unit V2 */
+
static const char * const smca_smu2_mce_desc[] = {
"High SRAM ECC or parity error",
"Low SRAM ECC or parity error",
@@ -314,7 +345,7 @@ static const char * const smca_smu2_mce_desc[] = {
"Instruction Tag Cache Bank B ECC or parity error",
"System Hub Read Buffer ECC or parity error",
};
-/* Microprocessor 5 Unit */
+
static const char * const smca_mp5_mce_desc[] = {
"High SRAM ECC or parity error",
"Low SRAM ECC or parity error",
@@ -327,15 +358,68 @@ static const char * const smca_mp5_mce_desc[] = {
"Instruction Tag Cache Bank A ECC or parity error",
"Instruction Tag Cache Bank B ECC or parity error",
};
-/* Northbridge IO Unit */
+
+static const char * const smca_mpdma_mce_desc[] = {
+ "Main SRAM [31:0] bank ECC or parity error",
+ "Main SRAM [63:32] bank ECC or parity error",
+ "Main SRAM [95:64] bank ECC or parity error",
+ "Main SRAM [127:96] bank ECC or parity error",
+ "Data Cache Bank A ECC or parity error",
+ "Data Cache Bank B ECC or parity error",
+ "Data Tag Cache Bank A ECC or parity error",
+ "Data Tag Cache Bank B ECC or parity error",
+ "Instruction Cache Bank A ECC or parity error",
+ "Instruction Cache Bank B ECC or parity error",
+ "Instruction Tag Cache Bank A ECC or parity error",
+ "Instruction Tag Cache Bank B ECC or parity error",
+ "Data Cache Bank A ECC or parity error",
+ "Data Cache Bank B ECC or parity error",
+ "Data Tag Cache Bank A ECC or parity error",
+ "Data Tag Cache Bank B ECC or parity error",
+ "Instruction Cache Bank A ECC or parity error",
+ "Instruction Cache Bank B ECC or parity error",
+ "Instruction Tag Cache Bank A ECC or parity error",
+ "Instruction Tag Cache Bank B ECC or parity error",
+ "Data Cache Bank A ECC or parity error",
+ "Data Cache Bank B ECC or parity error",
+ "Data Tag Cache Bank A ECC or parity error",
+ "Data Tag Cache Bank B ECC or parity error",
+ "Instruction Cache Bank A ECC or parity error",
+ "Instruction Cache Bank B ECC or parity error",
+ "Instruction Tag Cache Bank A ECC or parity error",
+ "Instruction Tag Cache Bank B ECC or parity error",
+ "System Hub Read Buffer ECC or parity error",
+ "MPDMA TVF DVSEC Memory ECC or parity error",
+ "MPDMA TVF MMIO Mailbox0 ECC or parity error",
+ "MPDMA TVF MMIO Mailbox1 ECC or parity error",
+ "MPDMA TVF Doorbell Memory ECC or parity error",
+ "MPDMA TVF SDP Slave Memory 0 ECC or parity error",
+ "MPDMA TVF SDP Slave Memory 1 ECC or parity error",
+ "MPDMA TVF SDP Slave Memory 2 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 0 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 1 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 2 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 3 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 4 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 5 ECC or parity error",
+ "MPDMA TVF SDP Master Memory 6 ECC or parity error",
+ "SDP Watchdog Timer expired",
+ "MPDMA PTE Command FIFO ECC or parity error",
+ "MPDMA PTE Hub Data FIFO ECC or parity error",
+ "MPDMA PTE Internal Data FIFO ECC or parity error",
+ "MPDMA PTE Command Memory DMA ECC or parity error",
+ "MPDMA PTE Command Memory Internal ECC or parity error",
+};
+
static const char * const smca_nbio_mce_desc[] = {
"ECC or Parity error",
"PCIE error",
- "SDP ErrEvent error",
- "SDP Egress Poison Error",
- "IOHC Internal Poison Error",
+ "External SDP ErrEvent error",
+ "SDP Egress Poison error",
+ "Internal Poison error",
+ "Internal system fatal error event",
};
-/* PCI Express Unit */
+
static const char * const smca_pcie_mce_desc[] = {
"CCIX PER Message logging",
"CCIX Read Response with Status: Non-Data Error",
@@ -345,7 +429,7 @@ static const char * const smca_pcie_mce_desc[] = {
};
static const char * const smca_pcie2_mce_desc[] = {
- "SDP Parity Error logging",
+ "SDP Data Parity Error logging",
};
static const char * const smca_xgmipcs_mce_desc[] = {
@@ -387,11 +471,66 @@ static const char * const smca_xgmiphy_mce_desc[] = {
"PHY APB error",
};
-static const char * const smca_waflphy_mce_desc[] = {
- "RAM ECC Error",
- "ARC instruction buffer parity error",
- "ARC data buffer parity error",
- "PHY APB error",
+static const char * const smca_nbif_mce_desc[] = {
+ "Timeout error from GMI",
+ "SRAM ECC error",
+ "NTB Error Event",
+ "SDP Parity error",
+};
+
+static const char * const smca_sata_mce_desc[] = {
+ "Parity error for port 0",
+ "Parity error for port 1",
+ "Parity error for port 2",
+ "Parity error for port 3",
+ "Parity error for port 4",
+ "Parity error for port 5",
+ "Parity error for port 6",
+ "Parity error for port 7",
+};
+
+static const char * const smca_usb_mce_desc[] = {
+ "Parity error or ECC error for S0 RAM0",
+ "Parity error or ECC error for S0 RAM1",
+ "Parity error or ECC error for S0 RAM2",
+ "Parity error for PHY RAM0",
+ "Parity error for PHY RAM1",
+ "AXI Slave Response error",
+};
+
+static const char * const smca_gmipcs_mce_desc[] = {
+ "Data Loss Error",
+ "Training Error",
+ "Replay Parity Error",
+ "Rx Fifo Underflow Error",
+ "Rx Fifo Overflow Error",
+ "CRC Error",
+ "BER Exceeded Error",
+ "Tx Fifo Underflow Error",
+ "Replay Buffer Parity Error",
+ "Tx Overflow Error",
+ "Replay Fifo Overflow Error",
+ "Replay Fifo Underflow Error",
+ "Elastic Fifo Overflow Error",
+ "Deskew Error",
+ "Offline Error",
+ "Data Startup Limit Error",
+ "FC Init Timeout Error",
+ "Recovery Timeout Error",
+ "Ready Serial Timeout Error",
+ "Ready Serial Attempt Error",
+ "Recovery Attempt Error",
+ "Recovery Relock Attempt Error",
+ "Deskew Abort Error",
+ "Rx Buffer Error",
+ "Rx LFDS Fifo Overflow Error",
+ "Rx LFDS Fifo Underflow Error",
+ "LinkSub Tx Timeout Error",
+ "LinkSub Rx Timeout Error",
+ "Rx CMD Packet Error",
+ "LFDS Training Timeout Error",
+ "LFDS FC Init Timeout Error",
+ "Data Loss Error",
};
struct smca_mce_desc {
@@ -419,12 +558,21 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
[SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc)},
[SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) },
+ [SMCA_MPDMA] = { smca_mpdma_mce_desc, ARRAY_SIZE(smca_mpdma_mce_desc) },
[SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)},
[SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)},
[SMCA_PCIE_V2] = { smca_pcie2_mce_desc, ARRAY_SIZE(smca_pcie2_mce_desc) },
[SMCA_XGMI_PCS] = { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc) },
+ /* NBIF and SHUB have the same error descriptions, for now. */
+ [SMCA_NBIF] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) },
+ [SMCA_SHUB] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) },
+ [SMCA_SATA] = { smca_sata_mce_desc, ARRAY_SIZE(smca_sata_mce_desc) },
+ [SMCA_USB] = { smca_usb_mce_desc, ARRAY_SIZE(smca_usb_mce_desc) },
+ [SMCA_GMI_PCS] = { smca_gmipcs_mce_desc, ARRAY_SIZE(smca_gmipcs_mce_desc) },
+ /* All the PHY bank types have the same error descriptions, for now. */
[SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
- [SMCA_WAFL_PHY] = { smca_waflphy_mce_desc, ARRAY_SIZE(smca_waflphy_mce_desc) },
+ [SMCA_WAFL_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
+ [SMCA_GMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
};
struct smca_hwid {
@@ -470,6 +618,9 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Microprocessor 5 Unit MCA type */
{ SMCA_MP5, 0x00020001 },
+ /* MPDMA MCA Type */
+ { SMCA_MPDMA, 0x00030001 },
+
/* Northbridge IO Unit MCA type */
{ SMCA_NBIO, 0x00000018 },
@@ -480,11 +631,20 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Ext Global Memory Interconnect PCS MCA type */
{ SMCA_XGMI_PCS, 0x00000050 },
+ { SMCA_NBIF, 0x0000006C },
+
+ { SMCA_SHUB, 0x00000080 },
+ { SMCA_SATA, 0x000000A8 },
+ { SMCA_USB, 0x000000AA },
+ { SMCA_GMI_PCS, 0x00000241 },
+
/* Ext Global Memory Interconnect PHY MCA type */
{ SMCA_XGMI_PHY, 0x00000259 },
/* WAFL PHY MCA type */
{ SMCA_WAFL_PHY, 0x00000267 },
+
+ { SMCA_GMI_PHY, 0x00000269 },
};
struct smca_bank_name {
@@ -508,12 +668,18 @@ static struct smca_bank_name smca_names[] = {
[SMCA_PSP ... SMCA_PSP_V2] = { "Platform Security Processor" },
[SMCA_SMU ... SMCA_SMU_V2] = { "System Management Unit" },
[SMCA_MP5] = { "Microprocessor 5 Unit" },
+ [SMCA_MPDMA] = { "MPDMA Unit" },
[SMCA_NBIO] = { "Northbridge IO Unit" },
[SMCA_PCIE ... SMCA_PCIE_V2] = { "PCI Express Unit" },
[SMCA_XGMI_PCS] = { "Ext Global Memory Interconnect PCS Unit" },
+ [SMCA_NBIF] = { "NBIF Unit" },
+ [SMCA_SHUB] = { "System Hub Unit" },
+ [SMCA_SATA] = { "SATA Unit" },
+ [SMCA_USB] = { "USB Unit" },
+ [SMCA_GMI_PCS] = { "Global Memory Interconnect PCS Unit" },
[SMCA_XGMI_PHY] = { "Ext Global Memory Interconnect PHY Unit" },
[SMCA_WAFL_PHY] = { "WAFL PHY Unit" },
-
+ [SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" },
};
static void amd_decode_errcode(struct mce_event *e)

View File

@ -0,0 +1,411 @@
commit 932118b04a04104dfac6b8536419803f236e6118
Author: Avadhut Naik <avadhut.naik@amd.com>
Date: Mon May 22 22:13:17 2023 +0000
rasdaemon: Add support for post-processing MCA errors
Currently, the rasdaemon performs detailed error decoding of received
MCA errors on the system only whence it is running, either as a daemon
or in the foreground.
As such, error decoding cannot be undertaken for any MCA errors received
whence the rasdaemon wasn't running. Additionally, if the error decoding
modules like edac_mce_amd too have not been loaded, error records in the
demsg buffer might correspond to raw values in associated MSRs, compelling
users to undertake decoding manually. The scenario seems more plausible on
AMD systems with Scalabale MCA (SMCA) with plans in place to remove SMCA
Extended Error Descriptions from the edac_mce_amd module in an effort to
offload SMCA Error Decoding to the rasdaemon.
As such, add support to post-process and decode MCA Errors received on AMD
SMCA systems from raw MSR values. Support for post-processing and decoding
of MCA Errors received on CPUs of other vendors can be added in the future,
as needed.
Suggested-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
mce-amd-smca.c | 8 ++-
ras-events.h | 1
ras-mce-handler.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++----
ras-mce-handler.h | 4 +
ras-record.h | 10 ++++
rasdaemon.c | 94 +++++++++++++++++++++++++++++++++++++++++++++-
6 files changed, 216 insertions(+), 11 deletions(-)
--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2023-10-27 12:44:55.541077722 -0400
+++ rasdaemon-0.6.7/mce-amd-smca.c 2023-10-27 12:44:58.549049019 -0400
@@ -710,7 +710,7 @@ static struct smca_bank_name smca_names[
[SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" },
};
-static void amd_decode_errcode(struct mce_event *e)
+void amd_decode_errcode(struct mce_event *e)
{
decode_amd_errcode(e);
@@ -782,7 +782,7 @@ *hwid_mcatype = 0x00010000;
}
/* Decode extended errors according to Scalable MCA specification */
-static void decode_smca_error(struct mce_event *e, struct mce_priv* m)
+void decode_smca_error(struct mce_event *e, struct mce_priv *m)
{
enum smca_bank_types bank_type;
const char *ip_name;
@@ -827,7 +827,9 @@ for (i = 0; i < ARRAY_SIZE(smca_hwid_mca
/* Only print the descriptor of valid extended error code */
if (xec < smca_mce_descs[bank_type].num_descs)
mce_snprintf(e->mcastatus_msg,
- " %s.\n", smca_mce_descs[bank_type].descs[xec]);
+ "%s. Ext Err Code: %d",
+ smca_mce_descs[bank_type].descs[xec],
+ xec);
if (bank_type == SMCA_UMC && xec == 0) {
channel = find_umc_channel(e);
--- rasdaemon-0.6.7.orig/ras-events.h 2023-10-27 12:44:55.541077722 -0400
+++ rasdaemon-0.6.7/ras-events.h 2023-10-27 12:44:58.549049019 -0400
@@ -100,6 +100,7 @@ enum ghes_severity {
/* Function prototypes */
int toggle_ras_mc_event(int enable);
+int ras_offline_mce_event(struct ras_mc_offline_event *event);
int handle_ras_events(int record_events);
#endif
--- rasdaemon-0.6.7.orig/ras-mce-handler.c 2023-10-27 12:44:55.541077722 -0400
+++ rasdaemon-0.6.7/ras-mce-handler.c 2023-10-27 12:45:27.159776011 -0400
@@ -63,10 +63,8 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series
[CPU_SAPPHIRERAPIDS] = "Sapphirerapids server",
};
-static enum cputype select_intel_cputype(struct ras_events *ras)
+static enum cputype select_intel_cputype(struct mce_priv *mce)
{
- struct mce_priv *mce = ras->mce_priv;
-
if (mce->family == 15) {
if (mce->model == 6)
return CPU_TULSA;
@@ -140,9 +138,8 @@ if (mce->model > 0x1a) {
return mce->family == 6 ? CPU_P6OLD : CPU_GENERIC;
}
-static int detect_cpu(struct ras_events *ras)
+static int detect_cpu(struct mce_priv *mce)
{
- struct mce_priv *mce = ras->mce_priv;
FILE *f;
int ret = 0;
char *line = NULL;
@@ -221,7 +218,7 @@ ret = 0;
}
goto ret;
} else if (!strcmp(mce->vendor,"GenuineIntel")) {
- mce->cputype = select_intel_cputype(ras);
+ mce->cputype = select_intel_cputype(mce);
} else {
ret = EINVAL;
}
@@ -246,7 +243,7 @@ int register_mce_handler(struct ras_even
mce = ras->mce_priv;
- rc = detect_cpu(ras);
+ rc = detect_cpu(mce);
if (rc) {
if (mce->processor_flags)
free (mce->processor_flags);
@@ -383,6 +380,105 @@ #if 0
*/
}
+static int report_mce_offline(struct trace_seq *s,
+ struct mce_event *mce,
+ struct mce_priv *priv)
+{
+ time_t now;
+ struct tm *tm;
+
+ time(&now);
+ tm = localtime(&now);
+
+ if (tm)
+ strftime(mce->timestamp, sizeof(mce->timestamp),
+ "%Y-%m-%d %H:%M:%S %z", tm);
+ trace_seq_printf(s, "%s,", mce->timestamp);
+
+ if (*mce->bank_name)
+ trace_seq_printf(s, " %s,", mce->bank_name);
+ else
+ trace_seq_printf(s, " bank=%x,", mce->bank);
+
+ if (*mce->mcastatus_msg)
+ trace_seq_printf(s, " mca: %s,", mce->mcastatus_msg);
+
+ if (*mce->mcistatus_msg)
+ trace_seq_printf(s, " mci: %s,", mce->mcistatus_msg);
+
+ if (*mce->mc_location)
+ trace_seq_printf(s, " Locn: %s,", mce->mc_location);
+
+ if (*mce->error_msg)
+ trace_seq_printf(s, " Error Msg: %s\n", mce->error_msg);
+
+ return 0;
+}
+
+int ras_offline_mce_event(struct ras_mc_offline_event *event)
+{
+ int rc = 0;
+ struct trace_seq s;
+ struct mce_event *mce = NULL;
+ struct mce_priv *priv = NULL;
+
+ mce = (struct mce_event *)calloc(1, sizeof(struct mce_event));
+ if (!mce) {
+ log(TERM, LOG_ERR, "Can't allocate memory for mce struct\n");
+ return errno;
+ }
+
+ priv = (struct mce_priv *)calloc(1, sizeof(struct mce_priv));
+ if (!priv) {
+ log(TERM, LOG_ERR, "Can't allocate memory for mce_priv struct\n");
+ free(mce);
+ return errno;
+ }
+
+ if (event->smca) {
+ priv->cputype = CPU_AMD_SMCA;
+ priv->family = event->family;
+ priv->model = event->model;
+ } else {
+ rc = detect_cpu(priv);
+ if (rc) {
+ log(TERM, LOG_ERR, "Failed to detect CPU\n");
+ goto free_mce;
+ }
+ }
+
+ mce->status = event->status;
+ mce->bank = event->bank;
+
+ switch (priv->cputype) {
+ case CPU_AMD_SMCA:
+ mce->synd = event->synd;
+ mce->ipid = event->ipid;
+ if (!mce->ipid || !mce->status) {
+ log(TERM, LOG_ERR, "%s MSR required.\n",
+ mce->ipid ? "Status" : "Ipid");
+ rc = -EINVAL;
+ goto free_mce;
+ }
+ decode_smca_error(mce, priv);
+ amd_decode_errcode(mce);
+ break;
+ default:
+ break;
+ }
+
+ trace_seq_init(&s);
+ report_mce_offline(&s, mce, priv);
+ trace_seq_do_printf(&s);
+ fflush(stdout);
+ trace_seq_destroy(&s);
+
+free_mce:
+ free(priv);
+ free(mce);
+ return rc;
+}
+
int ras_mce_event_handler(struct trace_seq *s,
struct pevent_record *record,
struct event_format *event, void *context)
--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2023-10-27 12:44:55.541077722 -0400
+++ rasdaemon-0.6.7/ras-mce-handler.h 2023-10-27 12:44:58.550049010 -0400
@@ -118,6 +118,10 @@ int ras_mce_event_handler(struct trace_s
/* enables intel iMC logs */
int set_intel_imc_log(enum cputype cputype, unsigned ncpus);
+/* Undertake AMD SMCA Error Decoding */
+void decode_smca_error(struct mce_event *e, struct mce_priv *m);
+void amd_decode_errcode(struct mce_event *e);
+
/* Per-CPU-type decoders for Intel CPUs */
void p4_decode_model(struct mce_event *e);
void core2_decode_model(struct mce_event *e);
--- rasdaemon-0.6.7.orig/ras-record.h 2023-10-27 12:44:55.541077722 -0400
+++ rasdaemon-0.6.7/ras-record.h 2023-10-27 12:44:58.550049010 -0400
@@ -21,6 +21,7 @@ * Foundation, Inc., 51 Franklin Street,
#define __RAS_RECORD_H
#include <stdint.h>
+#include <stdbool.h>
#include "config.h"
#define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x)))
@@ -39,6 +40,15 @@ struct ras_mc_event {
const char *driver_detail;
};
+struct ras_mc_offline_event {
+ unsigned int family, model;
+ bool smca;
+ uint8_t bank;
+ uint64_t ipid;
+ uint64_t synd;
+ uint64_t status;
+};
+
struct ras_aer_event {
char timestamp[64];
const char *error_type;
--- rasdaemon-0.6.7.orig/rasdaemon.c 2023-10-27 12:44:55.541077722 -0400
+++ rasdaemon-0.6.7/rasdaemon.c 2023-10-27 12:44:58.550049010 -0400
@@ -41,8 +41,21 @@ struct arguments {
int record_events;
int enable_ras;
int foreground;
+ int offline;
};
+enum OFFLINE_ARG_KEYS {
+ SMCA = 0x100,
+ MODEL,
+ FAMILY,
+ BANK_NUM,
+ IPID_REG,
+ STATUS_REG,
+ SYNDROME_REG
+};
+
+struct ras_mc_offline_event event;
+
static error_t parse_opt(int k, char *arg, struct argp_state *state)
{
struct arguments *args = state->input;
@@ -62,18 +75,84 @@ static error_t parse_opt(int k, char *ar
case 'f':
args->foreground++;
break;
+#ifdef HAVE_MCE
+ case 'p':
+ if (state->argc < 4)
+ argp_state_help(state, stdout, ARGP_HELP_LONG | ARGP_HELP_EXIT_ERR);
+ args->offline++;
+ break;
+#endif
default:
return ARGP_ERR_UNKNOWN;
}
return 0;
}
+#ifdef HAVE_MCE
+static error_t parse_opt_offline(int key, char *arg,
+ struct argp_state *state)
+{
+ switch (key) {
+ case SMCA:
+ event.smca = true;
+ break;
+ case MODEL:
+ event.model = strtoul(state->argv[state->next], NULL, 0);
+ break;
+ case FAMILY:
+ event.family = strtoul(state->argv[state->next], NULL, 0);
+ break;
+ case BANK_NUM:
+ event.bank = atoi(state->argv[state->next]);
+ break;
+ case IPID_REG:
+ event.ipid = strtoull(state->argv[state->next], NULL, 0);
+ break;
+ case STATUS_REG:
+ event.status = strtoull(state->argv[state->next], NULL, 0);
+ break;
+ case SYNDROME_REG:
+ event.synd = strtoull(state->argv[state->next], NULL, 0);
+ break;
+ default:
+ return ARGP_ERR_UNKNOWN;
+ }
+ return 0;
+}
+#endif
+
long user_hz;
int main(int argc, char *argv[])
{
struct arguments args;
int idx = -1;
+
+#ifdef HAVE_MCE
+ const struct argp_option offline_options[] = {
+ {"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"},
+ {"model", MODEL, 0, 0, "CPU Model"},
+ {"family", FAMILY, 0, 0, "CPU Family"},
+ {"bank", BANK_NUM, 0, 0, "Bank Number"},
+ {"ipid", IPID_REG, 0, 0, "IPID Register (for SMCA systems only)"},
+ {"status", STATUS_REG, 0, 0, "Status Register"},
+ {"synd", SYNDROME_REG, 0, 0, "Syndrome Register"},
+ {0, 0, 0, 0, 0, 0},
+ };
+
+ struct argp offline_argp = {
+ .options = offline_options,
+ .parser = parse_opt_offline,
+ .doc = TOOL_DESCRIPTION,
+ .args_doc = ARGS_DOC,
+ };
+
+ struct argp_child offline_parser[] = {
+ {&offline_argp, 0, "Post-Processing Options:", 0},
+ {0, 0, 0, 0},
+ };
+#endif
+
const struct argp_option options[] = {
{"enable", 'e', 0, 0, "enable RAS events and exit", 0},
{"disable", 'd', 0, 0, "disable RAS events and exit", 0},
@@ -81,6 +160,10 @@ {"disable", 'd', 0, 0, "disable RAS even
{"record", 'r', 0, 0, "record events via sqlite3", 0},
#endif
{"foreground", 'f', 0, 0, "run foreground, not daemonize"},
+#ifdef HAVE_MCE
+ {"post-processing", 'p', 0, 0,
+ "Post-processing MCE's with raw register values"},
+#endif
{ 0, 0, 0, 0, 0, 0 }
};
@@ -89,7 +172,9 @@ { 0, 0, 0, 0, 0, 0 }
.parser = parse_opt,
.doc = TOOL_DESCRIPTION,
.args_doc = ARGS_DOC,
-
+#ifdef HAVE_MCE
+ .children = offline_parser,
+#endif
};
memset (&args, 0, sizeof(args));
@@ -111,6 +196,13 @@ enable = (args.enable_ras > 0) ? 1 : 0;
return 0;
}
+#ifdef HAVE_MCE
+ if (args.offline) {
+ ras_offline_mce_event(&event);
+ return 0;
+ }
+#endif
+
openlog(TOOL_NAME, 0, LOG_DAEMON);
if (!args.foreground)
if (daemon(0,0))

View File

@ -0,0 +1,159 @@
commit aa36c96cd52d775570dae989dd95a060f1149077
Author: Avadhut Naik <avadnaik@amd.com>
Date: Mon Apr 24 20:35:56 2023 +0000
rasdaemon: Handle reassigned bit definitions for CS SMCA
Currently, on AMD systems with Scalable MCA (SMCA), each machine check
error of a SMCA bank type has an associated bit position in the bank's
control (CTL) register used for enabling / disabling reporting of the
very error. An error's bit position in the CTL register is also used
during error decoding for offsetting into the corresponding bank's error
description structure. As new errors are being added in newer AMD systems
for existing SMCA bank types, the underlying SMCA architecture guarantees
that the bit positions of existing errors are not altered.
However, on some AMD systems viz. Genoa, some of the existing bit
definitions in the CTL register of the Coherent Slave (CS) SMCA bank type
are reassigned without defining new HWID and McaType. Consequently, the
very errors whose bit definitions have been reassigned in the CTL register
are being erroneously decoded.
As a solution, create a new software defined SMCA bank type by utilizing
one of the hardware-reserved values for HWID. The new SMCA bank type will
only be employed for CS error decoding on affected CPU models.
Additionally, since the existing error description structure for the CS
SMCA bank type is still valid, add new error description structure to
compensate for the reassigned bit definitions.
Signed-off-by: Avadhut Naik <avadnaik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 7ec787a..e81f732 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -57,6 +57,7 @@ enum smca_bank_types {
SMCA_L3_CACHE, /* L3 Cache */
SMCA_CS, /* Coherent Slave */
SMCA_CS_V2,
+ SMCA_CS_V2_QUIRK,
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
SMCA_UMC_V2,
@@ -259,6 +260,31 @@ static const char * const smca_cs2_mce_desc[] = {
"Hardware Assert Error",
};
+/*
+ * Per Genoa's revision guide, erratum 1384, existing bit definitions
+ * are reassigned for SMCA CS bank type.
+ */
+static const char * const smca_cs2_quirk_mce_desc[] = {
+ "Illegal Request",
+ "Address Violation",
+ "Security Violation",
+ "Illegal Response",
+ "Unexpected Response",
+ "Request or Probe Parity Error",
+ "Read Response Parity Error",
+ "Atomic Request Parity Error",
+ "SDP read response had no match in the CS queue",
+ "SDP read response had an unexpected RETRY error",
+ "Counter overflow error",
+ "Counter underflow error",
+ "Probe Filter Protocol Error",
+ "Probe Filter ECC Error",
+ "Illegal Request on the no data channel",
+ "Address Violation on the no data channel",
+ "Security Violation on the no data channel",
+ "Hardware Assert Error",
+};
+
static const char * const smca_pie_mce_desc[] = {
"Hardware assert",
"Register security violation",
@@ -549,6 +575,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
[SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
[SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) },
+ [SMCA_CS_V2_QUIRK] = { smca_cs2_quirk_mce_desc, ARRAY_SIZE(smca_cs2_quirk_mce_desc)},
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
[SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) },
@@ -597,6 +624,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Data Fabric MCA types */
{ SMCA_CS, 0x0000002E },
{ SMCA_CS_V2, 0x0002002E },
+ {SMCA_CS_V2_QUIRK, 0x00010000 },
{ SMCA_PIE, 0x0001002E },
/* Unified Memory Controller MCA type */
@@ -660,7 +688,7 @@ static struct smca_bank_name smca_names[] = {
[SMCA_EX] = { "Execution Unit" },
[SMCA_FP] = { "Floating Point Unit" },
[SMCA_L3_CACHE] = { "L3 Cache" },
- [SMCA_CS ... SMCA_CS_V2] = { "Coherent Slave" },
+ [SMCA_CS ... SMCA_CS_V2_QUIRK] = { "Coherent Slave" },
[SMCA_PIE] = { "Power, Interrupts, etc." },
[SMCA_UMC] = { "Unified Memory Controller" },
[SMCA_UMC_V2] = { "Unified Memory Controller V2" },
@@ -723,8 +751,38 @@ static int find_hbm_channel(struct mce_event *e)
return (umc % 2) ? tmp + 4 : tmp;
}
+static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
+{
+ if (m->family == 0x19) {
+ switch (m->model) {
+ /*
+ * Per Genoa's revision guide, erratum 1384, some SMCA Extended
+ * Error Codes and SMCA Control bits are incorrect for SMCA CS
+ * bank type.
+ */
+ case 0x10 ... 0x1F:
+ case 0x60 ... 0x7B:
+ case 0xA0 ... 0xAF:
+ if (*hwid_mcatype == 0x0002002E)
+ *hwid_mcatype = 0x00010000;
+ break;
+ default:
+ break;
+ }
+ } else if (m->family == 0x1A) {
+ switch (m->model) {
+ case 0x40 ... 0x4F:
+ if (*hwid_mcatype == 0x0002002E)
+ *hwid_mcatype = 0x00010000;
+ break;
+ default:
+ break;
+ }
+ }
+}
+
/* Decode extended errors according to Scalable MCA specification */
-static void decode_smca_error(struct mce_event *e)
+static void decode_smca_error(struct mce_event *e, struct mce_priv* m)
{
enum smca_bank_types bank_type;
const char *ip_name;
@@ -735,6 +793,8 @@ static void decode_smca_error(struct mce_event *e)
unsigned int csrow = -1, channel = -1;
unsigned int i;
+ fixup_hwid(m, &mcatype_hwid);
+
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
s_hwid = &smca_hwid_mcatypes[i];
if (mcatype_hwid == s_hwid->mcatype_hwid) {
@@ -801,7 +861,7 @@ int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)
if (mcgstatus & MCG_STATUS_MCIP)
mce_snprintf(e->mcgstatus_msg, "MCIP");
- decode_smca_error(e);
+ decode_smca_error(e, ras->mce_priv);
amd_decode_errcode(e);
return 0;
}

View File

@ -0,0 +1,208 @@
commit b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87
Author: Avadhut Naik <avadhut.naik@amd.com>
Date: Thu Aug 31 02:23:48 2023 -0500
rasdaemon: Fix SMCA bank type decoding
On AMD systems with Scalable MCA (SMCA), the (HWID, MCATYPE) tuple from
the MCA_IPID MSR, bits 43:32 and 63:48 respectively, are used for SMCA
bank type decoding. On occurrence of an SMCA error, the cached tuples are
compared against the tuple read from the MCA_IPID MSR to determine the
SMCA bank type.
Currently however, all high 32 bits of the MCA_IPID register are cached in
the rasdaemon for all SMCA bank types. Bits 47:44 which do not play a part
in bank type decoding are zeroed out. Likewise, when an SMCA error occurs,
all high 32 bits of the MCA_IPID register are read and compared against
the cached values in smca_hwid_mcatypes array.
This can lead to erroneous bank type decoding since the bits 47:44 are
not guaranteed to be zero. They are either reserved or, on some modern
AMD systems viz. Genoa, denote the InstanceIdHi value. The bits therefore,
should not be associated with SMCA bank type decoding.
Import the HWID_MCATYPE macro from the kernel to ensure that only the
relevant fields i.e. (HWID, MCATYPE) tuples are used for SMCA bank type
decoding on occurrence of an SMCA error.
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index a20f03c..55620e2 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -90,6 +90,12 @@ enum smca_bank_types {
/* Maximum number of MCA banks per CPU. */
#define MAX_NR_BANKS 64
+#define MCI_IPID_MCATYPE 0xFFFF0000
+#define MCI_IPID_HWID 0xFFF
+
+/* Obtain HWID_MCATYPE Tuple on SMCA Systems */
+#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
+
/*
* On Newer heterogeneous systems from AMD with CPU and GPU nodes connected
* via xGMI links, the NON CPU Nodes are enumerated from index 8
@@ -699,76 +705,76 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* { bank_type, mcatype_hwid } */
/* ZN Core (HWID=0xB0) MCA types */
- { SMCA_LS, 0x000000B0 },
- { SMCA_LS_V2, 0x001000B0 },
- { SMCA_IF, 0x000100B0 },
- { SMCA_L2_CACHE, 0x000200B0 },
- { SMCA_DE, 0x000300B0 },
+ { SMCA_LS, HWID_MCATYPE(0xB0, 0x0) },
+ { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10) },
+ { SMCA_IF, HWID_MCATYPE(0xB0, 0x1) },
+ { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2) },
+ { SMCA_DE, HWID_MCATYPE(0xB0, 0x3) },
/* HWID 0xB0 MCATYPE 0x4 is Reserved */
- { SMCA_EX, 0x000500B0 },
- { SMCA_FP, 0x000600B0 },
- { SMCA_L3_CACHE, 0x000700B0 },
+ { SMCA_EX, HWID_MCATYPE(0xB0, 0x5) },
+ { SMCA_FP, HWID_MCATYPE(0xB0, 0x6) },
+ { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7) },
/* Data Fabric MCA types */
- { SMCA_CS, 0x0000002E },
- { SMCA_CS_V2, 0x0002002E },
- {SMCA_CS_V2_QUIRK, 0x00010000 },
- { SMCA_PIE, 0x0001002E },
+ { SMCA_CS, HWID_MCATYPE(0x2E, 0x0) },
+ { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) },
+ { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) },
+ { SMCA_CS_V2_QUIRK, HWID_MCATYPE(0x0, 0x1) },
/* Unified Memory Controller MCA type */
- { SMCA_UMC, 0x00000096 },
- { SMCA_UMC_QUIRK, 0x00020000 },
+ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) },
+ { SMCA_UMC_QUIRK, HWID_MCATYPE(0x0, 0x2) },
/* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */
- { SMCA_UMC_V2, 0x00010096 },
+ { SMCA_UMC_V2, HWID_MCATYPE(0x96, 0x1) },
/* Memory Attached Last Level Cache */
- { SMCA_MA_LLC, 0x0004002E },
+ { SMCA_MA_LLC, HWID_MCATYPE(0x2E, 0x4) },
/* Parameter Block MCA type */
- { SMCA_PB, 0x00000005 },
+ { SMCA_PB, HWID_MCATYPE(0x05, 0x0) },
/* Platform Security Processor MCA type */
- { SMCA_PSP, 0x000000FF },
- { SMCA_PSP_V2, 0x000100FF },
+ { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0) },
+ { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1) },
/* System Management Unit MCA type */
- { SMCA_SMU, 0x00000001 },
- { SMCA_SMU_V2, 0x00010001 },
+ { SMCA_SMU, HWID_MCATYPE(0x01, 0x0) },
+ { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1) },
/* Microprocessor 5 Unit MCA type */
- { SMCA_MP5, 0x00020001 },
+ { SMCA_MP5, HWID_MCATYPE(0x01, 0x2) },
/* MPDMA MCA Type */
- { SMCA_MPDMA, 0x00030001 },
+ { SMCA_MPDMA, HWID_MCATYPE(0x01, 0x3) },
/* Northbridge IO Unit MCA type */
- { SMCA_NBIO, 0x00000018 },
+ { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) },
/* PCI Express Unit MCA type */
- { SMCA_PCIE, 0x00000046 },
- { SMCA_PCIE_V2, 0x00010046 },
+ { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) },
+ { SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) },
/* Ext Global Memory Interconnect PCS MCA type */
- { SMCA_XGMI_PCS, 0x00000050 },
+ { SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) },
- { SMCA_NBIF, 0x0000006C },
+ { SMCA_NBIF, HWID_MCATYPE(0x6C, 0x0) },
- { SMCA_SHUB, 0x00000080 },
- { SMCA_SATA, 0x000000A8 },
- { SMCA_USB, 0x000000AA },
+ { SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) },
+ { SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) },
+ { SMCA_USB, HWID_MCATYPE(0xAA, 0x0) },
/* Ultra Short Reach Data and Control Plane Controller */
- { SMCA_USR_DP, 0x00000170 },
- { SMCA_USR_CP, 0x00000180 },
+ { SMCA_USR_DP, HWID_MCATYPE(0x170, 0x0) },
+ { SMCA_USR_CP, HWID_MCATYPE(0x180, 0x0) },
- { SMCA_GMI_PCS, 0x00000241 },
+ { SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) },
/* Ext Global Memory Interconnect PHY MCA type */
- { SMCA_XGMI_PHY, 0x00000259 },
+ { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
/* WAFL PHY MCA type */
- { SMCA_WAFL_PHY, 0x00000267 },
+ { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
- { SMCA_GMI_PHY, 0x00000269 },
+ { SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) },
};
struct smca_bank_name {
@@ -862,12 +868,12 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
case 0x10 ... 0x1F:
case 0x60 ... 0x7B:
case 0xA0 ... 0xAF:
- if (*hwid_mcatype == 0x0002002E)
- *hwid_mcatype = 0x00010000;
+ if (*hwid_mcatype == HWID_MCATYPE(0x2E, 0x2))
+ *hwid_mcatype = HWID_MCATYPE(0x0, 0x1);
break;
case 0x90 ... 0x9F:
- if ((*hwid_mcatype & 0xFF) == 0x00000096)
- *hwid_mcatype = 0x00020000;
+ if (*hwid_mcatype == HWID_MCATYPE(0x96, 0x0))
+ *hwid_mcatype = HWID_MCATYPE(0x0, 0x2);
break;
default:
break;
@@ -875,8 +881,8 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
} else if (m->family == 0x1A) {
switch (m->model) {
case 0x40 ... 0x4F:
- if (*hwid_mcatype == 0x0002002E)
- *hwid_mcatype = 0x00010000;
+ if (*hwid_mcatype == HWID_MCATYPE(0x2E, 0x2))
+ *hwid_mcatype = HWID_MCATYPE(0x0, 0x1);
break;
default:
break;
@@ -889,13 +895,17 @@ void decode_smca_error(struct mce_event *e, struct mce_priv *m)
{
enum smca_bank_types bank_type;
const char *ip_name;
+ uint32_t mcatype_hwid = 0;
unsigned short xec = (e->status >> 16) & 0x3f;
const struct smca_hwid *s_hwid;
- uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
+ uint32_t ipid_high = EXTRACT(e->ipid, 32, 63);
uint8_t mcatype_instancehi = EXTRACT(e->ipid, 44, 47);
unsigned int csrow = -1, channel = -1;
unsigned int i;
+ mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID,
+ (ipid_high & MCI_IPID_MCATYPE) >> 16);
+
fixup_hwid(m, &mcatype_hwid);
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {

View File

@ -0,0 +1,37 @@
commit c785d309dcbdeb7ecd219975244f3944a8d047e9
Author: Muralidhara M K <muralidhara.mk@amd.com>
Date: Thu Jul 27 10:18:12 2023 +0000
rasdaemon: Identify the DIe Number in multidie system
Some AMD systems have 4 dies in each socket and Die ID represents
whether the error occured on cpu die or gpu die.
Also, respective Die used for FRU identification.
Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 54060ee..a20f03c 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -935,10 +935,15 @@ void decode_smca_error(struct mce_event *e, struct mce_priv *m)
xec);
if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_QUIRK) && xec == 0) {
- channel = find_umc_channel(e);
- csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */
- mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
- channel, csrow);
+ if ((m->family == 0x19) && (m->model >= 0x90 && m->model <= 0x9f)) {
+ /* MCA_IPID[InstanceIdHi] give the AMD Node Die ID */
+ mce_snprintf(e->mc_location, "memory_die_id=%d", mcatype_instancehi / 4);
+ } else {
+ channel = find_umc_channel(e);
+ csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */
+ mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
+ channel, csrow);
+ }
}
if (bank_type == SMCA_UMC_V2 && xec == 0) {

View File

@ -1,6 +1,6 @@
Name: rasdaemon
Version: 0.6.7
Release: 8%{?dist}
Release: 9%{?dist}
Summary: Utility to receive RAS error tracings
License: GPLv2
URL: http://git.infradead.org/users/mchehab/rasdaemon.git
@ -26,6 +26,13 @@ Patch17: 2b6a54b0d31e02e657171fd27f4e31d996756bc6.patch
Patch18: 7ccf12f5ae26a055926d175d908c7930293438c4.patch
Patch19: 9415b7449c70f5ea4a0209ddb89c2f5f392d3b4b.patch
Patch20: d0e0bb3d73c4bc5060da20270a089857bba2a64c.patch
Patch21: 30158ef8d7aebc3e5201bf39b73ce7644f8e419e.patch
Patch22: aa36c96cd52d775570dae989dd95a060f1149077.patch
Patch23: 932118b04a04104dfac6b8536419803f236e6118.patch
Patch24: 1f74a59ee33b7448b00d7ba13d5ecd4918b9853c.patch
Patch25: 2d15882a0cbfce0b905039bebc811ac8311cd739.patch
Patch26: c785d309dcbdeb7ecd219975244f3944a8d047e9.patch
Patch27: b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87.patch
ExcludeArch: s390 s390x
BuildRequires: make
@ -81,6 +88,13 @@ an utility for reporting current error counts from the EDAC sysfs files.
%patch18 -p1
%patch19 -p1
%patch20 -p1
%patch21 -p1
%patch22 -p1
%patch23 -p1
%patch24 -p1
%patch25 -p1
%patch26 -p1
%patch27 -p1
# The tarball is locked in time the first time aclocal was ran and will keep
# requiring an older version of automake
@ -116,6 +130,9 @@ sed -i "s/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION=account/" %{buildroot}/%{_sysconfdir
%{_sysconfdir}/sysconfig/rasdaemon
%changelog
* Thu Oct 26 2023 Aristeu Rozanski <aris@redhat.com> 0.6.7-9
- Update SMCA support for AMD processors [RHEL-11092]
* Tue May 03 2022 Aristeu Rozanski <aris@redhat.com> 0.6.7-8
- Update ras-mc-ctl manpage to match current options [2079132]