rasdaemon/b6a64416ab31b66ce92cabcc7fa...

209 lines
7.2 KiB
Diff

commit b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87
Author: Avadhut Naik <avadhut.naik@amd.com>
Date: Thu Aug 31 02:23:48 2023 -0500
rasdaemon: Fix SMCA bank type decoding
On AMD systems with Scalable MCA (SMCA), the (HWID, MCATYPE) tuple from
the MCA_IPID MSR, bits 43:32 and 63:48 respectively, are used for SMCA
bank type decoding. On occurrence of an SMCA error, the cached tuples are
compared against the tuple read from the MCA_IPID MSR to determine the
SMCA bank type.
Currently however, all high 32 bits of the MCA_IPID register are cached in
the rasdaemon for all SMCA bank types. Bits 47:44 which do not play a part
in bank type decoding are zeroed out. Likewise, when an SMCA error occurs,
all high 32 bits of the MCA_IPID register are read and compared against
the cached values in smca_hwid_mcatypes array.
This can lead to erroneous bank type decoding since the bits 47:44 are
not guaranteed to be zero. They are either reserved or, on some modern
AMD systems viz. Genoa, denote the InstanceIdHi value. The bits therefore,
should not be associated with SMCA bank type decoding.
Import the HWID_MCATYPE macro from the kernel to ensure that only the
relevant fields i.e. (HWID, MCATYPE) tuples are used for SMCA bank type
decoding on occurrence of an SMCA error.
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index a20f03c..55620e2 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -90,6 +90,12 @@ enum smca_bank_types {
/* Maximum number of MCA banks per CPU. */
#define MAX_NR_BANKS 64
+#define MCI_IPID_MCATYPE 0xFFFF0000
+#define MCI_IPID_HWID 0xFFF
+
+/* Obtain HWID_MCATYPE Tuple on SMCA Systems */
+#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
+
/*
* On Newer heterogeneous systems from AMD with CPU and GPU nodes connected
* via xGMI links, the NON CPU Nodes are enumerated from index 8
@@ -699,76 +705,76 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* { bank_type, mcatype_hwid } */
/* ZN Core (HWID=0xB0) MCA types */
- { SMCA_LS, 0x000000B0 },
- { SMCA_LS_V2, 0x001000B0 },
- { SMCA_IF, 0x000100B0 },
- { SMCA_L2_CACHE, 0x000200B0 },
- { SMCA_DE, 0x000300B0 },
+ { SMCA_LS, HWID_MCATYPE(0xB0, 0x0) },
+ { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10) },
+ { SMCA_IF, HWID_MCATYPE(0xB0, 0x1) },
+ { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2) },
+ { SMCA_DE, HWID_MCATYPE(0xB0, 0x3) },
/* HWID 0xB0 MCATYPE 0x4 is Reserved */
- { SMCA_EX, 0x000500B0 },
- { SMCA_FP, 0x000600B0 },
- { SMCA_L3_CACHE, 0x000700B0 },
+ { SMCA_EX, HWID_MCATYPE(0xB0, 0x5) },
+ { SMCA_FP, HWID_MCATYPE(0xB0, 0x6) },
+ { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7) },
/* Data Fabric MCA types */
- { SMCA_CS, 0x0000002E },
- { SMCA_CS_V2, 0x0002002E },
- {SMCA_CS_V2_QUIRK, 0x00010000 },
- { SMCA_PIE, 0x0001002E },
+ { SMCA_CS, HWID_MCATYPE(0x2E, 0x0) },
+ { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) },
+ { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) },
+ { SMCA_CS_V2_QUIRK, HWID_MCATYPE(0x0, 0x1) },
/* Unified Memory Controller MCA type */
- { SMCA_UMC, 0x00000096 },
- { SMCA_UMC_QUIRK, 0x00020000 },
+ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) },
+ { SMCA_UMC_QUIRK, HWID_MCATYPE(0x0, 0x2) },
/* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */
- { SMCA_UMC_V2, 0x00010096 },
+ { SMCA_UMC_V2, HWID_MCATYPE(0x96, 0x1) },
/* Memory Attached Last Level Cache */
- { SMCA_MA_LLC, 0x0004002E },
+ { SMCA_MA_LLC, HWID_MCATYPE(0x2E, 0x4) },
/* Parameter Block MCA type */
- { SMCA_PB, 0x00000005 },
+ { SMCA_PB, HWID_MCATYPE(0x05, 0x0) },
/* Platform Security Processor MCA type */
- { SMCA_PSP, 0x000000FF },
- { SMCA_PSP_V2, 0x000100FF },
+ { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0) },
+ { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1) },
/* System Management Unit MCA type */
- { SMCA_SMU, 0x00000001 },
- { SMCA_SMU_V2, 0x00010001 },
+ { SMCA_SMU, HWID_MCATYPE(0x01, 0x0) },
+ { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1) },
/* Microprocessor 5 Unit MCA type */
- { SMCA_MP5, 0x00020001 },
+ { SMCA_MP5, HWID_MCATYPE(0x01, 0x2) },
/* MPDMA MCA Type */
- { SMCA_MPDMA, 0x00030001 },
+ { SMCA_MPDMA, HWID_MCATYPE(0x01, 0x3) },
/* Northbridge IO Unit MCA type */
- { SMCA_NBIO, 0x00000018 },
+ { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) },
/* PCI Express Unit MCA type */
- { SMCA_PCIE, 0x00000046 },
- { SMCA_PCIE_V2, 0x00010046 },
+ { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) },
+ { SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) },
/* Ext Global Memory Interconnect PCS MCA type */
- { SMCA_XGMI_PCS, 0x00000050 },
+ { SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) },
- { SMCA_NBIF, 0x0000006C },
+ { SMCA_NBIF, HWID_MCATYPE(0x6C, 0x0) },
- { SMCA_SHUB, 0x00000080 },
- { SMCA_SATA, 0x000000A8 },
- { SMCA_USB, 0x000000AA },
+ { SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) },
+ { SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) },
+ { SMCA_USB, HWID_MCATYPE(0xAA, 0x0) },
/* Ultra Short Reach Data and Control Plane Controller */
- { SMCA_USR_DP, 0x00000170 },
- { SMCA_USR_CP, 0x00000180 },
+ { SMCA_USR_DP, HWID_MCATYPE(0x170, 0x0) },
+ { SMCA_USR_CP, HWID_MCATYPE(0x180, 0x0) },
- { SMCA_GMI_PCS, 0x00000241 },
+ { SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) },
/* Ext Global Memory Interconnect PHY MCA type */
- { SMCA_XGMI_PHY, 0x00000259 },
+ { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
/* WAFL PHY MCA type */
- { SMCA_WAFL_PHY, 0x00000267 },
+ { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
- { SMCA_GMI_PHY, 0x00000269 },
+ { SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) },
};
struct smca_bank_name {
@@ -862,12 +868,12 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
case 0x10 ... 0x1F:
case 0x60 ... 0x7B:
case 0xA0 ... 0xAF:
- if (*hwid_mcatype == 0x0002002E)
- *hwid_mcatype = 0x00010000;
+ if (*hwid_mcatype == HWID_MCATYPE(0x2E, 0x2))
+ *hwid_mcatype = HWID_MCATYPE(0x0, 0x1);
break;
case 0x90 ... 0x9F:
- if ((*hwid_mcatype & 0xFF) == 0x00000096)
- *hwid_mcatype = 0x00020000;
+ if (*hwid_mcatype == HWID_MCATYPE(0x96, 0x0))
+ *hwid_mcatype = HWID_MCATYPE(0x0, 0x2);
break;
default:
break;
@@ -875,8 +881,8 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
} else if (m->family == 0x1A) {
switch (m->model) {
case 0x40 ... 0x4F:
- if (*hwid_mcatype == 0x0002002E)
- *hwid_mcatype = 0x00010000;
+ if (*hwid_mcatype == HWID_MCATYPE(0x2E, 0x2))
+ *hwid_mcatype = HWID_MCATYPE(0x0, 0x1);
break;
default:
break;
@@ -889,13 +895,17 @@ void decode_smca_error(struct mce_event *e, struct mce_priv *m)
{
enum smca_bank_types bank_type;
const char *ip_name;
+ uint32_t mcatype_hwid = 0;
unsigned short xec = (e->status >> 16) & 0x3f;
const struct smca_hwid *s_hwid;
- uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
+ uint32_t ipid_high = EXTRACT(e->ipid, 32, 63);
uint8_t mcatype_instancehi = EXTRACT(e->ipid, 44, 47);
unsigned int csrow = -1, channel = -1;
unsigned int i;
+ mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID,
+ (ipid_high & MCI_IPID_MCATYPE) >> 16);
+
fixup_hwid(m, &mcatype_hwid);
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {