Compare commits

...

No commits in common. "c8" and "c10s" have entirely different histories.
c8 ... c10s

53 changed files with 6250 additions and 4141 deletions

3
.gitignore vendored
View File

@ -1 +1,2 @@
SOURCES/rasdaemon-0.6.1.tar.bz2
/rasdaemon-*.tar.bz2
/rasdaemon-0.6.8.tar.gz

View File

@ -1 +0,0 @@
742eda555cccb8ca8f9b6a18bab1f4a732c11318 SOURCES/rasdaemon-0.6.1.tar.bz2

View File

@ -0,0 +1,66 @@
commit 2ff9bc453998ddb145c7bb8ba30a57c56bd18eab
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Tue Apr 4 14:40:42 2023 +0100
rasdaemon: Add common function to convert timestamp in the CXL event records to the broken-down time format
Add common function to convert the timestamp in the CXL event records
in nanoseconds to the broken-down time format.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c
index 8f6342d..59534a4 100644
--- a/ras-cxl-handler.c
+++ b/ras-cxl-handler.c
@@ -23,6 +23,25 @@
#include "ras-report.h"
#include <endian.h>
+/* Common Functions */
+static void convert_timestamp(unsigned long long ts, char *ts_ptr, uint16_t size)
+{
+ /* CXL Specification 3.0
+ * Overflow timestamp - The number of unsigned nanoseconds
+ * that have elapsed since midnight, 01-Jan-1970 UTC
+ */
+ time_t ts_secs = ts / 1000000000ULL;
+ struct tm *tm;
+
+ tm = localtime(&ts_secs);
+ if (tm)
+ strftime(ts_ptr, size, "%Y-%m-%d %H:%M:%S %z", tm);
+
+ if (!ts || !tm)
+ strncpy(ts_ptr, "1970-01-01 00:00:00 +0000",
+ size);
+}
+
/* Poison List: Payload out flags */
#define CXL_POISON_FLAG_MORE BIT(0)
#define CXL_POISON_FLAG_OVERFLOW BIT(1)
@@ -168,22 +187,7 @@ int ras_cxl_poison_event_handler(struct trace_seq *s,
if (ev.flags & CXL_POISON_FLAG_OVERFLOW) {
if (tep_get_field_val(s, event, "overflow_ts", record, &val, 1) < 0)
return -1;
- if (val) {
- /* CXL Specification 3.0
- * Overflow timestamp - The number of unsigned nanoseconds
- * that have elapsed since midnight, 01-Jan-1970 UTC
- */
- time_t ovf_ts_secs = val / 1000000000ULL;
-
- tm = localtime(&ovf_ts_secs);
- if (tm) {
- strftime(ev.overflow_ts, sizeof(ev.overflow_ts),
- "%Y-%m-%d %H:%M:%S %z", tm);
- }
- }
- if (!val || !tm)
- strncpy(ev.overflow_ts, "1970-01-01 00:00:00 +0000",
- sizeof(ev.overflow_ts));
+ convert_timestamp(val, ev.overflow_ts, sizeof(ev.overflow_ts));
} else
strncpy(ev.overflow_ts, "1970-01-01 00:00:00 +0000", sizeof(ev.overflow_ts));
if (trace_seq_printf(s, "overflow timestamp:%s\n", ev.overflow_ts) <= 0)

View File

@ -0,0 +1,54 @@
commit 31c7578ddb0fc15aa7247f2b8885956540031221
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Tue Feb 6 12:08:00 2024 +0000
rasdaemon: ras-memory-failure-handler: update memory failure action page types
Update memory failure action page types corresponding to the same in
mm/memory-failure.c in the kernel.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c
index 97e8840..a5acc08 100644
--- a/ras-memory-failure-handler.c
+++ b/ras-memory-failure-handler.c
@@ -26,10 +26,8 @@ enum mf_action_page_type {
MF_MSG_KERNEL_HIGH_ORDER,
MF_MSG_SLAB,
MF_MSG_DIFFERENT_COMPOUND,
- MF_MSG_POISONED_HUGE,
MF_MSG_HUGE,
MF_MSG_FREE_HUGE,
- MF_MSG_NON_PMD_HUGE,
MF_MSG_UNMAP_FAILED,
MF_MSG_DIRTY_SWAPCACHE,
MF_MSG_CLEAN_SWAPCACHE,
@@ -41,7 +39,6 @@ enum mf_action_page_type {
MF_MSG_CLEAN_LRU,
MF_MSG_TRUNCATED_LRU,
MF_MSG_BUDDY,
- MF_MSG_BUDDY_2ND,
MF_MSG_DAX,
MF_MSG_UNSPLIT_THP,
MF_MSG_UNKNOWN,
@@ -64,10 +61,8 @@ static const struct {
{ MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page"},
{ MF_MSG_SLAB, "kernel slab page"},
{ MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking"},
- { MF_MSG_POISONED_HUGE, "huge page already hardware poisoned"},
{ MF_MSG_HUGE, "huge page"},
{ MF_MSG_FREE_HUGE, "free huge page"},
- { MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page"},
{ MF_MSG_UNMAP_FAILED, "unmapping failed page"},
{ MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page"},
{ MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page"},
@@ -79,7 +74,6 @@ static const struct {
{ MF_MSG_CLEAN_LRU, "clean LRU page"},
{ MF_MSG_TRUNCATED_LRU, "already truncated LRU page"},
{ MF_MSG_BUDDY, "free buddy page"},
- { MF_MSG_BUDDY_2ND, "free buddy page (2nd try)"},
{ MF_MSG_DAX, "dax page"},
{ MF_MSG_UNSPLIT_THP, "unsplit thp"},
{ MF_MSG_UNKNOWN, "unknown page"},

View File

@ -0,0 +1,551 @@
commit 53c682fb45c2909c128be4ee8f51a3e42fe2f7fd
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Wed Apr 5 11:54:41 2023 +0100
rasdaemon: Add support for the CXL general media events
Add support to log and record the CXL general media events.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c
index 83ada56..2de96f6 100644
--- a/ras-cxl-handler.c
+++ b/ras-cxl-handler.c
@@ -99,6 +99,14 @@ static char *uuid_be(const char *uu)
return uuid;
}
+static const char* get_cxl_type_str(const char** type_array, uint8_t num_elems, uint8_t type)
+{
+ if (type >= num_elems)
+ return "Unknown";
+
+ return type_array[type];
+}
+
/* Poison List: Payload out flags */
#define CXL_POISON_FLAG_MORE BIT(0)
#define CXL_POISON_FLAG_OVERFLOW BIT(1)
@@ -709,3 +717,151 @@ int ras_cxl_generic_event_handler(struct trace_seq *s,
return 0;
}
+
+#define CXL_DPA_VOLATILE BIT(0)
+#define CXL_DPA_NOT_REPAIRABLE BIT(1)
+
+static const struct cxl_event_flags cxl_dpa_flags[] = {
+ { .bit = CXL_DPA_VOLATILE, .flag = "VOLATILE" },
+ { .bit = CXL_DPA_NOT_REPAIRABLE, .flag = "NOT_REPAIRABLE" },
+};
+
+/*
+ * General Media Event Record - GMER
+ * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43
+ */
+#define CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT BIT(0)
+#define CXL_GMER_EVT_DESC_THRESHOLD_EVENT BIT(1)
+#define CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW BIT(2)
+
+static const struct cxl_event_flags cxl_gmer_event_desc_flags[] = {
+ { .bit = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT, .flag = "UNCORRECTABLE EVENT" },
+ { .bit = CXL_GMER_EVT_DESC_THRESHOLD_EVENT, .flag = "THRESHOLD EVENT" },
+ { .bit = CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW, .flag = "POISON LIST OVERFLOW" },
+};
+
+#define CXL_GMER_VALID_CHANNEL BIT(0)
+#define CXL_GMER_VALID_RANK BIT(1)
+#define CXL_GMER_VALID_DEVICE BIT(2)
+#define CXL_GMER_VALID_COMPONENT BIT(3)
+
+static const char* cxl_gmer_mem_event_type[] = {
+ "ECC Error",
+ "Invalid Address",
+ "Data Path Error",
+};
+
+static const char* cxl_gmer_trans_type[] = {
+ "Unknown",
+ "Host Read",
+ "Host Write",
+ "Host Scan Media",
+ "Host Inject Poison",
+ "Internal Media Scrub",
+ "Internal Media Management",
+};
+
+int ras_cxl_general_media_event_handler(struct trace_seq *s,
+ struct tep_record *record,
+ struct tep_event *event, void *context)
+{
+ int len, i;
+ unsigned long long val;
+ struct ras_events *ras = context;
+ struct ras_cxl_general_media_event ev;
+
+ memset(&ev, 0, sizeof(ev));
+ if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "dpa", record, &val, 1) < 0)
+ return -1;
+ ev.dpa = val;
+ if (trace_seq_printf(s, "dpa:0x%llx ", (unsigned long long)ev.dpa) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "dpa_flags", record, &val, 1) < 0)
+ return -1;
+ ev.dpa_flags = val;
+ if (trace_seq_printf(s, "dpa_flags:") <= 0)
+ return -1;
+ if (decode_cxl_event_flags(s, ev.dpa_flags, cxl_dpa_flags, ARRAY_SIZE(cxl_dpa_flags)) < 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "descriptor", record, &val, 1) < 0)
+ return -1;
+ ev.descriptor = val;
+ if (trace_seq_printf(s, "descriptor:") <= 0)
+ return -1;
+ if (decode_cxl_event_flags(s, ev.descriptor, cxl_gmer_event_desc_flags,
+ ARRAY_SIZE(cxl_gmer_event_desc_flags)) < 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "type", record, &val, 1) < 0)
+ return -1;
+ ev.type = val;
+ if (trace_seq_printf(s, "type:%s ", get_cxl_type_str(cxl_gmer_mem_event_type,
+ ARRAY_SIZE(cxl_gmer_mem_event_type), ev.type)) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "transaction_type", record, &val, 1) < 0)
+ return -1;
+ ev.transaction_type = val;
+ if (trace_seq_printf(s, "transaction_type:%s ",
+ get_cxl_type_str(cxl_gmer_trans_type,
+ ARRAY_SIZE(cxl_gmer_trans_type),
+ ev.transaction_type)) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "validity_flags", record, &val, 1) < 0)
+ return -1;
+ ev.validity_flags = val;
+
+ if (ev.validity_flags & CXL_GMER_VALID_CHANNEL) {
+ if (tep_get_field_val(s, event, "channel", record, &val, 1) < 0)
+ return -1;
+ ev.channel = val;
+ if (trace_seq_printf(s, "channel:%u ", ev.channel) <= 0)
+ return -1;
+ }
+
+ if (ev.validity_flags & CXL_GMER_VALID_RANK) {
+ if (tep_get_field_val(s, event, "rank", record, &val, 1) < 0)
+ return -1;
+ ev.rank = val;
+ if (trace_seq_printf(s, "rank:%u ", ev.rank) <= 0)
+ return -1;
+ }
+
+ if (ev.validity_flags & CXL_GMER_VALID_DEVICE) {
+ if (tep_get_field_val(s, event, "device", record, &val, 1) < 0)
+ return -1;
+ ev.device = val;
+ if (trace_seq_printf(s, "device:%x ", ev.device) <= 0)
+ return -1;
+ }
+
+ if (ev.validity_flags & CXL_GMER_VALID_COMPONENT) {
+ ev.comp_id = tep_get_field_raw(s, event, "comp_id", record, &len, 1);
+ if (!ev.comp_id)
+ return -1;
+ if (trace_seq_printf(s, "comp_id:") <= 0)
+ return -1;
+ for (i = 0; i < CXL_EVENT_GEN_MED_COMP_ID_SIZE; i++) {
+ if (trace_seq_printf(s, "%02x ", ev.comp_id[i]) <= 0)
+ break;
+ }
+ }
+
+ /* Insert data into the SGBD */
+#ifdef HAVE_SQLITE3
+ ras_store_cxl_general_media_event(ras, &ev);
+#endif
+
+#ifdef HAVE_ABRT_REPORT
+ /* Report event to ABRT */
+ ras_report_cxl_general_media_event(ras, &ev);
+#endif
+
+ return 0;
+}
diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h
index 9f77cb7..3adca4a 100644
--- a/ras-cxl-handler.h
+++ b/ras-cxl-handler.h
@@ -35,4 +35,7 @@ int ras_cxl_overflow_event_handler(struct trace_seq *s,
int ras_cxl_generic_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context);
+int ras_cxl_general_media_event_handler(struct trace_seq *s,
+ struct tep_record *record,
+ struct tep_event *event, void *context);
#endif
diff --git a/ras-events.c b/ras-events.c
index 4036933..978dee4 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -250,6 +250,7 @@ int toggle_ras_mc_event(int enable)
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_correctable_error", enable);
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_overflow", enable);
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_generic_event", enable);
+ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_general_media", enable);
#endif
free_ras:
@@ -1063,6 +1064,14 @@ int handle_ras_events(int record_events)
else
log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
"cxl", "cxl_generic_event");
+
+ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_general_media",
+ ras_cxl_general_media_event_handler, NULL, CXL_GENERAL_MEDIA_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "cxl", "cxl_general_media");
#endif
if (!num_events) {
diff --git a/ras-events.h b/ras-events.h
index 96c299e..9b83df3 100644
--- a/ras-events.h
+++ b/ras-events.h
@@ -44,6 +44,7 @@ enum {
CXL_AER_CE_EVENT,
CXL_OVERFLOW_EVENT,
CXL_GENERIC_EVENT,
+ CXL_GENERAL_MEDIA_EVENT,
NR_EVENTS
};
diff --git a/ras-record.c b/ras-record.c
index a65d9c0..507a58e 100644
--- a/ras-record.c
+++ b/ras-record.c
@@ -846,6 +846,75 @@ int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_e
return rc;
}
+
+/*
+ * Table and functions to handle cxl:cxl_general_media_event
+ */
+static const struct db_fields cxl_general_media_event_fields[] = {
+ { .name = "id", .type = "INTEGER PRIMARY KEY" },
+ { .name = "timestamp", .type = "TEXT" },
+ { .name = "memdev", .type = "TEXT" },
+ { .name = "host", .type = "TEXT" },
+ { .name = "serial", .type = "INTEGER" },
+ { .name = "log_type", .type = "TEXT" },
+ { .name = "hdr_uuid", .type = "TEXT" },
+ { .name = "hdr_flags", .type = "INTEGER" },
+ { .name = "hdr_handle", .type = "INTEGER" },
+ { .name = "hdr_related_handle", .type = "INTEGER" },
+ { .name = "hdr_ts", .type = "TEXT" },
+ { .name = "hdr_length", .type = "INTEGER" },
+ { .name = "hdr_maint_op_class", .type = "INTEGER" },
+ { .name = "dpa", .type = "INTEGER" },
+ { .name = "dpa_flags", .type = "INTEGER" },
+ { .name = "descriptor", .type = "INTEGER" },
+ { .name = "type", .type = "INTEGER" },
+ { .name = "transaction_type", .type = "INTEGER" },
+ { .name = "channel", .type = "INTEGER" },
+ { .name = "rank", .type = "INTEGER" },
+ { .name = "device", .type = "INTEGER" },
+ { .name = "comp_id", .type = "BLOB" },
+};
+
+static const struct db_table_descriptor cxl_general_media_event_tab = {
+ .name = "cxl_general_media_event",
+ .fields = cxl_general_media_event_fields,
+ .num_fields = ARRAY_SIZE(cxl_general_media_event_fields),
+};
+
+int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev)
+{
+ int rc;
+ struct sqlite3_priv *priv = ras->db_priv;
+
+ if (!priv || !priv->stmt_cxl_general_media_event)
+ return 0;
+ log(TERM, LOG_INFO, "cxl_general_media_event store: %p\n",
+ priv->stmt_cxl_general_media_event);
+
+ ras_store_cxl_common_hdr(priv->stmt_cxl_general_media_event, &ev->hdr);
+ sqlite3_bind_int64(priv->stmt_cxl_general_media_event, 13, ev->dpa);
+ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 14, ev->dpa_flags);
+ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 15, ev->descriptor);
+ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 16, ev->type);
+ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 17, ev->transaction_type);
+ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 18, ev->channel);
+ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 19, ev->rank);
+ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 20, ev->device);
+ sqlite3_bind_blob(priv->stmt_cxl_general_media_event, 21, ev->comp_id,
+ CXL_EVENT_GEN_MED_COMP_ID_SIZE, NULL);
+
+ rc = sqlite3_step(priv->stmt_cxl_general_media_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to do stmt_cxl_general_media_event step on sqlite: error = %d\n", rc);
+ rc = sqlite3_reset(priv->stmt_cxl_general_media_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed reset stmt_cxl_general_media_event on sqlite: error = %d\n", rc);
+ log(TERM, LOG_INFO, "register inserted at db\n");
+
+ return rc;
+}
#endif
/*
@@ -1229,6 +1298,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
if (rc != SQLITE_OK)
goto error;
}
+
+ rc = ras_mc_create_table(priv, &cxl_general_media_event_tab);
+ if (rc == SQLITE_OK) {
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_general_media_event,
+ &cxl_general_media_event_tab);
+ if (rc != SQLITE_OK)
+ goto error;
+ }
#endif
ras->db_priv = priv;
@@ -1390,6 +1467,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras)
"cpu %u: Failed to finalize cxl_generic_event sqlite: error = %d\n",
cpu, rc);
}
+
+ if (priv->stmt_cxl_general_media_event) {
+ rc = sqlite3_finalize(priv->stmt_cxl_general_media_event);
+ if (rc != SQLITE_OK)
+ log(TERM, LOG_ERR,
+ "cpu %u: Failed to finalize cxl_general_media_event sqlite: error = %d\n",
+ cpu, rc);
+ }
#endif
rc = sqlite3_close_v2(db);
diff --git a/ras-record.h b/ras-record.h
index 9ecfcda..37c32de 100644
--- a/ras-record.h
+++ b/ras-record.h
@@ -134,6 +134,7 @@ struct ras_cxl_poison_event {
#define CXL_HEADERLOG_SIZE SZ_512
#define CXL_HEADERLOG_SIZE_U32 (SZ_512 / sizeof(uint32_t))
#define CXL_EVENT_RECORD_DATA_LENGTH 0x50
+#define CXL_EVENT_GEN_MED_COMP_ID_SIZE 0x10
struct ras_cxl_aer_ue_event {
char timestamp[64];
@@ -184,6 +185,20 @@ struct ras_cxl_generic_event {
uint8_t *data;
};
+struct ras_cxl_general_media_event {
+ struct ras_cxl_event_common_hdr hdr;
+ uint64_t dpa;
+ uint8_t dpa_flags;
+ uint8_t descriptor;
+ uint8_t type;
+ uint8_t transaction_type;
+ uint8_t channel;
+ uint8_t rank;
+ uint32_t device;
+ uint8_t *comp_id;
+ uint16_t validity_flags;
+};
+
struct ras_mc_event;
struct ras_aer_event;
struct ras_extlog_event;
@@ -198,6 +213,7 @@ struct ras_cxl_aer_ue_event;
struct ras_cxl_aer_ce_event;
struct ras_cxl_overflow_event;
struct ras_cxl_generic_event;
+struct ras_cxl_general_media_event;
#ifdef HAVE_SQLITE3
@@ -236,6 +252,7 @@ struct sqlite3_priv {
sqlite3_stmt *stmt_cxl_aer_ce_event;
sqlite3_stmt *stmt_cxl_overflow_event;
sqlite3_stmt *stmt_cxl_generic_event;
+ sqlite3_stmt *stmt_cxl_general_media_event;
#endif
};
@@ -269,6 +286,7 @@ int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_eve
int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev);
int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev);
int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev);
+int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev);
#else
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
@@ -287,6 +305,7 @@ static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_
static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; };
static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; };
static inline int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; };
+static inline int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; };
#endif
diff --git a/ras-report.c b/ras-report.c
index 8d7b76a..725dc9b 100644
--- a/ras-report.c
+++ b/ras-report.c
@@ -489,6 +489,60 @@ static int set_cxl_generic_event_backtrace(char *buf, struct ras_cxl_generic_eve
return 0;
}
+static int set_cxl_general_media_event_backtrace(char *buf, struct ras_cxl_general_media_event *ev)
+{
+ char bt_buf[MAX_BACKTRACE_SIZE];
+
+ if (!buf || !ev)
+ return -1;
+
+ sprintf(bt_buf, "BACKTRACE=" \
+ "timestamp=%s\n" \
+ "memdev=%s\n" \
+ "host=%s\n" \
+ "serial=0x%lx\n" \
+ "log_type=%s\n" \
+ "hdr_uuid=%s\n" \
+ "hdr_flags=0x%x\n" \
+ "hdr_handle=0x%x\n" \
+ "hdr_related_handle=0x%x\n" \
+ "hdr_timestamp=%s\n" \
+ "hdr_length=%u\n" \
+ "hdr_maint_op_class=%u\n" \
+ "dpa=0x%lx\n" \
+ "dpa_flags=%u\n" \
+ "descriptor=%u\n" \
+ "type=%u\n" \
+ "transaction_type=%u\n" \
+ "channel=%u\n" \
+ "rank=%u\n" \
+ "device=0x%x\n", \
+ ev->hdr.timestamp, \
+ ev->hdr.memdev, \
+ ev->hdr.host, \
+ ev->hdr.serial, \
+ ev->hdr.log_type, \
+ ev->hdr.hdr_uuid, \
+ ev->hdr.hdr_flags, \
+ ev->hdr.hdr_handle, \
+ ev->hdr.hdr_related_handle, \
+ ev->hdr.hdr_timestamp, \
+ ev->hdr.hdr_length, \
+ ev->hdr.hdr_maint_op_class, \
+ ev->dpa, \
+ ev->dpa_flags, \
+ ev->descriptor, \
+ ev->type, \
+ ev->transaction_type, \
+ ev->channel, \
+ ev->rank, \
+ ev->device);
+
+ strcat(buf, bt_buf);
+
+ return 0;
+}
+
static int commit_report_backtrace(int sockfd, int type, void *ev){
char buf[MAX_BACKTRACE_SIZE];
char *pbuf = buf;
@@ -541,6 +595,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){
case CXL_GENERIC_EVENT:
rc = set_cxl_generic_event_backtrace(buf, (struct ras_cxl_generic_event *)ev);
break;
+ case CXL_GENERAL_MEDIA_EVENT:
+ rc = set_cxl_general_media_event_backtrace(buf, (struct ras_cxl_general_media_event *)ev);
+ break;
default:
return -1;
}
@@ -1170,3 +1227,47 @@ cxl_generic_fail:
return -1;
}
+
+int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev)
+{
+ char buf[MAX_MESSAGE_SIZE];
+ int sockfd = 0;
+ int done = 0;
+ int rc = -1;
+
+ memset(buf, 0, sizeof(buf));
+
+ sockfd = setup_report_socket();
+ if (sockfd < 0)
+ return -1;
+
+ rc = commit_report_basic(sockfd);
+ if (rc < 0)
+ goto cxl_general_media_fail;
+
+ rc = commit_report_backtrace(sockfd, CXL_GENERAL_MEDIA_EVENT, ev);
+ if (rc < 0)
+ goto cxl_general_media_fail;
+
+ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl_general_media_event");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto cxl_general_media_fail;
+
+ sprintf(buf, "REASON=%s", "CXL General Media Event");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto cxl_general_media_fail;
+
+ done = 1;
+
+cxl_general_media_fail:
+
+ if (sockfd >= 0)
+ close(sockfd);
+
+ if (done)
+ return 0;
+ else
+ return -1;
+}
diff --git a/ras-report.h b/ras-report.h
index bf591a6..d9ec7df 100644
--- a/ras-report.h
+++ b/ras-report.h
@@ -44,6 +44,7 @@ int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_ev
int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev);
int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev);
int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev);
+int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev);
#else
@@ -60,6 +61,7 @@ static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras
static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; };
static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; };
static inline int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; };
+static inline int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; };
#endif

View File

@ -0,0 +1,182 @@
commit dea649c9f9a6f2941e80cade9ed311a398e267be
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Mon Feb 12 11:14:03 2024 +0000
rasdaemon: ras-mc-ctl: Add support for CXL general media trace events
Add support for CXL general media events to the ras-mc-ctl tool.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
(cherry picked from commit 572de9d57691be9e630abee9ffa56a2fb155d558)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 5528021..99b3c10 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -1298,6 +1298,84 @@ sub get_cxl_hdr_flags_text
return join (", ", @out);
}
+use constant {
+ CXL_DPA_VOLATILE => 0x0001,
+ CXL_DPA_NOT_REPAIRABLE => 0x0002,
+};
+
+sub get_cxl_dpa_flags_text
+{
+ my $flags = $_[0];
+ my @out;
+
+ if ($flags & CXL_DPA_VOLATILE) {
+ push @out, (sprintf "\'VOLATILE\' ");
+ }
+ if ($flags & CXL_DPA_NOT_REPAIRABLE) {
+ push @out, (sprintf "\'NOT_REPAIRABLE\' ");
+ }
+
+ return join (", ", @out);
+}
+
+use constant {
+ CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT => 0x0001,
+ CXL_GMER_EVT_DESC_THRESHOLD_EVENT => 0x0002,
+ CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW => 0x0004,
+};
+
+sub get_cxl_descriptor_flags_text
+{
+ my $flags = $_[0];
+ my @out;
+
+ if ($flags & CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT) {
+ push @out, (sprintf "\'UNCORRECTABLE EVENT\' ");
+ }
+ if ($flags & CXL_GMER_EVT_DESC_THRESHOLD_EVENT) {
+ push @out, (sprintf "\'THRESHOLD EVENT\' ");
+ }
+ if ($flags & CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW) {
+ push @out, (sprintf "\'POISON LIST OVERFLOW\' ");
+ }
+
+ return join (", ", @out);
+}
+
+sub get_cxl_mem_event_type
+{
+ my @types;
+
+ if ($_[0] < 0 || $_[0] > 2) {
+ return "unknown-type";
+ }
+
+ @types = ("ECC Error",
+ "Invalid Address",
+ "Data Path Error");
+
+ return $types[$_[0]];
+}
+
+sub get_cxl_transaction_type
+{
+ my @types;
+
+ if ($_[0] < 0 || $_[0] > 6) {
+ return "unknown-type";
+ }
+
+ @types = ("Unknown",
+ "Host Read",
+ "Host Write",
+ "Host Scan Media",
+ "Host Inject Poison",
+ "Internal Media Scrub",
+ "Internal Media Management");
+
+ return $types[$_[0]];
+}
+
sub summary
{
require DBI;
@@ -1442,6 +1520,22 @@ sub summary
print "No CXL generic errors.\n\n";
}
$query_handle->finish;
+
+ # CXL general media errors
+ $query = "select memdev, count(*) from cxl_general_media_event$conf{opt}{since} group by memdev";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($memdev, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$memdev errors: $count\n";
+ }
+ if ($out ne "") {
+ print "CXL general media events summary:\n$out\n";
+ } else {
+ print "No CXL general media errors.\n\n";
+ }
+ $query_handle->finish;
}
# extlog errors
@@ -1553,6 +1647,7 @@ sub errors
my ($log_type, $first_ts, $last_ts);
my ($trace_type, $region, $region_uuid, $hpa, $dpa, $dpa_length, $source, $flags, $overflow_ts);
my ($hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $data);
+ my ($dpa_flags, $descriptor, $mem_event_type, $transaction_type, $channel, $rank, $device, $comp_id);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
@@ -1764,6 +1859,49 @@ sub errors
} else {
print "No CXL generic errors.\n\n";
}
+
+ # CXL general media errors
+ use constant CXL_EVENT_GEN_MED_COMP_ID_SIZE => 0x10;
+ $query = "select id, timestamp, memdev, host, serial, log_type, hdr_uuid, hdr_flags, hdr_handle, hdr_related_handle, hdr_ts, hdr_length, hdr_maint_op_class, dpa, dpa_flags, descriptor, type, transaction_type, channel, rank, device, comp_id from cxl_general_media_event$conf{opt}{since} order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $log_type, $hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $dpa, $dpa_flags, $descriptor, $mem_event_type, $transaction_type, $channel, $rank, $device, $comp_id));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev);
+ $out .= "host=$host, " if (defined $host && length $host);
+ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial);
+ $out .= "log=$log_type, " if (defined $log_type && length $log_type);
+ $out .= "hdr_uuid=$hdr_uuid, " if (defined $hdr_uuid && length $hdr_uuid);
+ $out .= sprintf "hdr_flags=0x%llx %s, ", $hdr_flags, get_cxl_hdr_flags_text($hdr_flags) if (defined $hdr_flags && length $hdr_flags);
+ $out .= sprintf "hdr_handle=0x%x, ", $hdr_handle if (defined $hdr_handle && length $hdr_handle);
+ $out .= sprintf "hdr_related_handle=0x%x, ", $hdr_related_handle if (defined $hdr_related_handle && length $hdr_related_handle);
+ $out .= "hdr_timestamp=$hdr_ts, " if (defined $hdr_ts && length $hdr_ts);
+ $out .= sprintf "hdr_length=%u, ", $hdr_length if (defined $hdr_length && length $hdr_length);
+ $out .= sprintf "hdr_maint_op_class=%u, ", $hdr_maint_op_class if (defined $hdr_maint_op_class && length $hdr_maint_op_class);
+ $out .= sprintf "dpa=0x%llx, ", $dpa if (defined $dpa && length $dpa);
+ $out .= sprintf "dpa_flags: %s, ", get_cxl_dpa_flags_text($dpa_flags) if (defined $dpa_flags && length $dpa_flags);
+ $out .= sprintf "descriptor_flags: %s, ", get_cxl_descriptor_flags_text($descriptor) if (defined $descriptor && length $descriptor);
+ $out .= sprintf "memory event type: %s, ", get_cxl_mem_event_type($mem_event_type) if (defined $mem_event_type && length $mem_event_type);
+ $out .= sprintf "transaction_type: %s, ", get_cxl_transaction_type($transaction_type) if (defined $transaction_type && length $transaction_type);
+ $out .= sprintf "channel=%u, ", $channel if (defined $channel && length $channel);
+ $out .= sprintf "rank=%u, ", $rank if (defined $rank && length $rank);
+ $out .= sprintf "device=0x%x, ", $device if (defined $device && length $device);
+ if (defined $comp_id && length $comp_id) {
+ $out .= sprintf "component_id:";
+ my @bytes = unpack "C*", $comp_id;
+ for (my $i = 0; $i < CXL_EVENT_GEN_MED_COMP_ID_SIZE; $i++) {
+ $out .= sprintf "%02x ", $bytes[$i];
+ }
+ }
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "CXL general media events:\n$out\n";
+ } else {
+ print "No CXL general media errors.\n\n";
+ }
}
# Extlog errors

View File

@ -0,0 +1,663 @@
commit 75c8fec559641f843345ef8fbc36d124b60b914d
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Fri Mar 31 13:35:13 2023 +0100
rasdaemon: Add support for the CXL poison events
Add support to log and record the CXL poison events.
The corresponding Kernel patches here:
https://lore.kernel.org/linux-cxl/64457d30bae07_2028294ac@dwillia2-xfh.jf.intel.com.notmuch/
Presently for logging only, could be extended for the policy
based recovery action for the frequent poison events depending on the above
kernel patches.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/Makefile.am b/Makefile.am
index 56c144e..5bddeac 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -73,6 +73,11 @@ endif
if WITH_CPU_FAULT_ISOLATION
rasdaemon_SOURCES += ras-cpu-isolation.c queue.c
endif
+
+if WITH_CXL
+ rasdaemon_SOURCES += ras-cxl-handler.c
+endif
+
rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS)
rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS)
@@ -81,7 +86,7 @@ include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \
ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \
non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \
- ras-cpu-isolation.h queue.h
+ ras-cxl-handler.h ras-cpu-isolation.h queue.h
# This rule can't be called with more than one Makefile job (like make -j8)
# I can't figure out a way to fix that
diff --git a/configure.ac b/configure.ac
index f588090..ab5697d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -127,6 +127,16 @@ AS_IF([test "x$enable_memory_failure" = "xyes" || test "x$enable_all" = "xyes"],
AM_CONDITIONAL([WITH_MEMORY_FAILURE], [test x$enable_memory_failure = xyes || test x$enable_all = xyes])
AM_COND_IF([WITH_MEMORY_FAILURE], [USE_MEMORY_FAILURE="yes"], [USE_MEMORY_FAILURE="no"])
+AC_ARG_ENABLE([cxl],
+ AS_HELP_STRING([--enable-cxl], [enable CXL events (currently experimental)]))
+
+AS_IF([test "x$enable_cxl" = "xyes" || test "x$enable_all" == "xyes"], [
+ AC_DEFINE(HAVE_CXL,1,"have CXL events collect")
+ AC_SUBST([WITH_CXL])
+])
+AM_CONDITIONAL([WITH_CXL], [test x$enable_cxl = xyes || test x$enable_all == xyes])
+AM_COND_IF([WITH_CXL], [USE_CXL="yes"], [USE_CXL="no"])
+
AC_ARG_ENABLE([abrt_report],
AS_HELP_STRING([--enable-abrt-report], [enable report event to ABRT (currently experimental)]))
@@ -215,6 +225,7 @@ compile time options summary
DEVLINK : $USE_DEVLINK
Disk I/O errors : $USE_DISKERROR
Memory Failure : $USE_MEMORY_FAILURE
+ CXL events : $USE_CXL
Memory CE PFA : $USE_MEMORY_CE_PFA
AMP RAS errors : $USE_AMP_NS_DECODE
CPU fault isolation : $USE_CPU_FAULT_ISOLATION
diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c
new file mode 100644
index 0000000..cb23ba2
--- /dev/null
+++ b/ras-cxl-handler.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <traceevent/kbuffer.h>
+#include "ras-cxl-handler.h"
+#include "ras-record.h"
+#include "ras-logger.h"
+#include "ras-report.h"
+
+/* Poison List: Payload out flags */
+#define CXL_POISON_FLAG_MORE BIT(0)
+#define CXL_POISON_FLAG_OVERFLOW BIT(1)
+#define CXL_POISON_FLAG_SCANNING BIT(2)
+
+/* CXL poison - source types */
+enum cxl_poison_source {
+ CXL_POISON_SOURCE_UNKNOWN = 0,
+ CXL_POISON_SOURCE_EXTERNAL = 1,
+ CXL_POISON_SOURCE_INTERNAL = 2,
+ CXL_POISON_SOURCE_INJECTED = 3,
+ CXL_POISON_SOURCE_VENDOR = 7,
+};
+
+/* CXL poison - trace types */
+enum cxl_poison_trace_type {
+ CXL_POISON_TRACE_LIST,
+ CXL_POISON_TRACE_INJECT,
+ CXL_POISON_TRACE_CLEAR,
+};
+
+int ras_cxl_poison_event_handler(struct trace_seq *s,
+ struct tep_record *record,
+ struct tep_event *event, void *context)
+{
+ int len;
+ unsigned long long val;
+ struct ras_events *ras = context;
+ time_t now;
+ struct tm *tm;
+ struct ras_cxl_poison_event ev;
+
+ now = record->ts / user_hz + ras->uptime_diff;
+ tm = localtime(&now);
+ if (tm)
+ strftime(ev.timestamp, sizeof(ev.timestamp),
+ "%Y-%m-%d %H:%M:%S %z", tm);
+ else
+ strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp));
+ if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0)
+ return -1;
+
+ ev.memdev = tep_get_field_raw(s, event, "memdev",
+ record, &len, 1);
+ if (!ev.memdev)
+ return -1;
+ if (trace_seq_printf(s, "memdev:%s ", ev.memdev) <= 0)
+ return -1;
+
+ ev.host = tep_get_field_raw(s, event, "host",
+ record, &len, 1);
+ if (!ev.host)
+ return -1;
+ if (trace_seq_printf(s, "host:%s ", ev.host) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "serial", record, &val, 1) < 0)
+ return -1;
+ ev.serial = val;
+ if (trace_seq_printf(s, "serial:0x%llx ", (unsigned long long)ev.serial) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "trace_type", record, &val, 1) < 0)
+ return -1;
+ switch (val) {
+ case CXL_POISON_TRACE_LIST:
+ ev.trace_type = "List";
+ break;
+ case CXL_POISON_TRACE_INJECT:
+ ev.trace_type = "Inject";
+ break;
+ case CXL_POISON_TRACE_CLEAR:
+ ev.trace_type = "Clear";
+ break;
+ default:
+ ev.trace_type = "Invalid";
+ }
+ if (trace_seq_printf(s, "trace_type:%s ", ev.trace_type) <= 0)
+ return -1;
+
+ ev.region = tep_get_field_raw(s, event, "region",
+ record, &len, 1);
+ if (!ev.region)
+ return -1;
+ if (trace_seq_printf(s, "region:%s ", ev.region) <= 0)
+ return -1;
+
+ ev.uuid = tep_get_field_raw(s, event, "uuid",
+ record, &len, 1);
+ if (!ev.uuid)
+ return -1;
+ if (trace_seq_printf(s, "region_uuid:%s ", ev.uuid) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "hpa", record, &val, 1) < 0)
+ return -1;
+ ev.hpa = val;
+ if (trace_seq_printf(s, "poison list: hpa:0x%llx ", (unsigned long long)ev.hpa) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "dpa", record, &val, 1) < 0)
+ return -1;
+ ev.dpa = val;
+ if (trace_seq_printf(s, "dpa:0x%llx ", (unsigned long long)ev.dpa) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "dpa_length", record, &val, 1) < 0)
+ return -1;
+ ev.dpa_length = val;
+ if (trace_seq_printf(s, "dpa_length:0x%x ", ev.dpa_length) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "source", record, &val, 1) < 0)
+ return -1;
+ switch (val) {
+ case CXL_POISON_SOURCE_UNKNOWN:
+ ev.source = "Unknown";
+ break;
+ case CXL_POISON_SOURCE_EXTERNAL:
+ ev.source = "External";
+ break;
+ case CXL_POISON_SOURCE_INTERNAL:
+ ev.source = "Internal";
+ break;
+ case CXL_POISON_SOURCE_INJECTED:
+ ev.source = "Injected";
+ break;
+ case CXL_POISON_SOURCE_VENDOR:
+ ev.source = "Vendor";
+ break;
+ default:
+ ev.source = "Invalid";
+ }
+ if (trace_seq_printf(s, "source:%s ", ev.source) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "flags", record, &val, 1) < 0)
+ return -1;
+ ev.flags = val;
+ if (trace_seq_printf(s, "flags:%d ", ev.flags) <= 0)
+ return -1;
+
+ if (ev.flags & CXL_POISON_FLAG_OVERFLOW) {
+ if (tep_get_field_val(s, event, "overflow_ts", record, &val, 1) < 0)
+ return -1;
+ if (val) {
+ /* CXL Specification 3.0
+ * Overflow timestamp - The number of unsigned nanoseconds
+ * that have elapsed since midnight, 01-Jan-1970 UTC
+ */
+ time_t ovf_ts_secs = val / 1000000000ULL;
+
+ tm = localtime(&ovf_ts_secs);
+ if (tm) {
+ strftime(ev.overflow_ts, sizeof(ev.overflow_ts),
+ "%Y-%m-%d %H:%M:%S %z", tm);
+ }
+ }
+ if (!val || !tm)
+ strncpy(ev.overflow_ts, "1970-01-01 00:00:00 +0000",
+ sizeof(ev.overflow_ts));
+ } else
+ strncpy(ev.overflow_ts, "1970-01-01 00:00:00 +0000", sizeof(ev.overflow_ts));
+ if (trace_seq_printf(s, "overflow timestamp:%s\n", ev.overflow_ts) <= 0)
+ return -1;
+
+ /* Insert data into the SGBD */
+#ifdef HAVE_SQLITE3
+ ras_store_cxl_poison_event(ras, &ev);
+#endif
+
+#ifdef HAVE_ABRT_REPORT
+ /* Report event to ABRT */
+ ras_report_cxl_poison_event(ras, &ev);
+#endif
+
+ return 0;
+}
diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h
new file mode 100644
index 0000000..84d5cc6
--- /dev/null
+++ b/ras-cxl-handler.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAS_CXL_HANDLER_H
+#define __RAS_CXL_HANDLER_H
+
+#include "ras-events.h"
+#include <traceevent/event-parse.h>
+
+int ras_cxl_poison_event_handler(struct trace_seq *s,
+ struct tep_record *record,
+ struct tep_event *event, void *context);
+#endif
diff --git a/ras-events.c b/ras-events.c
index 5fe8e19..f95844a 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -40,6 +40,7 @@
#include "ras-devlink-handler.h"
#include "ras-diskerror-handler.h"
#include "ras-memory-failure-handler.h"
+#include "ras-cxl-handler.h"
#include "ras-record.h"
#include "ras-logger.h"
#include "ras-page-isolation.h"
@@ -243,6 +244,10 @@ int toggle_ras_mc_event(int enable)
rc |= __toggle_ras_mc_event(ras, "ras", "memory_failure_event", enable);
#endif
+#ifdef HAVE_CXL
+ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_poison", enable);
+#endif
+
free_ras:
free(ras);
return rc;
@@ -979,6 +984,16 @@ int handle_ras_events(int record_events)
"ras", "memory_failure_event");
#endif
+#ifdef HAVE_CXL
+ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_poison",
+ ras_cxl_poison_event_handler, NULL, CXL_POISON_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "cxl", "cxl_poison");
+#endif
+
if (!num_events) {
log(ALL, LOG_INFO,
"Failed to trace all supported RAS events. Aborting.\n");
diff --git a/ras-events.h b/ras-events.h
index 649b0c0..1ef3ecd 100644
--- a/ras-events.h
+++ b/ras-events.h
@@ -39,6 +39,7 @@ enum {
DEVLINK_EVENT,
DISKERROR_EVENT,
MF_EVENT,
+ CXL_POISON_EVENT,
NR_EVENTS
};
diff --git a/ras-record.c b/ras-record.c
index adc97a4..c31baa0 100644
--- a/ras-record.c
+++ b/ras-record.c
@@ -559,6 +559,71 @@ int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev)
}
#endif
+#ifdef HAVE_CXL
+/*
+ * Table and functions to handle cxl:cxl_poison
+ */
+static const struct db_fields cxl_poison_event_fields[] = {
+ { .name = "id", .type = "INTEGER PRIMARY KEY" },
+ { .name = "timestamp", .type = "TEXT" },
+ { .name = "memdev", .type = "TEXT" },
+ { .name = "host", .type = "TEXT" },
+ { .name = "serial", .type = "INTEGER" },
+ { .name = "trace_type", .type = "TEXT" },
+ { .name = "region", .type = "TEXT" },
+ { .name = "region_uuid", .type = "TEXT" },
+ { .name = "hpa", .type = "INTEGER" },
+ { .name = "dpa", .type = "INTEGER" },
+ { .name = "dpa_length", .type = "INTEGER" },
+ { .name = "source", .type = "TEXT" },
+ { .name = "flags", .type = "INTEGER" },
+ { .name = "overflow_ts", .type = "TEXT" },
+};
+
+static const struct db_table_descriptor cxl_poison_event_tab = {
+ .name = "cxl_poison_event",
+ .fields = cxl_poison_event_fields,
+ .num_fields = ARRAY_SIZE(cxl_poison_event_fields),
+};
+
+int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev)
+{
+ int rc;
+ struct sqlite3_priv *priv = ras->db_priv;
+
+ if (!priv || !priv->stmt_cxl_poison_event)
+ return 0;
+ log(TERM, LOG_INFO, "cxl_poison_event store: %p\n", priv->stmt_cxl_poison_event);
+
+ sqlite3_bind_text(priv->stmt_cxl_poison_event, 1, ev->timestamp, -1, NULL);
+ sqlite3_bind_text(priv->stmt_cxl_poison_event, 2, ev->memdev, -1, NULL);
+ sqlite3_bind_text(priv->stmt_cxl_poison_event, 3, ev->host, -1, NULL);
+ sqlite3_bind_int64(priv->stmt_cxl_poison_event, 4, ev->serial);
+ sqlite3_bind_text(priv->stmt_cxl_poison_event, 5, ev->trace_type, -1, NULL);
+ sqlite3_bind_text(priv->stmt_cxl_poison_event, 6, ev->region, -1, NULL);
+ sqlite3_bind_text(priv->stmt_cxl_poison_event, 7, ev->uuid, -1, NULL);
+ sqlite3_bind_int64(priv->stmt_cxl_poison_event, 8, ev->hpa);
+ sqlite3_bind_int64(priv->stmt_cxl_poison_event, 9, ev->dpa);
+ sqlite3_bind_int(priv->stmt_cxl_poison_event, 10, ev->dpa_length);
+ sqlite3_bind_text(priv->stmt_cxl_poison_event, 11, ev->source, -1, NULL);
+ sqlite3_bind_int(priv->stmt_cxl_poison_event, 12, ev->flags);
+ sqlite3_bind_text(priv->stmt_cxl_poison_event, 13, ev->overflow_ts, -1, NULL);
+
+ rc = sqlite3_step(priv->stmt_cxl_poison_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to do cxl_poison_event step on sqlite: error = %d\n", rc);
+ rc = sqlite3_reset(priv->stmt_cxl_poison_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed reset cxl_poison_event on sqlite: error = %d\n",
+ rc);
+ log(TERM, LOG_INFO, "register inserted at db\n");
+
+ return rc;
+}
+#endif
+
/*
* Generic code
*/
@@ -900,6 +965,16 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
}
#endif
+#ifdef HAVE_CXL
+ rc = ras_mc_create_table(priv, &cxl_poison_event_tab);
+ if (rc == SQLITE_OK) {
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_poison_event,
+ &cxl_poison_event_tab);
+ if (rc != SQLITE_OK)
+ goto error;
+ }
+#endif
+
ras->db_priv = priv;
return 0;
@@ -1019,6 +1094,16 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras)
}
#endif
+#ifdef HAVE_CXL
+ if (priv->stmt_cxl_poison_event) {
+ rc = sqlite3_finalize(priv->stmt_cxl_poison_event);
+ if (rc != SQLITE_OK)
+ log(TERM, LOG_ERR,
+ "cpu %u: Failed to finalize cxl_poison_event sqlite: error = %d\n",
+ cpu, rc);
+ }
+#endif
+
rc = sqlite3_close_v2(db);
if (rc != SQLITE_OK)
log(TERM, LOG_ERR,
diff --git a/ras-record.h b/ras-record.h
index 219f10b..fd15215 100644
--- a/ras-record.h
+++ b/ras-record.h
@@ -114,6 +114,22 @@ struct ras_mf_event {
const char *action_result;
};
+struct ras_cxl_poison_event {
+ char timestamp[64];
+ const char *memdev;
+ const char *host;
+ uint64_t serial;
+ const char *trace_type;
+ const char *region;
+ const char *uuid;
+ uint64_t hpa;
+ uint64_t dpa;
+ uint32_t dpa_length;
+ const char *source;
+ uint8_t flags;
+ char overflow_ts[64];
+};
+
struct ras_mc_event;
struct ras_aer_event;
struct ras_extlog_event;
@@ -123,6 +139,7 @@ struct mce_event;
struct devlink_event;
struct diskerror_event;
struct ras_mf_event;
+struct ras_cxl_poison_event;
#ifdef HAVE_SQLITE3
@@ -155,6 +172,9 @@ struct sqlite3_priv {
#ifdef HAVE_MEMORY_FAILURE
sqlite3_stmt *stmt_mf_event;
#endif
+#ifdef HAVE_CXL
+ sqlite3_stmt *stmt_cxl_poison_event;
+#endif
};
struct db_fields {
@@ -182,6 +202,7 @@ int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev);
int ras_store_devlink_event(struct ras_events *ras, struct devlink_event *ev);
int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev);
int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
+int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev);
#else
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
@@ -195,6 +216,7 @@ static inline int ras_store_arm_record(struct ras_events *ras, struct ras_arm_ev
static inline int ras_store_devlink_event(struct ras_events *ras, struct devlink_event *ev) { return 0; };
static inline int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; };
static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; };
+static inline int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; };
#endif
diff --git a/ras-report.c b/ras-report.c
index 62d5eb7..3daecc0 100644
--- a/ras-report.c
+++ b/ras-report.c
@@ -331,6 +331,46 @@ static int set_mf_event_backtrace(char *buf, struct ras_mf_event *ev)
return 0;
}
+static int set_cxl_poison_event_backtrace(char *buf, struct ras_cxl_poison_event *ev)
+{
+ char bt_buf[MAX_BACKTRACE_SIZE];
+
+ if (!buf || !ev)
+ return -1;
+
+ sprintf(bt_buf, "BACKTRACE=" \
+ "timestamp=%s\n" \
+ "memdev=%s\n" \
+ "host=%s\n" \
+ "serial=0x%lx\n" \
+ "trace_type=%s\n" \
+ "region=%s\n" \
+ "region_uuid=%s\n" \
+ "hpa=0x%lx\n" \
+ "dpa=0x%lx\n" \
+ "dpa_length=0x%x\n" \
+ "source=%s\n" \
+ "flags=%u\n" \
+ "overflow_timestamp=%s\n", \
+ ev->timestamp, \
+ ev->memdev, \
+ ev->host, \
+ ev->serial, \
+ ev->trace_type, \
+ ev->region, \
+ ev->uuid, \
+ ev->hpa, \
+ ev->dpa, \
+ ev->dpa_length, \
+ ev->source, \
+ ev->flags, \
+ ev->overflow_ts);
+
+ strcat(buf, bt_buf);
+
+ return 0;
+}
+
static int commit_report_backtrace(int sockfd, int type, void *ev){
char buf[MAX_BACKTRACE_SIZE];
char *pbuf = buf;
@@ -368,6 +408,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){
case MF_EVENT:
rc = set_mf_event_backtrace(buf, (struct ras_mf_event *)ev);
break;
+ case CXL_POISON_EVENT:
+ rc = set_cxl_poison_event_backtrace(buf, (struct ras_cxl_poison_event *)ev);
+ break;
default:
return -1;
}
@@ -776,3 +819,47 @@ mf_fail:
else
return -1;
}
+
+int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev)
+{
+ char buf[MAX_MESSAGE_SIZE];
+ int sockfd = 0;
+ int done = 0;
+ int rc = -1;
+
+ memset(buf, 0, sizeof(buf));
+
+ sockfd = setup_report_socket();
+ if (sockfd < 0)
+ return -1;
+
+ rc = commit_report_basic(sockfd);
+ if (rc < 0)
+ goto cxl_poison_fail;
+
+ rc = commit_report_backtrace(sockfd, CXL_POISON_EVENT, ev);
+ if (rc < 0)
+ goto cxl_poison_fail;
+
+ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl-poison");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto cxl_poison_fail;
+
+ sprintf(buf, "REASON=%s", "CXL poison");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto cxl_poison_fail;
+
+ done = 1;
+
+cxl_poison_fail:
+
+ if (sockfd >= 0)
+ close(sockfd);
+
+ if (done)
+ return 0;
+ else
+ return -1;
+}
diff --git a/ras-report.h b/ras-report.h
index e605eb1..d1591ce 100644
--- a/ras-report.h
+++ b/ras-report.h
@@ -39,6 +39,7 @@ int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev);
int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev);
int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev);
int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
+int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev);
#else
@@ -50,6 +51,7 @@ static inline int ras_report_arm_event(struct ras_events *ras, struct ras_arm_ev
static inline int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev) { return 0; };
static inline int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; };
static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; };
+static inline int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; };
#endif

View File

@ -0,0 +1,97 @@
commit 7be2edbf863b7acf7e1cab10c2b9f9bf51b3d513
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Tue Apr 4 16:07:21 2023 +0100
rasdaemon: Add common function to get timestamp for the event
Add common function to get the timestamp for the event
reported.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c
index 59534a4..d540ebb 100644
--- a/ras-cxl-handler.c
+++ b/ras-cxl-handler.c
@@ -42,6 +42,20 @@ static void convert_timestamp(unsigned long long ts, char *ts_ptr, uint16_t size
size);
}
+static void get_timestamp(struct trace_seq *s, struct tep_record *record,
+ struct ras_events *ras, char *ts_ptr, uint16_t size)
+{
+ time_t now;
+ struct tm *tm;
+
+ now = record->ts / user_hz + ras->uptime_diff;
+ tm = localtime(&now);
+ if (tm)
+ strftime(ts_ptr, size, "%Y-%m-%d %H:%M:%S %z", tm);
+ else
+ strncpy(ts_ptr, "1970-01-01 00:00:00 +0000", size);
+}
+
/* Poison List: Payload out flags */
#define CXL_POISON_FLAG_MORE BIT(0)
#define CXL_POISON_FLAG_OVERFLOW BIT(1)
@@ -70,17 +84,9 @@ int ras_cxl_poison_event_handler(struct trace_seq *s,
int len;
unsigned long long val;
struct ras_events *ras = context;
- time_t now;
- struct tm *tm;
struct ras_cxl_poison_event ev;
- now = record->ts / user_hz + ras->uptime_diff;
- tm = localtime(&now);
- if (tm)
- strftime(ev.timestamp, sizeof(ev.timestamp),
- "%Y-%m-%d %H:%M:%S %z", tm);
- else
- strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp));
+ get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp));
if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0)
return -1;
@@ -285,19 +291,11 @@ int ras_cxl_aer_ue_event_handler(struct trace_seq *s,
{
int len, i;
unsigned long long val;
- time_t now;
- struct tm *tm;
struct ras_events *ras = context;
struct ras_cxl_aer_ue_event ev;
memset(&ev, 0, sizeof(ev));
- now = record->ts / user_hz + ras->uptime_diff;
- tm = localtime(&now);
- if (tm)
- strftime(ev.timestamp, sizeof(ev.timestamp),
- "%Y-%m-%d %H:%M:%S %z", tm);
- else
- strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp));
+ get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp));
if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0)
return -1;
@@ -380,18 +378,10 @@ int ras_cxl_aer_ce_event_handler(struct trace_seq *s,
{
int len;
unsigned long long val;
- time_t now;
- struct tm *tm;
struct ras_events *ras = context;
struct ras_cxl_aer_ce_event ev;
- now = record->ts / user_hz + ras->uptime_diff;
- tm = localtime(&now);
- if (tm)
- strftime(ev.timestamp, sizeof(ev.timestamp),
- "%Y-%m-%d %H:%M:%S %z", tm);
- else
- strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp));
+ get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp));
if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0)
return -1;

View File

@ -0,0 +1,78 @@
commit 8f79833e3d78424f4a594985fbeb91890f4af81c
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Mon Mar 4 11:49:50 2024 +0000
rasdaemon: Fix build warnings unused variable if AMP RAS errors is not enabled
This patch fixes following build warnings unused variable if AMP RAS errors
is not enabled(--enable-amp-ns-decode).
==================================================
ras-aer-handler.c: In function ras_aer_event_handler:
ras-aer-handler.c:72:21: warning: unused variable fn [-Wunused-variable]
int seg, bus, dev, fn;
^~
ras-aer-handler.c:72:16: warning: unused variable dev [-Wunused-variable]
int seg, bus, dev, fn;
^~~
ras-aer-handler.c:72:11: warning: unused variable bus [-Wunused-variable]
int seg, bus, dev, fn;
^~~
ras-aer-handler.c:72:6: warning: unused variable seg [-Wunused-variable]
int seg, bus, dev, fn;
^~~
ras-aer-handler.c:71:10: warning: variable sel_data set but not used [-Wunused-but-set-variable]
uint8_t sel_data[5];
^~~~~~~~
ras-aer-handler.c:70:7: warning: unused variable ipmi_add_sel [-Wunused-variable]
char ipmi_add_sel[105];
^~~~~~~~~~~~
==================================================
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/ras-aer-handler.c b/ras-aer-handler.c
index bb1a6f6..29f6551 100644
--- a/ras-aer-handler.c
+++ b/ras-aer-handler.c
@@ -67,9 +67,11 @@ int ras_aer_event_handler(struct trace_seq *s,
struct tm *tm;
struct ras_aer_event ev;
char buf[BUF_LEN];
+#ifdef HAVE_AMP_NS_DECODE
char ipmi_add_sel[105];
uint8_t sel_data[5];
int seg, bus, dev, fn;
+#endif
/*
* Newer kernels (3.10-rc1 or upper) provide an uptime clock.
@@ -132,19 +134,27 @@ int ras_aer_event_handler(struct trace_seq *s,
switch (severity_val) {
case HW_EVENT_AER_UNCORRECTED_NON_FATAL:
ev.error_type = "Uncorrected (Non-Fatal)";
+#ifdef HAVE_AMP_NS_DECODE
sel_data[0] = 0xca;
+#endif
break;
case HW_EVENT_AER_UNCORRECTED_FATAL:
ev.error_type = "Uncorrected (Fatal)";
+#ifdef HAVE_AMP_NS_DECODE
sel_data[0] = 0xca;
+#endif
break;
case HW_EVENT_AER_CORRECTED:
ev.error_type = "Corrected";
+#ifdef HAVE_AMP_NS_DECODE
sel_data[0] = 0xbf;
+#endif
break;
default:
ev.error_type = "Unknown severity";
+#ifdef HAVE_AMP_NS_DECODE
sel_data[0] = 0xbf;
+#endif
}
trace_seq_puts(s, ev.error_type);

View File

@ -0,0 +1,82 @@
commit b6506f22fb2d7f44d9d633d44656dff2a94f257e
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Mon Feb 12 10:49:10 2024 +0000
rasdaemon: ras-mc-ctl: Add support for CXL poison trace events
Add support for CXL poison events to the ras-mc-ctl tool.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
(cherry picked from commit 93ca96b66c917af37b2ae9295dc5df46a7d64dd2)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 6a319a7..16b0589 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -1382,6 +1382,22 @@ sub summary
print "No CXL overflow errors.\n\n";
}
$query_handle->finish;
+
+ # CXL poison errors
+ $query = "select memdev, count(*) from cxl_poison_event$conf{opt}{since} group by memdev";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($memdev, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$memdev errors: $count\n";
+ }
+ if ($out ne "") {
+ print "CXL poison events summary:\n$out\n";
+ } else {
+ print "No CXL poison errors.\n\n";
+ }
+ $query_handle->finish;
}
# extlog errors
@@ -1491,6 +1507,7 @@ sub errors
my ($pfn, $page_type, $action_result);
my ($memdev, $host, $serial, $error_status, $first_error, $header_log);
my ($log_type, $first_ts, $last_ts);
+ my ($trace_type, $region, $region_uuid, $hpa, $dpa, $dpa_length, $source, $flags, $overflow_ts);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
@@ -1636,6 +1653,34 @@ sub errors
} else {
print "No CXL overflow errors.\n\n";
}
+
+ # CXL poison errors
+ $query = "select id, timestamp, memdev, host, serial, trace_type, region, region_uuid, hpa, dpa, dpa_length, source, flags, overflow_ts from cxl_poison_event$conf{opt}{since} order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $trace_type, $region, $region_uuid, $hpa, $dpa, $dpa_length, $source, $flags, $overflow_ts));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev);
+ $out .= "host=$host, " if (defined $host && length $host);
+ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial);
+ $out .= "trace_type=$trace_type, " if (defined $trace_type && length $trace_type);
+ $out .= "region=$region, " if (defined $region && length $region);
+ $out .= "region_uuid=$region_uuid, " if (defined $region_uuid && length $region_uuid);
+ $out .= sprintf "hpa=0x%llx, ", $hpa if (defined $hpa && length $hpa);
+ $out .= sprintf "dpa=0x%llx, ", $dpa if (defined $dpa && length $dpa);
+ $out .= sprintf "dpa_length=0x%x, ", $dpa_length if (defined $dpa_length && length $dpa_length);
+ $out .= "source=$source, " if (defined $source && length $source);
+ $out .= sprintf "flags=%d, ", $flags if (defined $flags && length $flags);
+ $out .= "overflow timestamp=$overflow_ts " if (defined $overflow_ts && length $overflow_ts);
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "CXL poison events:\n$out\n";
+ } else {
+ print "No CXL poison errors.\n\n";
+ }
}
# Extlog errors

View File

@ -0,0 +1,559 @@
commit 9a2f6186db2622788f8868d8ec082684d6a06d4d
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Wed Apr 5 13:28:20 2023 +0100
rasdaemon: Add support for the CXL dram events
Add support to log and record the CXL dram events.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c
index 2de96f6..64b0b50 100644
--- a/ras-cxl-handler.c
+++ b/ras-cxl-handler.c
@@ -865,3 +865,154 @@ int ras_cxl_general_media_event_handler(struct trace_seq *s,
return 0;
}
+
+/*
+ * DRAM Event Record - DER
+ *
+ * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44
+ */
+#define CXL_DER_VALID_CHANNEL BIT(0)
+#define CXL_DER_VALID_RANK BIT(1)
+#define CXL_DER_VALID_NIBBLE BIT(2)
+#define CXL_DER_VALID_BANK_GROUP BIT(3)
+#define CXL_DER_VALID_BANK BIT(4)
+#define CXL_DER_VALID_ROW BIT(5)
+#define CXL_DER_VALID_COLUMN BIT(6)
+#define CXL_DER_VALID_CORRECTION_MASK BIT(7)
+
+int ras_cxl_dram_event_handler(struct trace_seq *s,
+ struct tep_record *record,
+ struct tep_event *event, void *context)
+{
+ int len, i;
+ unsigned long long val;
+ struct ras_events *ras = context;
+ struct ras_cxl_dram_event ev;
+
+ memset(&ev, 0, sizeof(ev));
+ if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "dpa", record, &val, 1) < 0)
+ return -1;
+ ev.dpa = val;
+ if (trace_seq_printf(s, "dpa:0x%llx ", (unsigned long long)ev.dpa) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "dpa_flags", record, &val, 1) < 0)
+ return -1;
+ ev.dpa_flags = val;
+ if (trace_seq_printf(s, "dpa_flags:") <= 0)
+ return -1;
+ if (decode_cxl_event_flags(s, ev.dpa_flags, cxl_dpa_flags, ARRAY_SIZE(cxl_dpa_flags)) < 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "descriptor", record, &val, 1) < 0)
+ return -1;
+ ev.descriptor = val;
+ if (trace_seq_printf(s, "descriptor:") <= 0)
+ return -1;
+ if (decode_cxl_event_flags(s, ev.descriptor, cxl_gmer_event_desc_flags,
+ ARRAY_SIZE(cxl_gmer_event_desc_flags)) < 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "type", record, &val, 1) < 0)
+ return -1;
+ ev.type = val;
+ if (trace_seq_printf(s, "type:%s ", get_cxl_type_str(cxl_gmer_mem_event_type,
+ ARRAY_SIZE(cxl_gmer_mem_event_type), ev.type)) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "transaction_type", record, &val, 1) < 0)
+ return -1;
+ ev.transaction_type = val;
+ if (trace_seq_printf(s, "transaction_type:%s ",
+ get_cxl_type_str(cxl_gmer_trans_type,
+ ARRAY_SIZE(cxl_gmer_trans_type),
+ ev.transaction_type)) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "validity_flags", record, &val, 1) < 0)
+ return -1;
+ ev.validity_flags = val;
+
+ if (ev.validity_flags & CXL_DER_VALID_CHANNEL) {
+ if (tep_get_field_val(s, event, "channel", record, &val, 1) < 0)
+ return -1;
+ ev.channel = val;
+ if (trace_seq_printf(s, "channel:%u ", ev.channel) <= 0)
+ return -1;
+ }
+
+ if (ev.validity_flags & CXL_DER_VALID_RANK) {
+ if (tep_get_field_val(s, event, "rank", record, &val, 1) < 0)
+ return -1;
+ ev.rank = val;
+ if (trace_seq_printf(s, "rank:%u ", ev.rank) <= 0)
+ return -1;
+ }
+
+ if (ev.validity_flags & CXL_DER_VALID_NIBBLE) {
+ if (tep_get_field_val(s, event, "nibble_mask", record, &val, 1) < 0)
+ return -1;
+ ev.nibble_mask = val;
+ if (trace_seq_printf(s, "nibble_mask:%u ", ev.nibble_mask) <= 0)
+ return -1;
+ }
+
+ if (ev.validity_flags & CXL_DER_VALID_BANK_GROUP) {
+ if (tep_get_field_val(s, event, "bank_group", record, &val, 1) < 0)
+ return -1;
+ ev.bank_group = val;
+ if (trace_seq_printf(s, "bank_group:%u ", ev.bank_group) <= 0)
+ return -1;
+ }
+
+ if (ev.validity_flags & CXL_DER_VALID_BANK) {
+ if (tep_get_field_val(s, event, "bank", record, &val, 1) < 0)
+ return -1;
+ ev.bank = val;
+ if (trace_seq_printf(s, "bank:%u ", ev.bank) <= 0)
+ return -1;
+ }
+
+ if (ev.validity_flags & CXL_DER_VALID_ROW) {
+ if (tep_get_field_val(s, event, "row", record, &val, 1) < 0)
+ return -1;
+ ev.row = val;
+ if (trace_seq_printf(s, "row:%u ", ev.row) <= 0)
+ return -1;
+ }
+
+ if (ev.validity_flags & CXL_DER_VALID_COLUMN) {
+ if (tep_get_field_val(s, event, "column", record, &val, 1) < 0)
+ return -1;
+ ev.column = val;
+ if (trace_seq_printf(s, "column:%u ", ev.column) <= 0)
+ return -1;
+ }
+
+ if (ev.validity_flags & CXL_DER_VALID_CORRECTION_MASK) {
+ ev.cor_mask = tep_get_field_raw(s, event, "cor_mask", record, &len, 1);
+ if (!ev.cor_mask)
+ return -1;
+ if (trace_seq_printf(s, "correction_mask:") <= 0)
+ return -1;
+ for (i = 0; i < CXL_EVENT_DER_CORRECTION_MASK_SIZE; i++) {
+ if (trace_seq_printf(s, "%02x ", ev.cor_mask[i]) <= 0)
+ break;
+ }
+ }
+
+ /* Insert data into the SGBD */
+#ifdef HAVE_SQLITE3
+ ras_store_cxl_dram_event(ras, &ev);
+#endif
+
+#ifdef HAVE_ABRT_REPORT
+ /* Report event to ABRT */
+ ras_report_cxl_dram_event(ras, &ev);
+#endif
+
+ return 0;
+}
diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h
index 3adca4a..35455af 100644
--- a/ras-cxl-handler.h
+++ b/ras-cxl-handler.h
@@ -38,4 +38,7 @@ int ras_cxl_generic_event_handler(struct trace_seq *s,
int ras_cxl_general_media_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context);
+int ras_cxl_dram_event_handler(struct trace_seq *s,
+ struct tep_record *record,
+ struct tep_event *event, void *context);
#endif
diff --git a/ras-events.c b/ras-events.c
index 978dee4..d27e0c4 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -251,6 +251,7 @@ int toggle_ras_mc_event(int enable)
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_overflow", enable);
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_generic_event", enable);
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_general_media", enable);
+ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_dram", enable);
#endif
free_ras:
@@ -1072,6 +1073,14 @@ int handle_ras_events(int record_events)
else
log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
"cxl", "cxl_general_media");
+
+ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_dram",
+ ras_cxl_dram_event_handler, NULL, CXL_DRAM_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "cxl", "cxl_dram");
#endif
if (!num_events) {
diff --git a/ras-events.h b/ras-events.h
index 9b83df3..d192a6b 100644
--- a/ras-events.h
+++ b/ras-events.h
@@ -45,6 +45,7 @@ enum {
CXL_OVERFLOW_EVENT,
CXL_GENERIC_EVENT,
CXL_GENERAL_MEDIA_EVENT,
+ CXL_DRAM_EVENT,
NR_EVENTS
};
diff --git a/ras-record.c b/ras-record.c
index 507a58e..fffa81c 100644
--- a/ras-record.c
+++ b/ras-record.c
@@ -915,6 +915,83 @@ int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_gen
return rc;
}
+
+/*
+ * Table and functions to handle cxl:cxl_dram_event
+ */
+static const struct db_fields cxl_dram_event_fields[] = {
+ { .name = "id", .type = "INTEGER PRIMARY KEY" },
+ { .name = "timestamp", .type = "TEXT" },
+ { .name = "memdev", .type = "TEXT" },
+ { .name = "host", .type = "TEXT" },
+ { .name = "serial", .type = "INTEGER" },
+ { .name = "log_type", .type = "TEXT" },
+ { .name = "hdr_uuid", .type = "TEXT" },
+ { .name = "hdr_flags", .type = "INTEGER" },
+ { .name = "hdr_handle", .type = "INTEGER" },
+ { .name = "hdr_related_handle", .type = "INTEGER" },
+ { .name = "hdr_ts", .type = "TEXT" },
+ { .name = "hdr_length", .type = "INTEGER" },
+ { .name = "hdr_maint_op_class", .type = "INTEGER" },
+ { .name = "dpa", .type = "INTEGER" },
+ { .name = "dpa_flags", .type = "INTEGER" },
+ { .name = "descriptor", .type = "INTEGER" },
+ { .name = "type", .type = "INTEGER" },
+ { .name = "transaction_type", .type = "INTEGER" },
+ { .name = "channel", .type = "INTEGER" },
+ { .name = "rank", .type = "INTEGER" },
+ { .name = "nibble_mask", .type = "INTEGER" },
+ { .name = "bank_group", .type = "INTEGER" },
+ { .name = "bank", .type = "INTEGER" },
+ { .name = "row", .type = "INTEGER" },
+ { .name = "column", .type = "INTEGER" },
+ { .name = "cor_mask", .type = "BLOB" },
+};
+
+static const struct db_table_descriptor cxl_dram_event_tab = {
+ .name = "cxl_dram_event",
+ .fields = cxl_dram_event_fields,
+ .num_fields = ARRAY_SIZE(cxl_dram_event_fields),
+};
+
+int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev)
+{
+ int rc;
+ struct sqlite3_priv *priv = ras->db_priv;
+
+ if (!priv || !priv->stmt_cxl_dram_event)
+ return 0;
+ log(TERM, LOG_INFO, "cxl_dram_event store: %p\n",
+ priv->stmt_cxl_dram_event);
+
+ ras_store_cxl_common_hdr(priv->stmt_cxl_dram_event, &ev->hdr);
+ sqlite3_bind_int64(priv->stmt_cxl_dram_event, 13, ev->dpa);
+ sqlite3_bind_int(priv->stmt_cxl_dram_event, 14, ev->dpa_flags);
+ sqlite3_bind_int(priv->stmt_cxl_dram_event, 15, ev->descriptor);
+ sqlite3_bind_int(priv->stmt_cxl_dram_event, 16, ev->type);
+ sqlite3_bind_int(priv->stmt_cxl_dram_event, 17, ev->transaction_type);
+ sqlite3_bind_int(priv->stmt_cxl_dram_event, 18, ev->channel);
+ sqlite3_bind_int(priv->stmt_cxl_dram_event, 19, ev->rank);
+ sqlite3_bind_int(priv->stmt_cxl_dram_event, 20, ev->nibble_mask);
+ sqlite3_bind_int(priv->stmt_cxl_dram_event, 21, ev->bank_group);
+ sqlite3_bind_int(priv->stmt_cxl_dram_event, 22, ev->bank);
+ sqlite3_bind_int(priv->stmt_cxl_dram_event, 23, ev->row);
+ sqlite3_bind_int(priv->stmt_cxl_dram_event, 24, ev->column);
+ sqlite3_bind_blob(priv->stmt_cxl_dram_event, 25, ev->cor_mask,
+ CXL_EVENT_DER_CORRECTION_MASK_SIZE, NULL);
+
+ rc = sqlite3_step(priv->stmt_cxl_dram_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to do stmt_cxl_dram_event step on sqlite: error = %d\n", rc);
+ rc = sqlite3_reset(priv->stmt_cxl_dram_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed reset stmt_cxl_dram_event on sqlite: error = %d\n", rc);
+ log(TERM, LOG_INFO, "register inserted at db\n");
+
+ return rc;
+}
#endif
/*
@@ -1306,6 +1383,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
if (rc != SQLITE_OK)
goto error;
}
+
+ rc = ras_mc_create_table(priv, &cxl_dram_event_tab);
+ if (rc == SQLITE_OK) {
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_dram_event,
+ &cxl_dram_event_tab);
+ if (rc != SQLITE_OK)
+ goto error;
+ }
#endif
ras->db_priv = priv;
@@ -1475,6 +1560,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras)
"cpu %u: Failed to finalize cxl_general_media_event sqlite: error = %d\n",
cpu, rc);
}
+
+ if (priv->stmt_cxl_dram_event) {
+ rc = sqlite3_finalize(priv->stmt_cxl_dram_event);
+ if (rc != SQLITE_OK)
+ log(TERM, LOG_ERR,
+ "cpu %u: Failed to finalize cxl_dram_event sqlite: error = %d\n",
+ cpu, rc);
+ }
#endif
rc = sqlite3_close_v2(db);
diff --git a/ras-record.h b/ras-record.h
index 37c32de..480ff92 100644
--- a/ras-record.h
+++ b/ras-record.h
@@ -135,6 +135,7 @@ struct ras_cxl_poison_event {
#define CXL_HEADERLOG_SIZE_U32 (SZ_512 / sizeof(uint32_t))
#define CXL_EVENT_RECORD_DATA_LENGTH 0x50
#define CXL_EVENT_GEN_MED_COMP_ID_SIZE 0x10
+#define CXL_EVENT_DER_CORRECTION_MASK_SIZE 0x20
struct ras_cxl_aer_ue_event {
char timestamp[64];
@@ -199,6 +200,24 @@ struct ras_cxl_general_media_event {
uint16_t validity_flags;
};
+struct ras_cxl_dram_event {
+ struct ras_cxl_event_common_hdr hdr;
+ uint64_t dpa;
+ uint8_t dpa_flags;
+ uint8_t descriptor;
+ uint8_t type;
+ uint8_t transaction_type;
+ uint8_t channel;
+ uint8_t rank;
+ uint32_t nibble_mask;
+ uint8_t bank_group;
+ uint8_t bank;
+ uint32_t row;
+ uint16_t column;
+ uint8_t *cor_mask;
+ uint16_t validity_flags;
+};
+
struct ras_mc_event;
struct ras_aer_event;
struct ras_extlog_event;
@@ -214,6 +233,7 @@ struct ras_cxl_aer_ce_event;
struct ras_cxl_overflow_event;
struct ras_cxl_generic_event;
struct ras_cxl_general_media_event;
+struct ras_cxl_dram_event;
#ifdef HAVE_SQLITE3
@@ -253,6 +273,7 @@ struct sqlite3_priv {
sqlite3_stmt *stmt_cxl_overflow_event;
sqlite3_stmt *stmt_cxl_generic_event;
sqlite3_stmt *stmt_cxl_general_media_event;
+ sqlite3_stmt *stmt_cxl_dram_event;
#endif
};
@@ -287,6 +308,7 @@ int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_eve
int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev);
int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev);
int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev);
+int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev);
#else
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
@@ -306,6 +328,7 @@ static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_
static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; };
static inline int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; };
static inline int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; };
+static inline int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) { return 0; };
#endif
diff --git a/ras-report.c b/ras-report.c
index 725dc9b..21180b1 100644
--- a/ras-report.c
+++ b/ras-report.c
@@ -543,6 +543,68 @@ static int set_cxl_general_media_event_backtrace(char *buf, struct ras_cxl_gener
return 0;
}
+static int set_cxl_dram_event_backtrace(char *buf, struct ras_cxl_dram_event *ev)
+{
+ char bt_buf[MAX_BACKTRACE_SIZE];
+
+ if (!buf || !ev)
+ return -1;
+
+ sprintf(bt_buf, "BACKTRACE=" \
+ "timestamp=%s\n" \
+ "memdev=%s\n" \
+ "host=%s\n" \
+ "serial=0x%lx\n" \
+ "log_type=%s\n" \
+ "hdr_uuid=%s\n" \
+ "hdr_flags=0x%x\n" \
+ "hdr_handle=0x%x\n" \
+ "hdr_related_handle=0x%x\n" \
+ "hdr_timestamp=%s\n" \
+ "hdr_length=%u\n" \
+ "hdr_maint_op_class=%u\n" \
+ "dpa=0x%lx\n" \
+ "dpa_flags=%u\n" \
+ "descriptor=%u\n" \
+ "type=%u\n" \
+ "transaction_type=%u\n" \
+ "channel=%u\n" \
+ "rank=%u\n" \
+ "nibble_mask=%u\n" \
+ "bank_group=%u\n" \
+ "bank=%u\n" \
+ "row=%u\n" \
+ "column=%u\n", \
+ ev->hdr.timestamp, \
+ ev->hdr.memdev, \
+ ev->hdr.host, \
+ ev->hdr.serial, \
+ ev->hdr.log_type, \
+ ev->hdr.hdr_uuid, \
+ ev->hdr.hdr_flags, \
+ ev->hdr.hdr_handle, \
+ ev->hdr.hdr_related_handle, \
+ ev->hdr.hdr_timestamp, \
+ ev->hdr.hdr_length, \
+ ev->hdr.hdr_maint_op_class, \
+ ev->dpa, \
+ ev->dpa_flags, \
+ ev->descriptor, \
+ ev->type, \
+ ev->transaction_type, \
+ ev->channel, \
+ ev->rank, \
+ ev->nibble_mask, \
+ ev->bank_group, \
+ ev->bank, \
+ ev->row, \
+ ev->column);
+
+ strcat(buf, bt_buf);
+
+ return 0;
+}
+
static int commit_report_backtrace(int sockfd, int type, void *ev){
char buf[MAX_BACKTRACE_SIZE];
char *pbuf = buf;
@@ -598,6 +660,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){
case CXL_GENERAL_MEDIA_EVENT:
rc = set_cxl_general_media_event_backtrace(buf, (struct ras_cxl_general_media_event *)ev);
break;
+ case CXL_DRAM_EVENT:
+ rc = set_cxl_dram_event_backtrace(buf, (struct ras_cxl_dram_event *)ev);
+ break;
default:
return -1;
}
@@ -1271,3 +1336,47 @@ cxl_general_media_fail:
else
return -1;
}
+
+int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev)
+{
+ char buf[MAX_MESSAGE_SIZE];
+ int sockfd = 0;
+ int done = 0;
+ int rc = -1;
+
+ memset(buf, 0, sizeof(buf));
+
+ sockfd = setup_report_socket();
+ if (sockfd < 0)
+ return -1;
+
+ rc = commit_report_basic(sockfd);
+ if (rc < 0)
+ goto cxl_dram_fail;
+
+ rc = commit_report_backtrace(sockfd, CXL_DRAM_EVENT, ev);
+ if (rc < 0)
+ goto cxl_dram_fail;
+
+ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl_dram_event");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto cxl_dram_fail;
+
+ sprintf(buf, "REASON=%s", "CXL DRAM Event");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto cxl_dram_fail;
+
+ done = 1;
+
+cxl_dram_fail:
+
+ if (sockfd >= 0)
+ close(sockfd);
+
+ if (done)
+ return 0;
+ else
+ return -1;
+}
diff --git a/ras-report.h b/ras-report.h
index d9ec7df..1ad00e0 100644
--- a/ras-report.h
+++ b/ras-report.h
@@ -45,6 +45,7 @@ int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_ev
int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev);
int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev);
int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev);
+int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev);
#else
@@ -62,6 +63,7 @@ static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras
static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; };
static inline int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; };
static inline int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; };
+static inline int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) { return 0; };
#endif

View File

@ -1,85 +0,0 @@
commit 0862a096c3a1d0f993703ab3299f1ddfadf53d7f
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Tue Aug 11 13:31:46 2020 +0100
rasdaemon: ras-mc-ctl: Add ARM processor error information
Add supporting ARM processor error in the ras-mc-ctl tool.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
util/ras-mc-ctl.in | 40 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 40 insertions(+)
--- rasdaemon-0.6.1.orig/util/ras-mc-ctl.in 2021-10-06 14:14:25.000440090 -0400
+++ rasdaemon-0.6.1/util/ras-mc-ctl.in 2021-10-06 14:15:59.995598590 -0400
@@ -1124,6 +1124,7 @@ sub summary
my ($query, $query_handle, $out);
my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg);
my ($etype, $severity, $etype_string, $severity_string);
+ my ($affinity, $mpidr);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
@@ -1159,6 +1160,22 @@ sub summary
}
$query_handle->finish;
+ # ARM processor arm_event errors
+ $query = "select affinity, mpidr, count(*) from arm_event group by affinity, mpidr";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($affinity, $mpidr, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$count errors\n";
+ }
+ if ($out ne "") {
+ print "ARM processor events summary:\n$out\n";
+ } else {
+ print "No ARM processor errors.\n\n";
+ }
+ $query_handle->finish;
+
# extlog errors
$query = "select etype, severity, count(*) from extlog_event group by etype, severity";
$query_handle = $dbh->prepare($query);
@@ -1202,6 +1219,7 @@ sub errors
my ($query, $query_handle, $id, $time, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out);
my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location);
my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data);
+ my ($error_count, $affinity, $mpidr, $r_state, $psci_state);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
@@ -1241,6 +1259,28 @@ sub errors
}
$query_handle->finish;
+ # ARM processor arm_event errors
+ $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "error_count=$error_count, " if ($error_count);
+ $out .= "affinity_level=$affinity, ";
+ $out .= sprintf "mpidr=0x%x, ", $mpidr;
+ $out .= sprintf "running_state=0x%x, ", $r_state;
+ $out .= sprintf "psci_state=0x%x", $psci_state;
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "ARM processor events:\n$out\n";
+ } else {
+ print "No ARM processor errors.\n\n";
+ }
+ $query_handle->finish;
+
# Extlog errors
$query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id";
$query_handle = $dbh->prepare($query);

View File

@ -1,32 +0,0 @@
commit 16d929b024c31d54a7f8a72eab094376c7be27f5
Author: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Wed May 26 10:20:39 2021 +0200
Makefile.am: fix build header rules
non-standard-hisilicon.h was added twice;
ras-memory-failure-handler.h is missing.
Due to that, the tarball becomes incomplete, causing build
errors.
While here, also adjust .travis.yml to use --enable-all.
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
Makefile.am | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/Makefile.am 2021-10-13 13:27:53.402685179 -0400
+++ b/Makefile.am 2021-10-13 13:28:11.664525173 -0400
@@ -54,7 +54,8 @@ rasdaemon_LDADD = -lpthread $(SQLITE3_LI
include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \
- ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h
+ ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \
+ ras-memory-failure-handler.h
# This rule can't be called with more than one Makefile job (like make -j8)
# I can't figure out a way to fix that

View File

@ -1,538 +0,0 @@
commit 2290d65b97311dd5736838f1e285355f7f357046
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Mon Mar 8 16:57:26 2021 +0000
rasdaemon: add support for memory_failure events
Add support to log the memory_failure kernel trace
events.
Example rasdaemon log and SQLite DB output for the
memory_failure event,
=================================================
rasdaemon: memory_failure_event store: 0x126ce8f8
rasdaemon: register inserted at db
<...>-785 [000] 0.000024: memory_failure_event: 2020-10-02 13:27:13 -0400 pfn=0x204000000 page_type=free buddy page action_result=Delayed
CREATE TABLE memory_failure_event (id INTEGER PRIMARY KEY, timestamp TEXT, pfn TEXT, page_type TEXT, action_result TEXT);
INSERT INTO memory_failure_event VALUES(1,'2020-10-02 13:27:13 -0400','0x204000000','free buddy page','Delayed');
==================================================
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
Makefile.am | 4
ras-events.c | 15 +++
ras-memory-failure-handler.c | 179 +++++++++++++++++++++++++++++++++++++++++++
ras-memory-failure-handler.h | 25 ++++++
ras-record.c | 56 +++++++++++++
ras-record.h | 13 +++
ras-report.c | 68 ++++++++++++++++
ras-report.h | 5 -
8 files changed, 364 insertions(+), 1 deletion(-)
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ b/ras-memory-failure-handler.c 2021-10-14 16:31:36.840657728 -0400
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "libtrace/kbuffer.h"
+#include "ras-memory-failure-handler.h"
+#include "ras-record.h"
+#include "ras-logger.h"
+#include "ras-report.h"
+
+/* Memory failure - various types of pages */
+enum mf_action_page_type {
+ MF_MSG_KERNEL,
+ MF_MSG_KERNEL_HIGH_ORDER,
+ MF_MSG_SLAB,
+ MF_MSG_DIFFERENT_COMPOUND,
+ MF_MSG_POISONED_HUGE,
+ MF_MSG_HUGE,
+ MF_MSG_FREE_HUGE,
+ MF_MSG_NON_PMD_HUGE,
+ MF_MSG_UNMAP_FAILED,
+ MF_MSG_DIRTY_SWAPCACHE,
+ MF_MSG_CLEAN_SWAPCACHE,
+ MF_MSG_DIRTY_MLOCKED_LRU,
+ MF_MSG_CLEAN_MLOCKED_LRU,
+ MF_MSG_DIRTY_UNEVICTABLE_LRU,
+ MF_MSG_CLEAN_UNEVICTABLE_LRU,
+ MF_MSG_DIRTY_LRU,
+ MF_MSG_CLEAN_LRU,
+ MF_MSG_TRUNCATED_LRU,
+ MF_MSG_BUDDY,
+ MF_MSG_BUDDY_2ND,
+ MF_MSG_DAX,
+ MF_MSG_UNSPLIT_THP,
+ MF_MSG_UNKNOWN,
+};
+
+/* Action results for various types of pages */
+enum mf_action_result {
+ MF_IGNORED, /* Error: cannot be handled */
+ MF_FAILED, /* Error: handling failed */
+ MF_DELAYED, /* Will be handled later */
+ MF_RECOVERED, /* Successfully recovered */
+};
+
+/* memory failure page types */
+static const struct {
+ int type;
+ const char *page_type;
+} mf_page_type[] = {
+ { MF_MSG_KERNEL, "reserved kernel page" },
+ { MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page"},
+ { MF_MSG_SLAB, "kernel slab page"},
+ { MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking"},
+ { MF_MSG_POISONED_HUGE, "huge page already hardware poisoned"},
+ { MF_MSG_HUGE, "huge page"},
+ { MF_MSG_FREE_HUGE, "free huge page"},
+ { MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page"},
+ { MF_MSG_UNMAP_FAILED, "unmapping failed page"},
+ { MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page"},
+ { MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page"},
+ { MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page"},
+ { MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page"},
+ { MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page"},
+ { MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page"},
+ { MF_MSG_DIRTY_LRU, "dirty LRU page"},
+ { MF_MSG_CLEAN_LRU, "clean LRU page"},
+ { MF_MSG_TRUNCATED_LRU, "already truncated LRU page"},
+ { MF_MSG_BUDDY, "free buddy page"},
+ { MF_MSG_BUDDY_2ND, "free buddy page (2nd try)"},
+ { MF_MSG_DAX, "dax page"},
+ { MF_MSG_UNSPLIT_THP, "unsplit thp"},
+ { MF_MSG_UNKNOWN, "unknown page"},
+};
+
+/* memory failure action results */
+static const struct {
+ int result;
+ const char *action_result;
+} mf_action_result[] = {
+ { MF_IGNORED, "Ignored" },
+ { MF_FAILED, "Failed" },
+ { MF_DELAYED, "Delayed" },
+ { MF_RECOVERED, "Recovered" },
+};
+
+static const char *get_page_type(int page_type)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mf_page_type); i++)
+ if (mf_page_type[i].type == page_type)
+ return mf_page_type[i].page_type;
+
+ return "unknown page";
+}
+
+static const char *get_action_result(int result)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mf_action_result); i++)
+ if (mf_action_result[i].result == result)
+ return mf_action_result[i].action_result;
+
+ return "unknown";
+}
+
+
+int ras_memory_failure_event_handler(struct trace_seq *s,
+ struct pevent_record *record,
+ struct event_format *event, void *context)
+{
+ unsigned long long val;
+ struct ras_events *ras = context;
+ time_t now;
+ struct tm *tm;
+ struct ras_mf_event ev;
+
+ /*
+ * Newer kernels (3.10-rc1 or upper) provide an uptime clock.
+ * On previous kernels, the way to properly generate an event would
+ * be to inject a fake one, measure its timestamp and diff it against
+ * gettimeofday. We won't do it here. Instead, let's use uptime,
+ * falling-back to the event report's time, if "uptime" clock is
+ * not available (legacy kernels).
+ */
+
+ if (ras->use_uptime)
+ now = record->ts/user_hz + ras->uptime_diff;
+ else
+ now = time(NULL);
+
+ tm = localtime(&now);
+ if (tm)
+ strftime(ev.timestamp, sizeof(ev.timestamp),
+ "%Y-%m-%d %H:%M:%S %z", tm);
+ trace_seq_printf(s, "%s ", ev.timestamp);
+
+ if (pevent_get_field_val(s, event, "pfn", record, &val, 1) < 0)
+ return -1;
+ sprintf(ev.pfn, "0x%llx", val);
+ trace_seq_printf(s, "pfn=0x%llx ", val);
+
+ if (pevent_get_field_val(s, event, "type", record, &val, 1) < 0)
+ return -1;
+ ev.page_type = get_page_type(val);
+ trace_seq_printf(s, "page_type=%s ", ev.page_type);
+
+ if (pevent_get_field_val(s, event, "result", record, &val, 1) < 0)
+ return -1;
+ ev.action_result = get_action_result(val);
+ trace_seq_printf(s, "action_result=%s ", ev.action_result);
+
+ /* Store data into the SQLite DB */
+#ifdef HAVE_SQLITE3
+ ras_store_mf_event(ras, &ev);
+#endif
+
+#ifdef HAVE_ABRT_REPORT
+ /* Report event to ABRT */
+ ras_report_mf_event(ras, &ev);
+#endif
+
+ return 0;
+}
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ b/ras-memory-failure-handler.h 2021-10-14 16:31:36.840657728 -0400
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+*/
+
+#ifndef __RAS_MEMORY_FAILURE_HANDLER_H
+#define __RAS_MEMORY_FAILURE_HANDLER_H
+
+#include "ras-events.h"
+#include "libtrace/event-parse.h"
+
+int ras_memory_failure_event_handler(struct trace_seq *s,
+ struct pevent_record *record,
+ struct event_format *event, void *context);
+
+#endif
--- a/ras-record.c 2018-04-25 06:19:03.000000000 -0400
+++ b/ras-record.c 2021-10-14 16:31:36.840657728 -0400
@@ -404,6 +404,55 @@ sqlite3_bind_text(priv->stmt_mce_record,
}
#endif
+/*
+ * Table and functions to handle ras:memory_failure
+ */
+
+#ifdef HAVE_MEMORY_FAILURE
+static const struct db_fields mf_event_fields[] = {
+ { .name="id", .type="INTEGER PRIMARY KEY" },
+ { .name="timestamp", .type="TEXT" },
+ { .name="pfn", .type="TEXT" },
+ { .name="page_type", .type="TEXT" },
+ { .name="action_result", .type="TEXT" },
+};
+
+static const struct db_table_descriptor mf_event_tab = {
+ .name = "memory_failure_event",
+ .fields = mf_event_fields,
+ .num_fields = ARRAY_SIZE(mf_event_fields),
+};
+
+int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev)
+{
+ int rc;
+ struct sqlite3_priv *priv = ras->db_priv;
+
+ if (!priv || !priv->stmt_mf_event)
+ return 0;
+ log(TERM, LOG_INFO, "memory_failure_event store: %p\n", priv->stmt_mf_event);
+
+ sqlite3_bind_text(priv->stmt_mf_event, 1, ev->timestamp, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mf_event, 2, ev->pfn, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mf_event, 3, ev->page_type, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mf_event, 4, ev->action_result, -1, NULL);
+
+ rc = sqlite3_step(priv->stmt_mf_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to do memory_failure_event step on sqlite: error = %d\n", rc);
+
+ rc = sqlite3_reset(priv->stmt_mf_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed reset memory_failure_event on sqlite: error = %d\n",
+ rc);
+
+ log(TERM, LOG_INFO, "register inserted at db\n");
+
+ return rc;
+}
+#endif
/*
* Generic code
@@ -567,6 +616,13 @@ usleep(10000);
rc = ras_mc_prepare_stmt(priv, &priv->stmt_arm_record,
&arm_event_tab);
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ rc = ras_mc_create_table(priv, &mf_event_tab);
+ if (rc == SQLITE_OK) {
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_mf_event,
+ &mf_event_tab);
+ }
+#endif
ras->db_priv = priv;
return 0;
--- a/ras-record.h 2018-04-25 06:19:03.000000000 -0400
+++ b/ras-record.h 2021-10-14 16:31:36.840657728 -0400
@@ -75,12 +75,20 @@ struct ras_arm_event {
int32_t psci_state;
};
+struct ras_mf_event {
+ char timestamp[64];
+ char pfn[30];
+ const char *page_type;
+ const char *action_result;
+};
+
struct ras_mc_event;
struct ras_aer_event;
struct ras_extlog_event;
struct ras_non_standard_event;
struct ras_arm_event;
struct mce_event;
+struct ras_mf_event;
#ifdef HAVE_SQLITE3
@@ -104,6 +112,9 @@ struct sqlite3_priv {
#ifdef HAVE_ARM
sqlite3_stmt *stmt_arm_record;
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ sqlite3_stmt *stmt_mf_event;
+#endif
};
int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras);
@@ -113,6 +124,7 @@ int ras_store_mce_record(struct ras_even
int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev);
int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev);
int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev);
+int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
#else
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
@@ -122,6 +134,7 @@ static inline int ras_store_mce_record(s
static inline int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev) { return 0; };
static inline int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; };
static inline int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) { return 0; };
+static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; };
#endif
--- a/ras-report.c 2017-10-14 05:11:34.000000000 -0400
+++ b/ras-report.c 2021-10-14 16:31:36.840657728 -0400
@@ -255,6 +255,28 @@ "midr=0x%lx\n" \
return 0;
}
+static int set_mf_event_backtrace(char *buf, struct ras_mf_event *ev)
+{
+ char bt_buf[MAX_BACKTRACE_SIZE];
+
+ if (!buf || !ev)
+ return -1;
+
+ sprintf(bt_buf, "BACKTRACE=" \
+ "timestamp=%s\n" \
+ "pfn=%s\n" \
+ "page_type=%s\n" \
+ "action_result=%s\n", \
+ ev->timestamp, \
+ ev->pfn, \
+ ev->page_type, \
+ ev->action_result);
+
+ strcat(buf, bt_buf);
+
+ return 0;
+}
+
static int commit_report_backtrace(int sockfd, int type, void *ev){
char buf[MAX_BACKTRACE_SIZE];
char *pbuf = buf;
@@ -283,6 +305,9 @@ memset(buf, 0, MAX_BACKTRACE_SIZE);
case ARM_EVENT:
rc = set_arm_event_backtrace(buf, (struct ras_arm_event *)ev);
break;
+ case MF_EVENT:
+ rc = set_mf_event_backtrace(buf, (struct ras_mf_event *)ev);
+ break;
default:
return -1;
}
@@ -549,3 +574,46 @@ return 0;
return -1;
}
}
+
+int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev)
+{
+ char buf[MAX_MESSAGE_SIZE];
+ int sockfd = 0;
+ int done = 0;
+ int rc = -1;
+
+ memset(buf, 0, sizeof(buf));
+
+ sockfd = setup_report_socket();
+ if (sockfd < 0)
+ return -1;
+
+ rc = commit_report_basic(sockfd);
+ if (rc < 0)
+ goto mf_fail;
+
+ rc = commit_report_backtrace(sockfd, MF_EVENT, ev);
+ if (rc < 0)
+ goto mf_fail;
+
+ sprintf(buf, "ANALYZER=%s", "rasdaemon-memory_failure");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto mf_fail;
+
+ sprintf(buf, "REASON=%s", "memory failure problem");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto mf_fail;
+
+ done = 1;
+
+mf_fail:
+ if (sockfd > 0)
+ close(sockfd);
+
+ if (done)
+ return 0;
+ else
+ return -1;
+}
--- a/ras-report.h 2017-10-14 05:11:34.000000000 -0400
+++ b/ras-report.h 2021-10-14 16:31:36.840657728 -0400
@@ -34,7 +34,8 @@ enum {
MCE_EVENT,
AER_EVENT,
NON_STANDARD_EVENT,
- ARM_EVENT
+ ARM_EVENT,
+ MF_EVENT,
};
#ifdef HAVE_ABRT_REPORT
@@ -44,6 +45,7 @@ int ras_report_aer_event(struct ras_even
int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev);
int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev);
int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev);
+int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
#else
@@ -52,6 +54,7 @@ static inline int ras_report_aer_event(s
static inline int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev) { return 0; };
static inline int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; };
static inline int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev) { return 0; };
+static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; };
#endif
--- a/Makefile.am 2018-04-25 06:21:56.000000000 -0400
+++ b/Makefile.am 2021-10-14 16:37:42.423639762 -0400
@@ -41,12 +41,16 @@ endif
if WITH_EXTLOG
rasdaemon_SOURCES += ras-extlog-handler.c
endif
+if WITH_MEMORY_FAILURE
+ rasdaemon_SOURCES += ras-memory-failure-handler.c
+endif
if WITH_ABRT_REPORT
rasdaemon_SOURCES += ras-report.c
endif
if WITH_HISI_NS_DECODE
rasdaemon_SOURCES += non-standard-hisi_hip07.c
endif
+
rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a
include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
--- a/ras-events.c 2021-10-14 16:31:36.730658636 -0400
+++ b/ras-events.c 2021-10-14 16:37:11.043898809 -0400
@@ -33,6 +33,7 @@ * Foundation, Inc., 51 Franklin Street,
#include "ras-arm-handler.h"
#include "ras-mce-handler.h"
#include "ras-extlog-handler.h"
+#include "ras-memory-failure-handler.h"
#include "ras-record.h"
#include "ras-logger.h"
@@ -218,6 +219,10 @@ if (rc < 0) {
rc |= __toggle_ras_mc_event(ras, "ras", "arm_event", enable);
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ rc |= __toggle_ras_mc_event(ras, "ras", "memory_failure_event", enable);
+#endif
+
free_ras:
free(ras);
return rc;
@@ -736,6 +741,16 @@ (void)open("/sys/kernel/debug/ras/daemon
"ras", "aer_event");
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ rc = add_event_handler(ras, pevent, page_size, "ras", "memory_failure_event",
+ ras_memory_failure_event_handler);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "ras", "memory_failure_event");
+#endif
+
if (!num_events) {
log(ALL, LOG_INFO,
"Failed to trace all supported RAS events. Aborting.\n");

View File

@ -1,28 +0,0 @@
commit 28ea956acc2dab7c18b4701f9657afb9ab3ddc79
Author: Muralidhara M K <muralimk@amd.com>
Date: Mon Jul 12 05:18:43 2021 -0500
rasdaemon: set SMCA maximum number of banks to 64
Newer AMD systems with SMCA banks support up to 64 MCA banks per CPU.
This patch is based on the commit below upstremed into the kernel:
a0bc32b3cacf ("x86/mce: Increase maximum number of banks to 64")
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index e0cf512..3c346f4 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -75,6 +75,9 @@ enum smca_bank_types {
N_SMCA_BANK_TYPES
};
+/* Maximum number of MCA banks per CPU. */
+#define MAX_NR_BANKS 64
+
/* SMCA Extended error strings */
/* Load Store */
static const char * const smca_ls_mce_desc[] = {

View File

@ -1,66 +0,0 @@
commit 2a1d217660351c08eb2f8bccebf939abba2f7e69
Author: Brian WoodsGhannam, Yazen <brian.woods@amd.comYazen.Ghannam@amd.com>
Date: Fri Nov 1 15:48:13 2019 +0100
rasdaemon: rename CPU_NAPLES cputype
Change CPU_NAPLES to CPU_AMD_SMCA to reflect that it isn't just NAPLES
that is supported, but AMD's Scalable Machine Check Architecture (SMCA).
[ Yazen: change family check to feature check, and change CPU name. ]
CC: "mchehab+samsung@kernel.org" <mchehab+samsung@kernel.org>, "Namburu, Chandu-babu" <chandu@amd.com> # Thread-Topic: [PATCH 1/2] rasdaemon: rename CPU_NAPLES cputype
Signed-off-by: Brian Woods <brian.woods@amd.com>
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Cc: Chandu-babu Namburu <chandu@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
ras-mce-handler.c | 10 ++++++----
ras-mce-handler.h | 2 +-
2 files changed, 7 insertions(+), 5 deletions(-)
--- rasdaemon-0.6.1.orig/ras-mce-handler.c 2021-05-26 15:16:24.699096556 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.c 2021-05-26 15:18:06.543162745 -0400
@@ -55,7 +55,7 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series
[CPU_KNIGHTS_LANDING] = "Knights Landing",
[CPU_KNIGHTS_MILL] = "Knights Mill",
[CPU_SKYLAKE_XEON] = "Skylake server",
- [CPU_NAPLES] = "AMD Family 17h Zen1"
+ [CPU_AMD_SMCA] = "AMD Scalable MCA",
};
static enum cputype select_intel_cputype(struct ras_events *ras)
@@ -191,8 +191,10 @@ ret = 0;
if (!strcmp(mce->vendor, "AuthenticAMD")) {
if (mce->family == 15)
mce->cputype = CPU_K8;
- if (mce->family == 23)
- mce->cputype = CPU_NAPLES;
+ if (strstr(mce->processor_flags, "smca")) {
+ mce->cputype = CPU_AMD_SMCA;
+ goto ret;
+ }
if (mce->family > 23) {
log(ALL, LOG_INFO,
"Can't parse MCE for this AMD CPU yet %d\n",
@@ -435,7 +437,7 @@ if (pevent_get_field_val(s, event, "ipid
case CPU_K8:
rc = parse_amd_k8_event(ras, &e);
break;
- case CPU_NAPLES:
+ case CPU_AMD_SMCA:
rc = parse_amd_smca_event(ras, &e);
break;
default: /* All other CPU types are Intel */
--- rasdaemon-0.6.1.orig/ras-mce-handler.h 2021-05-26 15:17:15.409631590 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.h 2021-05-26 15:18:20.102038424 -0400
@@ -50,7 +50,7 @@ enum cputype {
CPU_KNIGHTS_LANDING,
CPU_KNIGHTS_MILL,
CPU_SKYLAKE_XEON,
- CPU_NAPLES,
+ CPU_AMD_SMCA,
};
struct mce_event {

View File

@ -1,372 +0,0 @@
commit 546cf713f667437fb6e283cc3dc090679eb47d08
Author: Subhendu Saha <subhends@akamai.com>
Date: Tue Jan 12 03:29:55 2021 -0500
Fix ras-mc-ctl script.
When rasdaemon is compiled without enabling aer, mce, devlink,
etc., those tables are not created in the database file. Then
ras-mc-ctl script breaks trying to query data from non-existent
tables.
Signed-off-by: Subhendu Saha subhends@akamai.com
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
util/ras-mc-ctl.in | 310 ++++++++++++++++++++++++++++-------------------------
1 file changed, 168 insertions(+), 142 deletions(-)
--- a/util/ras-mc-ctl.in 2021-10-12 13:45:43.260646935 -0400
+++ b/util/ras-mc-ctl.in 2021-10-12 13:46:38.610158949 -0400
@@ -41,6 +41,16 @@ my $sysconfdir = "@sysconfdir@";
my $dmidecode = find_prog ("dmidecode");
my $modprobe = find_prog ("modprobe") or exit (1);
+my $has_aer = 0;
+my $has_arm = 0;
+my $has_extlog = 0;
+my $has_mce = 0;
+
+@WITH_AER_TRUE@$has_aer = 1;
+@WITH_ARM_TRUE@$has_arm = 1;
+@WITH_EXTLOG_TRUE@$has_extlog = 1;
+@WITH_MCE_TRUE@$has_mce = 1;
+
my %conf = ();
my %bus = ();
my %dimm_size = ();
@@ -1145,70 +1155,78 @@ sub summary
$query_handle->finish;
# PCIe AER aer_event errors
- $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($err_type, $msg, $count));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "\t$count $err_type errors: $msg\n";
- }
- if ($out ne "") {
- print "PCIe AER events summary:\n$out\n";
- } else {
- print "No PCIe AER errors.\n\n";
+ if ($has_aer == 1) {
+ $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($err_type, $msg, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$count $err_type errors: $msg\n";
+ }
+ if ($out ne "") {
+ print "PCIe AER events summary:\n$out\n";
+ } else {
+ print "No PCIe AER errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# ARM processor arm_event errors
- $query = "select affinity, mpidr, count(*) from arm_event group by affinity, mpidr";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($affinity, $mpidr, $count));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "\t$count errors\n";
- }
- if ($out ne "") {
- print "ARM processor events summary:\n$out\n";
- } else {
- print "No ARM processor errors.\n\n";
+ if ($has_arm == 1) {
+ $query = "select affinity, mpidr, count(*) from arm_event group by affinity, mpidr";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($affinity, $mpidr, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$count errors\n";
+ }
+ if ($out ne "") {
+ print "ARM processor events summary:\n$out\n";
+ } else {
+ print "No ARM processor errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# extlog errors
- $query = "select etype, severity, count(*) from extlog_event group by etype, severity";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($etype, $severity, $count));
- $out = "";
- while($query_handle->fetch()) {
- $etype_string = get_extlog_type($etype);
- $severity_string = get_extlog_severity($severity);
- $out .= "\t$count $etype_string $severity_string errors\n";
- }
- if ($out ne "") {
- print "Extlog records summary:\n$out";
- } else {
- print "No Extlog errors.\n";
+ if ($has_extlog == 1) {
+ $query = "select etype, severity, count(*) from extlog_event group by etype, severity";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($etype, $severity, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $etype_string = get_extlog_type($etype);
+ $severity_string = get_extlog_severity($severity);
+ $out .= "\t$count $etype_string $severity_string errors\n";
+ }
+ if ($out ne "") {
+ print "Extlog records summary:\n$out";
+ } else {
+ print "No Extlog errors.\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# MCE mce_record errors
- $query = "select error_msg, count(*) from mce_record group by error_msg";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($msg, $count));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "\t$count $msg errors\n";
- }
- if ($out ne "") {
- print "MCE records summary:\n$out";
- } else {
- print "No MCE errors.\n";
+ if ($has_mce == 1) {
+ $query = "select error_msg, count(*) from mce_record group by error_msg";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($msg, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$count $msg errors\n";
+ }
+ if ($out ne "") {
+ print "MCE records summary:\n$out";
+ } else {
+ print "No MCE errors.\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
undef($dbh);
}
@@ -1244,105 +1262,113 @@ sub errors
$query_handle->finish;
# PCIe AER aer_event errors
- $query = "select id, timestamp, err_type, err_msg from aer_event order by id";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($id, $time, $type, $msg));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "$id $time $type error: $msg\n";
- }
- if ($out ne "") {
- print "PCIe AER events:\n$out\n";
- } else {
- print "No PCIe AER errors.\n\n";
+ if ($has_aer == 1) {
+ $query = "select id, timestamp, err_type, err_msg from aer_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $time, $type, $msg));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $time $type error: $msg\n";
+ }
+ if ($out ne "") {
+ print "PCIe AER events:\n$out\n";
+ } else {
+ print "No PCIe AER errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# ARM processor arm_event errors
- $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "$id $timestamp error: ";
- $out .= "error_count=$error_count, " if ($error_count);
- $out .= "affinity_level=$affinity, ";
- $out .= sprintf "mpidr=0x%x, ", $mpidr;
- $out .= sprintf "running_state=0x%x, ", $r_state;
- $out .= sprintf "psci_state=0x%x", $psci_state;
- $out .= "\n";
- }
- if ($out ne "") {
- print "ARM processor events:\n$out\n";
- } else {
- print "No ARM processor errors.\n\n";
+ if ($has_arm == 1) {
+ $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "error_count=$error_count, " if ($error_count);
+ $out .= "affinity_level=$affinity, ";
+ $out .= sprintf "mpidr=0x%x, ", $mpidr;
+ $out .= sprintf "running_state=0x%x, ", $r_state;
+ $out .= sprintf "psci_state=0x%x", $psci_state;
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "ARM processor events:\n$out\n";
+ } else {
+ print "No ARM processor errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# Extlog errors
- $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data));
- $out = "";
- while($query_handle->fetch()) {
- $etype_string = get_extlog_type($etype);
- $severity_string = get_extlog_severity($severity);
- $out .= "$id $timestamp error: ";
- $out .= "type=$etype_string, ";
- $out .= "severity=$severity_string, ";
- $out .= sprintf "address=0x%08x, ", $addr;
- $out .= sprintf "fru_id=%s, ", get_uuid_le($fru_id);
- $out .= "fru_text='$fru_text', ";
- $out .= get_cper_data_text($cper_data) if ($cper_data);
- $out .= "\n";
- }
- if ($out ne "") {
- print "Extlog events:\n$out\n";
- } else {
- print "No Extlog errors.\n\n";
+ if ($has_extlog) {
+ $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data));
+ $out = "";
+ while($query_handle->fetch()) {
+ $etype_string = get_extlog_type($etype);
+ $severity_string = get_extlog_severity($severity);
+ $out .= "$id $timestamp error: ";
+ $out .= "type=$etype_string, ";
+ $out .= "severity=$severity_string, ";
+ $out .= sprintf "address=0x%08x, ", $addr;
+ $out .= sprintf "fru_id=%s, ", get_uuid_le($fru_id);
+ $out .= "fru_text='$fru_text', ";
+ $out .= get_cper_data_text($cper_data) if ($cper_data);
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "Extlog events:\n$out\n";
+ } else {
+ print "No Extlog errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# MCE mce_record errors
- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "$id $time error: $msg";
- $out .= ", CPU $cpuvendor" if ($cpuvendor);
- $out .= ", bank $bank_name" if ($bank_name);
- $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg);
- $out .= ", mci $mcistatus_msg" if ($mcistatus_msg);
- $out .= ", $mc_location" if ($mc_location);
- $out .= ", $user_action" if ($user_action);
- $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap);
- $out .= sprintf ", mcgstatus=0x%08x", $mcgstatus if ($mcgstatus);
- $out .= sprintf ", status=0x%08x", $status if ($status);
- $out .= sprintf ", addr=0x%08x", $addr if ($addr);
- $out .= sprintf ", misc=0x%08x", $misc if ($misc);
- $out .= sprintf ", ip=0x%08x", $ip if ($ip);
- $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc);
- $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime);
- $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu);
- $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid);
- $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid);
- $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid);
- $out .= sprintf ", cs=0x%08x", $cs if ($cs);
- $out .= sprintf ", bank=0x%08x", $bank if ($bank);
+ if ($has_mce == 1) {
+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $time error: $msg";
+ $out .= ", CPU $cpuvendor" if ($cpuvendor);
+ $out .= ", bank $bank_name" if ($bank_name);
+ $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg);
+ $out .= ", mci $mcistatus_msg" if ($mcistatus_msg);
+ $out .= ", $mc_location" if ($mc_location);
+ $out .= ", $user_action" if ($user_action);
+ $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap);
+ $out .= sprintf ", mcgstatus=0x%08x", $mcgstatus if ($mcgstatus);
+ $out .= sprintf ", status=0x%08x", $status if ($status);
+ $out .= sprintf ", addr=0x%08x", $addr if ($addr);
+ $out .= sprintf ", misc=0x%08x", $misc if ($misc);
+ $out .= sprintf ", ip=0x%08x", $ip if ($ip);
+ $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc);
+ $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime);
+ $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu);
+ $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid);
+ $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid);
+ $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid);
+ $out .= sprintf ", cs=0x%08x", $cs if ($cs);
+ $out .= sprintf ", bank=0x%08x", $bank if ($bank);
- $out .= "\n";
- }
- if ($out ne "") {
- print "MCE events:\n$out\n";
- } else {
- print "No MCE errors.\n\n";
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "MCE events:\n$out\n";
+ } else {
+ print "No MCE errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
undef($dbh);
}

View File

@ -1,149 +0,0 @@
commit 60a91e4da4f2daf2b10143fc148a8043312b61e5
Author: Aristeu Rozanski <aris@redhat.com>
Date: Wed Aug 1 16:29:58 2018 -0400
rasdaemon: ras-mc-ctl: add option to show error counts
In some scenarios it might not be desirable to have a daemon running
to parse and store the errors provided by EDAC and only having the
number of CEs and UEs is enough. This patch implements this feature
as an ras-mc-ctl option.
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 38b7824..aee431a 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -50,6 +50,8 @@ my %dimm_location = ();
my %csrow_size = ();
my %rank_size = ();
my %csrow_ranks = ();
+my %dimm_ce_count = ();
+my %dimm_ue_count = ();
my @layers;
my @max_pos;
@@ -76,6 +78,7 @@ Usage: $prog [OPTIONS...]
--layout Display the memory layout.
--summary Presents a summary of the logged errors.
--errors Shows the errors stored at the error database.
+ --error-count Shows the corrected and uncorrected error counts using sysfs.
--help This help message.
EOF
@@ -83,7 +86,7 @@ parse_cmdline();
if ( $conf{opt}{mainboard} || $conf{opt}{print_labels}
|| $conf{opt}{register_labels} || $conf{opt}{display_memory_layout}
- || $conf{opt}{guess_dimm_label}) {
+ || $conf{opt}{guess_dimm_label} || $conf{opt}{error_count}) {
get_mainboard_info();
@@ -105,6 +108,9 @@ if ( $conf{opt}{mainboard} || $conf{opt}{print_labels}
if ($conf{opt}{guess_dimm_label}) {
guess_dimm_label ();
}
+ if ($conf{opt}{error_count}) {
+ display_error_count ();
+ }
}
if ($conf{opt}{status}) {
@@ -134,6 +140,7 @@ sub parse_cmdline
$conf{opt}{guess_dimm_label} = 0;
$conf{opt}{summary} = 0;
$conf{opt}{errors} = 0;
+ $conf{opt}{error_count} = 0;
my $rref = \$conf{opt}{report};
my $mref = \$conf{opt}{mainboard};
@@ -150,7 +157,8 @@ sub parse_cmdline
"status" => \$conf{opt}{status},
"layout" => \$conf{opt}{display_memory_layout},
"summary" => \$conf{opt}{summary},
- "errors" => \$conf{opt}{errors}
+ "errors" => \$conf{opt}{errors},
+ "error-count" => \$conf{opt}{error_count}
);
usage(1) if !$rc;
@@ -284,6 +292,30 @@ sub parse_dimm_nodes
$dimm_label_file{$str_loc} = $file;
$dimm_location{$str_loc} = $location;
+ my $count;
+
+ $file =~s/dimm_label/dimm_ce_count/;
+ if (-e $file) {
+ open IN, $file;
+ chomp($count = <IN>);
+ close IN;
+ } else {
+ log_error ("dimm_ce_count not found in sysfs. Old kernel?\n");
+ exit -1;
+ }
+ $dimm_ce_count{$str_loc} = $count;
+
+ $file =~s/dimm_ce_count/dimm_ue_count/;
+ if (-e $file) {
+ open IN, $file;
+ chomp($count = <IN>);
+ close IN;
+ } else {
+ log_error ("dimm_ue_count not found in sysfs. Old kernel?\n");
+ exit -1;
+ }
+ $dimm_ue_count{$str_loc} = $count;
+
return;
}
}
@@ -906,6 +938,45 @@ sub display_memory_layout
dimm_display_mem();
}
+sub display_error_count
+{
+ my $sysfs_dir = "/sys/devices/system/edac/mc";
+ my $key;
+ my $max_width = 0;
+ my %dimm_labels = ();
+
+ find ({wanted => \&parse_dimm_nodes, no_chdir => 1}, $sysfs_dir);
+
+ if (!scalar(keys %dimm_node)) {
+ log_error ("No DIMMs found in /sys or new sysfs EDAC interface not found.\n");
+ exit -1;
+ }
+
+ foreach $key (keys %dimm_node) {
+ my $label_width;
+
+ open IN, $dimm_label_file{$key};
+ chomp(my $label = <IN>);
+ close IN;
+ $label_width = length $label;
+
+ if ($label_width > $max_width) {
+ $max_width = $label_width;
+ }
+ $dimm_labels{$key} = $label;
+ }
+ my $string = "Label";
+ $string .= " " x ($max_width - length $string);
+ print($string . "\tCE\tUE\n");
+
+ foreach $key (keys %dimm_node) {
+ my $ce_count = $dimm_ce_count{$key};
+ my $ue_count = $dimm_ue_count{$key};
+
+ print("$dimm_labels{$key}\t$ce_count\t$ue_count\n");
+ }
+}
+
sub find_prog
{
my ($file) = @_;

View File

@ -1,24 +0,0 @@
commit 7937f0d6c2aaaed096f3a3d306416743c0dcb7a4
Author: Muralidhara M K <muralimk@amd.com>
Date: Wed Jul 28 01:52:12 2021 -0500
rasdaemon: Support MCE for AMD CPU family 19h
Add support for family 19h x86 CPUs from AMD.
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/ras-mce-handler.c b/ras-mce-handler.c
index 805004a..f2b53d4 100644
--- a/ras-mce-handler.c
+++ b/ras-mce-handler.c
@@ -208,7 +208,7 @@ static int detect_cpu(struct ras_events *ras)
mce->cputype = CPU_AMD_SMCA;
goto ret;
}
- if (mce->family > 23) {
+ if (mce->family > 25) {
log(ALL, LOG_INFO,
"Can't parse MCE for this AMD CPU yet %d\n",
mce->family);

View File

@ -1,38 +0,0 @@
commit 854364ba44aee9bc5646f6537fc744b0b54aff37
Author: Muralidhara M K <muralimk@amd.com>
Date: Thu Aug 20 21:00:57 2020 +0530
rasdaemon: Add 8 channel decoding for SMCA systems
Current Scalable Machine Check Architecture (SMCA) systems support up
to 8 UMC channels.
To find the UMC channel represented by a bank, look at the 6th nibble
in the MCA_IPID[InstanceId] field.
Signed-off-by: Muralidhara M K <muralimk@amd.com>
[ Adjust commit message. ]
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index d0b6cb6..7c619fd 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -438,15 +438,7 @@ static void amd_decode_errcode(struct mce_event *e)
*/
static int find_umc_channel(struct mce_event *e)
{
- uint32_t umc_instance_id[] = {0x50f00, 0x150f00};
- uint32_t instance_id = EXTRACT(e->ipid, 0, 31);
- int i, channel = -1;
-
- for (i = 0; i < ARRAY_SIZE(umc_instance_id); i++)
- if (umc_instance_id[i] == instance_id)
- channel = i;
-
- return channel;
+ return EXTRACT(e->ipid, 0, 31) >> 20;
}
/* Decode extended errors according to Scalable MCA specification */
static void decode_smca_error(struct mce_event *e)

View File

@ -1,207 +0,0 @@
commit 8704a85d8dc3483423ec2934fee8132f85f8fdb6
Author: Brian WoodsGhannam, Yazen <brian.woods@amd.comYazen.Ghannam@amd.com>
Date: Fri Nov 1 15:48:14 2019 +0100
rasdaemon: add support for new AMD SMCA bank types
Going forward, the Scalable Machine Check Architecture (SMCA) has some
updated and additional bank types which show up in Zen2. The differing
bank types include: CS_V2, PSP_V2, SMU_V2, MP5, NBIO, and PCIE. The V2
bank types replace the original bank types but have unique HWID/MCAtype
IDs from the originals so there's no conflicts between different
versions or other bank types. All of the differing bank types have new
MCE descriptions which have been added as well.
CC: "mchehab+samsung@kernel.org" <mchehab+samsung@kernel.org>, "Namburu, Chandu-babu" <chandu@amd.com> # Thread-Topic: [PATCH 2/2] rasdaemon: add support for new AMD SMCA bank types
Signed-off-by: Brian Woods <brian.woods@amd.com>
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Cc: Chandu-babu Namburu <chandu@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 6c3e8a5..114e786 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -49,11 +49,17 @@ enum smca_bank_types {
SMCA_FP, /* Floating Point */
SMCA_L3_CACHE, /* L3 Cache */
SMCA_CS, /* Coherent Slave */
+ SMCA_CS_V2, /* Coherent Slave V2 */
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
+ SMCA_PSP_V2, /* Platform Security Processor V2 */
SMCA_SMU, /* System Management Unit */
+ SMCA_SMU_V2, /* System Management Unit V2 */
+ SMCA_MP5, /* Microprocessor 5 Unit */
+ SMCA_NBIO, /* Northbridge IO Unit */
+ SMCA_PCIE, /* PCI Express Unit */
N_SMCA_BANK_TYPES
};
@@ -165,6 +171,23 @@ static const char * const smca_cs_mce_desc[] = {
"Atomic request parity",
"ECC error on probe filter access",
};
+/* Coherent Slave Unit V2 */
+static const char * const smca_cs2_mce_desc[] = {
+ "Illegal Request",
+ "Address Violation",
+ "Security Violation",
+ "Illegal Response",
+ "Unexpected Response",
+ "Request or Probe Parity Error",
+ "Read Response Parity Error",
+ "Atomic Request Parity Error",
+ "SDP read response had no match in the CS queue",
+ "Probe Filter Protocol Error",
+ "Probe Filter ECC Error",
+ "SDP read response had an unexpected RETRY error",
+ "Counter overflow error",
+ "Counter underflow error",
+};
/* Power, Interrupt, etc.. */
static const char * const smca_pie_mce_desc[] = {
"HW assert",
@@ -189,10 +212,75 @@ static const char * const smca_pb_mce_desc[] = {
static const char * const smca_psp_mce_desc[] = {
"PSP RAM ECC or parity error",
};
+/* Platform Security Processor V2 */
+static const char * const smca_psp2_mce_desc[] = {
+ "High SRAM ECC or parity error",
+ "Low SRAM ECC or parity error",
+ "Instruction Cache Bank 0 ECC or parity error",
+ "Instruction Cache Bank 1 ECC or parity error",
+ "Instruction Tag Ram 0 parity error",
+ "Instruction Tag Ram 1 parity error",
+ "Data Cache Bank 0 ECC or parity error",
+ "Data Cache Bank 1 ECC or parity error",
+ "Data Cache Bank 2 ECC or parity error",
+ "Data Cache Bank 3 ECC or parity error",
+ "Data Tag Bank 0 parity error",
+ "Data Tag Bank 1 parity error",
+ "Data Tag Bank 2 parity error",
+ "Data Tag Bank 3 parity error",
+ "Dirty Data Ram parity error",
+ "TLB Bank 0 parity error",
+ "TLB Bank 1 parity error",
+ "System Hub Read Buffer ECC or parity error",
+};
/* System Management Unit */
static const char * const smca_smu_mce_desc[] = {
"SMU RAM ECC or parity error",
};
+/* System Management Unit V2 */
+static const char * const smca_smu2_mce_desc[] = {
+ "High SRAM ECC or parity error",
+ "Low SRAM ECC or parity error",
+ "Data Cache Bank A ECC or parity error",
+ "Data Cache Bank B ECC or parity error",
+ "Data Tag Cache Bank A ECC or parity error",
+ "Data Tag Cache Bank B ECC or parity error",
+ "Instruction Cache Bank A ECC or parity error",
+ "Instruction Cache Bank B ECC or parity error",
+ "Instruction Tag Cache Bank A ECC or parity error",
+ "Instruction Tag Cache Bank B ECC or parity error",
+ "System Hub Read Buffer ECC or parity error",
+};
+/* Microprocessor 5 Unit */
+static const char * const smca_mp5_mce_desc[] = {
+ "High SRAM ECC or parity error",
+ "Low SRAM ECC or parity error",
+ "Data Cache Bank A ECC or parity error",
+ "Data Cache Bank B ECC or parity error",
+ "Data Tag Cache Bank A ECC or parity error",
+ "Data Tag Cache Bank B ECC or parity error",
+ "Instruction Cache Bank A ECC or parity error",
+ "Instruction Cache Bank B ECC or parity error",
+ "Instruction Tag Cache Bank A ECC or parity error",
+ "Instruction Tag Cache Bank B ECC or parity error",
+};
+/* Northbridge IO Unit */
+static const char * const smca_nbio_mce_desc[] = {
+ "ECC or Parity error",
+ "PCIE error",
+ "SDP ErrEvent error",
+ "SDP Egress Poison Error",
+ "IOHC Internal Poison Error",
+};
+/* PCI Express Unit */
+static const char * const smca_pcie_mce_desc[] = {
+ "CCIX PER Message logging",
+ "CCIX Read Response with Status: Non-Data Error",
+ "CCIX Write Response with Status: Non-Data Error",
+ "CCIX Read Response with Status: Data Error",
+ "CCIX Non-okay write response with data error",
+};
+
struct smca_mce_desc {
const char * const *descs;
@@ -208,11 +296,17 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) },
[SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
[SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
+ [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) },
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
[SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
+ [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)},
[SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
+ [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc)},
+ [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) },
+ [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)},
+ [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)},
};
struct smca_hwid {
@@ -235,6 +329,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Data Fabric MCA types */
{ SMCA_CS, 0x0000002E },
+ { SMCA_CS_V2, 0x0002002E },
{ SMCA_PIE, 0x0001002E },
/* Unified Memory Controller MCA type */
@@ -245,9 +340,20 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Platform Security Processor MCA type */
{ SMCA_PSP, 0x000000FF },
+ { SMCA_PSP_V2, 0x000100FF },
/* System Management Unit MCA type */
{ SMCA_SMU, 0x00000001 },
+ { SMCA_SMU_V2, 0x00010001 },
+
+ /* Microprocessor 5 Unit MCA type */
+ { SMCA_MP5, 0x00020001 },
+
+ /* Northbridge IO Unit MCA type */
+ { SMCA_NBIO, 0x00000018 },
+
+ /* PCI Express Unit MCA type */
+ { SMCA_PCIE, 0x00000046 },
};
struct smca_bank_name {
@@ -264,11 +370,17 @@ static struct smca_bank_name smca_names[] = {
[SMCA_FP] = { "Floating Point Unit" },
[SMCA_L3_CACHE] = { "L3 Cache" },
[SMCA_CS] = { "Coherent Slave" },
+ [SMCA_CS_V2] = { "Coherent Slave" },
[SMCA_PIE] = { "Power, Interrupts, etc." },
[SMCA_UMC] = { "Unified Memory Controller" },
[SMCA_PB] = { "Parameter Block" },
[SMCA_PSP] = { "Platform Security Processor" },
+ [SMCA_PSP_V2] = { "Platform Security Processor" },
[SMCA_SMU] = { "System Management Unit" },
+ [SMCA_SMU_V2] = { "System Management Unit" },
+ [SMCA_MP5] = { "Microprocessor 5 Unit" },
+ [SMCA_NBIO] = { "Northbridge IO Unit" },
+ [SMCA_PCIE] = { "PCI Express Unit" },
};
static void amd_decode_errcode(struct mce_event *e)

View File

@ -1,71 +0,0 @@
commit 899fcc2cf21c86b5462c8f4441cd9c92b3d75f7d
Author: Aristeu Rozanski <arozansk@redhat.com>
Date: Thu Jan 19 08:45:57 2023 -0500
rasdaemon: ras-report: fix possible but unlikely file descriptor leak
Found with covscan.
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
ras-report.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
--- rasdaemon-0.6.1.orig/ras-report.c 2023-01-23 11:36:20.972368760 -0500
+++ rasdaemon-0.6.1/ras-report.c 2023-01-23 11:36:23.236343267 -0500
@@ -374,7 +374,7 @@ if(rc < 0){
mc_fail:
- if(sockfd > 0){
+ if(sockfd >= 0){
close(sockfd);
}
@@ -424,7 +424,7 @@ if(rc < 0){
aer_fail:
- if(sockfd > 0){
+ if(sockfd >= 0){
close(sockfd);
}
@@ -473,7 +473,7 @@ rc = 0;
non_standard_fail:
- if(sockfd > 0){
+ if(sockfd >= 0){
close(sockfd);
}
@@ -518,7 +518,7 @@ rc = 0;
arm_fail:
- if(sockfd > 0){
+ if(sockfd >= 0){
close(sockfd);
}
@@ -564,7 +564,7 @@ if(rc < 0){
mce_fail:
- if(sockfd > 0){
+ if(sockfd >= 0){
close(sockfd);
}
@@ -609,7 +609,7 @@ if (rc < 0)
done = 1;
mf_fail:
- if (sockfd > 0)
+ if (sockfd >= 0)
close(sockfd);
if (done)

View File

@ -1,230 +0,0 @@
commit 9acef39f13833f7d53ef96abc5a72e79384260f4
Author: Naveen Krishna Chatradhi <nchatrad@amd.com>
Date: Tue Jun 1 11:01:17 2021 +0530
rasdaemon: Add new SMCA bank types with error decoding
Upcoming systems with Scalable Machine Check Architecture (SMCA) have
new MCA banks added.
This patch adds the (HWID, MCATYPE) tuple, name and error decoding for
those new SMCA banks.
While at it, optimize the string names in smca_bank_name[].
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 7c619fd..e0cf512 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -47,7 +47,7 @@
/* These may be used by multiple smca_hwid_mcatypes */
enum smca_bank_types {
SMCA_LS = 0, /* Load Store */
- SMCA_LS_V2, /* Load Store */
+ SMCA_LS_V2,
SMCA_IF, /* Instruction Fetch */
SMCA_L2_CACHE, /* L2 Cache */
SMCA_DE, /* Decoder Unit */
@@ -56,17 +56,22 @@ enum smca_bank_types {
SMCA_FP, /* Floating Point */
SMCA_L3_CACHE, /* L3 Cache */
SMCA_CS, /* Coherent Slave */
- SMCA_CS_V2, /* Coherent Slave V2 */
+ SMCA_CS_V2,
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
+ SMCA_UMC_V2,
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
- SMCA_PSP_V2, /* Platform Security Processor V2 */
+ SMCA_PSP_V2,
SMCA_SMU, /* System Management Unit */
- SMCA_SMU_V2, /* System Management Unit V2 */
+ SMCA_SMU_V2,
SMCA_MP5, /* Microprocessor 5 Unit */
SMCA_NBIO, /* Northbridge IO Unit */
SMCA_PCIE, /* PCI Express Unit */
+ SMCA_PCIE_V2,
+ SMCA_XGMI_PCS, /* xGMI PCS Unit */
+ SMCA_XGMI_PHY, /* xGMI PHY Unit */
+ SMCA_WAFL_PHY, /* WAFL PHY Unit */
N_SMCA_BANK_TYPES
};
@@ -237,6 +242,22 @@ static const char * const smca_umc_mce_desc[] = {
"Command/address parity error",
"Write data CRC error",
};
+
+static const char * const smca_umc2_mce_desc[] = {
+ "DRAM ECC error",
+ "Data poison error",
+ "SDP parity error",
+ "Reserved",
+ "Address/Command parity error",
+ "Write data parity error",
+ "DCQ SRAM ECC error",
+ "Reserved",
+ "Read data parity error",
+ "Rdb SRAM ECC error",
+ "RdRsp SRAM ECC error",
+ "LM32 MP errors",
+};
+
/* Parameter Block */
static const char * const smca_pb_mce_desc[] = {
"Parameter Block RAM ECC error",
@@ -314,6 +335,55 @@ static const char * const smca_pcie_mce_desc[] = {
"CCIX Non-okay write response with data error",
};
+static const char * const smca_pcie2_mce_desc[] = {
+ "SDP Parity Error logging",
+};
+
+static const char * const smca_xgmipcs_mce_desc[] = {
+ "Data Loss Error",
+ "Training Error",
+ "Flow Control Acknowledge Error",
+ "Rx Fifo Underflow Error",
+ "Rx Fifo Overflow Error",
+ "CRC Error",
+ "BER Exceeded Error",
+ "Tx Vcid Data Error",
+ "Replay Buffer Parity Error",
+ "Data Parity Error",
+ "Replay Fifo Overflow Error",
+ "Replay Fifo Underflow Error",
+ "Elastic Fifo Overflow Error",
+ "Deskew Error",
+ "Flow Control CRC Error",
+ "Data Startup Limit Error",
+ "FC Init Timeout Error",
+ "Recovery Timeout Error",
+ "Ready Serial Timeout Error",
+ "Ready Serial Attempt Error",
+ "Recovery Attempt Error",
+ "Recovery Relock Attempt Error",
+ "Replay Attempt Error",
+ "Sync Header Error",
+ "Tx Replay Timeout Error",
+ "Rx Replay Timeout Error",
+ "LinkSub Tx Timeout Error",
+ "LinkSub Rx Timeout Error",
+ "Rx CMD Pocket Error",
+};
+
+static const char * const smca_xgmiphy_mce_desc[] = {
+ "RAM ECC Error",
+ "ARC instruction buffer parity error",
+ "ARC data buffer parity error",
+ "PHY APB error",
+};
+
+static const char * const smca_waflphy_mce_desc[] = {
+ "RAM ECC Error",
+ "ARC instruction buffer parity error",
+ "ARC data buffer parity error",
+ "PHY APB error",
+};
struct smca_mce_desc {
const char * const *descs;
@@ -333,6 +403,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) },
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
+ [SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) },
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
[SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
[SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)},
@@ -341,6 +412,10 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) },
[SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)},
[SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)},
+ [SMCA_PCIE_V2] = { smca_pcie2_mce_desc, ARRAY_SIZE(smca_pcie2_mce_desc) },
+ [SMCA_XGMI_PCS] = { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc) },
+ [SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
+ [SMCA_WAFL_PHY] = { smca_waflphy_mce_desc, ARRAY_SIZE(smca_waflphy_mce_desc) },
};
struct smca_hwid {
@@ -369,6 +444,8 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Unified Memory Controller MCA type */
{ SMCA_UMC, 0x00000096 },
+ /* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */
+ { SMCA_UMC_V2, 0x00010096 },
/* Parameter Block MCA type */
{ SMCA_PB, 0x00000005 },
@@ -389,6 +466,16 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* PCI Express Unit MCA type */
{ SMCA_PCIE, 0x00000046 },
+ { SMCA_PCIE_V2, 0x00010046 },
+
+ /* Ext Global Memory Interconnect PCS MCA type */
+ { SMCA_XGMI_PCS, 0x00000050 },
+
+ /* Ext Global Memory Interconnect PHY MCA type */
+ { SMCA_XGMI_PHY, 0x00000259 },
+
+ /* WAFL PHY MCA type */
+ { SMCA_WAFL_PHY, 0x00000267 },
};
struct smca_bank_name {
@@ -396,27 +483,28 @@ struct smca_bank_name {
};
static struct smca_bank_name smca_names[] = {
- [SMCA_LS] = { "Load Store Unit" },
- [SMCA_LS_V2] = { "Load Store Unit" },
- [SMCA_IF] = { "Instruction Fetch Unit" },
- [SMCA_L2_CACHE] = { "L2 Cache" },
- [SMCA_DE] = { "Decode Unit" },
- [SMCA_RESERVED] = { "Reserved" },
- [SMCA_EX] = { "Execution Unit" },
- [SMCA_FP] = { "Floating Point Unit" },
- [SMCA_L3_CACHE] = { "L3 Cache" },
- [SMCA_CS] = { "Coherent Slave" },
- [SMCA_CS_V2] = { "Coherent Slave" },
- [SMCA_PIE] = { "Power, Interrupts, etc." },
- [SMCA_UMC] = { "Unified Memory Controller" },
- [SMCA_PB] = { "Parameter Block" },
- [SMCA_PSP] = { "Platform Security Processor" },
- [SMCA_PSP_V2] = { "Platform Security Processor" },
- [SMCA_SMU] = { "System Management Unit" },
- [SMCA_SMU_V2] = { "System Management Unit" },
- [SMCA_MP5] = { "Microprocessor 5 Unit" },
- [SMCA_NBIO] = { "Northbridge IO Unit" },
- [SMCA_PCIE] = { "PCI Express Unit" },
+ [SMCA_LS ... SMCA_LS_V2] = { "Load Store Unit" },
+ [SMCA_IF] = { "Instruction Fetch Unit" },
+ [SMCA_L2_CACHE] = { "L2 Cache" },
+ [SMCA_DE] = { "Decode Unit" },
+ [SMCA_RESERVED] = { "Reserved" },
+ [SMCA_EX] = { "Execution Unit" },
+ [SMCA_FP] = { "Floating Point Unit" },
+ [SMCA_L3_CACHE] = { "L3 Cache" },
+ [SMCA_CS ... SMCA_CS_V2] = { "Coherent Slave" },
+ [SMCA_PIE] = { "Power, Interrupts, etc." },
+ [SMCA_UMC] = { "Unified Memory Controller" },
+ [SMCA_UMC_V2] = { "Unified Memory Controller V2" },
+ [SMCA_PB] = { "Parameter Block" },
+ [SMCA_PSP ... SMCA_PSP_V2] = { "Platform Security Processor" },
+ [SMCA_SMU ... SMCA_SMU_V2] = { "System Management Unit" },
+ [SMCA_MP5] = { "Microprocessor 5 Unit" },
+ [SMCA_NBIO] = { "Northbridge IO Unit" },
+ [SMCA_PCIE ... SMCA_PCIE_V2] = { "PCI Express Unit" },
+ [SMCA_XGMI_PCS] = { "Ext Global Memory Interconnect PCS Unit" },
+ [SMCA_XGMI_PHY] = { "Ext Global Memory Interconnect PHY Unit" },
+ [SMCA_WAFL_PHY] = { "WAFL PHY Unit" },
+
};
static void amd_decode_errcode(struct mce_event *e)

View File

@ -1,670 +0,0 @@
commit a16ca0711001957ee98f2c124abce0fa1f801529
Author: Chandu-babu Namburu <chandu@amd.com>
Date: Wed Jan 30 20:36:45 2019 +0530
rasdaemon: add support for AMD Scalable MCA
Add logic here to decode errors from all known IP blocks for
AMD Scalable MCA supported processors
Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Chandu-babu Namburu <chandu@amd.com>
---
mce-amd-smca.c | 371 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
mce-amd.c | 122 +++++++++++++++++
ras-mce-handler.c | 24 +++
ras-mce-handler.h | 15 ++
4 files changed, 530 insertions(+), 2 deletions(-)
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ rasdaemon-0.6.1/mce-amd-smca.c 2019-07-12 11:35:04.836470461 -0400
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2018, AMD, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "ras-mce-handler.h"
+#include "bitfield.h"
+
+/* MCA_STATUS REGISTER FOR FAMILY 17H
+ *********************** Higher 32-bits *****************************
+ * 63: VALIDERROR, 62: OVERFLOW, 61: UC, 60: Err ENABLE,
+ * 59: Misc Valid, 58: Addr Valid, 57: PCC, 56: ErrCoreID Valid,
+ * 55: TCC, 54: RES, 53: Syndrom Valid, 52: Transparanet,
+ * 51: RES, 50: RES, 49: RES, 48: RES,
+ * 47: RES, 46: CECC, 45: UECC, 44: Deferred,
+ * 43: Poison, 42: RES, 41: RES, 40: RES,
+ * 39: RES, 38: RES, 37: ErrCoreID[5], 36: ErrCoreID[4],
+ * 35: ErrCoreID[3], 34: ErrCoreID[2] 33: ErrCoreID[1] 32: ErrCoreID[0]
+ *********************** Lower 32-bits ******************************
+ * 31: RES, 30: RES, 29: RES, 28: RES,
+ * 27: RES, 26: RES, 25: RES, 24: RES
+ * 23: RES, 22: RES, 21: XEC[5], 20: XEC[4],
+ * 19: XEC[3], 18: XEC[2], 17: XEC[1], 16: XEC[0]
+ * 15: EC[15], 14: EC[14], 13: EC[13], 12: EC[12],
+ * 11: EC[11], 10: EC[10], 09: EC[9], 08: EC[8],
+ * 07: EC[7], 06: EC[6], 05: EC[5], 04: EC[4],
+ * 03: EC[3], 02: EC[2], 01: EC[1], 00: EC[0]
+ */
+
+/* These may be used by multiple smca_hwid_mcatypes */
+enum smca_bank_types {
+ SMCA_LS = 0, /* Load Store */
+ SMCA_IF, /* Instruction Fetch */
+ SMCA_L2_CACHE, /* L2 Cache */
+ SMCA_DE, /* Decoder Unit */
+ SMCA_RESERVED, /* Reserved */
+ SMCA_EX, /* Execution Unit */
+ SMCA_FP, /* Floating Point */
+ SMCA_L3_CACHE, /* L3 Cache */
+ SMCA_CS, /* Coherent Slave */
+ SMCA_PIE, /* Power, Interrupts, etc. */
+ SMCA_UMC, /* Unified Memory Controller */
+ SMCA_PB, /* Parameter Block */
+ SMCA_PSP, /* Platform Security Processor */
+ SMCA_SMU, /* System Management Unit */
+ N_SMCA_BANK_TYPES
+};
+
+/* SMCA Extended error strings */
+/* Load Store */
+static const char * const smca_ls_mce_desc[] = {
+ "Load queue parity",
+ "Store queue parity",
+ "Miss address buffer payload parity",
+ "L1 TLB parity",
+ "Reserved",
+ "DC tag error type 6",
+ "DC tag error type 1",
+ "Internal error type 1",
+ "Internal error type 2",
+ "Sys Read data error thread 0",
+ "Sys read data error thread 1",
+ "DC tag error type 2",
+ "DC data error type 1 (poison consumption)",
+ "DC data error type 2",
+ "DC data error type 3",
+ "DC tag error type 4",
+ "L2 TLB parity",
+ "PDC parity error",
+ "DC tag error type 3",
+ "DC tag error type 5",
+ "L2 fill data error",
+};
+/* Instruction Fetch */
+static const char * const smca_if_mce_desc[] = {
+ "microtag probe port parity error",
+ "IC microtag or full tag multi-hit error",
+ "IC full tag parity",
+ "IC data array parity",
+ "Decoupling queue phys addr parity error",
+ "L0 ITLB parity error",
+ "L1 ITLB parity error",
+ "L2 ITLB parity error",
+ "BPQ snoop parity on Thread 0",
+ "BPQ snoop parity on Thread 1",
+ "L1 BTB multi-match error",
+ "L2 BTB multi-match error",
+ "L2 Cache Response Poison error",
+ "System Read Data error",
+};
+/* L2 Cache */
+static const char * const smca_l2_mce_desc[] = {
+ "L2M tag multi-way-hit error",
+ "L2M tag ECC error",
+ "L2M data ECC error",
+ "HW assert",
+};
+/* Decoder Unit */
+static const char * const smca_de_mce_desc[] = {
+ "uop cache tag parity error",
+ "uop cache data parity error",
+ "Insn buffer parity error",
+ "uop queue parity error",
+ "Insn dispatch queue parity error",
+ "Fetch address FIFO parity",
+ "Patch RAM data parity",
+ "Patch RAM sequencer parity",
+ "uop buffer parity"
+};
+/* Execution Unit */
+static const char * const smca_ex_mce_desc[] = {
+ "Watchdog timeout error",
+ "Phy register file parity",
+ "Flag register file parity",
+ "Immediate displacement register file parity",
+ "Address generator payload parity",
+ "EX payload parity",
+ "Checkpoint queue parity",
+ "Retire dispatch queue parity",
+ "Retire status queue parity error",
+ "Scheduling queue parity error",
+ "Branch buffer queue parity error",
+};
+/* Floating Point Unit */
+static const char * const smca_fp_mce_desc[] = {
+ "Physical register file parity",
+ "Freelist parity error",
+ "Schedule queue parity",
+ "NSQ parity error",
+ "Retire queue parity",
+ "Status register file parity",
+ "Hardware assertion",
+};
+/* L3 Cache */
+static const char * const smca_l3_mce_desc[] = {
+ "Shadow tag macro ECC error",
+ "Shadow tag macro multi-way-hit error",
+ "L3M tag ECC error",
+ "L3M tag multi-way-hit error",
+ "L3M data ECC error",
+ "XI parity, L3 fill done channel error",
+ "L3 victim queue parity",
+ "L3 HW assert",
+};
+/* Coherent Slave Unit */
+static const char * const smca_cs_mce_desc[] = {
+ "Illegal request from transport layer",
+ "Address violation",
+ "Security violation",
+ "Illegal response from transport layer",
+ "Unexpected response",
+ "Parity error on incoming request or probe response data",
+ "Parity error on incoming read response data",
+ "Atomic request parity",
+ "ECC error on probe filter access",
+};
+/* Power, Interrupt, etc.. */
+static const char * const smca_pie_mce_desc[] = {
+ "HW assert",
+ "Internal PIE register security violation",
+ "Error on GMI link",
+ "Poison data written to internal PIE register",
+};
+/* Unified Memory Controller */
+static const char * const smca_umc_mce_desc[] = {
+ "DRAM ECC error",
+ "Data poison error on DRAM",
+ "SDP parity error",
+ "Advanced peripheral bus error",
+ "Command/address parity error",
+ "Write data CRC error",
+};
+/* Parameter Block */
+static const char * const smca_pb_mce_desc[] = {
+ "Parameter Block RAM ECC error",
+};
+/* Platform Security Processor */
+static const char * const smca_psp_mce_desc[] = {
+ "PSP RAM ECC or parity error",
+};
+/* System Management Unit */
+static const char * const smca_smu_mce_desc[] = {
+ "SMU RAM ECC or parity error",
+};
+
+struct smca_mce_desc {
+ const char * const *descs;
+ unsigned int num_descs;
+};
+
+static struct smca_mce_desc smca_mce_descs[] = {
+ [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) },
+ [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) },
+ [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) },
+ [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) },
+ [SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) },
+ [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) },
+ [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
+ [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
+ [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
+ [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
+ [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
+ [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
+ [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
+};
+
+struct smca_hwid {
+ unsigned int bank_type; /* Use with smca_bank_types for easy indexing.*/
+ uint32_t mcatype_hwid; /* mcatype,hwid bit 63-32 in MCx_IPID Register*/
+};
+
+static struct smca_hwid smca_hwid_mcatypes[] = {
+ /* { bank_type, mcatype_hwid } */
+
+ /* ZN Core (HWID=0xB0) MCA types */
+ { SMCA_LS, 0x000000B0 },
+ { SMCA_IF, 0x000100B0 },
+ { SMCA_L2_CACHE, 0x000200B0 },
+ { SMCA_DE, 0x000300B0 },
+ /* HWID 0xB0 MCATYPE 0x4 is Reserved */
+ { SMCA_EX, 0x000500B0 },
+ { SMCA_FP, 0x000600B0 },
+ { SMCA_L3_CACHE, 0x000700B0 },
+
+ /* Data Fabric MCA types */
+ { SMCA_CS, 0x0000002E },
+ { SMCA_PIE, 0x0001002E },
+
+ /* Unified Memory Controller MCA type */
+ { SMCA_UMC, 0x00000096 },
+
+ /* Parameter Block MCA type */
+ { SMCA_PB, 0x00000005 },
+
+ /* Platform Security Processor MCA type */
+ { SMCA_PSP, 0x000000FF },
+
+ /* System Management Unit MCA type */
+ { SMCA_SMU, 0x00000001 },
+};
+
+struct smca_bank_name {
+ const char *name;
+};
+
+static struct smca_bank_name smca_names[] = {
+ [SMCA_LS] = { "Load Store Unit" },
+ [SMCA_IF] = { "Instruction Fetch Unit" },
+ [SMCA_L2_CACHE] = { "L2 Cache" },
+ [SMCA_DE] = { "Decode Unit" },
+ [SMCA_RESERVED] = { "Reserved" },
+ [SMCA_EX] = { "Execution Unit" },
+ [SMCA_FP] = { "Floating Point Unit" },
+ [SMCA_L3_CACHE] = { "L3 Cache" },
+ [SMCA_CS] = { "Coherent Slave" },
+ [SMCA_PIE] = { "Power, Interrupts, etc." },
+ [SMCA_UMC] = { "Unified Memory Controller" },
+ [SMCA_PB] = { "Parameter Block" },
+ [SMCA_PSP] = { "Platform Security Processor" },
+ [SMCA_SMU] = { "System Management Unit" },
+};
+
+static void amd_decode_errcode(struct mce_event *e)
+{
+
+ decode_amd_errcode(e);
+
+ if (e->status & MCI_STATUS_POISON)
+ mce_snprintf(e->mcistatus_msg, "Poison consumed");
+
+ if (e->status & MCI_STATUS_TCC)
+ mce_snprintf(e->mcistatus_msg, "Task_context_corrupt");
+
+}
+/*
+ * To find the UMC channel represented by this bank we need to match on its
+ * instance_id. The instance_id of a bank is held in the lower 32 bits of its
+ * IPID.
+ */
+static int find_umc_channel(struct mce_event *e)
+{
+ uint32_t umc_instance_id[] = {0x50f00, 0x150f00};
+ uint32_t instance_id = EXTRACT(e->ipid, 0, 31);
+ int i, channel = -1;
+
+ for (i = 0; i < ARRAY_SIZE(umc_instance_id); i++)
+ if (umc_instance_id[i] == instance_id)
+ channel = i;
+
+ return channel;
+}
+/* Decode extended errors according to Scalable MCA specification */
+static void decode_smca_error(struct mce_event *e)
+{
+ enum smca_bank_types bank_type;
+ const char *ip_name;
+ unsigned short xec = (e->status >> 16) & 0x3f;
+ const struct smca_hwid *s_hwid;
+ uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
+ unsigned int csrow = -1, channel = -1;
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
+ s_hwid = &smca_hwid_mcatypes[i];
+ if (mcatype_hwid == s_hwid->mcatype_hwid) {
+ bank_type = s_hwid->bank_type;
+ break;
+ }
+ }
+
+ if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) {
+ strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID");
+ return;
+ }
+
+ if (bank_type >= N_SMCA_BANK_TYPES) {
+ strcpy(e->mcastatus_msg, "Don't know how to decode this bank");
+ return;
+ }
+
+ if (bank_type == SMCA_RESERVED) {
+ strcpy(e->mcastatus_msg, "Bank 4 is reserved.\n");
+ return;
+ }
+
+ ip_name = smca_names[bank_type].name;
+
+ mce_snprintf(e->bank_name, "%s (bank=%d)", ip_name, e->bank);
+
+ /* Only print the descriptor of valid extended error code */
+ if (xec < smca_mce_descs[bank_type].num_descs)
+ mce_snprintf(e->mcastatus_msg,
+ " %s.\n", smca_mce_descs[bank_type].descs[xec]);
+
+ if (bank_type == SMCA_UMC && xec == 0) {
+ channel = find_umc_channel(e);
+ csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */
+ mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
+ channel, csrow);
+ }
+}
+
+int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)
+{
+ uint64_t mcgstatus = e->mcgstatus;
+
+ mce_snprintf(e->mcgstatus_msg, "mcgstatus=%lld",
+ (long long)e->mcgstatus);
+
+ if (mcgstatus & MCG_STATUS_RIPV)
+ mce_snprintf(e->mcgstatus_msg, "RIPV");
+ if (mcgstatus & MCG_STATUS_EIPV)
+ mce_snprintf(e->mcgstatus_msg, "EIPV");
+ if (mcgstatus & MCG_STATUS_MCIP)
+ mce_snprintf(e->mcgstatus_msg, "MCIP");
+
+ decode_smca_error(e);
+ amd_decode_errcode(e);
+ return 0;
+}
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ rasdaemon-0.6.1/mce-amd.c 2019-07-12 11:35:04.836470461 -0400
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2018, The AMD, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "ras-mce-handler.h"
+
+/* Error Code Types */
+#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010)
+#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100)
+#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800)
+#define INT_ERROR(x) (((x) & 0xF4FF) == 0x0400)
+
+/* Error code: transaction type (TT) */
+static char *transaction[] = {
+ "instruction", "data", "generic", "reserved"
+};
+/* Error codes: cache level (LL) */
+static char *cachelevel[] = {
+ "reserved", "L1", "L2", "L3/generic"
+};
+/* Error codes: memory transaction type (RRRR) */
+static char *memtrans[] = {
+ "generic", "generic read", "generic write", "data read",
+ "data write", "instruction fetch", "prefetch", "evict", "snoop",
+ "?", "?", "?", "?", "?", "?", "?"
+};
+/* Participation Processor */
+static char *partproc[] = {
+ "local node origin", "local node response",
+ "local node observed", "generic participation"
+};
+/* Timeout */
+static char *timeout[] = {
+ "request didn't time out",
+ "request timed out"
+};
+/* internal unclassified error code */
+static char *internal[] = { "reserved",
+ "reserved",
+ "hardware assert",
+ "reserved" };
+
+#define TT(x) (((x) >> 2) & 0x3) /*bit 2, bit 3*/
+#define TT_MSG(x) transaction[TT(x)]
+#define LL(x) ((x) & 0x3) /*bit 0, bit 1*/
+#define LL_MSG(x) cachelevel[LL(x)]
+
+#define R4(x) (((x) >> 4) & 0xF) /*bit 4, bit 5, bit 6, bit 7 */
+#define R4_MSG(x) ((R4(x) < 9) ? memtrans[R4(x)] : "Wrong R4!")
+
+#define TO(x) (((x) >> 8) & 0x1) /*bit 8*/
+#define TO_MSG(x) timeout[TO(x)]
+#define PP(x) (((x) >> 9) & 0x3) /*bit 9, bit 10*/
+#define PP_MSG(x) partproc[PP(x)]
+
+#define UU(x) (((x) >> 8) & 0x3) /*bit 8, bit 9*/
+#define UU_MSG(x) internal[UU(x)]
+
+void decode_amd_errcode(struct mce_event *e)
+{
+ uint16_t ec = e->status & 0xffff;
+ uint16_t ecc = (e->status >> 45) & 0x3;
+
+ if (e->status & MCI_STATUS_UC) {
+ if (e->status & MCI_STATUS_PCC)
+ strcpy(e->error_msg, "System Fatal error.");
+ if (e->mcgstatus & MCG_STATUS_RIPV)
+ strcpy(e->error_msg,
+ "Uncorrected, software restartable error.");
+ strcpy(e->error_msg,
+ "Uncorrected, software containable error.");
+ } else if (e->status & MCI_STATUS_DEFERRED)
+ strcpy(e->error_msg, "Deferred error, no action required.");
+ else
+ strcpy(e->error_msg, "Corrected error, no action required.");
+
+ if (!(e->status & MCI_STATUS_VAL))
+ mce_snprintf(e->mcistatus_msg, "MCE_INVALID");
+
+ if (e->status & MCI_STATUS_OVER)
+ mce_snprintf(e->mcistatus_msg, "Error_overflow");
+
+ if (e->status & MCI_STATUS_PCC)
+ mce_snprintf(e->mcistatus_msg, "Processor_context_corrupt");
+
+ if (ecc)
+ mce_snprintf(e->mcistatus_msg,
+ "%sECC", ((ecc == 2) ? "C" : "U"));
+
+ if (INT_ERROR(ec)) {
+ mce_snprintf(e->mcastatus_msg, "Internal '%s'", UU_MSG(ec));
+ return;
+ }
+
+ if (TLB_ERROR(ec))
+ mce_snprintf(e->mcastatus_msg,
+ "TLB Error 'tx: %s, level: %s'",
+ TT_MSG(ec), LL_MSG(ec));
+ else if (MEM_ERROR(ec))
+ mce_snprintf(e->mcastatus_msg,
+ "Memory Error 'mem-tx: %s, tx: %s, level: %s'",
+ R4_MSG(ec), TT_MSG(ec), LL_MSG(ec));
+ else if (BUS_ERROR(ec))
+ mce_snprintf(e->mcastatus_msg,
+ "Bus Error '%s, %s, mem-tx: %s, level: %s'",
+ PP_MSG(ec), TO_MSG(ec),
+ R4_MSG(ec), LL_MSG(ec));
+ return;
+
+}
--- rasdaemon-0.6.1.orig/ras-mce-handler.c 2019-07-12 11:35:01.585502811 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.c 2019-07-12 11:35:04.836470461 -0400
@@ -55,6 +55,7 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series
[CPU_KNIGHTS_LANDING] = "Knights Landing",
[CPU_KNIGHTS_MILL] = "Knights Mill",
[CPU_SKYLAKE_XEON] = "Skylake server",
+ [CPU_NAPLES] = "AMD Family 17h Zen1"
};
static enum cputype select_intel_cputype(struct ras_events *ras)
@@ -190,9 +191,12 @@ ret = 0;
if (!strcmp(mce->vendor, "AuthenticAMD")) {
if (mce->family == 15)
mce->cputype = CPU_K8;
- if (mce->family > 15) {
+ if (mce->family == 23)
+ mce->cputype = CPU_NAPLES;
+ if (mce->family > 23) {
log(ALL, LOG_INFO,
- "Can't parse MCE for this AMD CPU yet\n");
+ "Can't parse MCE for this AMD CPU yet %d\n",
+ mce->family);
ret = EINVAL;
}
goto ret;
@@ -331,6 +335,12 @@ #if 0
if (e->status & MCI_STATUS_ADDRV)
trace_seq_printf(s, ", addr= %llx", (long long)e->addr);
+ if (e->status & MCI_STATUS_SYNDV)
+ trace_seq_printf(s, ", synd= %llx", (long long)e->synd);
+
+ if (e->ipid)
+ trace_seq_printf(s, ", ipid= %llx", (long long)e->ipid);
+
if (e->mcgstatus_msg)
trace_seq_printf(s, ", %s", e->mcgstatus_msg);
else
@@ -411,6 +421,13 @@ if (pevent_get_field_val(s, event, "bank
if (pevent_get_field_val(s, event, "cpuvendor", record, &val, 1) < 0)
return -1;
e.cpuvendor = val;
+ /* Get New entries */
+ if (pevent_get_field_val(s, event, "synd", record, &val, 1) < 0)
+ return -1;
+ e.synd = val;
+ if (pevent_get_field_val(s, event, "ipid", record, &val, 1) < 0)
+ return -1;
+ e.ipid = val;
switch (mce->cputype) {
case CPU_GENERIC:
@@ -418,6 +435,9 @@ if (pevent_get_field_val(s, event, "cpuv
case CPU_K8:
rc = parse_amd_k8_event(ras, &e);
break;
+ case CPU_NAPLES:
+ rc = parse_amd_smca_event(ras, &e);
+ break;
default: /* All other CPU types are Intel */
rc = parse_intel_event(ras, &e);
}
--- rasdaemon-0.6.1.orig/ras-mce-handler.h 2019-07-12 11:35:01.585502811 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.h 2019-07-12 11:35:04.836470461 -0400
@@ -50,6 +50,7 @@ enum cputype {
CPU_KNIGHTS_LANDING,
CPU_KNIGHTS_MILL,
CPU_SKYLAKE_XEON,
+ CPU_NAPLES,
};
struct mce_event {
@@ -69,6 +70,8 @@ struct mce_event {
uint8_t cs;
uint8_t bank;
uint8_t cpuvendor;
+ uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */
+ uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */
/* Parsed data */
char timestamp[64];
@@ -129,6 +132,9 @@ void broadwell_de_decode_model(struct ra
void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e);
void skylake_s_decode_model(struct ras_events *ras, struct mce_event *e);
+/* AMD error code decode function */
+void decode_amd_errcode(struct mce_event *e);
+
/* Software defined banks */
#define MCE_EXTENDED_BANK 128
@@ -144,6 +150,13 @@ #define MCI_STATUS_EN (1ULL<<60) /*
#define MCI_STATUS_S (1ULL<<56) /* signalled */
#define MCI_STATUS_AR (1ULL<<55) /* action-required */
+/* AMD-specific bits */
+#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */
+#define MCI_STATUS_SYNDV (1ULL<<53) /* synd reg. valid */
+/* uncorrected error,deferred exception */
+#define MCI_STATUS_DEFERRED (1ULL<<44)
+#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
+
#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
#define MCG_STATUS_EIPV (1ULL<<1) /* eip points to correct instruction */
#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
@@ -154,4 +167,6 @@ int parse_intel_event(struct ras_events
int parse_amd_k8_event(struct ras_events *ras, struct mce_event *e);
+int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e);
+
#endif
--- rasdaemon-0.6.1.orig/Makefile.in 2018-04-25 06:29:05.000000000 -0400
+++ rasdaemon-0.6.1/Makefile.in 2019-07-15 14:41:22.308278851 -0400
@@ -100,7 +100,7 @@ sbin_PROGRAMS = rasdaemon$(EXEEXT)
@WITH_MCE_TRUE@ mce-intel-dunnington.c mce-intel-tulsa.c \
@WITH_MCE_TRUE@ mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \
@WITH_MCE_TRUE@ mce-intel-knl.c mce-intel-broadwell-de.c \
-@WITH_MCE_TRUE@ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c
+@WITH_MCE_TRUE@ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c mce-amd.c mce-amd-smca.c
@WITH_EXTLOG_TRUE@am__append_6 = ras-extlog-handler.c
@WITH_ABRT_REPORT_TRUE@am__append_7 = ras-report.c
@@ -132,7 +132,7 @@ am__rasdaemon_SOURCES_DIST = rasdaemon.c
mce-intel-ivb.c mce-intel-haswell.c mce-intel-knl.c \
mce-intel-broadwell-de.c mce-intel-broadwell-epex.c \
mce-intel-skylake-xeon.c ras-extlog-handler.c ras-report.c \
- non-standard-hisi_hip07.c
+ non-standard-hisi_hip07.c mce-amd-smca.c mce-amd.c
@WITH_SQLITE3_TRUE@am__objects_1 = ras-record.$(OBJEXT)
@WITH_AER_TRUE@am__objects_2 = ras-aer-handler.$(OBJEXT)
@WITH_NON_STANDARD_TRUE@am__objects_3 = \
@@ -149,7 +149,9 @@ non-standard-hisi_hip07.c
@WITH_MCE_TRUE@ mce-intel-knl.$(OBJEXT) \
@WITH_MCE_TRUE@ mce-intel-broadwell-de.$(OBJEXT) \
@WITH_MCE_TRUE@ mce-intel-broadwell-epex.$(OBJEXT) \
-@WITH_MCE_TRUE@ mce-intel-skylake-xeon.$(OBJEXT)
+@WITH_MCE_TRUE@ mce-intel-skylake-xeon.$(OBJEXT) \
+@WITH_MCE_TRUE@ mce-amd-smca.$(OBJEXT) \
+@WITH_MCE_TRUE@ mce-amd.$(OBJEXT)
@WITH_EXTLOG_TRUE@am__objects_6 = ras-extlog-handler.$(OBJEXT)
@WITH_ABRT_REPORT_TRUE@am__objects_7 = ras-report.$(OBJEXT)
@WITH_HISI_NS_DECODE_TRUE@am__objects_8 = \
@@ -595,6 +597,8 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bitfield.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd-k8.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd-scma.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-broadwell-de.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-broadwell-epex.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-dunnington.Po@am__quote@

View File

@ -1,138 +0,0 @@
commit a8c776ed94f68ae31d7b5f74e19545698898c13c
Author: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Tue Aug 14 13:06:27 2018 -0300
mce-intel-*: fix a warning when using FIELD(<num>, NULL)
Internally, FIELD() macro checks the size of an array, by
using ARRAY_SIZE. Well, this macro causes a division by zero
if NULL is used, as its type is void, as warned:
mce-intel-dunnington.c:30:2: note: in expansion of macro FIELD
FIELD(17, NULL),
^~~~~
ras-mce-handler.h:28:33: warning: division sizeof (void *) / sizeof (void) does not compute the number of array elements [-Wsizeof-pointer-div]
#define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x)))
^
bitfield.h:37:51: note: in expansion of macro ARRAY_SIZE
#define FIELD(start_bit, name) { start_bit, name, ARRAY_SIZE(name) }
^~~~~~~~~~
While this warning is harmless, it may prevent seeing more serios
warnings. So, add a FIELD_NULL(<num>) macro to avoid that.
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
diff --git a/bitfield.h b/bitfield.h
index c7dfeb1..fccbb36 100644
--- a/bitfield.h
+++ b/bitfield.h
@@ -35,6 +35,7 @@ struct numfield {
};
#define FIELD(start_bit, name) { start_bit, name, ARRAY_SIZE(name) }
+#define FIELD_NULL(start_bit) { start_bit, NULL, 0 }
#define SBITFIELD(start_bit, string) { start_bit, ((char * [2]) { NULL, string }), 2 }
#define NUMBER(start, end, name) { start, end, name, "%Lu", 0 }
diff --git a/mce-intel-dunnington.c b/mce-intel-dunnington.c
index 4b1c7e3..c695c62 100644
--- a/mce-intel-dunnington.c
+++ b/mce-intel-dunnington.c
@@ -27,14 +27,14 @@
static struct field dunnington_bus_status[] = {
SBITFIELD(16, "Parity error detected during FSB request phase"),
- FIELD(17, NULL),
+ FIELD_NULL(17),
SBITFIELD(20, "Hard Failure response received for a local transaction"),
SBITFIELD(21, "Parity error on FSB response field detected"),
SBITFIELD(22, "Parity data error on inbound data detected"),
- FIELD(23, NULL),
- FIELD(25, NULL),
- FIELD(28, NULL),
- FIELD(31, NULL),
+ FIELD_NULL(23),
+ FIELD_NULL(25),
+ FIELD_NULL(28),
+ FIELD_NULL(31),
{}
};
diff --git a/mce-intel-p4-p6.c b/mce-intel-p4-p6.c
index 4615e1a..5c6c3ff 100644
--- a/mce-intel-p4-p6.c
+++ b/mce-intel-p4-p6.c
@@ -60,7 +60,7 @@ static char *bus_queue_error_type[] = {
};
static struct field p6_shared_status[] = {
- FIELD(16, NULL),
+ FIELD_NULL(16),
FIELD(19, bus_queue_req_type),
FIELD(25, bus_queue_error_type),
FIELD(25, bus_queue_error_type),
@@ -68,7 +68,7 @@ static struct field p6_shared_status[] = {
SBITFIELD(36, "received parity error on response transaction"),
SBITFIELD(38, "timeout BINIT (ROB timeout)."
" No micro-instruction retired for some time"),
- FIELD(39, NULL),
+ FIELD_NULL(39),
SBITFIELD(42, "bus transaction received hard error response"),
SBITFIELD(43, "failure that caused IERR"),
/* The following are reserved for Core in the SDM. Let's keep them here anyways*/
@@ -76,15 +76,15 @@ static struct field p6_shared_status[] = {
SBITFIELD(45, "uncorrectable ECC error"),
SBITFIELD(46, "correctable ECC error"),
/* [47..54]: ECC syndrome */
- FIELD(55, NULL),
+ FIELD_NULL(55),
{},
};
static struct field p6old_status[] = {
SBITFIELD(28, "FRC error"),
SBITFIELD(29, "BERR on this CPU"),
- FIELD(31, NULL),
- FIELD(32, NULL),
+ FIELD_NULL(31),
+ FIELD_NULL(32),
SBITFIELD(35, "BINIT received from external bus"),
SBITFIELD(37, "Received hard error reponse on split transaction (Bus BINIT)"),
{}
@@ -94,9 +94,9 @@ static struct field core2_status[] = {
SBITFIELD(28, "MCE driven"),
SBITFIELD(29, "MCE is observed"),
SBITFIELD(31, "BINIT observed"),
- FIELD(32, NULL),
+ FIELD_NULL(32),
SBITFIELD(34, "PIC or FSB data parity error"),
- FIELD(35, NULL),
+ FIELD_NULL(35),
SBITFIELD(37, "FSB address parity error detected"),
{}
};
diff --git a/mce-intel-tulsa.c b/mce-intel-tulsa.c
index 6cea421..e59bf06 100644
--- a/mce-intel-tulsa.c
+++ b/mce-intel-tulsa.c
@@ -39,7 +39,7 @@ static struct field tls_bus_status[] = {
SBITFIELD(16, "Parity error detected during FSB request phase"),
SBITFIELD(17, "Partity error detected on Core 0 request's address field"),
SBITFIELD(18, "Partity error detected on Core 1 request's address field"),
- FIELD(19, NULL),
+ FIELD_NULL(19),
SBITFIELD(20, "Parity error on FSB response field detected"),
SBITFIELD(21, "FSB data parity error on inbound date detected"),
SBITFIELD(22, "Data parity error on data received from Core 0 detected"),
@@ -48,8 +48,8 @@ static struct field tls_bus_status[] = {
SBITFIELD(25, "Data ECC event to error on inbound data correctable or uncorrectable"),
SBITFIELD(26, "Pad logic detected a data strobe glitch or sequencing error"),
SBITFIELD(27, "Pad logic detected a request strobe glitch or sequencing error"),
- FIELD(28, NULL),
- FIELD(31, NULL),
+ FIELD_NULL(28),
+ FIELD_NULL(31),
{}
};

View File

@ -1,159 +0,0 @@
---
labels/dell | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 152 insertions(+)
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ rasdaemon-0.6.1/labels/dell 2020-02-20 11:53:39.574579258 -0500
@@ -0,0 +1,152 @@
+# RASDAEMON Motherboard DIMM labels Database file.
+#
+# Vendor-name and model-name are found from the program 'dmidecode'
+# labels are found from the silk screen on the motherboard.
+#
+#Vendor: <vendor-name>
+# Product: <product-name>
+# Model: <model-name>
+# <label>: <mc>.<top>.<mid>.<low>
+#
+
+Vendor: Dell Inc.
+# 1-socket
+ Product: PowerEdge R220, PowerEdge R330, PowerEdge T330, PowerEdge R230, PowerEdge T130, PowerEdge T30
+ DIMM_A1: 0.0.0; DIMM_A2: 0.0.1;
+ DIMM_A3: 0.1.0; DIMM_A4: 0.1.1;
+
+ Product: PowerEdge T110 II, PowerEdge T20
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0;
+
+ DIMM_B1: 0.0.1; DIMM_B2: 0.1.1;
+
+ Product: PowerEdge R320, PowerEdge T320
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0;
+ DIMM_A4: 0.0.1; DIMM_A5: 0.1.1; DIMM_A6: 0.2.1;
+
+# 2-socket
+ Product: PowerEdge R610
+ DIMM_A1: 0.0.0; DIMM_A2: 0.0.1; DIMM_A3: 0.0.2;
+ DIMM_A4: 0.1.0; DIMM_A5: 0.1.1; DIMM_A6: 0.1.2;
+
+ DIMM_B1: 1.0.0; DIMM_B2: 1.0.1; DIMM_B3: 1.0.2;
+ DIMM_B4: 1.1.0; DIMM_B5: 1.1.1; DIMM_B6: 1.1.2;
+
+ Product: PowerEdge T710, PowerEdge R710
+ DIMM_A3: 0.0.0; DIMM_A2: 0.1.0; DIMM_A1: 0.2.0;
+ DIMM_A6: 0.0.1; DIMM_A5: 0.1.1; DIMM_A4: 0.2.1;
+ DIMM_A9: 0.0.2; DIMM_A8: 0.1.2; DIMM_A7: 0.2.2;
+
+ DIMM_B3: 1.0.0; DIMM_B2: 1.1.0; DIMM_B1: 1.2.0;
+ DIMM_B6: 1.0.1; DIMM_B5: 1.1.1; DIMM_B4: 1.2.1;
+ DIMM_B9: 1.0.2; DIMM_B8: 1.1.2; DIMM_B7: 1.2.2;
+
+ Product: PowerEdge R620, PowerEdge T620, PowerEdge R720xd, PowerEdge R730xd, PowerEdge T630, PowerEdge R730, PowerEdge R630, PowerEdge T620, PowerEdge M620, PowerEdge FC620, PowerEdge M630, PowerEdge FC630
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
+ DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1;
+ DIMM_A9: 0.0.2; DIMM_A10: 0.1.2; DIMM_A11: 0.2.2; DIMM_A12: 0.3.2;
+
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
+ DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1;
+ DIMM_B9: 1.0.2; DIMM_B10: 1.1.2; DIMM_B11: 1.2.2; DIMM_B12: 1.3.2;
+
+ Product: PowerEdge R640, PowerEdge R740, PowerEdge R740xd, PowerEdge T640
+ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0;
+ A7: 0.0.1; A8: 0.1.1; A9: 0.2.1; A10: 1.0.1; A11: 1.1.1; A12: 1.2.1;
+
+ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0;
+ B7: 2.0.1; B8: 2.1.1; B9: 2.2.1; B10: 3.0.1; B11: 3.1.1; B12: 3.2.1;
+
+ Product: PowerEdge M520, PowerEdge R420, PowerEdge T420
+ DIMM_A1: 0.1.0; DIMM_A2: 0.2.0; DIMM_A3: 0.3.0;
+ DIMM_A4: 0.1.1; DIMM_A5: 0.2.1; DIMM_A6: 0.3.1;
+
+ DIMM_B1: 1.1.0; DIMM_B2: 1.2.0; DIMM_B3: 1.3.0;
+ DIMM_B4: 1.1.1; DIMM_B5: 1.2.1; DIMM_B6: 1.3.1;
+
+ Product: PowerEdge FC420, PowerEdge M420
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0;
+
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0;
+
+ Product: PowerEdge C6320, PowerEdge C4130
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
+ DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1;
+
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
+ DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1;
+
+ Product: PowerEdge C6320p
+ A1: 0.0.0; B1: 0.1.0; C1: 0.2.0;
+ D1: 1.0.0; E1: 1.1.0; F1: 1.2.0;
+
+ Product: PowerEdge C6420
+ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0;
+ A7: 0.0.1; A8: 1.0.1;
+
+ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0;
+ B7: 2.0.1; B8: 3.0.1;
+
+ Product: PowerEdge R430, PowerEdge T430, PowerEdge R530
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
+ DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1;
+
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
+
+ Product: PowerEdge FC430
+ DIMM_A1: 0.1.0; DIMM_A2: 0.0.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
+
+ DIMM_B1: 1.1.0; DIMM_B2: 1.0.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
+
+# 4-socket
+ Product: PowerEdge M820, PowerEdge R830, PowerEdge M830, PowerEdge R930, PowerEdge FC830
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
+ DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1;
+ DIMM_A9: 0.0.2; DIMM_A10: 0.1.2; DIMM_A11: 0.2.2; DIMM_A12: 0.3.2;
+
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
+ DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1;
+ DIMM_B9: 1.0.2; DIMM_B10: 1.1.2; DIMM_B11: 1.2.2; DIMM_B12: 1.3.2;
+
+ DIMM_C1: 2.0.0; DIMM_C2: 2.1.0; DIMM_C3: 2.2.0; DIMM_C4: 2.3.0;
+ DIMM_C5: 2.0.1; DIMM_C6: 2.1.1; DIMM_C7: 2.2.1; DIMM_C8: 2.3.1;
+ DIMM_C9: 2.0.2; DIMM_C10: 2.1.2; DIMM_C11: 2.2.2; DIMM_C12: 2.3.2;
+
+ DIMM_D1: 3.0.0; DIMM_D2: 3.1.0; DIMM_D3: 3.2.0; DIMM_D4: 3.3.0;
+ DIMM_D5: 3.0.1; DIMM_D6: 3.1.1; DIMM_D7: 3.2.1; DIMM_D8: 3.3.1;
+ DIMM_D9: 3.0.2; DIMM_D10: 3.1.2; DIMM_D11: 3.2.2; DIMM_D12: 3.3.2;
+
+ Product: PowerEdge FM120x4
+ DIMM_A_A1: 0.1.0; DIMM_A_A2: 0.2.0;
+
+ DIMM_B_A1: 1.1.0; DIMM_B_A2: 1.2.0;
+
+ DIMM_C_A1: 2.1.0; DIMM_C_A2: 2.2.0;
+
+ DIMM_D_A1: 3.1.0; DIMM_D_A2: 3.2.0;
+
+ Product: PowerEdge R940
+ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0;
+ A7: 0.0.1; A8: 0.1.1; A9: 0.2.1; A10: 1.0.1; A11: 1.1.1; A12: 1.2.1;
+
+ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0;
+ B7: 2.0.1; B8: 2.1.1; B9: 2.2.1; B10: 3.0.1; B11: 3.1.1; B12: 3.2.1;
+
+ C1: 4.0.0; C2: 4.1.0; C3: 4.2.0; C4: 5.0.0; C5: 5.1.0; C6: 5.2.0;
+ C7: 4.0.1; C8: 4.1.1; C9: 4.2.1; C10: 5.0.1; C11: 5.1.1; C12: 5.2.1;
+
+ D1: 6.0.0; D2: 6.1.0; D3: 6.2.0; D4: 7.0.0; D5: 7.1.0; D6: 7.2.0;
+ D7: 6.0.1; D8: 6.1.1; D9: 6.2.1; D10: 7.0.1; D11: 7.1.1; D12: 7.2.1;
+
+ Product: PowerEdge R440, PowerEdge R540
+ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0;
+ A7: 0.0.1; A8: 0.1.1; A9: 1.0.1; A10: 1.1.1;
+
+ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0;
+
+ Product: PowerEdge M640, PowerEdge FC640
+ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0;
+ A7: 0.0.1; A8: 1.0.1;
+
+ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0;
+ B7: 2.0.1; B8: 3.0.1;

View File

@ -1,107 +0,0 @@
commit aecf33aa70331670c06db6b652712b476e24051c
Author: Muralidhara M K <muralimk@amd.com>
Date: Mon Jul 12 05:40:46 2021 -0500
rasdaemon: Enumerate memory on noncpu nodes
On newer heterogeneous systems from AMD with GPU nodes (with HBM2 memory
banks) connected via xGMI links to the CPUs.
The node id information is available in the InstanceHI[47:44] of
the IPID register.
The UMC Phys on Aldeberan nodes are enumerated as csrow
The UMC channels connected to HBMs are enumerated as ranks.
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 3c346f4..f3379fc 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -78,6 +78,12 @@ enum smca_bank_types {
/* Maximum number of MCA banks per CPU. */
#define MAX_NR_BANKS 64
+/*
+ * On Newer heterogeneous systems from AMD with CPU and GPU nodes connected
+ * via xGMI links, the NON CPU Nodes are enumerated from index 8
+ */
+#define NONCPU_NODE_INDEX 8
+
/* SMCA Extended error strings */
/* Load Store */
static const char * const smca_ls_mce_desc[] = {
@@ -531,6 +537,26 @@ static int find_umc_channel(struct mce_event *e)
{
return EXTRACT(e->ipid, 0, 31) >> 20;
}
+
+/*
+ * The HBM memory managed by the UMCCH of the noncpu node
+ * can be calculated based on the [15:12]bits of IPID
+ */
+static int find_hbm_channel(struct mce_event *e)
+{
+ int umc, tmp;
+
+ umc = EXTRACT(e->ipid, 0, 31) >> 20;
+
+ /*
+ * The HBM channel managed by the UMC of the noncpu node
+ * can be calculated based on the [15:12]bits of IPID as follows
+ */
+ tmp = ((e->ipid >> 12) & 0xf);
+
+ return (umc % 2) ? tmp + 4 : tmp;
+}
+
/* Decode extended errors according to Scalable MCA specification */
static void decode_smca_error(struct mce_event *e)
{
@@ -539,6 +565,7 @@ static void decode_smca_error(struct mce_event *e)
unsigned short xec = (e->status >> 16) & 0x3f;
const struct smca_hwid *s_hwid;
uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
+ uint8_t mcatype_instancehi = EXTRACT(e->ipid, 44, 47);
unsigned int csrow = -1, channel = -1;
unsigned int i;
@@ -548,14 +575,16 @@ static void decode_smca_error(struct mce_event *e)
bank_type = s_hwid->bank_type;
break;
}
+ if (mcatype_instancehi >= NONCPU_NODE_INDEX)
+ bank_type = SMCA_UMC_V2;
}
- if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) {
+ if (i >= MAX_NR_BANKS) {
strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID");
return;
}
- if (bank_type >= N_SMCA_BANK_TYPES) {
+ if (bank_type >= MAX_NR_BANKS) {
strcpy(e->mcastatus_msg, "Don't know how to decode this bank");
return;
}
@@ -580,6 +609,16 @@ static void decode_smca_error(struct mce_event *e)
mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
channel, csrow);
}
+
+ if (bank_type == SMCA_UMC_V2 && xec == 0) {
+ /* The UMCPHY is reported as csrow in case of noncpu nodes */
+ csrow = find_umc_channel(e) / 2;
+ /* UMCCH is managing the HBM memory */
+ channel = find_hbm_channel(e);
+ mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
+ channel, csrow);
+ }
+
}
int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)

View File

@ -1,37 +0,0 @@
commit b22be68453b2497e86cbd273b9cd56fadc5859e3
Author: Ying Lv <lvying6@huawei.com>
Date: Wed May 15 11:15:42 2019 +0800
fix rasdaemon high CPU usage when part of CPUs offline
When we set part of CPU core offline, such as by setting the kernel cmdline
maxcpus = N(N is less than the total number of system CPU cores).
And then, we will observe that the CPU usage of some rasdaemon threads
is very close to 100.
This is because when part of CPU offline, poll in read_ras_event_all_cpus func
will fallback to pthread way.
Offlined CPU thread will return negative value when read trace_pipe_raw,
negative return value will covert to positive value because of 'unsigned size'.
So code will always go into 'size > 0' branch, and the CPU usage is too high.
Here, variable size uses int type will go to the right branch.
Fiexs: eff7c9e0("ras-events: Only use pthreads for collect if poll() not available")
Reported-by: Zhipeng Xie <xiezhipeng1@huawei.com>
Signed-off-by: Ying Lv <lvying6@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
diff --git a/ras-events.c b/ras-events.c
index 4e7b815..38ebe1e 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -426,7 +426,7 @@ static int read_ras_event(int fd,
struct kbuffer *kbuf,
void *page)
{
- unsigned size;
+ int size;
unsigned long long time_stamp;
void *data;

View File

@ -1,148 +0,0 @@
commit b497a3d6a39d402c41065e9284d49114b97e3bfe
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Mon Mar 8 16:57:28 2021 +0000
rasdaemon: ras-mc-ctl: Add memory failure events
Add supporting memory failure errors (memory_failure_event)
to the ras-mc-ctl tool.
Sample Log,
ras-mc-ctl --summary
...
Memory failure events summary:
Delayed errors: 4
Failed errors: 1
...
ras-mc-ctl --errors
...
Memory failure events:
1 2020-10-28 23:20:41 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Delayed
2 2020-10-28 23:31:38 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Delayed
3 2020-10-28 23:54:54 -0800 error: pfn=0x205000000, page_type=free buddy page, action_result=Delayed
4 2020-10-29 00:12:25 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Delayed
5 2020-10-29 00:26:36 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Failed
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
configure.ac | 11 +++++++++++
util/ras-mc-ctl.in | 46 +++++++++++++++++++++++++++++++++++++++++++---
2 files changed, 54 insertions(+), 3 deletions(-)
--- a/util/ras-mc-ctl.in 2021-10-13 13:51:00.887292563 -0400
+++ b/util/ras-mc-ctl.in 2021-10-13 13:51:27.536061894 -0400
@@ -44,11 +44,13 @@ my $modprobe = find_prog ("modprobe")
my $has_aer = 0;
my $has_arm = 0;
my $has_extlog = 0;
+my $has_mem_failure = 0;
my $has_mce = 0;
@WITH_AER_TRUE@$has_aer = 1;
@WITH_ARM_TRUE@$has_arm = 1;
@WITH_EXTLOG_TRUE@$has_extlog = 1;
+@WITH_MEMORY_FAILURE_TRUE@$has_mem_failure = 1;
@WITH_MCE_TRUE@$has_mce = 1;
my %conf = ();
@@ -1132,7 +1134,7 @@ sub summary
{
require DBI;
my ($query, $query_handle, $out);
- my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg);
+ my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg, $action_result);
my ($etype, $severity, $etype_string, $severity_string);
my ($affinity, $mpidr);
@@ -1203,9 +1205,27 @@ sub summary
$out .= "\t$count $etype_string $severity_string errors\n";
}
if ($out ne "") {
- print "Extlog records summary:\n$out";
+ print "Extlog records summary:\n$out\n";
} else {
- print "No Extlog errors.\n";
+ print "No Extlog errors.\n\n";
+ }
+ $query_handle->finish;
+ }
+
+ # Memory failure errors
+ if ($has_mem_failure == 1) {
+ $query = "select action_result, count(*) from memory_failure_event group by action_result";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($action_result, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$action_result errors: $count\n";
+ }
+ if ($out ne "") {
+ print "Memory failure events summary:\n$out\n";
+ } else {
+ print "No Memory failure errors.\n\n";
}
$query_handle->finish;
}
@@ -1238,6 +1258,7 @@ sub errors
my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location);
my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data);
my ($error_count, $affinity, $mpidr, $r_state, $psci_state);
+ my ($pfn, $page_type, $action_result);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
@@ -1329,6 +1350,25 @@ $out .= sprintf "address=0x%08x, ", $add
}
$query_handle->finish;
}
+
+ # Memory failure errors
+ if ($has_mem_failure == 1) {
+ $query = "select id, timestamp, pfn, page_type, action_result from memory_failure_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $pfn, $page_type, $action_result));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "pfn=$pfn, page_type=$page_type, action_result=$action_result\n";
+ }
+ if ($out ne "") {
+ print "Memory failure events:\n$out\n";
+ } else {
+ print "No Memory failure errors.\n\n";
+ }
+ $query_handle->finish;
+ }
# MCE mce_record errors
if ($has_mce == 1) {
--- a/configure.ac 2018-04-25 06:28:51.000000000 -0400
+++ b/configure.ac 2021-10-13 13:51:00.916292312 -0400
@@ -80,6 +80,16 @@ AS_IF([test "x$enable_extlog" = "xyes"],
])
AM_CONDITIONAL([WITH_EXTLOG], [test x$enable_extlog = xyes])
+AC_ARG_ENABLE([memory_failure],
+ AS_HELP_STRING([--enable-memory-failure], [enable memory failure events (currently experimental)]))
+
+AS_IF([test "x$enable_memory_failure" = "xyes" || test "x$enable_all" == "xyes"], [
+ AC_DEFINE(HAVE_MEMORY_FAILURE,1,"have memory failure events collect")
+ AC_SUBST([WITH_MEMORY_FAILURE])
+])
+AM_CONDITIONAL([WITH_MEMORY_FAILURE], [test x$enable_memory_failure = xyes || test x$enable_all == xyes])
+AM_COND_IF([WITH_MEMORY_FAILURE], [USE_MEMORY_FAILURE="yes"], [USE_MEMORY_FAILURE="no"])
+
AC_ARG_ENABLE([abrt_report],
AS_HELP_STRING([--enable-abrt-report], [enable report event to ABRT (currently experimental)]))
@@ -127,4 +137,5 @@ compile time options summary
ABRT report : $enable_abrt_report
HIP07 SAS HW errors : $enable_hisi_ns_decode
ARM events : $enable_arm
+ Memory Failure : $USE_MEMORY_FAILURE
EOF

View File

@ -1,94 +0,0 @@
commit cc2ce5c65ed5a42eaa97aa3659854add6d808da5
Author: Muralidhara M K <muralidhara.mk@amd.com>
Date: Mon Jan 13 19:12:06 2020 +0530
rasdaemon: Add error decoding for new SMCA Load Store bank type
Future Scalable Machine Check Architecture (SMCA) systems will have a
new Load Store bank type.
Add the new type's (HWID, McaType) ID and error decoding.
Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
[ Adjust commit message. ]
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 114e786..d0b6cb6 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -38,9 +38,16 @@
* 03: EC[3], 02: EC[2], 01: EC[1], 00: EC[0]
*/
+/* MCA_STATUS REGISTER FOR FAMILY 19H
+ * The bits 24 ~ 29 contains AddressLsb
+ * 29: ADDRLS[5], 28: ADDRLS[4], 27: ADDRLS[3],
+ * 26: ADDRLS[2], 25: ADDRLS[1], 24: ADDRLS[0]
+ */
+
/* These may be used by multiple smca_hwid_mcatypes */
enum smca_bank_types {
SMCA_LS = 0, /* Load Store */
+ SMCA_LS_V2, /* Load Store */
SMCA_IF, /* Instruction Fetch */
SMCA_L2_CACHE, /* L2 Cache */
SMCA_DE, /* Decoder Unit */
@@ -88,6 +95,32 @@ static const char * const smca_ls_mce_desc[] = {
"DC tag error type 5",
"L2 fill data error",
};
+static const char * const smca_ls2_mce_desc[] = {
+ "An ECC error was detected on a data cache read by a probe or victimization",
+ "An ECC error or L2 poison was detected on a data cache read by a load",
+ "An ECC error was detected on a data cache read-modify-write by a store",
+ "An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization",
+ "An ECC error or poison bit mismatch was detected on a tag read by a load",
+ "An ECC error or poison bit mismatch was detected on a tag read by a store",
+ "An ECC error was detected on an EMEM read by a load",
+ "An ECC error was detected on an EMEM read-modify-write by a store",
+ "A parity error was detected in an L1 TLB entry by any access",
+ "A parity error was detected in an L2 TLB entry by any access",
+ "A parity error was detected in a PWC entry by any access",
+ "A parity error was detected in an STQ entry by any access",
+ "A parity error was detected in an LDQ entry by any access",
+ "A parity error was detected in a MAB entry by any access",
+ "A parity error was detected in an SCB entry state field by any access",
+ "A parity error was detected in an SCB entry address field by any access",
+ "A parity error was detected in an SCB entry data field by any access",
+ "A parity error was detected in a WCB entry by any access",
+ "A poisoned line was detected in an SCB entry by any access",
+ "A SystemReadDataError error was reported on read data returned from L2 for a load",
+ "A SystemReadDataError error was reported on read data returned from L2 for an SCB store",
+ "A SystemReadDataError error was reported on read data returned from L2 for a WCB store",
+ "A hardware assertion error was reported",
+ "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access",
+};
/* Instruction Fetch */
static const char * const smca_if_mce_desc[] = {
"microtag probe port parity error",
@@ -289,6 +322,7 @@ struct smca_mce_desc {
static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) },
+ [SMCA_LS_V2] = { smca_ls2_mce_desc, ARRAY_SIZE(smca_ls2_mce_desc) },
[SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) },
[SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) },
[SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) },
@@ -319,6 +353,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* ZN Core (HWID=0xB0) MCA types */
{ SMCA_LS, 0x000000B0 },
+ { SMCA_LS_V2, 0x001000B0 },
{ SMCA_IF, 0x000100B0 },
{ SMCA_L2_CACHE, 0x000200B0 },
{ SMCA_DE, 0x000300B0 },
@@ -362,6 +397,7 @@ struct smca_bank_name {
static struct smca_bank_name smca_names[] = {
[SMCA_LS] = { "Load Store Unit" },
+ [SMCA_LS_V2] = { "Load Store Unit" },
[SMCA_IF] = { "Instruction Fetch Unit" },
[SMCA_L2_CACHE] = { "L2 Cache" },
[SMCA_DE] = { "Decode Unit" },

View File

@ -1,28 +0,0 @@
commit ce33041e0abfa20054ff5d6874ffbd1ab592558d
Author: Aristeu Rozanski <arozansk@redhat.com>
Date: Thu Jan 19 08:45:57 2023 -0500
rasdaemon: ras-memory-failure-handler: handle localtime() failure correctly
We could just have an empty string but keeping the format could prevent
issues if someone is actually parsing this.
Found with covscan.
v2: fixed the timestamp as pointed by Robert Elliott
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c
index 9941e68..1951456 100644
--- a/ras-memory-failure-handler.c
+++ b/ras-memory-failure-handler.c
@@ -148,6 +148,8 @@ int ras_memory_failure_event_handler(struct trace_seq *s,
if (tm)
strftime(ev.timestamp, sizeof(ev.timestamp),
"%Y-%m-%d %H:%M:%S %z", tm);
+ else
+ strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp));
trace_seq_printf(s, "%s ", ev.timestamp);
if (pevent_get_field_val(s, event, "pfn", record, &val, 1) < 0)

View File

@ -1,611 +0,0 @@
commit ce6e7864f11f709c4f803828fbc8e507d115d03b
Author: Greg Edwards <gedwards@ddn.com>
Date: Thu Apr 8 15:03:30 2021 -0600
rasdaemon: Add Ice Lake and Sapphire Rapids MSCOD values
Based on mcelog commits:
ee90ff20ce6a ("mcelog: Add support for Icelake server, Icelake-D, and Snow Ridge")
391abaac9bdf ("mcelog: Add decode for MCi_MISC from 10nm memory controller")
59cb7ad4bc72 ("mcelog: i10nm: Fix mapping from bank number to functional unit")
c0acd0e6a639 ("mcelog: Add support for Sapphirerapids server.")
Signed-off-by: Greg Edwards <gedwards@ddn.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
Makefile.am | 3
mce-intel-i10nm.c | 509 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
mce-intel.c | 5
ras-mce-handler.c | 12 +
ras-mce-handler.h | 5
5 files changed, 533 insertions(+), 1 deletion(-)
--- rasdaemon-0.6.1.orig/Makefile.am 2021-09-17 15:29:45.977790658 -0400
+++ rasdaemon-0.6.1/Makefile.am 2021-09-17 15:29:57.439698580 -0400
@@ -36,7 +36,8 @@ if WITH_MCE
mce-intel-dunnington.c mce-intel-tulsa.c \
mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \
mce-intel-knl.c mce-intel-broadwell-de.c \
- mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c
+ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c \
+ mce-amd.c mce-amd-smca.c mce-intel-i10nm.c
endif
if WITH_EXTLOG
rasdaemon_SOURCES += ras-extlog-handler.c
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ rasdaemon-0.6.1/mce-intel-i10nm.c 2021-09-17 15:29:45.977790658 -0400
@@ -0,0 +1,509 @@
+/*
+ * The code below came from Tony Luck's mcelog code,
+ * released under GNU Public General License, v.2
+ *
+ * Copyright (C) 2019 Intel Corporation
+ * Decode Intel 10nm specific machine check errors.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "ras-mce-handler.h"
+#include "bitfield.h"
+
+static char *pcu_1[] = {
+ [0x0D] = "MCA_LLC_BIST_ACTIVE_TIMEOUT",
+ [0x0E] = "MCA_DMI_TRAINING_TIMEOUT",
+ [0x0F] = "MCA_DMI_STRAP_SET_ARRIVAL_TIMEOUT",
+ [0x10] = "MCA_DMI_CPU_RESET_ACK_TIMEOUT",
+ [0x11] = "MCA_MORE_THAN_ONE_LT_AGENT",
+ [0x14] = "MCA_INCOMPATIBLE_PCH_TYPE",
+ [0x1E] = "MCA_BIOS_RST_CPL_INVALID_SEQ",
+ [0x1F] = "MCA_BIOS_INVALID_PKG_STATE_CONFIG",
+ [0x2D] = "MCA_PCU_PMAX_CALIB_ERROR",
+ [0x2E] = "MCA_TSC100_SYNC_TIMEOUT",
+ [0x3A] = "MCA_GPSB_TIMEOUT",
+ [0x3B] = "MCA_PMSB_TIMEOUT",
+ [0x3E] = "MCA_IOSFSB_PMREQ_CMP_TIMEOUT",
+ [0x40] = "MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE",
+ [0x42] = "MCA_SVID_VCCIN_VR_VOUT_FAILURE",
+ [0x43] = "MCA_SVID_CPU_VR_CAPABILITY_ERROR",
+ [0x44] = "MCA_SVID_CRITICAL_VR_FAILED",
+ [0x45] = "MCA_SVID_SA_ITD_ERROR",
+ [0x46] = "MCA_SVID_READ_REG_FAILED",
+ [0x47] = "MCA_SVID_WRITE_REG_FAILED",
+ [0x4A] = "MCA_SVID_PKGC_REQUEST_FAILED",
+ [0x4B] = "MCA_SVID_IMON_REQUEST_FAILED",
+ [0x4C] = "MCA_SVID_ALERT_REQUEST_FAILED",
+ [0x4D] = "MCA_SVID_MCP_VR_RAMP_ERROR",
+ [0x56] = "MCA_FIVR_PD_HARDERR",
+ [0x58] = "MCA_WATCHDOG_TIMEOUT_PKGC_SLAVE",
+ [0x59] = "MCA_WATCHDOG_TIMEOUT_PKGC_MASTER",
+ [0x5A] = "MCA_WATCHDOG_TIMEOUT_PKGS_MASTER",
+ [0x5B] = "MCA_WATCHDOG_TIMEOUT_MSG_CH_FSM",
+ [0x5C] = "MCA_WATCHDOG_TIMEOUT_BULK_CR_FSM",
+ [0x5D] = "MCA_WATCHDOG_TIMEOUT_IOSFSB_FSM",
+ [0x60] = "MCA_PKGS_SAFE_WP_TIMEOUT",
+ [0x61] = "MCA_PKGS_CPD_UNCPD_TIMEOUT",
+ [0x62] = "MCA_PKGS_INVALID_REQ_PCH",
+ [0x63] = "MCA_PKGS_INVALID_REQ_INTERNAL",
+ [0x64] = "MCA_PKGS_INVALID_RSP_INTERNAL",
+ [0x65 ... 0x7A] = "MCA_PKGS_RESET_PREP_TIMEOUT",
+ [0x7B] = "MCA_PKGS_SMBUS_VPP_PAUSE_TIMEOUT",
+ [0x7C] = "MCA_PKGS_SMBUS_MCP_PAUSE_TIMEOUT",
+ [0x7D] = "MCA_PKGS_SMBUS_SPD_PAUSE_TIMEOUT",
+ [0x80] = "MCA_PKGC_DISP_BUSY_TIMEOUT",
+ [0x81] = "MCA_PKGC_INVALID_RSP_PCH",
+ [0x83] = "MCA_PKGC_WATCHDOG_HANG_CBZ_DOWN",
+ [0x84] = "MCA_PKGC_WATCHDOG_HANG_CBZ_UP",
+ [0x87] = "MCA_PKGC_WATCHDOG_HANG_C2_BLKMASTER",
+ [0x88] = "MCA_PKGC_WATCHDOG_HANG_C2_PSLIMIT",
+ [0x89] = "MCA_PKGC_WATCHDOG_HANG_SETDISP",
+ [0x8B] = "MCA_PKGC_ALLOW_L1_ERROR",
+ [0x90] = "MCA_RECOVERABLE_DIE_THERMAL_TOO_HOT",
+ [0xA0] = "MCA_ADR_SIGNAL_TIMEOUT",
+ [0xA1] = "MCA_BCLK_FREQ_OC_ABOVE_THRESHOLD",
+ [0xB0] = "MCA_DISPATCHER_RUN_BUSY_TIMEOUT",
+};
+
+static char *pcu_2[] = {
+ [0x04] = "Clock/power IP response timeout",
+ [0x05] = "SMBus controller raised SMI",
+ [0x09] = "PM controller received invalid transaction",
+};
+
+static char *pcu_3[] = {
+ [0x01] = "Instruction address out of valid space",
+ [0x02] = "Double bit RAM error on Instruction Fetch",
+ [0x03] = "Invalid OpCode seen",
+ [0x04] = "Stack Underflow",
+ [0x05] = "Stack Overflow",
+ [0x06] = "Data address out of valid space",
+ [0x07] = "Double bit RAM error on Data Fetch",
+};
+
+static struct field pcu1[] = {
+ FIELD(0, pcu_1),
+ {}
+};
+
+static struct field pcu2[] = {
+ FIELD(0, pcu_2),
+ {}
+};
+
+static struct field pcu3[] = {
+ FIELD(0, pcu_3),
+ {}
+};
+
+static struct field upi1[] = {
+ SBITFIELD(22, "Phy Control Error"),
+ SBITFIELD(23, "Unexpected Retry.Ack flit"),
+ SBITFIELD(24, "Unexpected Retry.Req flit"),
+ SBITFIELD(25, "RF parity error"),
+ SBITFIELD(26, "Routeback Table error"),
+ SBITFIELD(27, "Unexpected Tx Protocol flit (EOP, Header or Data)"),
+ SBITFIELD(28, "Rx Header-or-Credit BGF credit overflow/underflow"),
+ SBITFIELD(29, "Link Layer Reset still in progress when Phy enters L0"),
+ SBITFIELD(30, "Link Layer reset initiated while protocol traffic not idle"),
+ SBITFIELD(31, "Link Layer Tx Parity Error"),
+ {}
+};
+
+static char *upi_2[] = {
+ [0x00] = "Phy Initialization Failure (NumInit)",
+ [0x01] = "Phy Detected Drift Buffer Alarm",
+ [0x02] = "Phy Detected Latency Buffer Rollover",
+ [0x10] = "LL Rx detected CRC error: unsuccessful LLR (entered Abort state)",
+ [0x11] = "LL Rx Unsupported/Undefined packet",
+ [0x12] = "LL or Phy Control Error",
+ [0x13] = "LL Rx Parameter Exception",
+ [0x1F] = "LL Detected Control Error",
+ [0x20] = "Phy Initialization Abort",
+ [0x21] = "Phy Inband Reset",
+ [0x22] = "Phy Lane failure, recovery in x8 width",
+ [0x23] = "Phy L0c error corrected without Phy reset",
+ [0x24] = "Phy L0c error triggering Phy reset",
+ [0x25] = "Phy L0p exit error corrected with reset",
+ [0x30] = "LL Rx detected CRC error: successful LLR without Phy Reinit",
+ [0x31] = "LL Rx detected CRC error: successful LLR with Phy Reinit",
+ [0x32] = "Tx received LLR",
+};
+
+static struct field upi2[] = {
+ FIELD(0, upi_2),
+ {}
+};
+
+static struct field m2m[] = {
+ SBITFIELD(16, "MC read data error"),
+ SBITFIELD(17, "Reserved"),
+ SBITFIELD(18, "MC partial write data error"),
+ SBITFIELD(19, "Full write data error"),
+ SBITFIELD(20, "M2M clock-domain-crossing buffer (BGF) error"),
+ SBITFIELD(21, "M2M time out"),
+ SBITFIELD(22, "M2M tracker parity error"),
+ SBITFIELD(23, "fatal Bucket1 error"),
+ {}
+};
+
+static char *imc_0[] = {
+ [0x01] = "Address parity error",
+ [0x02] = "Data parity error",
+ [0x03] = "Data ECC error",
+ [0x04] = "Data byte enable parity error",
+ [0x07] = "Transaction ID parity error",
+ [0x08] = "Corrected patrol scrub error",
+ [0x10] = "Uncorrected patrol scrub error",
+ [0x20] = "Corrected spare error",
+ [0x40] = "Uncorrected spare error",
+ [0x80] = "Corrected read error",
+ [0xA0] = "Uncorrected read error",
+ [0xC0] = "Uncorrected metadata",
+};
+
+static char *imc_1[] = {
+ [0x00] = "WDB read parity error",
+ [0x03] = "RPA parity error",
+ [0x06] = "DDR_T_DPPP data BE error",
+ [0x07] = "DDR_T_DPPP data error",
+ [0x08] = "DDR link failure",
+ [0x11] = "PCLS CAM error",
+ [0x12] = "PCLS data error",
+};
+
+static char *imc_2[] = {
+ [0x00] = "DDR4 command / address parity error",
+ [0x20] = "HBM command / address parity error",
+ [0x21] = "HBM data parity error",
+};
+
+static char *imc_4[] = {
+ [0x00] = "RPQ parity (primary) error",
+};
+
+static char *imc_8[] = {
+ [0x00] = "DDR-T bad request",
+ [0x01] = "DDR Data response to an invalid entry",
+ [0x02] = "DDR data response to an entry not expecting data",
+ [0x03] = "DDR4 completion to an invalid entry",
+ [0x04] = "DDR-T completion to an invalid entry",
+ [0x05] = "DDR data/completion FIFO overflow",
+ [0x06] = "DDR-T ERID correctable parity error",
+ [0x07] = "DDR-T ERID uncorrectable error",
+ [0x08] = "DDR-T interrupt received while outstanding interrupt was not ACKed",
+ [0x09] = "ERID FI FO overflow",
+ [0x0A] = "DDR-T error on FNV write credits",
+ [0x0B] = "DDR-T error on FNV read credits",
+ [0x0C] = "DDR-T scheduler error",
+ [0x0D] = "DDR-T FNV error event",
+ [0x0E] = "DDR-T FNV thermal event",
+ [0x0F] = "CMI packet while idle",
+ [0x10] = "DDR_T_RPQ_REQ_PARITY_ERR",
+ [0x11] = "DDR_T_WPQ_REQ_PARITY_ERR",
+ [0x12] = "2LM_NMFILLWR_CAM_ERR",
+ [0x13] = "CMI_CREDIT_OVERSUB_ERR",
+ [0x14] = "CMI_CREDIT_TOTAL_ERR",
+ [0x15] = "CMI_CREDIT_RSVD_POOL_ERR",
+ [0x16] = "DDR_T_RD_ERROR",
+ [0x17] = "WDB_FIFO_ERR",
+ [0x18] = "CMI_REQ_FIFO_OVERFLOW",
+ [0x19] = "CMI_REQ_FIFO_UNDERFLOW",
+ [0x1A] = "CMI_RSP_FIFO_OVERFLOW",
+ [0x1B] = "CMI_RSP_FIFO_UNDERFLOW",
+ [0x1C] = "CMI _MISC_MC_CRDT_ERRORS",
+ [0x1D] = "CMI_MISC_MC_ARB_ERRORS",
+ [0x1E] = "DDR_T_WR_CMPL_FI FO_OVERFLOW",
+ [0x1F] = "DDR_T_WR_CMPL_FI FO_UNDERFLOW",
+ [0x20] = "CMI_RD_CPL_FIFO_OVERFLOW",
+ [0x21] = "CMI_RD_CPL_FIFO_UNDERFLOW",
+ [0x22] = "TME_KEY_PAR_ERR",
+ [0x23] = "TME_CMI_MISC_ERR",
+ [0x24] = "TME_CMI_OVFL_ERR",
+ [0x25] = "TME_CMI_UFL_ERR",
+ [0x26] = "TME_TEM_SECURE_ERR",
+ [0x27] = "TME_UFILL_PAR_ERR",
+ [0x29] = "INTERNAL_ERR",
+ [0x2A] = "TME_INTEGRITY_ERR",
+ [0x2B] = "TME_TDX_ERR",
+ [0x2C] = "TME_UFILL_TEM_SECURE_ERR",
+ [0x2D] = "TME_KEY_POISON_ERR",
+ [0x2E] = "TME_SECURITY_ENGINE_ERR",
+};
+
+static char *imc_10[] = {
+ [0x08] = "CORR_PATSCRUB_MIRR2ND_ERR",
+ [0x10] = "UC_PATSCRUB_MIRR2ND_ERR",
+ [0x20] = "COR_SPARE_MIRR2ND_ERR",
+ [0x40] = "UC_SPARE_MIRR2ND_ERR",
+ [0x80] = "HA_RD_MIRR2ND_ERR",
+ [0xA0] = "HA_UNCORR_RD_MIRR2ND_ERR",
+};
+
+static struct field imc0[] = {
+ FIELD(0, imc_0),
+ {}
+};
+
+static struct field imc1[] = {
+ FIELD(0, imc_1),
+ {}
+};
+
+static struct field imc2[] = {
+ FIELD(0, imc_2),
+ {}
+};
+
+static struct field imc4[] = {
+ FIELD(0, imc_4),
+ {}
+};
+
+static struct field imc8[] = {
+ FIELD(0, imc_8),
+ {}
+};
+
+static struct field imc10[] = {
+ FIELD(0, imc_10),
+ {}
+};
+
+static void i10nm_imc_misc(struct mce_event *e)
+{
+ uint32_t column = EXTRACT(e->misc, 9, 18) << 2;
+ uint32_t row = EXTRACT(e->misc, 19, 39);
+ uint32_t bank = EXTRACT(e->misc, 42, 43);
+ uint32_t bankgroup = EXTRACT(e->misc, 40, 41) | (EXTRACT(e->misc, 44, 44) << 2);
+ uint32_t fdevice = EXTRACT(e->misc, 46, 51);
+ uint32_t subrank = EXTRACT(e->misc, 52, 55);
+ uint32_t rank = EXTRACT(e->misc, 56, 58);
+ uint32_t eccmode = EXTRACT(e->misc, 59, 62);
+ uint32_t transient = EXTRACT(e->misc, 63, 63);
+
+ mce_snprintf(e->error_msg, "bank: 0x%x bankgroup: 0x%x row: 0x%x column: 0x%x", bank, bankgroup, row, column);
+ if (!transient && !EXTRACT(e->status, 61, 61))
+ mce_snprintf(e->error_msg, "failed device: 0x%x", fdevice);
+ mce_snprintf(e->error_msg, "rank: 0x%x subrank: 0x%x", rank, subrank);
+ mce_snprintf(e->error_msg, "ecc mode: ");
+ switch (eccmode) {
+ case 0: mce_snprintf(e->error_msg, "SDDC memory mode"); break;
+ case 1: mce_snprintf(e->error_msg, "SDDC"); break;
+ case 4: mce_snprintf(e->error_msg, "ADDDC memory mode"); break;
+ case 5: mce_snprintf(e->error_msg, "ADDDC"); break;
+ case 8: mce_snprintf(e->error_msg, "DDRT read"); break;
+ default: mce_snprintf(e->error_msg, "unknown"); break;
+ }
+ if (transient)
+ mce_snprintf(e->error_msg, "transient");
+}
+
+enum banktype {
+ BT_UNKNOWN,
+ BT_PCU,
+ BT_UPI,
+ BT_M2M,
+ BT_IMC,
+};
+
+static enum banktype icelake[32] = {
+ [4] = BT_PCU,
+ [5] = BT_UPI,
+ [7 ... 8] = BT_UPI,
+ [12] = BT_M2M,
+ [16] = BT_M2M,
+ [20] = BT_M2M,
+ [24] = BT_M2M,
+ [13 ... 15] = BT_IMC,
+ [17 ... 19] = BT_IMC,
+ [21 ... 23] = BT_IMC,
+ [25 ... 27] = BT_IMC,
+};
+
+static enum banktype icelake_de[32] = {
+ [4] = BT_PCU,
+ [12] = BT_M2M,
+ [16] = BT_M2M,
+ [13 ... 15] = BT_IMC,
+ [17 ... 19] = BT_IMC,
+};
+
+static enum banktype tremont[32] = {
+ [4] = BT_PCU,
+ [12] = BT_M2M,
+ [13 ... 15] = BT_IMC,
+};
+
+static enum banktype sapphire[32] = {
+ [4] = BT_PCU,
+ [5] = BT_UPI,
+ [12] = BT_M2M,
+ [13 ... 20] = BT_IMC,
+};
+
+void i10nm_memerr_misc(struct mce_event *e, int *channel);
+
+void i10nm_decode_model(enum cputype cputype, struct ras_events *ras,
+ struct mce_event *e)
+{
+ enum banktype banktype;
+ uint64_t f, status = e->status;
+ uint32_t mca = status & 0xffff;
+ int channel = -1;
+
+ switch (cputype) {
+ case CPU_ICELAKE_XEON:
+ banktype = icelake[e->bank];
+ break;
+ case CPU_ICELAKE_DE:
+ banktype = icelake_de[e->bank];
+ break;
+ case CPU_TREMONT_D:
+ banktype = tremont[e->bank];
+ break;
+ case CPU_SAPPHIRERAPIDS:
+ banktype = sapphire[e->bank];
+ break;
+ default:
+ return;
+ }
+
+ switch (banktype) {
+ case BT_UNKNOWN:
+ break;
+
+ case BT_PCU:
+ mce_snprintf(e->error_msg, "PCU: ");
+ f = EXTRACT(status, 24, 31);
+ if (f)
+ decode_bitfield(e, f, pcu1);
+ f = EXTRACT(status, 20, 23);
+ if (f)
+ decode_bitfield(e, f, pcu2);
+ f = EXTRACT(status, 16, 19);
+ if (f)
+ decode_bitfield(e, f, pcu3);
+ break;
+
+ case BT_UPI:
+ mce_snprintf(e->error_msg, "UPI: ");
+ f = EXTRACT(status, 22, 31);
+ if (f)
+ decode_bitfield(e, status, upi1);
+ f = EXTRACT(status, 16, 21);
+ decode_bitfield(e, f, upi2);
+ break;
+
+ case BT_M2M:
+ mce_snprintf(e->error_msg, "M2M: ");
+ f = EXTRACT(status, 24, 25);
+ mce_snprintf(e->error_msg, "MscodDDRType=0x%" PRIx64, f);
+ f = EXTRACT(status, 26, 31);
+ mce_snprintf(e->error_msg, "MscodMiscErrs=0x%" PRIx64, f);
+ decode_bitfield(e, status, m2m);
+ break;
+
+ case BT_IMC:
+ mce_snprintf(e->error_msg, "MemCtrl: ");
+ f = EXTRACT(status, 16, 23);
+ switch (EXTRACT(status, 24, 31)) {
+ case 0: decode_bitfield(e, f, imc0); break;
+ case 1: decode_bitfield(e, f, imc1); break;
+ case 2: decode_bitfield(e, f, imc2); break;
+ case 4: decode_bitfield(e, f, imc4); break;
+ case 8: decode_bitfield(e, f, imc8); break;
+ case 0x10: decode_bitfield(e, f, imc10); break;
+ }
+ i10nm_imc_misc(e);
+ break;
+ }
+
+ /*
+ * Memory error specific code. Returns if the error is not a MC one
+ */
+
+ /* Check if the error is at the memory controller */
+ if ((mca >> 7) != 1)
+ return;
+
+ /* Ignore unless this is an corrected extended error from an iMC bank */
+ if (banktype != BT_IMC || (status & MCI_STATUS_UC))
+ return;
+
+ /*
+ * Parse the reported channel
+ */
+
+ i10nm_memerr_misc(e, &channel);
+ if (channel == -1)
+ return;
+ mce_snprintf(e->mc_location, "memory_channel=%d", channel);
+}
+
+/*
+ * There isn't enough information to identify the DIMM. But
+ * we can derive the channel from the bank number.
+ * There can be four memory controllers with two channels each.
+ */
+void i10nm_memerr_misc(struct mce_event *e, int *channel)
+{
+ uint64_t status = e->status;
+ unsigned int chan, imc;
+
+ /* Check this is a memory error */
+ if (!test_prefix(7, status & 0xefff))
+ return;
+
+ chan = EXTRACT(status, 0, 3);
+ if (chan == 0xf)
+ return;
+
+ switch (e->bank) {
+ case 12: /* M2M 0 */
+ case 13: /* IMC 0, Channel 0 */
+ case 14: /* IMC 0, Channel 1 */
+ case 15: /* IMC 0, Channel 2 */
+ imc = 0;
+ break;
+ case 16: /* M2M 1 */
+ case 17: /* IMC 1, Channel 0 */
+ case 18: /* IMC 1, Channel 1 */
+ case 19: /* IMC 1, Channel 2 */
+ imc = 1;
+ break;
+ case 20: /* M2M 2 */
+ case 21: /* IMC 2, Channel 0 */
+ case 22: /* IMC 2, Channel 1 */
+ case 23: /* IMC 2, Channel 2 */
+ imc = 2;
+ break;
+ case 24: /* M2M 3 */
+ case 25: /* IMC 3, Channel 0 */
+ case 26: /* IMC 3, Channel 1 */
+ case 27: /* IMC 3, Channel 2 */
+ imc = 3;
+ break;
+ default:
+ return;
+ }
+
+ channel[0] = imc * 3 + chan;
+}
--- rasdaemon-0.6.1.orig/mce-intel.c 2021-09-17 15:29:39.189845188 -0400
+++ rasdaemon-0.6.1/mce-intel.c 2021-09-17 15:29:45.977790658 -0400
@@ -411,6 +411,11 @@ if (test_prefix(11, (e->status & 0xffffL
case CPU_SKYLAKE_XEON:
skylake_s_decode_model(ras, e);
break;
+ case CPU_ICELAKE_XEON:
+ case CPU_ICELAKE_DE:
+ case CPU_TREMONT_D:
+ case CPU_SAPPHIRERAPIDS:
+ i10nm_decode_model(mce->cputype, ras, e);
default:
break;
}
--- rasdaemon-0.6.1.orig/ras-mce-handler.c 2021-09-17 15:29:39.189845188 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.c 2021-09-17 15:29:45.977790658 -0400
@@ -56,6 +56,10 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series
[CPU_KNIGHTS_MILL] = "Knights Mill",
[CPU_SKYLAKE_XEON] = "Skylake server",
[CPU_AMD_SMCA] = "AMD Scalable MCA",
+ [CPU_ICELAKE_XEON] = "Icelake server",
+ [CPU_ICELAKE_DE] = "Icelake server D Family",
+ [CPU_TREMONT_D] = "Tremont microserver",
+ [CPU_SAPPHIRERAPIDS] = "Sapphirerapids server",
};
static enum cputype select_intel_cputype(struct ras_events *ras)
@@ -107,6 +111,14 @@ else if (mce->model == 0x85)
return CPU_KNIGHTS_MILL;
else if (mce->model == 0x55)
return CPU_SKYLAKE_XEON;
+ else if (mce->model == 0x6a)
+ return CPU_ICELAKE_XEON;
+ else if (mce->model == 0x6c)
+ return CPU_ICELAKE_DE;
+ else if (mce->model == 0x86)
+ return CPU_TREMONT_D;
+ else if (mce->model == 0x8f)
+ return CPU_SAPPHIRERAPIDS;
if (mce->model > 0x1a) {
log(ALL, LOG_INFO,
--- rasdaemon-0.6.1.orig/ras-mce-handler.h 2021-09-17 15:29:39.189845188 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.h 2021-09-17 15:29:45.977790658 -0400
@@ -51,6 +51,10 @@ enum cputype {
CPU_KNIGHTS_MILL,
CPU_SKYLAKE_XEON,
CPU_AMD_SMCA,
+ CPU_ICELAKE_XEON,
+ CPU_ICELAKE_DE,
+ CPU_TREMONT_D,
+ CPU_SAPPHIRERAPIDS,
};
struct mce_event {
@@ -131,6 +135,7 @@ void tulsa_decode_model(struct mce_event
void broadwell_de_decode_model(struct ras_events *ras, struct mce_event *e);
void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e);
void skylake_s_decode_model(struct ras_events *ras, struct mce_event *e);
+void i10nm_decode_model(enum cputype cputype, struct ras_events *ras, struct mce_event *e);
/* AMD error code decode function */
void decode_amd_errcode(struct mce_event *e);

View File

@ -1,24 +0,0 @@
commit e8b97ec14a11764fedfea50bd4d96ddda43c7fc1
Author: Aristeu Rozanski <arozansk@redhat.com>
Date: Thu Jan 19 08:45:57 2023 -0500
rasdaemon: mce-amd-smca: properly limit bank types
Found with covscan.
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index f3379fc..27ca8aa 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -584,7 +584,7 @@ static void decode_smca_error(struct mce_event *e)
return;
}
- if (bank_type >= MAX_NR_BANKS) {
+ if (bank_type >= N_SMCA_BANK_TYPES) {
strcpy(e->mcastatus_msg, "Don't know how to decode this bank");
return;
}

View File

@ -1,47 +0,0 @@
From: Muralidhara M K <muralimk@amd.com>
This patch removes trailing spaces at the end of a line from
file location and fixes --layout option to parse dimm nodes
to get the size from ras-mc-ctl.
Issue is reported https://github.com/mchehab/rasdaemon/issues/43
Where '> ras-mc-ctl --layout' reports all 0s
With this change the layout prints the correct dimm sizes
> sudo ras-mc-ctl --layout
+-----------------------------------------------+
| mc0 |
| csrow0 | csrow1 | csrow2 | csrow3 |
----------+-----------------------------------------------+
...
channel7: | 16384 MB | 0 MB | 0 MB | 0 MB |
channel6: | 16384 MB | 0 MB | 0 MB | 0 MB |
...
----------+-----------------------------------------------+
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
---
util/ras-mc-ctl.in | 2 ++
1 file changed, 2 insertions(+)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 1e3aeb7..b22dd60 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -246,6 +246,7 @@ sub parse_dimm_nodes
if (($file =~ /max_location$/)) {
open IN, $file;
my $location = <IN>;
+ $location =~ s/\s+$//;
close IN;
my @temp = split(/ /, $location);
@@ -288,6 +289,7 @@ sub parse_dimm_nodes
open IN, $file;
my $location = <IN>;
+ $location =~ s/\s+$//;
close IN;
my @pos;

View File

@ -1,236 +0,0 @@
Name: rasdaemon
Version: 0.6.1
Release: 13%{?dist}
Summary: Utility to receive RAS error tracings
Group: Applications/System
License: GPLv2
URL: http://git.infradead.org/users/mchehab/rasdaemon.git
Source0: http://www.infradead.org/~mchehab/rasdaemon/%{name}-%{version}.tar.bz2
ExcludeArch: s390 s390x
BuildRequires: gettext-devel
BuildRequires: perl-generators
BuildRequires: sqlite-devel
BuildRequires: systemd
BuildRequires: libtool
Provides: bundled(kernel-event-lib)
Requires: hwdata
Requires: perl-DBD-SQLite
%ifarch %{ix86} x86_64
Requires: dmidecode
%endif
Requires(post): systemd
Requires(preun): systemd
Requires(postun): systemd
Patch1: 60a91e4da4f2daf2b10143fc148a8043312b61e5.patch
Patch2: a16ca0711001957ee98f2c124abce0fa1f801529.patch
Patch3: add_upstream_labels.patch
Patch4: b22be68453b2497e86cbd273b9cd56fadc5859e3.patch
Patch5: 2a1d217660351c08eb2f8bccebf939abba2f7e69.patch
Patch6: 8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch
Patch7: cc2ce5c65ed5a42eaa97aa3659854add6d808da5.patch
Patch8: 854364ba44aee9bc5646f6537fc744b0b54aff37.patch
Patch9: 9acef39f13833f7d53ef96abc5a72e79384260f4.patch
Patch10: 28ea956acc2dab7c18b4701f9657afb9ab3ddc79.patch
Patch11: aecf33aa70331670c06db6b652712b476e24051c.patch
Patch12: 7937f0d6c2aaaed096f3a3d306416743c0dcb7a4.patch
Patch13: rasdaemon-ras-mc-ctl-Fix-script-to-parse-dimm-sizes.patch
Patch14: 0862a096c3a1d0f993703ab3299f1ddfadf53d7f.patch
Patch15: 546cf713f667437fb6e283cc3dc090679eb47d08.patch
Patch16: 2290d65b97311dd5736838f1e285355f7f357046.patch
Patch17: 16d929b024c31d54a7f8a72eab094376c7be27f5.patch
Patch18: b497a3d6a39d402c41065e9284d49114b97e3bfe.patch
Patch19: ce6e7864f11f709c4f803828fbc8e507d115d03b.patch
Patch20: a8c776ed94f68ae31d7b5f74e19545698898c13c.patch
Patch21: 899fcc2cf21c86b5462c8f4441cd9c92b3d75f7d.patch
Patch22: e8b97ec14a11764fedfea50bd4d96ddda43c7fc1.patch
Patch23: ce33041e0abfa20054ff5d6874ffbd1ab592558d.patch
%description
%{name} is a RAS (Reliability, Availability and Serviceability) logging tool.
It currently records memory errors, using the EDAC tracing events.
EDAC is drivers in the Linux kernel that handle detection of ECC errors
from memory controllers for most chipsets on i386 and x86_64 architectures.
EDAC drivers for other architectures like arm also exists.
This userspace component consists of an init script which makes sure
EDAC drivers and DIMM labels are loaded at system startup, as well as
an utility for reporting current error counts from the EDAC sysfs files.
%prep
%setup -q
%patch1 -p1
%patch2 -p1
%patch3 -p1
%patch4 -p1
%patch5 -p1
%patch6 -p1
%patch7 -p1
%patch8 -p1
%patch9 -p1
%patch10 -p1
%patch11 -p1
%patch12 -p1
%patch13 -p1
%patch14 -p1
%patch15 -p1
%patch16 -p1
%patch17 -p1
%patch18 -p1
%patch19 -p1
%patch20 -p1
%patch21 -p1
%patch22 -p1
%patch23 -p1
# The tarball is locked in time the first time aclocal was ran and will keep
# requiring an older version of automake
autoreconf -vfi
%build
%ifarch %{arm} aarch64
%configure --enable-aer --enable-sqlite3 --enable-abrt-report --enable-non-standard --enable-hisi-ns-decode --enable-arm
%else
%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-memory-failure
%endif
make %{?_smp_mflags}
%install
make install DESTDIR=%{buildroot}
install -D -p -m 0644 misc/rasdaemon.service %{buildroot}/%{_unitdir}/rasdaemon.service
install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service
install -D -p -m 0655 labels/* %{buildroot}%{_sysconfdir}/ras/dimm_labels.d
rm INSTALL %{buildroot}/usr/include/*.h
%files
%doc AUTHORS ChangeLog COPYING README TODO
%{_sbindir}/rasdaemon
%{_sbindir}/ras-mc-ctl
%{_mandir}/*/*
%{_unitdir}/*.service
%{_sharedstatedir}/rasdaemon
%{_sysconfdir}/ras/dimm_labels.d
%changelog
* Mon Jan 23 2023 Aristeu Rozanski <aris@redhat.com> 0.6.1-13
- Fixing covscan issues [2073516]
* Tue Oct 12 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-12
- Adding missing bits from b497a3d6a39d402c41065e9284d49114b97e3bfe [1923254]
* Tue Oct 12 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-11
- Removed bits from devlink and diskerrors that aren't used yet [1923254]
* Tue Oct 12 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-10
- Add miscellaneous patches required by customer [1923254]
* Wed Oct 06 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-9
- Prevent ras-mc-ctl trying to access extlog and mce tables if rasdaemon was built without support for them [2011404]
* Thu Aug 26 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-8
- Disable MCE and extlog in arm packages [2009499]
* Thu Aug 26 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-7
- Add support for AMD SMCA banks for family 19 [1991955]
* Wed May 26 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-6
- Add support for AMD SMCA [1965011]
* Wed Apr 08 2020 Aristeu Rozanski <aris@redhat.com> 0.6.1-5
- Fix high CPU usage when CPUs are offline [1683420]
* Wed Apr 08 2020 Aristeu Rozanski <aris@redhat.com> 0.6.1-4
- Include upstream labels [1665418]
* Thu Jul 11 2019 Aristeu Rozanski <aris@redhat.com> 0.6.1-3
- Add support for AMD scalable MCA [1725488]
* Mon Aug 20 2018 Aristeu Rozanski <aris@redhat.com> 0.6.1-2
- Add support for error count display [1573685]
* Wed Apr 25 2018 Mauro Carvalho Chehab <mchehab+samsung@kernel.org> 0.6.1-1
- Bump to version 0.6.1 adding support for Skylake Xeon MSCOD, a bug fix and some new DELL labels
* Fri Feb 09 2018 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.0-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_28_Mass_Rebuild
* Sat Oct 14 2017 Mauro Carvalho Chehab <mchehab@osg.samsung.com> 0.6.0-1
- Bump to version 0.6.0 adding support for Arm and Hisilicon events and update Dell Skylate labels
* Thu Aug 03 2017 Fedora Release Engineering <releng@fedoraproject.org> - 0.5.8-6
- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Binutils_Mass_Rebuild
* Thu Jul 27 2017 Fedora Release Engineering <releng@fedoraproject.org> - 0.5.8-5
- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Mass_Rebuild
* Sat Feb 11 2017 Fedora Release Engineering <releng@fedoraproject.org> - 0.5.8-4
- Rebuilt for https://fedoraproject.org/wiki/Fedora_26_Mass_Rebuild
* Fri Apr 15 2016 Mauro Carvalho Chehab <mchehab@osg.samsung.com> 0.5.8-3
- Add a virtual provide, per BZ#104132
* Fri Apr 15 2016 Mauro Carvalho Chehab <mchehab@osg.samsung.com> 0.5.8-2
- Bump to version 0.5.8 with support for Broadwell EP/EX MSCOD/DE MSCOD
* Thu Feb 04 2016 Fedora Release Engineering <releng@fedoraproject.org> - 0.5.6-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_24_Mass_Rebuild
* Fri Jul 03 2015 Mauro Carvalho Chehab <mchehab@osg.samsung.com> 0.5.6-1
- Bump to version 0.5.6 with support for LMCE and some fixes
* Thu Jun 18 2015 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 0.5.5-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_23_Mass_Rebuild
* Wed Jun 03 2015 Mauro Carvalho Chehab <mchehab@osg.samsung.com> 0.5.5-1
- Bump to version 0.5.5 with support for newer Intel platforms & some fixes
* Tue Sep 16 2014 Peter Robinson <pbrobinson@fedoraproject.org> 0.5.4-3
- aarch64/ppc64 have edac capabilities
- spec cleanups
- No need to run autoreconf
* Sun Aug 17 2014 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 0.5.4-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_21_22_Mass_Rebuild
* Fri Aug 15 2014 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.5.4-1
- Bump to version 0.5.4 with some fixes, mainly for amd64
* Sun Aug 10 2014 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.5.3-1
- Bump to version 0.5.3 and enable ABRT and ExtLog
* Sun Jun 08 2014 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 0.5.2-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_21_Mass_Rebuild
* Thu Apr 03 2014 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.5.2-1
- fix and enable ABRT report support
* Fri Mar 28 2014 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.5.1-1
- Do some fixes at the service files and add some documentation for --record
* Sun Feb 16 2014 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.5.0-1
- Add experimental ABRT support
* Tue Sep 10 2013 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.4.2-1
- Fix ras-mc-ctl layout filling
* Sun Aug 04 2013 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 0.4.1-5
- Rebuilt for https://fedoraproject.org/wiki/Fedora_20_Mass_Rebuild
* Wed Jul 17 2013 Petr Pisar <ppisar@redhat.com> - 0.4.1-4
- Perl 5.18 rebuild
* Sun Jun 2 2013 Peter Robinson <pbrobinson@fedoraproject.org> 0.4.1-3
- ARM has EDMA drivers (currently supported in Calxeda highbank)
* Wed May 29 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.4.1-2
- Fix the name of perl-DBD-SQLite package
* Wed May 29 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.4.1-1
- Updated to version 0.4.1 with contains some bug fixes
* Tue May 28 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.4.0-1
- Updated to version 0.4.0 and added support for mce, aer and sqlite3 storage
* Mon May 20 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.3.0-1
- Package created

View File

@ -0,0 +1,424 @@
commit a247baf7110ab6427259eb1421a103e2021a8735
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Fri Mar 17 13:07:01 2023 +0000
rasdaemon: Add support for the CXL AER correctable errors
Add support to log and record the CXL AER correctable errors.
The corresponding Kernel patches are here:
https://lore.kernel.org/linux-cxl/166974401763.1608150.5424589924034481387.stgit@djiang5-desk3.ch.intel.com/T/#t
https://lore.kernel.org/linux-cxl/63e5ed38d77d9_138fbc2947a@iweiny-mobl.notmuch/T/#t
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c
index 0f2c9e4..8f6342d 100644
--- a/ras-cxl-handler.c
+++ b/ras-cxl-handler.c
@@ -220,6 +220,14 @@ int ras_cxl_poison_event_handler(struct trace_seq *s,
#define CXL_AER_UE_IDE_TX_ERR BIT(15)
#define CXL_AER_UE_IDE_RX_ERR BIT(16)
+#define CXL_AER_CE_CACHE_DATA_ECC BIT(0)
+#define CXL_AER_CE_MEM_DATA_ECC BIT(1)
+#define CXL_AER_CE_CRC_THRESH BIT(2)
+#define CXL_AER_CE_RETRY_THRESH BIT(3)
+#define CXL_AER_CE_CACHE_POISON BIT(4)
+#define CXL_AER_CE_MEM_POISON BIT(5)
+#define CXL_AER_CE_PHYS_LAYER_ERR BIT(6)
+
struct cxl_error_list {
uint32_t bit;
const char *error;
@@ -243,6 +251,16 @@ static const struct cxl_error_list cxl_aer_ue[] = {
{ .bit = CXL_AER_UE_IDE_RX_ERR, .error = "IDE Rx Error" },
};
+static const struct cxl_error_list cxl_aer_ce[] = {
+ { .bit = CXL_AER_CE_CACHE_DATA_ECC, .error = "Cache Data ECC Error" },
+ { .bit = CXL_AER_CE_MEM_DATA_ECC, .error = "Memory Data ECC Error" },
+ { .bit = CXL_AER_CE_CRC_THRESH, .error = "CRC Threshold Hit" },
+ { .bit = CXL_AER_CE_RETRY_THRESH, .error = "Retry Threshold" },
+ { .bit = CXL_AER_CE_CACHE_POISON, .error = "Received Cache Poison From Peer" },
+ { .bit = CXL_AER_CE_MEM_POISON, .error = "Received Memory Poison From Peer" },
+ { .bit = CXL_AER_CE_PHYS_LAYER_ERR, .error = "Received Error From Physical Layer" },
+};
+
static int decode_cxl_error_status(struct trace_seq *s, uint32_t status,
const struct cxl_error_list *cxl_error_list,
uint8_t num_elems)
@@ -351,3 +369,66 @@ int ras_cxl_aer_ue_event_handler(struct trace_seq *s,
return 0;
}
+
+int ras_cxl_aer_ce_event_handler(struct trace_seq *s,
+ struct tep_record *record,
+ struct tep_event *event, void *context)
+{
+ int len;
+ unsigned long long val;
+ time_t now;
+ struct tm *tm;
+ struct ras_events *ras = context;
+ struct ras_cxl_aer_ce_event ev;
+
+ now = record->ts / user_hz + ras->uptime_diff;
+ tm = localtime(&now);
+ if (tm)
+ strftime(ev.timestamp, sizeof(ev.timestamp),
+ "%Y-%m-%d %H:%M:%S %z", tm);
+ else
+ strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp));
+ if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0)
+ return -1;
+
+ ev.memdev = tep_get_field_raw(s, event, "memdev",
+ record, &len, 1);
+ if (!ev.memdev)
+ return -1;
+ if (trace_seq_printf(s, "memdev:%s ", ev.memdev) <= 0)
+ return -1;
+
+ ev.host = tep_get_field_raw(s, event, "host",
+ record, &len, 1);
+ if (!ev.host)
+ return -1;
+ if (trace_seq_printf(s, "host:%s ", ev.host) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "serial", record, &val, 1) < 0)
+ return -1;
+ ev.serial = val;
+ if (trace_seq_printf(s, "serial:0x%llx ", (unsigned long long)ev.serial) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "status", record, &val, 1) < 0)
+ return -1;
+ ev.error_status = val;
+ if (trace_seq_printf(s, "error status:") <= 0)
+ return -1;
+ if (decode_cxl_error_status(s, ev.error_status,
+ cxl_aer_ce, ARRAY_SIZE(cxl_aer_ce)) < 0)
+ return -1;
+
+ /* Insert data into the SGBD */
+#ifdef HAVE_SQLITE3
+ ras_store_cxl_aer_ce_event(ras, &ev);
+#endif
+
+#ifdef HAVE_ABRT_REPORT
+ /* Report event to ABRT */
+ ras_report_cxl_aer_ce_event(ras, &ev);
+#endif
+
+ return 0;
+}
diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h
index 35efadd..711daf4 100644
--- a/ras-cxl-handler.h
+++ b/ras-cxl-handler.h
@@ -25,4 +25,8 @@ int ras_cxl_poison_event_handler(struct trace_seq *s,
int ras_cxl_aer_ue_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context);
+
+int ras_cxl_aer_ce_event_handler(struct trace_seq *s,
+ struct tep_record *record,
+ struct tep_event *event, void *context);
#endif
diff --git a/ras-events.c b/ras-events.c
index 5d73df1..2662467 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -247,6 +247,7 @@ int toggle_ras_mc_event(int enable)
#ifdef HAVE_CXL
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_poison", enable);
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_uncorrectable_error", enable);
+ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_correctable_error", enable);
#endif
free_ras:
@@ -1001,6 +1002,14 @@ int handle_ras_events(int record_events)
else
log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
"cxl", "cxl_aer_uncorrectable_error");
+
+ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_aer_correctable_error",
+ ras_cxl_aer_ce_event_handler, NULL, CXL_AER_CE_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "cxl", "cxl_aer_correctable_error");
#endif
if (!num_events) {
diff --git a/ras-events.h b/ras-events.h
index 4acbe57..a9d67c2 100644
--- a/ras-events.h
+++ b/ras-events.h
@@ -41,6 +41,7 @@ enum {
MF_EVENT,
CXL_POISON_EVENT,
CXL_AER_UE_EVENT,
+ CXL_AER_CE_EVENT,
NR_EVENTS
};
diff --git a/ras-record.c b/ras-record.c
index 97a2a37..86133c4 100644
--- a/ras-record.c
+++ b/ras-record.c
@@ -673,6 +673,53 @@ int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_eve
return rc;
}
+
+/*
+ * Table and functions to handle cxl:cxl_aer_correctable_error
+ */
+static const struct db_fields cxl_aer_ce_event_fields[] = {
+ { .name = "id", .type = "INTEGER PRIMARY KEY" },
+ { .name = "timestamp", .type = "TEXT" },
+ { .name = "memdev", .type = "TEXT" },
+ { .name = "host", .type = "TEXT" },
+ { .name = "serial", .type = "INTEGER" },
+ { .name = "error_status", .type = "INTEGER" },
+};
+
+static const struct db_table_descriptor cxl_aer_ce_event_tab = {
+ .name = "cxl_aer_ce_event",
+ .fields = cxl_aer_ce_event_fields,
+ .num_fields = ARRAY_SIZE(cxl_aer_ce_event_fields),
+};
+
+int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev)
+{
+ int rc;
+ struct sqlite3_priv *priv = ras->db_priv;
+
+ if (!priv || !priv->stmt_cxl_aer_ce_event)
+ return 0;
+ log(TERM, LOG_INFO, "cxl_aer_ce_event store: %p\n", priv->stmt_cxl_aer_ce_event);
+
+ sqlite3_bind_text(priv->stmt_cxl_aer_ce_event, 1, ev->timestamp, -1, NULL);
+ sqlite3_bind_text(priv->stmt_cxl_aer_ce_event, 2, ev->memdev, -1, NULL);
+ sqlite3_bind_text(priv->stmt_cxl_aer_ce_event, 3, ev->host, -1, NULL);
+ sqlite3_bind_int64(priv->stmt_cxl_aer_ce_event, 4, ev->serial);
+ sqlite3_bind_int(priv->stmt_cxl_aer_ce_event, 5, ev->error_status);
+
+ rc = sqlite3_step(priv->stmt_cxl_aer_ce_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to do cxl_aer_ce_event step on sqlite: error = %d\n", rc);
+ rc = sqlite3_reset(priv->stmt_cxl_aer_ce_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed reset cxl_aer_ce_event on sqlite: error = %d\n",
+ rc);
+ log(TERM, LOG_INFO, "register inserted at db\n");
+
+ return rc;
+}
#endif
/*
@@ -1032,6 +1079,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
if (rc != SQLITE_OK)
goto error;
}
+
+ rc = ras_mc_create_table(priv, &cxl_aer_ce_event_tab);
+ if (rc == SQLITE_OK) {
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_aer_ce_event,
+ &cxl_aer_ce_event_tab);
+ if (rc != SQLITE_OK)
+ goto error;
+ }
#endif
ras->db_priv = priv;
@@ -1169,6 +1224,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras)
"cpu %u: Failed to finalize cxl_aer_ue_event sqlite: error = %d\n",
cpu, rc);
}
+
+ if (priv->stmt_cxl_aer_ce_event) {
+ rc = sqlite3_finalize(priv->stmt_cxl_aer_ce_event);
+ if (rc != SQLITE_OK)
+ log(TERM, LOG_ERR,
+ "cpu %u: Failed to finalize cxl_aer_ce_event sqlite: error = %d\n",
+ cpu, rc);
+ }
#endif
rc = sqlite3_close_v2(db);
diff --git a/ras-record.h b/ras-record.h
index f11985f..ab7153d 100644
--- a/ras-record.h
+++ b/ras-record.h
@@ -144,6 +144,14 @@ struct ras_cxl_aer_ue_event {
uint32_t *header_log;
};
+struct ras_cxl_aer_ce_event {
+ char timestamp[64];
+ const char *memdev;
+ const char *host;
+ uint64_t serial;
+ uint32_t error_status;
+};
+
struct ras_mc_event;
struct ras_aer_event;
struct ras_extlog_event;
@@ -155,6 +163,7 @@ struct diskerror_event;
struct ras_mf_event;
struct ras_cxl_poison_event;
struct ras_cxl_aer_ue_event;
+struct ras_cxl_aer_ce_event;
#ifdef HAVE_SQLITE3
@@ -190,6 +199,7 @@ struct sqlite3_priv {
#ifdef HAVE_CXL
sqlite3_stmt *stmt_cxl_poison_event;
sqlite3_stmt *stmt_cxl_aer_ue_event;
+ sqlite3_stmt *stmt_cxl_aer_ce_event;
#endif
};
@@ -220,6 +230,7 @@ int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev
int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev);
int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev);
+int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev);
#else
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
@@ -235,6 +246,7 @@ static inline int ras_store_diskerror_event(struct ras_events *ras, struct diske
static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; };
static inline int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; };
static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; };
+static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; };
#endif
diff --git a/ras-report.c b/ras-report.c
index 2ebdc80..63b47f5 100644
--- a/ras-report.c
+++ b/ras-report.c
@@ -397,6 +397,30 @@ static int set_cxl_aer_ue_event_backtrace(char *buf, struct ras_cxl_aer_ue_event
return 0;
}
+static int set_cxl_aer_ce_event_backtrace(char *buf, struct ras_cxl_aer_ce_event *ev)
+{
+ char bt_buf[MAX_BACKTRACE_SIZE];
+
+ if (!buf || !ev)
+ return -1;
+
+ sprintf(bt_buf, "BACKTRACE=" \
+ "timestamp=%s\n" \
+ "memdev=%s\n" \
+ "host=%s\n" \
+ "serial=0x%lx\n" \
+ "error_status=%u\n", \
+ ev->timestamp, \
+ ev->memdev, \
+ ev->host, \
+ ev->serial, \
+ ev->error_status);
+
+ strcat(buf, bt_buf);
+
+ return 0;
+}
+
static int commit_report_backtrace(int sockfd, int type, void *ev){
char buf[MAX_BACKTRACE_SIZE];
char *pbuf = buf;
@@ -440,6 +464,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){
case CXL_AER_UE_EVENT:
rc = set_cxl_aer_ue_event_backtrace(buf, (struct ras_cxl_aer_ue_event *)ev);
break;
+ case CXL_AER_CE_EVENT:
+ rc = set_cxl_aer_ce_event_backtrace(buf, (struct ras_cxl_aer_ce_event *)ev);
+ break;
default:
return -1;
}
@@ -936,3 +963,47 @@ cxl_aer_ue_fail:
else
return -1;
}
+
+int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev)
+{
+ char buf[MAX_MESSAGE_SIZE];
+ int sockfd = 0;
+ int done = 0;
+ int rc = -1;
+
+ memset(buf, 0, sizeof(buf));
+
+ sockfd = setup_report_socket();
+ if (sockfd < 0)
+ return -1;
+
+ rc = commit_report_basic(sockfd);
+ if (rc < 0)
+ goto cxl_aer_ce_fail;
+
+ rc = commit_report_backtrace(sockfd, CXL_AER_CE_EVENT, ev);
+ if (rc < 0)
+ goto cxl_aer_ce_fail;
+
+ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl-aer-correctable-error");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto cxl_aer_ce_fail;
+
+ sprintf(buf, "REASON=%s", "CXL AER correctable error");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto cxl_aer_ce_fail;
+
+ done = 1;
+
+cxl_aer_ce_fail:
+
+ if (sockfd >= 0)
+ close(sockfd);
+
+ if (done)
+ return 0;
+ else
+ return -1;
+}
diff --git a/ras-report.h b/ras-report.h
index dfe89d1..46155ee 100644
--- a/ras-report.h
+++ b/ras-report.h
@@ -41,6 +41,7 @@ int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *e
int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev);
int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev);
+int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev);
#else
@@ -54,6 +55,7 @@ static inline int ras_report_diskerror_event(struct ras_events *ras, struct disk
static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; };
static inline int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; };
static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; };
+static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; };
#endif

View File

@ -0,0 +1,503 @@
commit a7524917befe7e67c02253cc27cb0c724e5992c0
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Fri Mar 17 12:51:02 2023 +0000
rasdaemon: Add support for the CXL AER uncorrectable errors
Add support to log and record the CXL AER uncorrectable errors.
The corresponding Kernel patches are here:
https://lore.kernel.org/linux-cxl/166974401763.1608150.5424589924034481387.stgit@djiang5-desk3.ch.intel.com/T/#t
https://lore.kernel.org/lkml/63eeb2a8c9e3f_32d612941f@dwillia2-xfh.jf.intel.com.notmuch/T/
It was found that the header log data to be converted to the
big-endian format to correctly store in the SQLite DB likely
because the SQLite database seems uses the big-endian storage.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>#
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c
index cb23ba2..0f2c9e4 100644
--- a/ras-cxl-handler.c
+++ b/ras-cxl-handler.c
@@ -21,6 +21,7 @@
#include "ras-record.h"
#include "ras-logger.h"
#include "ras-report.h"
+#include <endian.h>
/* Poison List: Payload out flags */
#define CXL_POISON_FLAG_MORE BIT(0)
@@ -200,3 +201,153 @@ int ras_cxl_poison_event_handler(struct trace_seq *s,
return 0;
}
+
+/* CXL AER Errors */
+
+#define CXL_AER_UE_CACHE_DATA_PARITY BIT(0)
+#define CXL_AER_UE_CACHE_ADDR_PARITY BIT(1)
+#define CXL_AER_UE_CACHE_BE_PARITY BIT(2)
+#define CXL_AER_UE_CACHE_DATA_ECC BIT(3)
+#define CXL_AER_UE_MEM_DATA_PARITY BIT(4)
+#define CXL_AER_UE_MEM_ADDR_PARITY BIT(5)
+#define CXL_AER_UE_MEM_BE_PARITY BIT(6)
+#define CXL_AER_UE_MEM_DATA_ECC BIT(7)
+#define CXL_AER_UE_REINIT_THRESH BIT(8)
+#define CXL_AER_UE_RSVD_ENCODE BIT(9)
+#define CXL_AER_UE_POISON BIT(10)
+#define CXL_AER_UE_RECV_OVERFLOW BIT(11)
+#define CXL_AER_UE_INTERNAL_ERR BIT(14)
+#define CXL_AER_UE_IDE_TX_ERR BIT(15)
+#define CXL_AER_UE_IDE_RX_ERR BIT(16)
+
+struct cxl_error_list {
+ uint32_t bit;
+ const char *error;
+};
+
+static const struct cxl_error_list cxl_aer_ue[] = {
+ { .bit = CXL_AER_UE_CACHE_DATA_PARITY, .error = "Cache Data Parity Error" },
+ { .bit = CXL_AER_UE_CACHE_ADDR_PARITY, .error = "Cache Address Parity Error" },
+ { .bit = CXL_AER_UE_CACHE_BE_PARITY, .error = "Cache Byte Enable Parity Error" },
+ { .bit = CXL_AER_UE_CACHE_DATA_ECC, .error = "Cache Data ECC Error" },
+ { .bit = CXL_AER_UE_MEM_DATA_PARITY, .error = "Memory Data Parity Error" },
+ { .bit = CXL_AER_UE_MEM_ADDR_PARITY, .error = "Memory Address Parity Error" },
+ { .bit = CXL_AER_UE_MEM_BE_PARITY, .error = "Memory Byte Enable Parity Error" },
+ { .bit = CXL_AER_UE_MEM_DATA_ECC, .error = "Memory Data ECC Error" },
+ { .bit = CXL_AER_UE_REINIT_THRESH, .error = "REINIT Threshold Hit" },
+ { .bit = CXL_AER_UE_RSVD_ENCODE, .error = "Received Unrecognized Encoding" },
+ { .bit = CXL_AER_UE_POISON, .error = "Received Poison From Peer" },
+ { .bit = CXL_AER_UE_RECV_OVERFLOW, .error = "Receiver Overflow" },
+ { .bit = CXL_AER_UE_INTERNAL_ERR, .error = "Component Specific Error" },
+ { .bit = CXL_AER_UE_IDE_TX_ERR, .error = "IDE Tx Error" },
+ { .bit = CXL_AER_UE_IDE_RX_ERR, .error = "IDE Rx Error" },
+};
+
+static int decode_cxl_error_status(struct trace_seq *s, uint32_t status,
+ const struct cxl_error_list *cxl_error_list,
+ uint8_t num_elems)
+{
+ int i;
+
+ for (i = 0; i < num_elems; i++) {
+ if (status & cxl_error_list[i].bit)
+ if (trace_seq_printf(s, "\'%s\' ", cxl_error_list[i].error) <= 0)
+ return -1;
+ }
+ return 0;
+}
+
+int ras_cxl_aer_ue_event_handler(struct trace_seq *s,
+ struct tep_record *record,
+ struct tep_event *event, void *context)
+{
+ int len, i;
+ unsigned long long val;
+ time_t now;
+ struct tm *tm;
+ struct ras_events *ras = context;
+ struct ras_cxl_aer_ue_event ev;
+
+ memset(&ev, 0, sizeof(ev));
+ now = record->ts / user_hz + ras->uptime_diff;
+ tm = localtime(&now);
+ if (tm)
+ strftime(ev.timestamp, sizeof(ev.timestamp),
+ "%Y-%m-%d %H:%M:%S %z", tm);
+ else
+ strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp));
+ if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0)
+ return -1;
+
+ ev.memdev = tep_get_field_raw(s, event, "memdev",
+ record, &len, 1);
+ if (!ev.memdev)
+ return -1;
+ if (trace_seq_printf(s, "memdev:%s ", ev.memdev) <= 0)
+ return -1;
+
+ ev.host = tep_get_field_raw(s, event, "host",
+ record, &len, 1);
+ if (!ev.host)
+ return -1;
+ if (trace_seq_printf(s, "host:%s ", ev.host) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "serial", record, &val, 1) < 0)
+ return -1;
+ ev.serial = val;
+ if (trace_seq_printf(s, "serial:0x%llx ", (unsigned long long)ev.serial) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "status", record, &val, 1) < 0)
+ return -1;
+ ev.error_status = val;
+
+ if (trace_seq_printf(s, "error status:") <= 0)
+ return -1;
+ if (decode_cxl_error_status(s, ev.error_status,
+ cxl_aer_ue, ARRAY_SIZE(cxl_aer_ue)) < 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "first_error", record, &val, 1) < 0)
+ return -1;
+ ev.first_error = val;
+
+ if (trace_seq_printf(s, "first error:") <= 0)
+ return -1;
+ if (decode_cxl_error_status(s, ev.first_error,
+ cxl_aer_ue, ARRAY_SIZE(cxl_aer_ue)) < 0)
+ return -1;
+
+ ev.header_log = tep_get_field_raw(s, event, "header_log",
+ record, &len, 1);
+ if (!ev.header_log)
+ return -1;
+ if (trace_seq_printf(s, "header log:\n") <= 0)
+ return -1;
+ for (i = 0; i < CXL_HEADERLOG_SIZE_U32; i++) {
+ if (trace_seq_printf(s, "%08x ", ev.header_log[i]) <= 0)
+ break;
+ if ((i > 0) && ((i % 20) == 0))
+ if (trace_seq_printf(s, "\n") <= 0)
+ break;
+ /* Convert header log data to the big-endian format because
+ * the SQLite database seems uses the big-endian storage.
+ */
+ ev.header_log[i] = htobe32(ev.header_log[i]);
+ }
+ if (i < CXL_HEADERLOG_SIZE_U32)
+ return -1;
+
+ /* Insert data into the SGBD */
+#ifdef HAVE_SQLITE3
+ ras_store_cxl_aer_ue_event(ras, &ev);
+#endif
+
+#ifdef HAVE_ABRT_REPORT
+ /* Report event to ABRT */
+ ras_report_cxl_aer_ue_event(ras, &ev);
+#endif
+
+ return 0;
+}
diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h
index 84d5cc6..35efadd 100644
--- a/ras-cxl-handler.h
+++ b/ras-cxl-handler.h
@@ -21,4 +21,8 @@
int ras_cxl_poison_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context);
+
+int ras_cxl_aer_ue_event_handler(struct trace_seq *s,
+ struct tep_record *record,
+ struct tep_event *event, void *context);
#endif
diff --git a/ras-events.c b/ras-events.c
index f95844a..5d73df1 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -246,6 +246,7 @@ int toggle_ras_mc_event(int enable)
#ifdef HAVE_CXL
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_poison", enable);
+ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_uncorrectable_error", enable);
#endif
free_ras:
@@ -992,6 +993,14 @@ int handle_ras_events(int record_events)
else
log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
"cxl", "cxl_poison");
+
+ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_aer_uncorrectable_error",
+ ras_cxl_aer_ue_event_handler, NULL, CXL_AER_UE_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "cxl", "cxl_aer_uncorrectable_error");
#endif
if (!num_events) {
diff --git a/ras-events.h b/ras-events.h
index 1ef3ecd..4acbe57 100644
--- a/ras-events.h
+++ b/ras-events.h
@@ -40,6 +40,7 @@ enum {
DISKERROR_EVENT,
MF_EVENT,
CXL_POISON_EVENT,
+ CXL_AER_UE_EVENT,
NR_EVENTS
};
diff --git a/ras-record.c b/ras-record.c
index c31baa0..97a2a37 100644
--- a/ras-record.c
+++ b/ras-record.c
@@ -622,6 +622,57 @@ int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_eve
return rc;
}
+
+/*
+ * Table and functions to handle cxl:cxl_aer_uncorrectable_error
+ */
+static const struct db_fields cxl_aer_ue_event_fields[] = {
+ { .name = "id", .type = "INTEGER PRIMARY KEY" },
+ { .name = "timestamp", .type = "TEXT" },
+ { .name = "memdev", .type = "TEXT" },
+ { .name = "host", .type = "TEXT" },
+ { .name = "serial", .type = "INTEGER" },
+ { .name = "error_status", .type = "INTEGER" },
+ { .name = "first_error", .type = "INTEGER" },
+ { .name = "header_log", .type = "BLOB" },
+};
+
+static const struct db_table_descriptor cxl_aer_ue_event_tab = {
+ .name = "cxl_aer_ue_event",
+ .fields = cxl_aer_ue_event_fields,
+ .num_fields = ARRAY_SIZE(cxl_aer_ue_event_fields),
+};
+
+int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev)
+{
+ int rc;
+ struct sqlite3_priv *priv = ras->db_priv;
+
+ if (!priv || !priv->stmt_cxl_aer_ue_event)
+ return 0;
+ log(TERM, LOG_INFO, "cxl_aer_ue_event store: %p\n", priv->stmt_cxl_aer_ue_event);
+
+ sqlite3_bind_text(priv->stmt_cxl_aer_ue_event, 1, ev->timestamp, -1, NULL);
+ sqlite3_bind_text(priv->stmt_cxl_aer_ue_event, 2, ev->memdev, -1, NULL);
+ sqlite3_bind_text(priv->stmt_cxl_aer_ue_event, 3, ev->host, -1, NULL);
+ sqlite3_bind_int64(priv->stmt_cxl_aer_ue_event, 4, ev->serial);
+ sqlite3_bind_int(priv->stmt_cxl_aer_ue_event, 5, ev->error_status);
+ sqlite3_bind_int(priv->stmt_cxl_aer_ue_event, 6, ev->first_error);
+ sqlite3_bind_blob(priv->stmt_cxl_aer_ue_event, 7, ev->header_log, CXL_HEADERLOG_SIZE, NULL);
+
+ rc = sqlite3_step(priv->stmt_cxl_aer_ue_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to do cxl_aer_ue_event step on sqlite: error = %d\n", rc);
+ rc = sqlite3_reset(priv->stmt_cxl_aer_ue_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed reset cxl_aer_ue_event on sqlite: error = %d\n",
+ rc);
+ log(TERM, LOG_INFO, "register inserted at db\n");
+
+ return rc;
+}
#endif
/*
@@ -973,6 +1024,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
if (rc != SQLITE_OK)
goto error;
}
+
+ rc = ras_mc_create_table(priv, &cxl_aer_ue_event_tab);
+ if (rc == SQLITE_OK) {
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_aer_ue_event,
+ &cxl_aer_ue_event_tab);
+ if (rc != SQLITE_OK)
+ goto error;
+ }
#endif
ras->db_priv = priv;
@@ -1102,6 +1161,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras)
"cpu %u: Failed to finalize cxl_poison_event sqlite: error = %d\n",
cpu, rc);
}
+
+ if (priv->stmt_cxl_aer_ue_event) {
+ rc = sqlite3_finalize(priv->stmt_cxl_aer_ue_event);
+ if (rc != SQLITE_OK)
+ log(TERM, LOG_ERR,
+ "cpu %u: Failed to finalize cxl_aer_ue_event sqlite: error = %d\n",
+ cpu, rc);
+ }
#endif
rc = sqlite3_close_v2(db);
diff --git a/ras-record.h b/ras-record.h
index fd15215..f11985f 100644
--- a/ras-record.h
+++ b/ras-record.h
@@ -130,6 +130,20 @@ struct ras_cxl_poison_event {
char overflow_ts[64];
};
+#define SZ_512 0x200
+#define CXL_HEADERLOG_SIZE SZ_512
+#define CXL_HEADERLOG_SIZE_U32 (SZ_512 / sizeof(uint32_t))
+
+struct ras_cxl_aer_ue_event {
+ char timestamp[64];
+ const char *memdev;
+ const char *host;
+ uint64_t serial;
+ uint32_t error_status;
+ uint32_t first_error;
+ uint32_t *header_log;
+};
+
struct ras_mc_event;
struct ras_aer_event;
struct ras_extlog_event;
@@ -140,6 +154,7 @@ struct devlink_event;
struct diskerror_event;
struct ras_mf_event;
struct ras_cxl_poison_event;
+struct ras_cxl_aer_ue_event;
#ifdef HAVE_SQLITE3
@@ -174,6 +189,7 @@ struct sqlite3_priv {
#endif
#ifdef HAVE_CXL
sqlite3_stmt *stmt_cxl_poison_event;
+ sqlite3_stmt *stmt_cxl_aer_ue_event;
#endif
};
@@ -203,6 +219,7 @@ int ras_store_devlink_event(struct ras_events *ras, struct devlink_event *ev);
int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev);
int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev);
+int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev);
#else
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
@@ -217,6 +234,7 @@ static inline int ras_store_devlink_event(struct ras_events *ras, struct devlink
static inline int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; };
static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; };
static inline int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; };
+static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; };
#endif
diff --git a/ras-report.c b/ras-report.c
index 3daecc0..2ebdc80 100644
--- a/ras-report.c
+++ b/ras-report.c
@@ -371,6 +371,32 @@ static int set_cxl_poison_event_backtrace(char *buf, struct ras_cxl_poison_event
return 0;
}
+static int set_cxl_aer_ue_event_backtrace(char *buf, struct ras_cxl_aer_ue_event *ev)
+{
+ char bt_buf[MAX_BACKTRACE_SIZE];
+
+ if (!buf || !ev)
+ return -1;
+
+ sprintf(bt_buf, "BACKTRACE=" \
+ "timestamp=%s\n" \
+ "memdev=%s\n" \
+ "host=%s\n" \
+ "serial=0x%lx\n" \
+ "error_status=%u\n" \
+ "first_error=%u\n", \
+ ev->timestamp, \
+ ev->memdev, \
+ ev->host, \
+ ev->serial, \
+ ev->error_status, \
+ ev->first_error);
+
+ strcat(buf, bt_buf);
+
+ return 0;
+}
+
static int commit_report_backtrace(int sockfd, int type, void *ev){
char buf[MAX_BACKTRACE_SIZE];
char *pbuf = buf;
@@ -411,6 +437,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){
case CXL_POISON_EVENT:
rc = set_cxl_poison_event_backtrace(buf, (struct ras_cxl_poison_event *)ev);
break;
+ case CXL_AER_UE_EVENT:
+ rc = set_cxl_aer_ue_event_backtrace(buf, (struct ras_cxl_aer_ue_event *)ev);
+ break;
default:
return -1;
}
@@ -863,3 +892,47 @@ cxl_poison_fail:
else
return -1;
}
+
+int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev)
+{
+ char buf[MAX_MESSAGE_SIZE];
+ int sockfd = 0;
+ int done = 0;
+ int rc = -1;
+
+ memset(buf, 0, sizeof(buf));
+
+ sockfd = setup_report_socket();
+ if (sockfd < 0)
+ return -1;
+
+ rc = commit_report_basic(sockfd);
+ if (rc < 0)
+ goto cxl_aer_ue_fail;
+
+ rc = commit_report_backtrace(sockfd, CXL_AER_UE_EVENT, ev);
+ if (rc < 0)
+ goto cxl_aer_ue_fail;
+
+ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl-aer-uncorrectable-error");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto cxl_aer_ue_fail;
+
+ sprintf(buf, "REASON=%s", "CXL AER uncorrectable error");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto cxl_aer_ue_fail;
+
+ done = 1;
+
+cxl_aer_ue_fail:
+
+ if (sockfd >= 0)
+ close(sockfd);
+
+ if (done)
+ return 0;
+ else
+ return -1;
+}
diff --git a/ras-report.h b/ras-report.h
index d1591ce..dfe89d1 100644
--- a/ras-report.h
+++ b/ras-report.h
@@ -40,6 +40,7 @@ int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev);
int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev);
int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev);
+int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev);
#else
@@ -52,6 +53,7 @@ static inline int ras_report_devlink_event(struct ras_events *ras, struct devlin
static inline int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; };
static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; };
static inline int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; };
+static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; };
#endif

View File

@ -0,0 +1,116 @@
commit 81b362f0412eb9769098c2f4317b84b9bd82cce9
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Mon Feb 12 10:35:25 2024 +0000
rasdaemon: ras-mc-ctl: Add support for CXL AER correctable trace events
Add support for CXL AER correctable events to the ras-mc-ctl tool.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
(cherry picked from commit ae1647624486fca0070b297d0e2fd4e53443c10b)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index c0a2ec6..9519279 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -1230,6 +1230,46 @@ sub get_cxl_ue_error_status_text
return join (", ", @out);
}
+use constant {
+ CXL_AER_CE_CACHE_DATA_ECC => 0x0001,
+ CXL_AER_CE_MEM_DATA_ECC => 0x0002,
+ CXL_AER_CE_CRC_THRESH => 0x0004,
+ CXL_AER_CE_RETRY_THRESH => 0x0008,
+ CXL_AER_CE_CACHE_POISON => 0x0010,
+ CXL_AER_CE_MEM_POISON => 0x0020,
+ CXL_AER_CE_PHYS_LAYER_ERR => 0x0040,
+};
+
+sub get_cxl_ce_error_status_text
+{
+ my $error_status = $_[0];
+ my @out;
+
+ if ($error_status & CXL_AER_CE_CACHE_DATA_ECC) {
+ push @out, (sprintf "\'Cache Data ECC Error\' ");
+ }
+ if ($error_status & CXL_AER_CE_MEM_DATA_ECC) {
+ push @out, (sprintf "\'Memory Data ECC Error\' ");
+ }
+ if ($error_status & CXL_AER_CE_CRC_THRESH) {
+ push @out, (sprintf "\'CRC Threshold Hit\' ");
+ }
+ if ($error_status & CXL_AER_CE_RETRY_THRESH) {
+ push @out, (sprintf "\'Retry Threshold\' ");
+ }
+ if ($error_status & CXL_AER_CE_CACHE_POISON) {
+ push @out, (sprintf "\'Received Cache Poison From Peer\' ");
+ }
+ if ($error_status & CXL_AER_CE_MEM_POISON) {
+ push @out, (sprintf "\'Received Memory Poison From Peer\' ");
+ }
+ if ($error_status & CXL_AER_CE_PHYS_LAYER_ERR) {
+ push @out, (sprintf "\'Received Error From Physical Layer\' ");
+ }
+
+ return join (", ", @out);
+}
+
sub summary
{
require DBI;
@@ -1310,6 +1350,22 @@ sub summary
print "No CXL AER uncorrectable errors.\n\n";
}
$query_handle->finish;
+
+ # CXL AER correctable errors
+ $query = "select memdev, count(*) from cxl_aer_ce_event$conf{opt}{since} group by memdev";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($memdev, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$memdev errors: $count\n";
+ }
+ if ($out ne "") {
+ print "CXL AER correctable events summary:\n$out\n";
+ } else {
+ print "No CXL AER correctable errors.\n\n";
+ }
+ $query_handle->finish;
}
# extlog errors
@@ -1519,6 +1575,29 @@ sub errors
print "No CXL AER uncorrectable errors.\n\n";
}
$query_handle->finish;
+
+ # CXL AER correctable errors
+ $query = "select id, timestamp, memdev, host, serial, error_status from cxl_aer_ce_event$conf{opt}{since} order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $error_status));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev);
+ $out .= "host=$host, " if (defined $host && length $host);
+ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial);
+ if (defined $error_status && length $error_status) {
+ $out .= sprintf "error_status: %s, ", get_cxl_ce_error_status_text($error_status);
+ }
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "CXL AER correctable events:\n$out\n";
+ } else {
+ print "No CXL AER correctable errors.\n\n";
+ }
+ $query_handle->finish;
}
# Extlog errors

View File

@ -0,0 +1,161 @@
commit b2e5a6821fae4278cc37803a223a5a64bf50c8cc
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Mon Feb 12 11:29:13 2024 +0000
rasdaemon: ras-mc-ctl: Add support for CXL memory module trace events
Add support for CXL memory module events to the ras-mc-ctl tool.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
(cherry picked from commit aee13f74266382c64128bd7367a5eeb46277f490)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 5e45889..5e120d9 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -1376,6 +1376,70 @@ sub get_cxl_transaction_type
return $types[$_[0]];
}
+sub get_cxl_dev_event_type
+{
+ my @types;
+
+ if ($_[0] < 0 || $_[0] > 5) {
+ return "unknown-type";
+ }
+
+ @types = ("Health Status Change",
+ "Media Status Change",
+ "Life Used Change",
+ "Temperature Change",
+ "Data Path Error",
+ "LSA Error");
+
+ return $types[$_[0]];
+}
+
+use constant {
+ CXL_DHI_HS_MAINTENANCE_NEEDED => 0x0001,
+ CXL_DHI_HS_PERFORMANCE_DEGRADED => 0x0002,
+ CXL_DHI_HS_HW_REPLACEMENT_NEEDED => 0x0004,
+};
+
+sub get_cxl_health_status_text
+{
+ my $flags = $_[0];
+ my @out;
+
+ if ($flags & CXL_DHI_HS_MAINTENANCE_NEEDED) {
+ push @out, (sprintf "\'MAINTENANCE_NEEDED\' ");
+ }
+ if ($flags & CXL_DHI_HS_PERFORMANCE_DEGRADED) {
+ push @out, (sprintf "\'PERFORMANCE_DEGRADED\' ");
+ }
+ if ($flags & CXL_DHI_HS_HW_REPLACEMENT_NEEDED) {
+ push @out, (sprintf "\'REPLACEMENT_NEEDED\' ");
+ }
+
+ return join (", ", @out);
+}
+
+sub get_cxl_media_status
+{
+ my @types;
+
+ if ($_[0] < 0 || $_[0] > 9) {
+ return "unknown";
+ }
+
+ @types = ("Normal",
+ "Not Ready",
+ "Write Persistency Lost",
+ "All Data Lost",
+ "Write Persistency Loss in the Event of Power Loss",
+ "Write Persistency Loss in Event of Shutdown",
+ "Write Persistency Loss Imminent",
+ "All Data Loss in Event of Power Loss",
+ "All Data loss in the Event of Shutdown",
+ "All Data Loss Imminent");
+
+ return $types[$_[0]];
+}
+
sub summary
{
require DBI;
@@ -1552,6 +1616,22 @@ sub summary
print "No CXL DRAM errors.\n\n";
}
$query_handle->finish;
+
+ # CXL memory module errors
+ $query = "select memdev, count(*) from cxl_memory_module_event$conf{opt}{since} group by memdev";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($memdev, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$memdev errors: $count\n";
+ }
+ if ($out ne "") {
+ print "CXL memory module events summary:\n$out\n";
+ } else {
+ print "No CXL memory module errors.\n\n";
+ }
+ $query_handle->finish;
}
# extlog errors
@@ -1665,6 +1745,7 @@ sub errors
my ($hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $data);
my ($dpa_flags, $descriptor, $mem_event_type, $transaction_type, $channel, $rank, $device, $comp_id);
my ($nibble_mask, $bank_group, $row, $column, $cor_mask);
+ my ($event_type, $health_status, $media_status, $life_used, $dirty_shutdown_cnt, $cor_vol_err_cnt, $cor_per_err_cnt, $device_temp, $add_status);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
@@ -1966,6 +2047,42 @@ sub errors
} else {
print "No CXL DRAM errors.\n\n";
}
+
+ # CXL memory module errors
+ $query = "select id, timestamp, memdev, host, serial, log_type, hdr_uuid, hdr_flags, hdr_handle, hdr_related_handle, hdr_ts, hdr_length, hdr_maint_op_class, event_type, health_status, media_status, life_used, dirty_shutdown_cnt, cor_vol_err_cnt, cor_per_err_cnt, device_temp, add_status from cxl_memory_module_event$conf{opt}{since} order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $log_type, $hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $event_type, $health_status, $media_status, $life_used, $dirty_shutdown_cnt, $cor_vol_err_cnt, $cor_per_err_cnt, $device_temp, $add_status));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev);
+ $out .= "host=$host, " if (defined $host && length $host);
+ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial);
+ $out .= "log=$log_type, " if (defined $log_type && length $log_type);
+ $out .= "hdr_uuid=$hdr_uuid, " if (defined $hdr_uuid && length $hdr_uuid);
+ $out .= sprintf "hdr_flags=0x%llx, %s, ", $hdr_flags, get_cxl_hdr_flags_text($hdr_flags) if (defined $hdr_flags && length $hdr_flags);
+ $out .= sprintf "hdr_handle=0x%x, ", $hdr_handle if (defined $hdr_handle && length $hdr_handle);
+ $out .= sprintf "hdr_related_handle=0x%x, ", $hdr_related_handle if (defined $hdr_related_handle && length $hdr_related_handle);
+ $out .= "hdr_timestamp=$hdr_ts, " if (defined $hdr_ts && length $hdr_ts);
+ $out .= sprintf "hdr_length=%u, ", $hdr_length if (defined $hdr_length && length $hdr_length);
+ $out .= sprintf "hdr_maint_op_class=%u, ", $hdr_maint_op_class if (defined $hdr_maint_op_class && length $hdr_maint_op_class);
+ $out .= sprintf "event_type: %s, ", get_cxl_dev_event_type($event_type) if (defined $event_type && length $event_type);
+ $out .= sprintf "health_status: %s, ", get_cxl_health_status_text($health_status) if (defined $health_status && length $health_status);
+ $out .= sprintf "media_status: %s, ", get_cxl_media_status($media_status) if (defined $media_status && length $media_status);
+ $out .= sprintf "life_used=%u, ", $life_used if (defined $life_used && length $life_used);
+ $out .= sprintf "dirty_shutdown_cnt=%u, ", $dirty_shutdown_cnt if (defined $dirty_shutdown_cnt && length $dirty_shutdown_cnt);
+ $out .= sprintf "cor_vol_err_cnt=%u, ", $cor_vol_err_cnt if (defined $cor_vol_err_cnt && length $cor_vol_err_cnt);
+ $out .= sprintf "cor_per_err_cnt=%u, ", $cor_per_err_cnt if (defined $cor_per_err_cnt && length $cor_per_err_cnt);
+ $out .= sprintf "device_temp=%u, ", $device_temp if (defined $device_temp && length $device_temp);
+ $out .= sprintf "add_status=%u ", $add_status if (defined $add_status && length $add_status);
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "CXL memory module events:\n$out\n";
+ } else {
+ print "No CXL memory module errors.\n\n";
+ }
}
# Extlog errors

View File

@ -0,0 +1,75 @@
commit 25ef3044f38224d653d880fb9f20be9e7c9bf570
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Mon Feb 12 10:38:51 2024 +0000
rasdaemon: ras-mc-ctl: Add support for CXL overflow trace events
Add support for CXL overflow events to the ras-mc-ctl tool.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
(cherry picked from commit b22cb067755f4604770f9864a0babed8f93a1553)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 9519279..6a319a7 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -1366,6 +1366,22 @@ sub summary
print "No CXL AER correctable errors.\n\n";
}
$query_handle->finish;
+
+ # CXL overflow errors
+ $query = "select memdev, count(*) from cxl_overflow_event$conf{opt}{since} group by memdev";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($memdev, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$memdev errors: $count\n";
+ }
+ if ($out ne "") {
+ print "CXL overflow events summary:\n$out\n";
+ } else {
+ print "No CXL overflow errors.\n\n";
+ }
+ $query_handle->finish;
}
# extlog errors
@@ -1474,6 +1490,7 @@ sub errors
my ($error_count, $affinity, $mpidr, $r_state, $psci_state);
my ($pfn, $page_type, $action_result);
my ($memdev, $host, $serial, $error_status, $first_error, $header_log);
+ my ($log_type, $first_ts, $last_ts);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
@@ -1598,6 +1615,27 @@ sub errors
print "No CXL AER correctable errors.\n\n";
}
$query_handle->finish;
+
+ # CXL overflow errors
+ $query = "select id, timestamp, memdev, host, serial, log_type, count, first_ts, last_ts from cxl_overflow_event$conf{opt}{since} order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $log_type, $count, $first_ts, $last_ts));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev);
+ $out .= "host=$host, " if (defined $host && length $host);
+ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial);
+ $out .= "log=$log_type, " if (defined $log_type && length $log_type);
+ $out .= sprintf "%u records from $first_ts to $last_ts", $count if (defined $count && length $count);
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "CXL overflow events:\n$out\n";
+ } else {
+ print "No CXL overflow errors.\n\n";
+ }
}
# Extlog errors

View File

@ -0,0 +1,267 @@
commit bd27251e3d52f57be1e245dff1cf221e09c5686f
Author: Marcus Sundman <sundman@iki.fi>
Date: Thu Apr 20 18:17:17 2023 +0300
ras-mc-ctl: add option to exclude old events from reports
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 5e120d9..712a105 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -97,6 +97,7 @@ Usage: $prog [OPTIONS...]
--summary Presents a summary of the logged errors.
--errors Shows the errors stored at the error database.
--error-count Shows the corrected and uncorrected error counts using sysfs.
+ --since=YYYY-MM-DD Only include events since the date YYYY-MM-DD.
--vendor-errors-summary <platform-id> Presents a summary of the vendor-specific logged errors.
--vendor-errors <platform-id> Shows the vendor-specific errors stored in the error database.
--vendor-errors <platform-id> <module-name> Shows the vendor-specific errors for a specific module stored in the error database.
@@ -177,6 +178,7 @@ sub parse_cmdline
$conf{opt}{error_count} = 0;
$conf{opt}{vendor_errors_summary} = 0;
$conf{opt}{vendor_errors} = 0;
+ $conf{opt}{since} = '';
$conf{opt}{vendor_platforms} = 0;
my $rref = \$conf{opt}{report};
@@ -198,6 +200,7 @@ sub parse_cmdline
"error-count" => \$conf{opt}{error_count},
"vendor-errors-summary" => \$conf{opt}{vendor_errors_summary},
"vendor-errors" => \$conf{opt}{vendor_errors},
+ "since=s" => \$conf{opt}{since},
"vendor-platforms" => \$conf{opt}{vendor_platforms},
);
@@ -209,6 +212,14 @@ sub parse_cmdline
log_error ("Only use --delay with --register-labels\n");
exit (1);
}
+
+ if ($conf{opt}{since}) {
+ if ($conf{opt}{since} !~ /^20\d\d-[01]\d-[0-3]\d/) {
+ log_error ("--since requires a date like yyyy-mm-dd where yyyy is the year, mm the month, and dd the day\n");
+ exit (1);
+ }
+ $conf{opt}{since} = " where timestamp>='$conf{opt}{since}'";
+ }
}
sub usage
@@ -1452,7 +1463,7 @@ sub summary
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
# Memory controller mc_event errors
- $query = "select err_type, label, mc, top_layer,middle_layer,lower_layer, count(*) from mc_event group by err_type, label, mc, top_layer, middle_layer, lower_layer";
+ $query = "select err_type, label, mc, top_layer,middle_layer,lower_layer, count(*) from mc_event$conf{opt}{since} group by err_type, label, mc, top_layer, middle_layer, lower_layer";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($err_type, $label, $mc, $top, $mid, $low, $count));
@@ -1469,7 +1480,7 @@ sub summary
# PCIe AER aer_event errors
if ($has_aer == 1) {
- $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg";
+ $query = "select err_type, err_msg, count(*) from aer_event$conf{opt}{since} group by err_type, err_msg";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($err_type, $msg, $count));
@@ -1487,7 +1498,7 @@ sub summary
# ARM processor arm_event errors
if ($has_arm == 1) {
- $query = "select mpidr, count(*) from arm_event group by mpidr";
+ $query = "select mpidr, count(*) from arm_event$conf{opt}{since} group by mpidr";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($mpidr, $count));
@@ -1636,7 +1647,7 @@ sub summary
# extlog errors
if ($has_extlog == 1) {
- $query = "select etype, severity, count(*) from extlog_event group by etype, severity";
+ $query = "select etype, severity, count(*) from extlog_event$conf{opt}{since} group by etype, severity";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($etype, $severity, $count));
@@ -1656,7 +1667,7 @@ sub summary
# devlink errors
if ($has_devlink == 1) {
- $query = "select dev_name, count(*) from devlink_event group by dev_name";
+ $query = "select dev_name, count(*) from devlink_event$conf{opt}{since} group by dev_name";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($dev_name, $count));
@@ -1674,7 +1685,7 @@ sub summary
# Disk errors
if ($has_disk_errors == 1) {
- $query = "select dev, count(*) from disk_errors group by dev";
+ $query = "select dev, count(*) from disk_errors$conf{opt}{since} group by dev";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($dev, $count));
@@ -1692,7 +1703,7 @@ sub summary
# Memory failure errors
if ($has_mem_failure == 1) {
- $query = "select action_result, count(*) from memory_failure_event group by action_result";
+ $query = "select action_result, count(*) from memory_failure_event$conf{opt}{since} group by action_result";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($action_result, $count));
@@ -1710,7 +1721,7 @@ sub summary
# MCE mce_record errors
if ($has_mce == 1) {
- $query = "select error_msg, count(*) from mce_record group by error_msg";
+ $query = "select error_msg, count(*) from mce_record$conf{opt}{since} group by error_msg";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($msg, $count));
@@ -1750,7 +1761,7 @@ sub errors
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
# Memory controller mc_event errors
- $query = "select id, timestamp, err_count, err_type, err_msg, label, mc, top_layer,middle_layer,lower_layer, address, grain, syndrome, driver_detail from mc_event order by id";
+ $query = "select id, timestamp, err_count, err_type, err_msg, label, mc, top_layer,middle_layer,lower_layer, address, grain, syndrome, driver_detail from mc_event$conf{opt}{since} order by id";
$query_handle = $dbh->prepare($query);
if (!$query_handle) {
log_error ("mc_event table missing from $dbname. Run 'rasdaemon --record'.\n");
@@ -1771,7 +1782,7 @@ sub errors
# PCIe AER aer_event errors
if ($has_aer == 1) {
- $query = "select id, timestamp, dev_name, err_type, err_msg from aer_event order by id";
+ $query = "select id, timestamp, dev_name, err_type, err_msg from aer_event$conf{opt}{since} order by id";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($id, $time, $devname, $type, $msg));
@@ -1789,7 +1800,7 @@ sub errors
# ARM processor arm_event errors
if ($has_arm == 1) {
- $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id";
+ $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event$conf{opt}{since} order by id";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state));
@@ -2087,7 +2098,7 @@ sub errors
# Extlog errors
if ($has_extlog == 1) {
- $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id";
+ $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event$conf{opt}{since} order by id";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data));
@@ -2114,7 +2125,7 @@ sub errors
# devlink errors
if ($has_devlink == 1) {
- $query = "select id, timestamp, bus_name, dev_name, driver_name, reporter_name, msg from devlink_event order by id";
+ $query = "select id, timestamp, bus_name, dev_name, driver_name, reporter_name, msg from devlink_event$conf{opt}{since} order by id";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($id, $timestamp, $bus_name, $dev_name, $driver_name, $reporter_name, $msg));
@@ -2138,7 +2149,7 @@ sub errors
# Disk errors
if ($has_disk_errors == 1) {
- $query = "select id, timestamp, dev, sector, nr_sector, error, rwbs, cmd from disk_errors order by id";
+ $query = "select id, timestamp, dev, sector, nr_sector, error, rwbs, cmd from disk_errors$conf{opt}{since} order by id";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($id, $timestamp, $dev, $sector, $nr_sector, $error, $rwbs, $cmd));
@@ -2163,7 +2174,7 @@ sub errors
# Memory failure errors
if ($has_mem_failure == 1) {
- $query = "select id, timestamp, pfn, page_type, action_result from memory_failure_event order by id";
+ $query = "select id, timestamp, pfn, page_type, action_result from memory_failure_event$conf{opt}{since} order by id";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($id, $timestamp, $pfn, $page_type, $action_result));
@@ -2182,7 +2193,7 @@ sub errors
# MCE mce_record errors
if ($has_mce == 1) {
- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id";
+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record$conf{opt}{since} order by id";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location));
@@ -2251,7 +2262,7 @@ sub vendor_errors_summary
# HiSilicon KunPeng9xx errors
if ($platform_id eq HISILICON_KUNPENG_9XX) {
$found_platform = 1;
- $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2 group by err_severity, module_id";
+ $query = "select err_severity, module_id, count(*) from hip08_oem_type1_event_v2$conf{opt}{since} group by err_severity, module_id";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($err_severity, $module_id, $count));
@@ -2269,7 +2280,7 @@ sub vendor_errors_summary
}
$query_handle->finish;
- $query = "select err_severity, module_id, count(*) from hip08_oem_type2_event_v2 group by err_severity, module_id";
+ $query = "select err_severity, module_id, count(*) from hip08_oem_type2_event_v2$conf{opt}{since} group by err_severity, module_id";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($err_severity, $module_id, $count));
@@ -2287,7 +2298,7 @@ sub vendor_errors_summary
}
$query_handle->finish;
- $query = "select err_severity, sub_module_id, count(*) from hip08_pcie_local_event_v2 group by err_severity, sub_module_id";
+ $query = "select err_severity, sub_module_id, count(*) from hip08_pcie_local_event_v2$conf{opt}{since} group by err_severity, sub_module_id";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($err_severity, $sub_module_id, $count));
@@ -2305,7 +2316,7 @@ sub vendor_errors_summary
}
$query_handle->finish;
- $query = "select err_severity, module_id, count(*) from hisi_common_section_v2 group by err_severity, module_id";
+ $query = "select err_severity, module_id, count(*) from hisi_common_section_v2$conf{opt}{since} group by err_severity, module_id";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($err_severity, $module_id, $count));
@@ -2359,7 +2370,7 @@ sub vendor_errors
# HiSilicon KunPeng9xx errors
if ($platform_id eq HISILICON_KUNPENG_9XX) {
$found_platform = 1;
- $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2 order by id, module_id, err_severity";
+ $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type1_event_v2$conf{opt}{since} order by id, module_id, err_severity";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs));
@@ -2384,7 +2395,7 @@ sub vendor_errors
}
$query_handle->finish;
- $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type2_event_v2 order by id, module_id, err_severity";
+ $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, module_id, sub_module_id, err_severity, regs_dump from hip08_oem_type2_event_v2$conf{opt}{since} order by id, module_id, err_severity";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $module_id, $sub_module_id, $err_severity, $regs));
@@ -2409,7 +2420,7 @@ sub vendor_errors
}
$query_handle->finish;
- $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, sub_module_id, core_id, port_id, err_severity, err_type, regs_dump from hip08_pcie_local_event_v2 order by id, sub_module_id, err_severity";
+ $query = "select id, timestamp, version, soc_id, socket_id, nimbus_id, sub_module_id, core_id, port_id, err_severity, err_type, regs_dump from hip08_pcie_local_event_v2$conf{opt}{since} order by id, sub_module_id, err_severity";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $nimbus_id, $sub_module_id, $core_id, $port_id, $err_severity, $err_type, $regs));
@@ -2436,7 +2447,7 @@ sub vendor_errors
}
$query_handle->finish;
- $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2 order by id, module_id, err_severity";
+ $query = "select id, timestamp, version, soc_id, socket_id, totem_id, nimbus_id, sub_system_id, module_id, sub_module_id, core_id, port_id, err_type, pcie_info, err_severity, regs_dump from hisi_common_section_v2$conf{opt}{since} order by id, module_id, err_severity";
$query_handle = $dbh->prepare($query);
$query_handle->execute();
$query_handle->bind_columns(\($id, $timestamp, $version, $soc_id, $socket_id, $totem_id, $nimbus_id, $sub_system_id, $module_id, $sub_module_id, $core_id, $port_id, $err_type, $pcie_info, $err_severity, $regs));

View File

@ -0,0 +1,101 @@
commit 703e0f8eabbe1e191a8bd85632066c155ec1f4fa
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Mon Feb 12 11:22:03 2024 +0000
rasdaemon: ras-mc-ctl: Add support for CXL DRAM trace events
Add support for CXL DRAM events to the ras-mc-ctl tool.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
(cherry picked from commit c38c14afc5d7bb6c8c52d1023271d755deb23008)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 99b3c10..5e45889 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -1536,6 +1536,22 @@ sub summary
print "No CXL general media errors.\n\n";
}
$query_handle->finish;
+
+ # CXL DRAM errors
+ $query = "select memdev, count(*) from cxl_dram_event$conf{opt}{since} group by memdev";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($memdev, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$memdev errors: $count\n";
+ }
+ if ($out ne "") {
+ print "CXL DRAM events summary:\n$out\n";
+ } else {
+ print "No CXL DRAM errors.\n\n";
+ }
+ $query_handle->finish;
}
# extlog errors
@@ -1648,6 +1664,7 @@ sub errors
my ($trace_type, $region, $region_uuid, $hpa, $dpa, $dpa_length, $source, $flags, $overflow_ts);
my ($hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $data);
my ($dpa_flags, $descriptor, $mem_event_type, $transaction_type, $channel, $rank, $device, $comp_id);
+ my ($nibble_mask, $bank_group, $row, $column, $cor_mask);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
@@ -1902,6 +1919,53 @@ sub errors
} else {
print "No CXL general media errors.\n\n";
}
+
+ # CXL DRAM errors
+ use constant CXL_EVENT_DER_CORRECTION_MASK_SIZE => 0x20;
+ $query = "select id, timestamp, memdev, host, serial, log_type, hdr_uuid, hdr_flags, hdr_handle, hdr_related_handle, hdr_ts, hdr_length, hdr_maint_op_class, dpa, dpa_flags, descriptor, type, transaction_type, channel, rank, nibble_mask, bank_group, bank, row, column, cor_mask from cxl_dram_event$conf{opt}{since} order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $log_type, $hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $dpa, $dpa_flags, $descriptor, $type, $transaction_type, $channel, $rank, $nibble_mask, $bank_group, $bank, $row, $column, $cor_mask));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev);
+ $out .= "host=$host, " if (defined $host && length $host);
+ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial);
+ $out .= "log=$log_type, " if (defined $log_type && length $log_type);
+ $out .= "hdr_uuid=$hdr_uuid, " if (defined $hdr_uuid && length $hdr_uuid);
+ $out .= sprintf "hdr_flags=0x%llx, %s, ", $hdr_flags, get_cxl_hdr_flags_text($hdr_flags) if (defined $hdr_flags && length $hdr_flags);
+ $out .= sprintf "hdr_handle=0x%x, ", $hdr_handle if (defined $hdr_handle && length $hdr_handle);
+ $out .= sprintf "hdr_related_handle=0x%x, ", $hdr_related_handle if (defined $hdr_related_handle && length $hdr_related_handle);
+ $out .= "hdr_timestamp=$hdr_ts, " if (defined $hdr_ts && length $hdr_ts);
+ $out .= sprintf "hdr_length=%u, ", $hdr_length if (defined $hdr_length && length $hdr_length);
+ $out .= sprintf "hdr_maint_op_class=%u, ", $hdr_maint_op_class if (defined $hdr_maint_op_class && length $hdr_maint_op_class);
+ $out .= sprintf "dpa=0x%llx, ", $dpa if (defined $dpa && length $dpa);
+ $out .= sprintf "dpa_flags: %s, ", get_cxl_dpa_flags_text($dpa_flags) if (defined $dpa_flags && length $dpa_flags);
+ $out .= sprintf "descriptor_flags: %s, ", get_cxl_descriptor_flags_text($descriptor) if (defined $descriptor && length $descriptor);
+ $out .= sprintf "memory event type: %s, ", get_cxl_mem_event_type($type) if (defined $type && length $type);
+ $out .= sprintf "transaction_type: %s, ", get_cxl_transaction_type($transaction_type) if (defined $transaction_type && length $transaction_type);
+ $out .= sprintf "channel=%u, ", $channel if (defined $channel && length $channel);
+ $out .= sprintf "rank=%u, ", $rank if (defined $rank && length $rank);
+ $out .= sprintf "nibble_mask=%u, ", $nibble_mask if (defined $nibble_mask && length $nibble_mask);
+ $out .= sprintf "bank_group=%u, ", $bank_group if (defined $bank_group && length $bank_group);
+ $out .= sprintf "bank=%u, ", $bank if (defined $bank && length $bank);
+ $out .= sprintf "row=%u, ", $row if (defined $row && length $row);
+ $out .= sprintf "column=%u, ", $column if (defined $column && length $column);
+ if (defined $cor_mask && length $cor_mask) {
+ $out .= sprintf "correction_mask:";
+ my @bytes = unpack "C*", $cor_mask;
+ for (my $i = 0; $i < CXL_EVENT_DER_CORRECTION_MASK_SIZE; $i++) {
+ $out .= sprintf "%02x ", $bytes[$i];
+ }
+ }
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "CXL DRAM events:\n$out\n";
+ } else {
+ print "No CXL DRAM errors.\n\n";
+ }
}
# Extlog errors

View File

@ -0,0 +1,42 @@
commit d3836aa061f677232f99c514247d3dbf80812a1b
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Mon Jan 16 17:13:32 2023 +0000
rasdaemon: Move definition for BIT and BIT_ULL to a common file
Move definition for BIT() and BIT_ULL() to the
common file ras-record.h
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/ras-non-standard-handler.h b/ras-non-standard-handler.h
index 4d9f938..c360eaf 100644
--- a/ras-non-standard-handler.h
+++ b/ras-non-standard-handler.h
@@ -17,9 +17,6 @@
#include "ras-events.h"
#include <traceevent/event-parse.h>
-#define BIT(nr) (1UL << (nr))
-#define BIT_ULL(nr) (1ULL << (nr))
-
struct ras_ns_ev_decoder {
struct ras_ns_ev_decoder *next;
const char *sec_type;
diff --git a/ras-record.h b/ras-record.h
index d9f7733..219f10b 100644
--- a/ras-record.h
+++ b/ras-record.h
@@ -25,6 +25,9 @@
#define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x)))
+#define BIT(nr) (1UL << (nr))
+#define BIT_ULL(nr) (1ULL << (nr))
+
extern long user_hz;
struct ras_events;

View File

@ -0,0 +1,575 @@
commit e0cde0edf073b939d345aeba0aed23e238dbc53b
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Tue Apr 4 18:49:09 2023 +0100
rasdaemon: Add support for the CXL generic events
Add support to log and record the CXL generic events.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c
index d4c845e..83ada56 100644
--- a/ras-cxl-handler.c
+++ b/ras-cxl-handler.c
@@ -56,6 +56,49 @@ static void get_timestamp(struct trace_seq *s, struct tep_record *record,
strncpy(ts_ptr, "1970-01-01 00:00:00 +0000", size);
}
+struct cxl_event_flags {
+ uint32_t bit;
+ const char *flag;
+};
+
+static int decode_cxl_event_flags(struct trace_seq *s, uint32_t flags,
+ const struct cxl_event_flags *cxl_ev_flags,
+ uint8_t num_elems)
+{
+ int i;
+
+ for (i = 0; i < num_elems; i++) {
+ if (flags & cxl_ev_flags[i].bit)
+ if (trace_seq_printf(s, "\'%s\' ", cxl_ev_flags[i].flag) <= 0)
+ return -1;
+ }
+ return 0;
+}
+
+static char *uuid_be(const char *uu)
+{
+ static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")];
+ char *p = uuid;
+ int i;
+ static const unsigned char be[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+ for (i = 0; i < 16; i++) {
+ p += sprintf(p, "%.2x", (unsigned char) uu[be[i]]);
+ switch (i) {
+ case 3:
+ case 5:
+ case 7:
+ case 9:
+ *p++ = '-';
+ break;
+ }
+ }
+
+ *p = 0;
+
+ return uuid;
+}
+
/* Poison List: Payload out flags */
#define CXL_POISON_FLAG_MORE BIT(0)
#define CXL_POISON_FLAG_OVERFLOW BIT(1)
@@ -524,3 +567,145 @@ int ras_cxl_overflow_event_handler(struct trace_seq *s,
return 0;
}
+
+/*
+ * Common Event Record Format
+ * CXL 3.0 section 8.2.9.2.1; Table 8-42
+ */
+#define CXL_EVENT_RECORD_FLAG_PERMANENT BIT(2)
+#define CXL_EVENT_RECORD_FLAG_MAINT_NEEDED BIT(3)
+#define CXL_EVENT_RECORD_FLAG_PERF_DEGRADED BIT(4)
+#define CXL_EVENT_RECORD_FLAG_HW_REPLACE BIT(5)
+
+static const struct cxl_event_flags cxl_hdr_flags[] = {
+ { .bit = CXL_EVENT_RECORD_FLAG_PERMANENT, .flag = "PERMANENT_CONDITION" },
+ { .bit = CXL_EVENT_RECORD_FLAG_MAINT_NEEDED, .flag = "MAINTENANCE_NEEDED" },
+ { .bit = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED, .flag = "PERFORMANCE_DEGRADED" },
+ { .bit = CXL_EVENT_RECORD_FLAG_HW_REPLACE, .flag = "HARDWARE_REPLACEMENT_NEEDED" },
+};
+
+static int handle_ras_cxl_common_hdr(struct trace_seq *s,
+ struct tep_record *record,
+ struct tep_event *event, void *context,
+ struct ras_cxl_event_common_hdr *hdr)
+{
+ int len;
+ unsigned long long val;
+ struct ras_events *ras = context;
+
+ get_timestamp(s, record, ras, (char *)&hdr->timestamp, sizeof(hdr->timestamp));
+ if (trace_seq_printf(s, "%s ", hdr->timestamp) <= 0)
+ return -1;
+
+ hdr->memdev = tep_get_field_raw(s, event, "memdev", record, &len, 1);
+ if (!hdr->memdev)
+ return -1;
+ if (trace_seq_printf(s, "memdev:%s ", hdr->memdev) <= 0)
+ return -1;
+
+ hdr->host = tep_get_field_raw(s, event, "host", record, &len, 1);
+ if (!hdr->host)
+ return -1;
+ if (trace_seq_printf(s, "host:%s ", hdr->host) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "serial", record, &val, 1) < 0)
+ return -1;
+ hdr->serial = val;
+ if (trace_seq_printf(s, "serial:0x%llx ", (unsigned long long)hdr->serial) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "log", record, &val, 1) < 0)
+ return -1;
+ hdr->log_type = cxl_event_log_type_str(val);
+ if (trace_seq_printf(s, "log type:%s ", hdr->log_type) <= 0)
+ return -1;
+
+ hdr->hdr_uuid = tep_get_field_raw(s, event, "hdr_uuid", record, &len, 1);
+ if (!hdr->hdr_uuid)
+ return -1;
+ hdr->hdr_uuid = uuid_be(hdr->hdr_uuid);
+ if (trace_seq_printf(s, "hdr_uuid:%s ", hdr->hdr_uuid) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "hdr_flags", record, &val, 1) < 0)
+ return -1;
+ hdr->hdr_flags = val;
+ if (decode_cxl_event_flags(s, hdr->hdr_flags, cxl_hdr_flags,
+ ARRAY_SIZE(cxl_hdr_flags)) < 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "hdr_handle", record, &val, 1) < 0)
+ return -1;
+ hdr->hdr_handle = val;
+ if (trace_seq_printf(s, "hdr_handle:0x%x ", hdr->hdr_handle) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "hdr_related_handle", record, &val, 1) < 0)
+ return -1;
+ hdr->hdr_related_handle = val;
+ if (trace_seq_printf(s, "hdr_related_handle:0x%x ", hdr->hdr_related_handle) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "hdr_timestamp", record, &val, 1) < 0)
+ return -1;
+ convert_timestamp(val, hdr->hdr_timestamp, sizeof(hdr->hdr_timestamp));
+ if (trace_seq_printf(s, "hdr_timestamp:%s ", hdr->hdr_timestamp) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "hdr_length", record, &val, 1) < 0)
+ return -1;
+ hdr->hdr_length = val;
+ if (trace_seq_printf(s, "hdr_length:%u ", hdr->hdr_length) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "hdr_maint_op_class", record, &val, 1) < 0)
+ return -1;
+ hdr->hdr_maint_op_class = val;
+ if (trace_seq_printf(s, "hdr_maint_op_class:%u ", hdr->hdr_maint_op_class) <= 0)
+ return -1;
+
+ return 0;
+}
+
+int ras_cxl_generic_event_handler(struct trace_seq *s,
+ struct tep_record *record,
+ struct tep_event *event, void *context)
+{
+ int len, i;
+ struct ras_events *ras = context;
+ struct ras_cxl_generic_event ev;
+ const uint8_t *buf;
+
+ memset(&ev, 0, sizeof(ev));
+ if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0)
+ return -1;
+
+ ev.data = tep_get_field_raw(s, event, "data", record, &len, 1);
+ if (!ev.data)
+ return -1;
+ i = 0;
+ buf = ev.data;
+ if (trace_seq_printf(s, "\ndata:\n %08x: ", i) <= 0)
+ return -1;
+ for (i = 0; i < CXL_EVENT_RECORD_DATA_LENGTH; i += 4) {
+ if ((i > 0) && ((i % 16) == 0))
+ if (trace_seq_printf(s, "\n %08x: ", i) <= 0)
+ break;
+ if (trace_seq_printf(s, "%02x%02x%02x%02x ",
+ buf[i], buf[i+1], buf[i+2], buf[i+3]) <= 0)
+ break;
+ }
+
+ /* Insert data into the SGBD */
+#ifdef HAVE_SQLITE3
+ ras_store_cxl_generic_event(ras, &ev);
+#endif
+
+#ifdef HAVE_ABRT_REPORT
+ /* Report event to ABRT */
+ ras_report_cxl_generic_event(ras, &ev);
+#endif
+
+ return 0;
+}
diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h
index e7847ec..9f77cb7 100644
--- a/ras-cxl-handler.h
+++ b/ras-cxl-handler.h
@@ -32,4 +32,7 @@ int ras_cxl_aer_ce_event_handler(struct trace_seq *s,
int ras_cxl_overflow_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context);
+int ras_cxl_generic_event_handler(struct trace_seq *s,
+ struct tep_record *record,
+ struct tep_event *event, void *context);
#endif
diff --git a/ras-events.c b/ras-events.c
index f2a869a..4036933 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -249,6 +249,7 @@ int toggle_ras_mc_event(int enable)
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_uncorrectable_error", enable);
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_correctable_error", enable);
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_overflow", enable);
+ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_generic_event", enable);
#endif
free_ras:
@@ -1054,6 +1055,14 @@ int handle_ras_events(int record_events)
else
log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
"cxl", "cxl_overflow");
+
+ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_generic_event",
+ ras_cxl_generic_event_handler, NULL, CXL_GENERIC_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "cxl", "cxl_generic_event");
#endif
if (!num_events) {
diff --git a/ras-events.h b/ras-events.h
index 7c869d9..96c299e 100644
--- a/ras-events.h
+++ b/ras-events.h
@@ -43,6 +43,7 @@ enum {
CXL_AER_UE_EVENT,
CXL_AER_CE_EVENT,
CXL_OVERFLOW_EVENT,
+ CXL_GENERIC_EVENT,
NR_EVENTS
};
diff --git a/ras-record.c b/ras-record.c
index 7b808a5..a65d9c0 100644
--- a/ras-record.c
+++ b/ras-record.c
@@ -773,6 +773,79 @@ int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow
return rc;
}
+
+static int ras_store_cxl_common_hdr(sqlite3_stmt *stmt, struct ras_cxl_event_common_hdr *hdr)
+{
+ if (!stmt || !hdr)
+ return 0;
+
+ sqlite3_bind_text(stmt, 1, hdr->timestamp, -1, NULL);
+ sqlite3_bind_text(stmt, 2, hdr->memdev, -1, NULL);
+ sqlite3_bind_text(stmt, 3, hdr->host, -1, NULL);
+ sqlite3_bind_int64(stmt, 4, hdr->serial);
+ sqlite3_bind_text(stmt, 5, hdr->log_type, -1, NULL);
+ sqlite3_bind_text(stmt, 6, hdr->hdr_uuid, -1, NULL);
+ sqlite3_bind_int(stmt, 7, hdr->hdr_flags);
+ sqlite3_bind_int(stmt, 8, hdr->hdr_handle);
+ sqlite3_bind_int(stmt, 9, hdr->hdr_related_handle);
+ sqlite3_bind_text(stmt, 10, hdr->hdr_timestamp, -1, NULL);
+ sqlite3_bind_int(stmt, 11, hdr->hdr_length);
+ sqlite3_bind_int(stmt, 12, hdr->hdr_maint_op_class);
+
+ return 0;
+}
+
+/*
+ * Table and functions to handle cxl:cxl_generic_event
+ */
+static const struct db_fields cxl_generic_event_fields[] = {
+ { .name = "id", .type = "INTEGER PRIMARY KEY" },
+ { .name = "timestamp", .type = "TEXT" },
+ { .name = "memdev", .type = "TEXT" },
+ { .name = "host", .type = "TEXT" },
+ { .name = "serial", .type = "INTEGER" },
+ { .name = "log_type", .type = "TEXT" },
+ { .name = "hdr_uuid", .type = "TEXT" },
+ { .name = "hdr_flags", .type = "INTEGER" },
+ { .name = "hdr_handle", .type = "INTEGER" },
+ { .name = "hdr_related_handle", .type = "INTEGER" },
+ { .name = "hdr_ts", .type = "TEXT" },
+ { .name = "hdr_length", .type = "INTEGER" },
+ { .name = "hdr_maint_op_class", .type = "INTEGER" },
+ { .name = "data", .type = "BLOB" },
+};
+
+static const struct db_table_descriptor cxl_generic_event_tab = {
+ .name = "cxl_generic_event",
+ .fields = cxl_generic_event_fields,
+ .num_fields = ARRAY_SIZE(cxl_generic_event_fields),
+};
+
+int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev)
+{
+ int rc;
+ struct sqlite3_priv *priv = ras->db_priv;
+
+ if (!priv || !priv->stmt_cxl_generic_event)
+ return 0;
+ log(TERM, LOG_INFO, "cxl_generic_event store: %p\n", priv->stmt_cxl_generic_event);
+
+ ras_store_cxl_common_hdr(priv->stmt_cxl_generic_event, &ev->hdr);
+ sqlite3_bind_blob(priv->stmt_cxl_generic_event, 13, ev->data,
+ CXL_EVENT_RECORD_DATA_LENGTH, NULL);
+
+ rc = sqlite3_step(priv->stmt_cxl_generic_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to do stmt_cxl_generic_event step on sqlite: error = %d\n", rc);
+ rc = sqlite3_reset(priv->stmt_cxl_generic_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed reset stmt_cxl_generic_event on sqlite: error = %d\n", rc);
+ log(TERM, LOG_INFO, "register inserted at db\n");
+
+ return rc;
+}
#endif
/*
@@ -1148,6 +1221,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
if (rc != SQLITE_OK)
goto error;
}
+
+ rc = ras_mc_create_table(priv, &cxl_generic_event_tab);
+ if (rc == SQLITE_OK) {
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_generic_event,
+ &cxl_generic_event_tab);
+ if (rc != SQLITE_OK)
+ goto error;
+ }
#endif
ras->db_priv = priv;
@@ -1301,6 +1382,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras)
"cpu %u: Failed to finalize cxl_overflow_event sqlite: error = %d\n",
cpu, rc);
}
+
+ if (priv->stmt_cxl_generic_event) {
+ rc = sqlite3_finalize(priv->stmt_cxl_generic_event);
+ if (rc != SQLITE_OK)
+ log(TERM, LOG_ERR,
+ "cpu %u: Failed to finalize cxl_generic_event sqlite: error = %d\n",
+ cpu, rc);
+ }
#endif
rc = sqlite3_close_v2(db);
diff --git a/ras-record.h b/ras-record.h
index 90db6ad..9ecfcda 100644
--- a/ras-record.h
+++ b/ras-record.h
@@ -133,6 +133,7 @@ struct ras_cxl_poison_event {
#define SZ_512 0x200
#define CXL_HEADERLOG_SIZE SZ_512
#define CXL_HEADERLOG_SIZE_U32 (SZ_512 / sizeof(uint32_t))
+#define CXL_EVENT_RECORD_DATA_LENGTH 0x50
struct ras_cxl_aer_ue_event {
char timestamp[64];
@@ -163,6 +164,26 @@ struct ras_cxl_overflow_event {
uint16_t count;
};
+struct ras_cxl_event_common_hdr {
+ char timestamp[64];
+ const char *memdev;
+ const char *host;
+ uint64_t serial;
+ const char *log_type;
+ const char *hdr_uuid;
+ uint32_t hdr_flags;
+ uint16_t hdr_handle;
+ uint16_t hdr_related_handle;
+ char hdr_timestamp[64];
+ uint8_t hdr_length;
+ uint8_t hdr_maint_op_class;
+};
+
+struct ras_cxl_generic_event {
+ struct ras_cxl_event_common_hdr hdr;
+ uint8_t *data;
+};
+
struct ras_mc_event;
struct ras_aer_event;
struct ras_extlog_event;
@@ -176,6 +197,7 @@ struct ras_cxl_poison_event;
struct ras_cxl_aer_ue_event;
struct ras_cxl_aer_ce_event;
struct ras_cxl_overflow_event;
+struct ras_cxl_generic_event;
#ifdef HAVE_SQLITE3
@@ -213,6 +235,7 @@ struct sqlite3_priv {
sqlite3_stmt *stmt_cxl_aer_ue_event;
sqlite3_stmt *stmt_cxl_aer_ce_event;
sqlite3_stmt *stmt_cxl_overflow_event;
+ sqlite3_stmt *stmt_cxl_generic_event;
#endif
};
@@ -245,6 +268,7 @@ int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_eve
int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev);
int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev);
int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev);
+int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev);
#else
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
@@ -262,6 +286,7 @@ static inline int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_
static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; };
static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; };
static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; };
+static inline int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; };
#endif
diff --git a/ras-report.c b/ras-report.c
index dbed454..8d7b76a 100644
--- a/ras-report.c
+++ b/ras-report.c
@@ -451,6 +451,44 @@ static int set_cxl_overflow_event_backtrace(char *buf, struct ras_cxl_overflow_e
return 0;
}
+static int set_cxl_generic_event_backtrace(char *buf, struct ras_cxl_generic_event *ev)
+{
+ char bt_buf[MAX_BACKTRACE_SIZE];
+
+ if (!buf || !ev)
+ return -1;
+
+ sprintf(bt_buf, "BACKTRACE=" \
+ "timestamp=%s\n" \
+ "memdev=%s\n" \
+ "host=%s\n" \
+ "serial=0x%lx\n" \
+ "log_type=%s\n" \
+ "hdr_uuid=%s\n" \
+ "hdr_flags=0x%x\n" \
+ "hdr_handle=0x%x\n" \
+ "hdr_related_handle=0x%x\n" \
+ "hdr_timestamp=%s\n" \
+ "hdr_length=%u\n" \
+ "hdr_maint_op_class=%u\n", \
+ ev->hdr.timestamp, \
+ ev->hdr.memdev, \
+ ev->hdr.host, \
+ ev->hdr.serial, \
+ ev->hdr.log_type, \
+ ev->hdr.hdr_uuid, \
+ ev->hdr.hdr_flags, \
+ ev->hdr.hdr_handle, \
+ ev->hdr.hdr_related_handle, \
+ ev->hdr.hdr_timestamp, \
+ ev->hdr.hdr_length, \
+ ev->hdr.hdr_maint_op_class);
+
+ strcat(buf, bt_buf);
+
+ return 0;
+}
+
static int commit_report_backtrace(int sockfd, int type, void *ev){
char buf[MAX_BACKTRACE_SIZE];
char *pbuf = buf;
@@ -500,6 +538,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){
case CXL_OVERFLOW_EVENT:
rc = set_cxl_overflow_event_backtrace(buf, (struct ras_cxl_overflow_event *)ev);
break;
+ case CXL_GENERIC_EVENT:
+ rc = set_cxl_generic_event_backtrace(buf, (struct ras_cxl_generic_event *)ev);
+ break;
default:
return -1;
}
@@ -1084,3 +1125,48 @@ cxl_overflow_fail:
else
return -1;
}
+
+int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev)
+{
+ char buf[MAX_MESSAGE_SIZE];
+ int sockfd = 0;
+ int done = 0;
+ int rc = -1;
+
+ memset(buf, 0, sizeof(buf));
+
+ sockfd = setup_report_socket();
+ if (sockfd < 0)
+ return -1;
+
+ rc = commit_report_basic(sockfd);
+ if (rc < 0)
+ goto cxl_generic_fail;
+
+ rc = commit_report_backtrace(sockfd, CXL_GENERIC_EVENT, ev);
+ if (rc < 0)
+ goto cxl_generic_fail;
+
+ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl_generic_event");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto cxl_generic_fail;
+
+ sprintf(buf, "REASON=%s", "CXL Generic Event ");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto cxl_generic_fail;
+
+ done = 1;
+
+cxl_generic_fail:
+
+ if (sockfd >= 0)
+ close(sockfd);
+
+ if (done)
+ return 0;
+ else
+ return -1;
+
+}
diff --git a/ras-report.h b/ras-report.h
index 204d485..bf591a6 100644
--- a/ras-report.h
+++ b/ras-report.h
@@ -43,6 +43,7 @@ int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_ev
int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev);
int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev);
int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev);
+int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev);
#else
@@ -58,6 +59,7 @@ static inline int ras_report_cxl_poison_event(struct ras_events *ras, struct ras
static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; };
static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; };
static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; };
+static inline int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; };
#endif

View File

@ -0,0 +1,536 @@
commit f63b4c942e19a0da1e85a88783ed6e222ad4bdba
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Wed Apr 5 16:16:19 2023 +0100
rasdaemon: Add support for the CXL memory module events
Add support to log and record the CXL memory module events.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c
index 64b0b50..a0b6780 100644
--- a/ras-cxl-handler.c
+++ b/ras-cxl-handler.c
@@ -1016,3 +1016,159 @@ int ras_cxl_dram_event_handler(struct trace_seq *s,
return 0;
}
+
+/*
+ * Memory Module Event Record - MMER
+ *
+ * CXL res 3.0 section 8.2.9.2.1.3; Table 8-45
+ */
+static const char* cxl_dev_evt_type[] = {
+ "Health Status Change",
+ "Media Status Change",
+ "Life Used Change",
+ "Temperature Change",
+ "Data Path Error",
+ "LSA Error",
+};
+
+/*
+ * Device Health Information - DHI
+ *
+ * CXL res 3.0 section 8.2.9.8.3.1; Table 8-100
+ */
+#define CXL_DHI_HS_MAINTENANCE_NEEDED BIT(0)
+#define CXL_DHI_HS_PERFORMANCE_DEGRADED BIT(1)
+#define CXL_DHI_HS_HW_REPLACEMENT_NEEDED BIT(2)
+
+static const struct cxl_event_flags cxl_health_status[] = {
+ { .bit = CXL_DHI_HS_MAINTENANCE_NEEDED, .flag = "MAINTENANCE_NEEDED" },
+ { .bit = CXL_DHI_HS_PERFORMANCE_DEGRADED, .flag = "PERFORMANCE_DEGRADED" },
+ { .bit = CXL_DHI_HS_HW_REPLACEMENT_NEEDED, .flag = "REPLACEMENT_NEEDED" },
+};
+
+static const char* cxl_media_status[] = {
+ "Normal",
+ "Not Ready",
+ "Write Persistency Lost",
+ "All Data Lost",
+ "Write Persistency Loss in the Event of Power Loss",
+ "Write Persistency Loss in Event of Shutdown",
+ "Write Persistency Loss Imminent",
+ "All Data Loss in Event of Power Loss",
+ "All Data loss in the Event of Shutdown",
+ "All Data Loss Imminent",
+};
+
+static const char* cxl_two_bit_status[] = {
+ "Normal",
+ "Warning",
+ "Critical",
+};
+
+static const char* cxl_one_bit_status[] = {
+ "Normal",
+ "Warning",
+};
+
+#define CXL_DHI_AS_LIFE_USED(as) (as & 0x3)
+#define CXL_DHI_AS_DEV_TEMP(as) ((as & 0xC) >> 2)
+#define CXL_DHI_AS_COR_VOL_ERR_CNT(as) ((as & 0x10) >> 4)
+#define CXL_DHI_AS_COR_PER_ERR_CNT(as) ((as & 0x20) >> 5)
+
+int ras_cxl_memory_module_event_handler(struct trace_seq *s,
+ struct tep_record *record,
+ struct tep_event *event, void *context)
+{
+ unsigned long long val;
+ struct ras_events *ras = context;
+ struct ras_cxl_memory_module_event ev;
+
+ memset(&ev, 0, sizeof(ev));
+ if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "event_type", record, &val, 1) < 0)
+ return -1;
+ ev.event_type = val;
+ if (trace_seq_printf(s, "event_type:%s ", get_cxl_type_str(cxl_dev_evt_type,
+ ARRAY_SIZE(cxl_dev_evt_type), ev.event_type)) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "health_status", record, &val, 1) < 0)
+ return -1;
+ ev.health_status = val;
+ if (trace_seq_printf(s, "health_status:") <= 0)
+ return -1;
+ if (decode_cxl_event_flags(s, ev.health_status, cxl_health_status,
+ ARRAY_SIZE(cxl_health_status)) < 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "media_status", record, &val, 1) < 0)
+ return -1;
+ ev.media_status = val;
+ if (trace_seq_printf(s, "media_status:%s ", get_cxl_type_str(cxl_media_status,
+ ARRAY_SIZE(cxl_media_status), ev.media_status)) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "add_status", record, &val, 1) < 0)
+ return -1;
+ ev.add_status = val;
+ if (trace_seq_printf(s, "as_life_used:%s ", get_cxl_type_str(cxl_two_bit_status,
+ ARRAY_SIZE(cxl_two_bit_status),
+ CXL_DHI_AS_LIFE_USED(ev.add_status))) <= 0)
+ return -1;
+ if (trace_seq_printf(s, "as_dev_temp:%s ", get_cxl_type_str(cxl_two_bit_status,
+ ARRAY_SIZE(cxl_two_bit_status),
+ CXL_DHI_AS_DEV_TEMP(ev.add_status))) <= 0)
+ return -1;
+ if (trace_seq_printf(s, "as_cor_vol_err_cnt:%s ", get_cxl_type_str(cxl_one_bit_status,
+ ARRAY_SIZE(cxl_one_bit_status),
+ CXL_DHI_AS_COR_VOL_ERR_CNT(ev.add_status))) <= 0)
+ return -1;
+ if (trace_seq_printf(s, "as_cor_per_err_cnt:%s ", get_cxl_type_str(cxl_one_bit_status,
+ ARRAY_SIZE(cxl_one_bit_status),
+ CXL_DHI_AS_COR_PER_ERR_CNT(ev.add_status))) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "life_used", record, &val, 1) < 0)
+ return -1;
+ ev.life_used = val;
+ if (trace_seq_printf(s, "life_used:%u ", ev.life_used) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "device_temp", record, &val, 1) < 0)
+ return -1;
+ ev.device_temp = val;
+ if (trace_seq_printf(s, "device_temp:%u ", ev.device_temp) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "dirty_shutdown_cnt", record, &val, 1) < 0)
+ return -1;
+ ev.dirty_shutdown_cnt = val;
+ if (trace_seq_printf(s, "dirty_shutdown_cnt:%u ", ev.dirty_shutdown_cnt) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "cor_vol_err_cnt", record, &val, 1) < 0)
+ return -1;
+ ev.cor_vol_err_cnt = val;
+ if (trace_seq_printf(s, "cor_vol_err_cnt:%u ", ev.cor_vol_err_cnt) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "cor_per_err_cnt", record, &val, 1) < 0)
+ return -1;
+ ev.cor_per_err_cnt = val;
+ if (trace_seq_printf(s, "cor_per_err_cnt:%u ", ev.cor_per_err_cnt) <= 0)
+ return -1;
+
+ /* Insert data into the SGBD */
+#ifdef HAVE_SQLITE3
+ ras_store_cxl_memory_module_event(ras, &ev);
+#endif
+
+#ifdef HAVE_ABRT_REPORT
+ /* Report event to ABRT */
+ ras_report_cxl_memory_module_event(ras, &ev);
+#endif
+
+ return 0;
+}
diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h
index 35455af..1ea0f93 100644
--- a/ras-cxl-handler.h
+++ b/ras-cxl-handler.h
@@ -41,4 +41,7 @@ int ras_cxl_general_media_event_handler(struct trace_seq *s,
int ras_cxl_dram_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context);
+int ras_cxl_memory_module_event_handler(struct trace_seq *s,
+ struct tep_record *record,
+ struct tep_event *event, void *context);
#endif
diff --git a/ras-events.c b/ras-events.c
index d27e0c4..a82dab2 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -252,6 +252,7 @@ int toggle_ras_mc_event(int enable)
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_generic_event", enable);
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_general_media", enable);
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_dram", enable);
+ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_memory_module", enable);
#endif
free_ras:
@@ -1081,6 +1082,14 @@ int handle_ras_events(int record_events)
else
log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
"cxl", "cxl_dram");
+
+ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_memory_module",
+ ras_cxl_memory_module_event_handler, NULL, CXL_MEMORY_MODULE_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "cxl", "memory_module");
#endif
if (!num_events) {
diff --git a/ras-events.h b/ras-events.h
index d192a6b..c4d54e3 100644
--- a/ras-events.h
+++ b/ras-events.h
@@ -46,6 +46,7 @@ enum {
CXL_GENERIC_EVENT,
CXL_GENERAL_MEDIA_EVENT,
CXL_DRAM_EVENT,
+ CXL_MEMORY_MODULE_EVENT,
NR_EVENTS
};
diff --git a/ras-record.c b/ras-record.c
index fffa81c..a5f99ae 100644
--- a/ras-record.c
+++ b/ras-record.c
@@ -992,6 +992,74 @@ int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *
return rc;
}
+
+/*
+ * Table and functions to handle cxl:cxl_memory_module_event
+ */
+static const struct db_fields cxl_memory_module_event_fields[] = {
+ { .name = "id", .type = "INTEGER PRIMARY KEY" },
+ { .name = "timestamp", .type = "TEXT" },
+ { .name = "memdev", .type = "TEXT" },
+ { .name = "host", .type = "TEXT" },
+ { .name = "serial", .type = "INTEGER" },
+ { .name = "log_type", .type = "TEXT" },
+ { .name = "hdr_uuid", .type = "TEXT" },
+ { .name = "hdr_flags", .type = "INTEGER" },
+ { .name = "hdr_handle", .type = "INTEGER" },
+ { .name = "hdr_related_handle", .type = "INTEGER" },
+ { .name = "hdr_ts", .type = "TEXT" },
+ { .name = "hdr_length", .type = "INTEGER" },
+ { .name = "hdr_maint_op_class", .type = "INTEGER" },
+ { .name = "event_type", .type = "INTEGER" },
+ { .name = "health_status", .type = "INTEGER" },
+ { .name = "media_status", .type = "INTEGER" },
+ { .name = "life_used", .type = "INTEGER" },
+ { .name = "dirty_shutdown_cnt", .type = "INTEGER" },
+ { .name = "cor_vol_err_cnt", .type = "INTEGER" },
+ { .name = "cor_per_err_cnt", .type = "INTEGER" },
+ { .name = "device_temp", .type = "INTEGER" },
+ { .name = "add_status", .type = "INTEGER" },
+};
+
+static const struct db_table_descriptor cxl_memory_module_event_tab = {
+ .name = "cxl_memory_module_event",
+ .fields = cxl_memory_module_event_fields,
+ .num_fields = ARRAY_SIZE(cxl_memory_module_event_fields),
+};
+
+int ras_store_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev)
+{
+ int rc;
+ struct sqlite3_priv *priv = ras->db_priv;
+
+ if (!priv || !priv->stmt_cxl_memory_module_event)
+ return 0;
+ log(TERM, LOG_INFO, "cxl_memory_module_event store: %p\n",
+ priv->stmt_cxl_memory_module_event);
+
+ ras_store_cxl_common_hdr(priv->stmt_cxl_memory_module_event, &ev->hdr);
+ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 13, ev->event_type);
+ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 14, ev->health_status);
+ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 15, ev->media_status);
+ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 16, ev->life_used);
+ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 17, ev->dirty_shutdown_cnt);
+ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 18, ev->cor_vol_err_cnt);
+ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 19, ev->cor_per_err_cnt);
+ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 20, ev->device_temp);
+ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 21, ev->add_status);
+
+ rc = sqlite3_step(priv->stmt_cxl_memory_module_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to do stmt_cxl_memory_module_event step on sqlite: error = %d\n", rc);
+ rc = sqlite3_reset(priv->stmt_cxl_memory_module_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed reset stmt_cxl_memory_module_event on sqlite: error = %d\n", rc);
+ log(TERM, LOG_INFO, "register inserted at db\n");
+
+ return rc;
+}
#endif
/*
@@ -1391,6 +1459,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
if (rc != SQLITE_OK)
goto error;
}
+
+ rc = ras_mc_create_table(priv, &cxl_memory_module_event_tab);
+ if (rc == SQLITE_OK) {
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_memory_module_event,
+ &cxl_memory_module_event_tab);
+ if (rc != SQLITE_OK)
+ goto error;
+ }
#endif
ras->db_priv = priv;
@@ -1568,6 +1644,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras)
"cpu %u: Failed to finalize cxl_dram_event sqlite: error = %d\n",
cpu, rc);
}
+
+ if (priv->stmt_cxl_memory_module_event) {
+ rc = sqlite3_finalize(priv->stmt_cxl_memory_module_event);
+ if (rc != SQLITE_OK)
+ log(TERM, LOG_ERR,
+ "cpu %u: Failed to finalize stmt_cxl_memory_module_event sqlite: error = %d\n",
+ cpu, rc);
+ }
#endif
rc = sqlite3_close_v2(db);
diff --git a/ras-record.h b/ras-record.h
index 480ff92..a7b9ab9 100644
--- a/ras-record.h
+++ b/ras-record.h
@@ -218,6 +218,19 @@ struct ras_cxl_dram_event {
uint16_t validity_flags;
};
+struct ras_cxl_memory_module_event {
+ struct ras_cxl_event_common_hdr hdr;
+ uint8_t event_type;
+ uint8_t health_status;
+ uint8_t media_status;
+ uint8_t life_used;
+ uint32_t dirty_shutdown_cnt;
+ uint32_t cor_vol_err_cnt;
+ uint32_t cor_per_err_cnt;
+ int16_t device_temp;
+ uint8_t add_status;
+};
+
struct ras_mc_event;
struct ras_aer_event;
struct ras_extlog_event;
@@ -234,6 +247,7 @@ struct ras_cxl_overflow_event;
struct ras_cxl_generic_event;
struct ras_cxl_general_media_event;
struct ras_cxl_dram_event;
+struct ras_cxl_memory_module_event;
#ifdef HAVE_SQLITE3
@@ -274,6 +288,7 @@ struct sqlite3_priv {
sqlite3_stmt *stmt_cxl_generic_event;
sqlite3_stmt *stmt_cxl_general_media_event;
sqlite3_stmt *stmt_cxl_dram_event;
+ sqlite3_stmt *stmt_cxl_memory_module_event;
#endif
};
@@ -309,6 +324,7 @@ int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow
int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev);
int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev);
int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev);
+int ras_store_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev);
#else
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
@@ -329,6 +345,7 @@ static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ra
static inline int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; };
static inline int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; };
static inline int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) { return 0; };
+static inline int ras_store_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev) { return 0; };
#endif
diff --git a/ras-report.c b/ras-report.c
index 21180b1..a30b66d 100644
--- a/ras-report.c
+++ b/ras-report.c
@@ -605,6 +605,62 @@ static int set_cxl_dram_event_backtrace(char *buf, struct ras_cxl_dram_event *ev
return 0;
}
+static int set_cxl_memory_module_event_backtrace(char *buf, struct ras_cxl_memory_module_event *ev)
+{
+ char bt_buf[MAX_BACKTRACE_SIZE];
+
+ if (!buf || !ev)
+ return -1;
+
+ sprintf(bt_buf, "BACKTRACE=" \
+ "timestamp=%s\n" \
+ "memdev=%s\n" \
+ "host=%s\n" \
+ "serial=0x%lx\n" \
+ "log_type=%s\n" \
+ "hdr_uuid=%s\n" \
+ "hdr_flags=0x%x\n" \
+ "hdr_handle=0x%x\n" \
+ "hdr_related_handle=0x%x\n" \
+ "hdr_timestamp=%s\n" \
+ "hdr_length=%u\n" \
+ "hdr_maint_op_class=%u\n" \
+ "event_type=%u\n" \
+ "health_status=%u\n" \
+ "media_status=%u\n" \
+ "life_used=%u\n" \
+ "dirty_shutdown_cnt=%u\n" \
+ "cor_vol_err_cnt=%u\n" \
+ "cor_per_err_cnt=%u\n" \
+ "device_temp=%d\n" \
+ "add_status=%u\n", \
+ ev->hdr.timestamp, \
+ ev->hdr.memdev, \
+ ev->hdr.host, \
+ ev->hdr.serial, \
+ ev->hdr.log_type, \
+ ev->hdr.hdr_uuid, \
+ ev->hdr.hdr_flags, \
+ ev->hdr.hdr_handle, \
+ ev->hdr.hdr_related_handle, \
+ ev->hdr.hdr_timestamp, \
+ ev->hdr.hdr_length, \
+ ev->hdr.hdr_maint_op_class, \
+ ev->event_type, \
+ ev->health_status, \
+ ev->media_status, \
+ ev->life_used, \
+ ev->dirty_shutdown_cnt, \
+ ev->cor_vol_err_cnt, \
+ ev->cor_per_err_cnt, \
+ ev->device_temp, \
+ ev->add_status);
+
+ strcat(buf, bt_buf);
+
+ return 0;
+}
+
static int commit_report_backtrace(int sockfd, int type, void *ev){
char buf[MAX_BACKTRACE_SIZE];
char *pbuf = buf;
@@ -663,6 +719,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){
case CXL_DRAM_EVENT:
rc = set_cxl_dram_event_backtrace(buf, (struct ras_cxl_dram_event *)ev);
break;
+ case CXL_MEMORY_MODULE_EVENT:
+ rc = set_cxl_memory_module_event_backtrace(buf, (struct ras_cxl_memory_module_event *)ev);
+ break;
default:
return -1;
}
@@ -1380,3 +1439,47 @@ cxl_dram_fail:
else
return -1;
}
+
+int ras_report_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev)
+{
+ char buf[MAX_MESSAGE_SIZE];
+ int sockfd = 0;
+ int done = 0;
+ int rc = -1;
+
+ memset(buf, 0, sizeof(buf));
+
+ sockfd = setup_report_socket();
+ if (sockfd < 0)
+ return -1;
+
+ rc = commit_report_basic(sockfd);
+ if (rc < 0)
+ goto cxl_memory_module_fail;
+
+ rc = commit_report_backtrace(sockfd, CXL_MEMORY_MODULE_EVENT, ev);
+ if (rc < 0)
+ goto cxl_memory_module_fail;
+
+ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl_memory_module_event");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto cxl_memory_module_fail;
+
+ sprintf(buf, "REASON=%s", "CXL Memory Module Event");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto cxl_memory_module_fail;
+
+ done = 1;
+
+cxl_memory_module_fail:
+
+ if (sockfd >= 0)
+ close(sockfd);
+
+ if (done)
+ return 0;
+ else
+ return -1;
+}
diff --git a/ras-report.h b/ras-report.h
index 1ad00e0..e401850 100644
--- a/ras-report.h
+++ b/ras-report.h
@@ -46,6 +46,7 @@ int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflo
int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev);
int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev);
int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev);
+int ras_report_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev);
#else
@@ -64,6 +65,7 @@ static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct r
static inline int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; };
static inline int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; };
static inline int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) { return 0; };
+static inline int ras_report_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev) { return 0; };
#endif

View File

@ -0,0 +1,435 @@
commit f73ed45b91244eb3986ac2574cd7d36ae1d4d22a
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Tue Apr 4 16:50:50 2023 +0100
rasdaemon: Add support for the CXL overflow events
Add support to log and record the CXL overflow events.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c
index d540ebb..d4c845e 100644
--- a/ras-cxl-handler.c
+++ b/ras-cxl-handler.c
@@ -426,3 +426,101 @@ int ras_cxl_aer_ce_event_handler(struct trace_seq *s,
return 0;
}
+
+/*
+ * CXL rev 3.0 section 8.2.9.2.2; Table 8-49
+ */
+enum cxl_event_log_type {
+ CXL_EVENT_TYPE_INFO = 0x00,
+ CXL_EVENT_TYPE_WARN,
+ CXL_EVENT_TYPE_FAIL,
+ CXL_EVENT_TYPE_FATAL,
+ CXL_EVENT_TYPE_UNKNOWN
+};
+
+static char *cxl_event_log_type_str(uint32_t log_type)
+{
+
+ switch (log_type) {
+ case CXL_EVENT_TYPE_INFO:
+ return "Informational";
+ case CXL_EVENT_TYPE_WARN:
+ return "Warning";
+ case CXL_EVENT_TYPE_FAIL:
+ return "Failure";
+ case CXL_EVENT_TYPE_FATAL:
+ return "Fatal";
+ default:
+ break;
+ }
+
+ return "Unknown";
+}
+
+int ras_cxl_overflow_event_handler(struct trace_seq *s,
+ struct tep_record *record,
+ struct tep_event *event, void *context)
+{
+ int len;
+ unsigned long long val;
+ struct ras_events *ras = context;
+ struct ras_cxl_overflow_event ev;
+
+ memset(&ev, 0, sizeof(ev));
+ get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp));
+ if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0)
+ return -1;
+
+ ev.memdev = tep_get_field_raw(s, event, "memdev", record, &len, 1);
+ if (!ev.memdev)
+ return -1;
+ if (trace_seq_printf(s, "memdev:%s ", ev.memdev) <= 0)
+ return -1;
+
+ ev.host = tep_get_field_raw(s, event, "host", record, &len, 1);
+ if (!ev.host)
+ return -1;
+ if (trace_seq_printf(s, "host:%s ", ev.host) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "serial", record, &val, 1) < 0)
+ return -1;
+ ev.serial = val;
+ if (trace_seq_printf(s, "serial:0x%llx ", (unsigned long long)ev.serial) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "log", record, &val, 1) < 0)
+ return -1;
+ ev.log_type = cxl_event_log_type_str(val);
+ if (trace_seq_printf(s, "log type:%s ", ev.log_type) <= 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "count", record, &val, 1) < 0)
+ return -1;
+ ev.count = val;
+
+ if (tep_get_field_val(s, event, "first_ts", record, &val, 1) < 0)
+ return -1;
+ convert_timestamp(val, ev.first_ts, sizeof(ev.first_ts));
+
+ if (tep_get_field_val(s, event, "last_ts", record, &val, 1) < 0)
+ return -1;
+ convert_timestamp(val, ev.last_ts, sizeof(ev.last_ts));
+
+ if (ev.count) {
+ if (trace_seq_printf(s, "%u errors from %s to %s\n",
+ ev.count, ev.first_ts, ev.last_ts) <= 0)
+ return -1;
+ }
+ /* Insert data into the SGBD */
+#ifdef HAVE_SQLITE3
+ ras_store_cxl_overflow_event(ras, &ev);
+#endif
+
+#ifdef HAVE_ABRT_REPORT
+ /* Report event to ABRT */
+ ras_report_cxl_overflow_event(ras, &ev);
+#endif
+
+ return 0;
+}
diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h
index 711daf4..e7847ec 100644
--- a/ras-cxl-handler.h
+++ b/ras-cxl-handler.h
@@ -29,4 +29,7 @@ int ras_cxl_aer_ue_event_handler(struct trace_seq *s,
int ras_cxl_aer_ce_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context);
+int ras_cxl_overflow_event_handler(struct trace_seq *s,
+ struct tep_record *record,
+ struct tep_event *event, void *context);
#endif
diff --git a/ras-events.c b/ras-events.c
index d0251e0..f2a869a 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -248,6 +248,7 @@ int toggle_ras_mc_event(int enable)
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_poison", enable);
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_uncorrectable_error", enable);
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_correctable_error", enable);
+ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_overflow", enable);
#endif
free_ras:
@@ -1045,6 +1046,14 @@ int handle_ras_events(int record_events)
else
log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
"cxl", "cxl_aer_correctable_error");
+
+ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_overflow",
+ ras_cxl_overflow_event_handler, NULL, CXL_OVERFLOW_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "cxl", "cxl_overflow");
#endif
if (!num_events) {
diff --git a/ras-events.h b/ras-events.h
index a9d67c2..7c869d9 100644
--- a/ras-events.h
+++ b/ras-events.h
@@ -42,6 +42,7 @@ enum {
CXL_POISON_EVENT,
CXL_AER_UE_EVENT,
CXL_AER_CE_EVENT,
+ CXL_OVERFLOW_EVENT,
NR_EVENTS
};
diff --git a/ras-record.c b/ras-record.c
index 86133c4..7b808a5 100644
--- a/ras-record.c
+++ b/ras-record.c
@@ -720,6 +720,59 @@ int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_eve
return rc;
}
+
+/*
+ * Table and functions to handle cxl:cxl_overflow
+ */
+static const struct db_fields cxl_overflow_event_fields[] = {
+ { .name = "id", .type = "INTEGER PRIMARY KEY" },
+ { .name = "timestamp", .type = "TEXT" },
+ { .name = "memdev", .type = "TEXT" },
+ { .name = "host", .type = "TEXT" },
+ { .name = "serial", .type = "INTEGER" },
+ { .name = "log_type", .type = "TEXT" },
+ { .name = "count", .type = "INTEGER" },
+ { .name = "first_ts", .type = "TEXT" },
+ { .name = "last_ts", .type = "TEXT" },
+};
+
+static const struct db_table_descriptor cxl_overflow_event_tab = {
+ .name = "cxl_overflow_event",
+ .fields = cxl_overflow_event_fields,
+ .num_fields = ARRAY_SIZE(cxl_overflow_event_fields),
+};
+
+int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev)
+{
+ int rc;
+ struct sqlite3_priv *priv = ras->db_priv;
+
+ if (!priv || !priv->stmt_cxl_overflow_event)
+ return 0;
+ log(TERM, LOG_INFO, "cxl_overflow_event store: %p\n", priv->stmt_cxl_overflow_event);
+
+ sqlite3_bind_text(priv->stmt_cxl_overflow_event, 1, ev->timestamp, -1, NULL);
+ sqlite3_bind_text(priv->stmt_cxl_overflow_event, 2, ev->memdev, -1, NULL);
+ sqlite3_bind_text(priv->stmt_cxl_overflow_event, 3, ev->host, -1, NULL);
+ sqlite3_bind_int64(priv->stmt_cxl_overflow_event, 4, ev->serial);
+ sqlite3_bind_text(priv->stmt_cxl_overflow_event, 5, ev->log_type, -1, NULL);
+ sqlite3_bind_int(priv->stmt_cxl_overflow_event, 6, ev->count);
+ sqlite3_bind_text(priv->stmt_cxl_overflow_event, 7, ev->first_ts, -1, NULL);
+ sqlite3_bind_text(priv->stmt_cxl_overflow_event, 8, ev->last_ts, -1, NULL);
+
+ rc = sqlite3_step(priv->stmt_cxl_overflow_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to do cxl_overflow_event step on sqlite: error = %d\n", rc);
+ rc = sqlite3_reset(priv->stmt_cxl_overflow_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed reset cxl_overflow_event on sqlite: error = %d\n",
+ rc);
+ log(TERM, LOG_INFO, "register inserted at db\n");
+
+ return rc;
+}
#endif
/*
@@ -1087,6 +1140,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
if (rc != SQLITE_OK)
goto error;
}
+
+ rc = ras_mc_create_table(priv, &cxl_overflow_event_tab);
+ if (rc == SQLITE_OK) {
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_overflow_event,
+ &cxl_overflow_event_tab);
+ if (rc != SQLITE_OK)
+ goto error;
+ }
#endif
ras->db_priv = priv;
@@ -1232,6 +1293,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras)
"cpu %u: Failed to finalize cxl_aer_ce_event sqlite: error = %d\n",
cpu, rc);
}
+
+ if (priv->stmt_cxl_overflow_event) {
+ rc = sqlite3_finalize(priv->stmt_cxl_overflow_event);
+ if (rc != SQLITE_OK)
+ log(TERM, LOG_ERR,
+ "cpu %u: Failed to finalize cxl_overflow_event sqlite: error = %d\n",
+ cpu, rc);
+ }
#endif
rc = sqlite3_close_v2(db);
diff --git a/ras-record.h b/ras-record.h
index ab7153d..90db6ad 100644
--- a/ras-record.h
+++ b/ras-record.h
@@ -152,6 +152,17 @@ struct ras_cxl_aer_ce_event {
uint32_t error_status;
};
+struct ras_cxl_overflow_event {
+ char timestamp[64];
+ const char *memdev;
+ const char *host;
+ uint64_t serial;
+ const char *log_type;
+ char first_ts[64];
+ char last_ts[64];
+ uint16_t count;
+};
+
struct ras_mc_event;
struct ras_aer_event;
struct ras_extlog_event;
@@ -164,6 +175,7 @@ struct ras_mf_event;
struct ras_cxl_poison_event;
struct ras_cxl_aer_ue_event;
struct ras_cxl_aer_ce_event;
+struct ras_cxl_overflow_event;
#ifdef HAVE_SQLITE3
@@ -200,6 +212,7 @@ struct sqlite3_priv {
sqlite3_stmt *stmt_cxl_poison_event;
sqlite3_stmt *stmt_cxl_aer_ue_event;
sqlite3_stmt *stmt_cxl_aer_ce_event;
+ sqlite3_stmt *stmt_cxl_overflow_event;
#endif
};
@@ -231,6 +244,7 @@ int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev);
int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev);
int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev);
+int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev);
#else
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
@@ -247,6 +261,7 @@ static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event
static inline int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; };
static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; };
static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; };
+static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; };
#endif
diff --git a/ras-report.c b/ras-report.c
index 63b47f5..dbed454 100644
--- a/ras-report.c
+++ b/ras-report.c
@@ -421,6 +421,36 @@ static int set_cxl_aer_ce_event_backtrace(char *buf, struct ras_cxl_aer_ce_event
return 0;
}
+static int set_cxl_overflow_event_backtrace(char *buf, struct ras_cxl_overflow_event *ev)
+{
+ char bt_buf[MAX_BACKTRACE_SIZE];
+
+ if (!buf || !ev)
+ return -1;
+
+ sprintf(bt_buf, "BACKTRACE=" \
+ "timestamp=%s\n" \
+ "memdev=%s\n" \
+ "host=%s\n" \
+ "serial=0x%lx\n" \
+ "log_type=%s\n" \
+ "count=%u\n" \
+ "first_ts=%s\n" \
+ "last_ts=%s\n", \
+ ev->timestamp, \
+ ev->memdev, \
+ ev->host, \
+ ev->serial, \
+ ev->log_type, \
+ ev->count, \
+ ev->first_ts, \
+ ev->last_ts);
+
+ strcat(buf, bt_buf);
+
+ return 0;
+}
+
static int commit_report_backtrace(int sockfd, int type, void *ev){
char buf[MAX_BACKTRACE_SIZE];
char *pbuf = buf;
@@ -467,6 +497,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){
case CXL_AER_CE_EVENT:
rc = set_cxl_aer_ce_event_backtrace(buf, (struct ras_cxl_aer_ce_event *)ev);
break;
+ case CXL_OVERFLOW_EVENT:
+ rc = set_cxl_overflow_event_backtrace(buf, (struct ras_cxl_overflow_event *)ev);
+ break;
default:
return -1;
}
@@ -1007,3 +1040,47 @@ cxl_aer_ce_fail:
else
return -1;
}
+
+int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev)
+{
+ char buf[MAX_MESSAGE_SIZE];
+ int sockfd = 0;
+ int done = 0;
+ int rc = -1;
+
+ memset(buf, 0, sizeof(buf));
+
+ sockfd = setup_report_socket();
+ if (sockfd < 0)
+ return -1;
+
+ rc = commit_report_basic(sockfd);
+ if (rc < 0)
+ goto cxl_overflow_fail;
+
+ rc = commit_report_backtrace(sockfd, CXL_OVERFLOW_EVENT, ev);
+ if (rc < 0)
+ goto cxl_overflow_fail;
+
+ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl-overflow");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto cxl_overflow_fail;
+
+ sprintf(buf, "REASON=%s", "CXL overflow");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto cxl_overflow_fail;
+
+ done = 1;
+
+cxl_overflow_fail:
+
+ if (sockfd >= 0)
+ close(sockfd);
+
+ if (done)
+ return 0;
+ else
+ return -1;
+}
diff --git a/ras-report.h b/ras-report.h
index 46155ee..204d485 100644
--- a/ras-report.h
+++ b/ras-report.h
@@ -42,6 +42,7 @@ int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev);
int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev);
int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev);
+int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev);
#else
@@ -56,6 +57,7 @@ static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_even
static inline int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; };
static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; };
static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; };
+static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; };
#endif

View File

@ -0,0 +1,199 @@
commit 70acd500302d2db318bb0e35b551f74fd4baebc4
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Mon Feb 12 10:27:58 2024 +0000
rasdaemon: ras-mc-ctl: Add support for CXL AER uncorrectable trace events
Add support for CXL AER uncorrectable events to the ras-mc-ctl tool.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
(cherry picked from commit f8b6da812eddc063ea739970f941fdd24fb984ae)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 1cc19b3..c0a2ec6 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -43,6 +43,7 @@ my $modprobe = find_prog ("modprobe") or exit (1);
my $has_aer = 0;
my $has_arm = 0;
+my $has_cxl = 0;
my $has_devlink = 0;
my $has_disk_errors = 0;
my $has_extlog = 0;
@@ -51,6 +52,7 @@ my $has_mce = 0;
@WITH_AER_TRUE@$has_aer = 1;
@WITH_ARM_TRUE@$has_arm = 1;
+@WITH_CXL_TRUE@$has_cxl = 1;
@WITH_DEVLINK_TRUE@$has_devlink = 1;
@WITH_DISKERROR_TRUE@$has_disk_errors = 1;
@WITH_EXTLOG_TRUE@$has_extlog = 1;
@@ -1156,6 +1158,78 @@ sub get_uuid_le
return $out;
}
+use constant {
+ CXL_AER_UE_CACHE_DATA_PARITY => 0x0001,
+ CXL_AER_UE_CACHE_ADDR_PARITY => 0x0002,
+ CXL_AER_UE_CACHE_BE_PARITY => 0x0004,
+ CXL_AER_UE_CACHE_DATA_ECC => 0x0008,
+ CXL_AER_UE_MEM_DATA_PARITY => 0x0010,
+ CXL_AER_UE_MEM_ADDR_PARITY => 0x0020,
+ CXL_AER_UE_MEM_BE_PARITY => 0x0040,
+ CXL_AER_UE_MEM_DATA_ECC => 0x0080,
+ CXL_AER_UE_REINIT_THRESH => 0x0100,
+ CXL_AER_UE_RSVD_ENCODE => 0x0200,
+ CXL_AER_UE_POISON => 0x0400,
+ CXL_AER_UE_RECV_OVERFLOW => 0x0800,
+ CXL_AER_UE_INTERNAL_ERR => 0x4000,
+ CXL_AER_UE_IDE_TX_ERR => 0x8000,
+ CXL_AER_UE_IDE_RX_ERR => 0x10000,
+};
+
+sub get_cxl_ue_error_status_text
+{
+ my $error_status = $_[0];
+ my @out;
+
+ if ($error_status & CXL_AER_UE_CACHE_DATA_PARITY) {
+ push @out, (sprintf "\'Cache Data Parity Error\' ");
+ }
+ if ($error_status & CXL_AER_UE_CACHE_ADDR_PARITY) {
+ push @out, (sprintf "\'Cache Address Parity Error\' ");
+ }
+ if ($error_status & CXL_AER_UE_CACHE_BE_PARITY) {
+ push @out, (sprintf "\'Cache Byte Enable Parity Error\' ");
+ }
+ if ($error_status & CXL_AER_UE_CACHE_DATA_ECC) {
+ push @out, (sprintf "\'Cache Data ECC Error\' ");
+ }
+ if ($error_status & CXL_AER_UE_MEM_DATA_PARITY) {
+ push @out, (sprintf "\'Memory Data Parity Error\' ");
+ }
+ if ($error_status & CXL_AER_UE_MEM_ADDR_PARITY) {
+ push @out, (sprintf "\'Memory Address Parity Error\' ");
+ }
+ if ($error_status & CXL_AER_UE_MEM_BE_PARITY) {
+ push @out, (sprintf "\'Memory Byte Enable Parity Error\' ");
+ }
+ if ($error_status & CXL_AER_UE_MEM_DATA_ECC) {
+ push @out, (sprintf "\'Memory Data ECC Error\' ");
+ }
+ if ($error_status & CXL_AER_UE_REINIT_THRESH) {
+ push @out, (sprintf "\'REINIT Threshold Hit\' ");
+ }
+ if ($error_status & CXL_AER_UE_RSVD_ENCODE) {
+ push @out, (sprintf "\'Received Unrecognized Encoding\' ");
+ }
+ if ($error_status & CXL_AER_UE_POISON) {
+ push @out, (sprintf "\'Received Poison From Peer\' ");
+ }
+ if ($error_status & CXL_AER_UE_RECV_OVERFLOW) {
+ push @out, (sprintf "\'Receiver Overflow\' ");
+ }
+ if ($error_status & CXL_AER_UE_INTERNAL_ERR) {
+ push @out, (sprintf "\'Component Specific Error\' ");
+ }
+ if ($error_status & CXL_AER_UE_IDE_TX_ERR) {
+ push @out, (sprintf "\'IDE Tx Error\' ");
+ }
+ if ($error_status & CXL_AER_UE_IDE_RX_ERR) {
+ push @out, (sprintf "\'IDE Rx Error\' ");
+ }
+
+ return join (", ", @out);
+}
+
sub summary
{
require DBI;
@@ -1163,7 +1237,7 @@ sub summary
my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg, $action_result);
my ($etype, $severity, $etype_string, $severity_string);
my ($dev_name, $dev);
- my ($mpidr);
+ my ($mpidr, $memdev);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
@@ -1219,6 +1293,25 @@ sub summary
$query_handle->finish;
}
+ # CXL errors
+ if ($has_cxl == 1) {
+ # CXL AER uncorrectable errors
+ $query = "select memdev, count(*) from cxl_aer_ue_event$conf{opt}{since} group by memdev";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($memdev, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$memdev errors: $count\n";
+ }
+ if ($out ne "") {
+ print "CXL AER uncorrectable events summary:\n$out\n";
+ } else {
+ print "No CXL AER uncorrectable errors.\n\n";
+ }
+ $query_handle->finish;
+ }
+
# extlog errors
if ($has_extlog == 1) {
$query = "select etype, severity, count(*) from extlog_event group by etype, severity";
@@ -1324,6 +1417,7 @@ sub errors
my ($dev, $sector, $nr_sector, $error, $rwbs, $cmd);
my ($error_count, $affinity, $mpidr, $r_state, $psci_state);
my ($pfn, $page_type, $action_result);
+ my ($memdev, $host, $serial, $error_status, $first_error, $header_log);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
@@ -1389,6 +1483,44 @@ sub errors
$query_handle->finish;
}
+ # CXL errors
+ if ($has_cxl == 1) {
+ # CXL AER uncorrectable errors
+ use constant SZ_512 => 0x200;
+ use constant CXL_HEADERLOG_SIZE_U32 => SZ_512/32;
+ $query = "select id, timestamp, memdev, host, serial, error_status, first_error, header_log from cxl_aer_ue_event$conf{opt}{since} order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $error_status, $first_error, $header_log));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev);
+ $out .= "host=$host, " if (defined $host && length $host);
+ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial);
+ if (defined $error_status && length $error_status) {
+ $out .= sprintf "error_status: %s, ", get_cxl_ue_error_status_text($error_status);
+ }
+ if (defined $first_error && length $first_error) {
+ $out .= sprintf "first_error: %s, ", get_cxl_ue_error_status_text($first_error);
+ }
+ if (defined $header_log && length $header_log) {
+ $out .= sprintf "header_log:\n";
+ my @bytes = unpack "C*", $header_log;
+ for (my $i = 0; $i < CXL_HEADERLOG_SIZE_U32; $i++) {
+ $out .= sprintf "%08x ", $bytes[$i];
+ }
+ }
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "CXL AER uncorrectable events:\n$out\n";
+ } else {
+ print "No CXL AER uncorrectable errors.\n\n";
+ }
+ $query_handle->finish;
+ }
+
# Extlog errors
if ($has_extlog == 1) {
$query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id";

View File

@ -0,0 +1,127 @@
commit dba1c58ef5802b96b6555cb42e3cf7f75fa0da8c
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Mon Feb 12 10:56:25 2024 +0000
rasdaemon: ras-mc-ctl: Add support for CXL generic trace events
Add support for CXL generic events to the ras-mc-ctl tool.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
(cherry picked from commit fd11670d2d35c5d939b03ba1ca80eb81c1f636b6)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 16b0589..5528021 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -1270,6 +1270,34 @@ sub get_cxl_ce_error_status_text
return join (", ", @out);
}
+use constant {
+ CXL_EVENT_RECORD_FLAG_PERMANENT => 0x0004,
+ CXL_EVENT_RECORD_FLAG_MAINT_NEEDED => 0x0008,
+ CXL_EVENT_RECORD_FLAG_PERF_DEGRADED => 0x0010,
+ CXL_EVENT_RECORD_FLAG_HW_REPLACE => 0x0020,
+};
+
+sub get_cxl_hdr_flags_text
+{
+ my $flags = $_[0];
+ my @out;
+
+ if ($flags & CXL_EVENT_RECORD_FLAG_PERMANENT) {
+ push @out, (sprintf "\'PERMANENT_CONDITION\' ");
+ }
+ if ($flags & CXL_EVENT_RECORD_FLAG_MAINT_NEEDED) {
+ push @out, (sprintf "\'MAINTENANCE_NEEDED\' ");
+ }
+ if ($flags & CXL_EVENT_RECORD_FLAG_PERF_DEGRADED) {
+ push @out, (sprintf "\'PERFORMANCE_DEGRADED\' ");
+ }
+ if ($flags & CXL_EVENT_RECORD_FLAG_HW_REPLACE) {
+ push @out, (sprintf "\'HARDWARE_REPLACEMENT_NEEDED\' ");
+ }
+
+ return join (", ", @out);
+}
+
sub summary
{
require DBI;
@@ -1398,6 +1426,22 @@ sub summary
print "No CXL poison errors.\n\n";
}
$query_handle->finish;
+
+ # CXL generic errors
+ $query = "select memdev, count(*) from cxl_generic_event$conf{opt}{since} group by memdev";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($memdev, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$memdev errors: $count\n";
+ }
+ if ($out ne "") {
+ print "CXL generic events summary:\n$out\n";
+ } else {
+ print "No CXL generic errors.\n\n";
+ }
+ $query_handle->finish;
}
# extlog errors
@@ -1508,6 +1552,7 @@ sub errors
my ($memdev, $host, $serial, $error_status, $first_error, $header_log);
my ($log_type, $first_ts, $last_ts);
my ($trace_type, $region, $region_uuid, $hpa, $dpa, $dpa_length, $source, $flags, $overflow_ts);
+ my ($hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $data);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
@@ -1681,6 +1726,44 @@ sub errors
} else {
print "No CXL poison errors.\n\n";
}
+
+ # CXL generic errors
+ use constant CXL_EVENT_RECORD_DATA_LENGTH => 0x50;
+ $query = "select id, timestamp, memdev, host, serial, log_type, hdr_uuid, hdr_flags, hdr_handle, hdr_related_handle, hdr_ts, hdr_length, hdr_maint_op_class, data from cxl_generic_event$conf{opt}{since} order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $log_type, $hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $data));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev);
+ $out .= "host=$host, " if (defined $host && length $host);
+ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial);
+ $out .= "log=$log_type, " if (defined $log_type && length $log_type);
+ $out .= "hdr_uuid=$hdr_uuid, " if (defined $hdr_uuid && length $hdr_uuid);
+ $out .= sprintf "hdr_flags=0x%llx %s, ", $hdr_flags, get_cxl_hdr_flags_text($hdr_flags) if (defined $hdr_flags && length $hdr_flags);
+ $out .= sprintf "hdr_handle=0x%x, ", $hdr_handle if (defined $hdr_handle && length $hdr_handle);
+ $out .= sprintf "hdr_related_handle=0x%x, ", $hdr_related_handle if (defined $hdr_related_handle && length $hdr_related_handle);
+ $out .= "hdr_timestamp=$hdr_ts, " if (defined $hdr_ts && length $hdr_ts);
+ $out .= sprintf "hdr_length=%u, ", $hdr_length if (defined $hdr_length && length $hdr_length);
+ $out .= sprintf "hdr_maint_op_class=%u, ", $hdr_maint_op_class if (defined $hdr_maint_op_class && length $hdr_maint_op_class);
+ if (defined $data && length $data) {
+ $out .= sprintf "data:\n";
+ my @bytes = unpack "C*", $data;
+ for (my $i = 0; $i < CXL_EVENT_RECORD_DATA_LENGTH; $i++) {
+ if (($i > 0) && (($i % 16) == 0)) {
+ $out .= sprintf "\n %08x: ", $i;
+ }
+ $out .= sprintf "%02x%02x%02x%02x ", $bytes[$i], $bytes[$i + 1], $bytes[$i + 2], $bytes[$i + 3];
+ }
+ }
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "CXL generic events:\n$out\n";
+ } else {
+ print "No CXL generic errors.\n\n";
+ }
}
# Extlog errors

6
gating.yaml Normal file
View File

@ -0,0 +1,6 @@
--- !Policy
product_versions:
- rhel-10
decision_context: osci_compose_gate
rules:
- !PassingTestCaseRule {test_case_name: osci.brew-build.tier0.functional}

336
rasdaemon.spec Normal file
View File

@ -0,0 +1,336 @@
Name: rasdaemon
Version: 0.8.0
Release: 8%{?dist}
Summary: Utility to receive RAS error tracings
Group: Applications/System
License: GPLv2
URL: http://git.infradead.org/users/mchehab/rasdaemon.git
Source0: http://www.infradead.org/~mchehab/rasdaemon/%{name}-%{version}.tar.bz2
# https://github.com/mchehab/rasdaemon/pull/96
# Add support for CXL poison and AER error events (4 patches)
# rasdaemon: Move definition for BIT and BIT_ULL to a common file
Patch0: d3836aa061f677232f99c514247d3dbf80812a1b.patch
# rasdaemon: Add support for the CXL poison events
Patch1: 75c8fec559641f843345ef8fbc36d124b60b914d.patch
# rasdaemon: Add support for the CXL AER uncorrectable errors
Patch2: a7524917befe7e67c02253cc27cb0c724e5992c0.patch
# rasdaemon: Add support for the CXL AER correctable errors
Patch3: a247baf7110ab6427259eb1421a103e2021a8735.patch
# https://github.com/mchehab/rasdaemon/pull/104
# rasdaemon: Process the generic CXL trace events (7 patches)
# rasdaemon: Add common function to convert timestamp in the CXL event records to the broken-down time format
Patch4: 2ff9bc453998ddb145c7bb8ba30a57c56bd18eab.patch
# rasdaemon: Add common function to get timestamp for the event
Patch5: 7be2edbf863b7acf7e1cab10c2b9f9bf51b3d513.patch
# rasdaemon: Add support for the CXL overflow events
Patch6: f73ed45b91244eb3986ac2574cd7d36ae1d4d22a.patch
# rasdaemon: Add support for the CXL generic events
Patch7: e0cde0edf073b939d345aeba0aed23e238dbc53b.patch
# rasdaemon: Add support for the CXL general media events
Patch8: 53c682fb45c2909c128be4ee8f51a3e42fe2f7fd.patch
# rasdaemon: Add support for the CXL dram events
Patch9: 9a2f6186db2622788f8868d8ec082684d6a06d4d.patch
# rasdaemon: Add support for the CXL memory module events
Patch10: f63b4c942e19a0da1e85a88783ed6e222ad4bdba.patch
# https://github.com/mchehab/rasdaemon/pull/149
# rasdaemon: generic fixes and ras-mc-ctl: add support for CXL error events (10 patches)
# rasdaemon: Fix build warnings unused variable if AMP RAS errors is not enabled
Patch11: 8f79833e3d78424f4a594985fbeb91890f4af81c.patch
# rasdaemon: ras-memory-failure-handler: update memory failure action page types
Patch12: 31c7578ddb0fc15aa7247f2b8885956540031221.patch
# rasdaemon: ras-mc-ctl: Add support for CXL AER uncorrectable trace events
Patch13: f8b6da812eddc063ea739970f941fdd24fb984ae.patch
# rasdaemon: ras-mc-ctl: Add support for CXL AER correctable trace events
Patch14: ae1647624486fca0070b297d0e2fd4e53443c10b.patch
# rasdaemon: ras-mc-ctl: Add support for CXL overflow trace events
Patch15: b22cb067755f4604770f9864a0babed8f93a1553.patch
# rasdaemon: ras-mc-ctl: Add support for CXL poison trace events
Patch16: 93ca96b66c917af37b2ae9295dc5df46a7d64dd2.patch
# rasdaemon: ras-mc-ctl: Add support for CXL generic trace events
Patch17: fd11670d2d35c5d939b03ba1ca80eb81c1f636b6.patch
# rasdaemon: ras-mc-ctl: Add support for CXL general media trace events
Patch18: 572de9d57691be9e630abee9ffa56a2fb155d558.patch
# rasdaemon: ras-mc-ctl: Add support for CXL DRAM trace events
Patch19: c38c14afc5d7bb6c8c52d1023271d755deb23008.patch
# rasdaemon: ras-mc-ctl: Add support for CXL memory module trace events
Patch20: aee13f74266382c64128bd7367a5eeb46277f490.patch
# ras-mc-ctl: add option to exclude old events from reports
Patch21: bd27251e3d52f57be1e245dff1cf221e09c5686f.patch
ExcludeArch: s390 s390x
BuildRequires: make
BuildRequires: gcc
BuildRequires: autoconf automake libtool
BuildRequires: gettext-devel
BuildRequires: perl-generators
BuildRequires: sqlite-devel
BuildRequires: systemd
BuildRequires: libtraceevent-devel
Provides: bundled(kernel-event-lib)
Requires: hwdata
Requires: perl-DBD-SQLite
Requires: libtraceevent
%ifarch %{ix86} x86_64
Requires: dmidecode
%endif
Requires(post): systemd
Requires(preun): systemd
Requires(postun): systemd
%description
%{name} is a RAS (Reliability, Availability and Serviceability) logging tool.
It currently records memory errors, using the EDAC tracing events.
EDAC is drivers in the Linux kernel that handle detection of ECC errors
from memory controllers for most chipsets on i386 and x86_64 architectures.
EDAC drivers for other architectures like arm also exists.
This userspace component consists of an init script which makes sure
EDAC drivers and DIMM labels are loaded at system startup, as well as
an utility for reporting current error counts from the EDAC sysfs files.
%prep
%setup -q
%patch0 -p1
%patch1 -p1
%patch2 -p1
%patch3 -p1
%patch4 -p1
%patch5 -p1
%patch6 -p1
%patch7 -p1
%patch8 -p1
%patch9 -p1
%patch10 -p1
%patch11 -p1
%patch12 -p1
%patch13 -p1
%patch14 -p1
%patch15 -p1
%patch16 -p1
%patch17 -p1
%patch18 -p1
%patch19 -p1
%patch20 -p1
%patch21 -p1
autoreconf -vfi
%build
%ifarch %{arm} aarch64
%configure --enable-sqlite3 --enable-aer --enable-non-standard --enable-arm \
--enable-mce --enable-extlog --enable-devlink --enable-diskerror \
--enable-memory-failure --enable-abrt-report --enable-hisi-ns-decode \
--enable-memory-ce-pfa --enable-amp-ns-decode --enable-cpu-fault-isolation \
--enable-cxl \
--with-sysconfdefdir=%{_sysconfdir}/sysconfig
%else
%configure --enable-sqlite3 --enable-aer \
--enable-mce --enable-extlog --enable-devlink --enable-diskerror \
--enable-memory-failure --enable-abrt-report --enable-cpu-fault-isolation \
--enable-cxl \
--with-sysconfdefdir=%{_sysconfdir}/sysconfig
%endif
make %{?_smp_mflags}
%install
make install DESTDIR=%{buildroot}
install -D -p -m 0644 misc/rasdaemon.service %{buildroot}%{_unitdir}/rasdaemon.service
install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service
install -D -p -m 0655 misc/rasdaemon.env %{buildroot}%{_sysconfdir}/sysconfig/%{name}
rm INSTALL %{buildroot}/usr/include/*.h
%files
%doc AUTHORS ChangeLog COPYING README.md TODO
%{_sbindir}/rasdaemon
%{_sbindir}/ras-mc-ctl
%{_mandir}/*/*
%{_unitdir}/*.service
%{_sysconfdir}/ras/dimm_labels.d
%config(noreplace) %{_sysconfdir}/sysconfig/%{name}
%changelog
* Thu Feb 13 2025 Joel Savitz <jsavitz@redhat.com> - 0.8.0-8
- Add option to exclude old events from reports
Resolves: RHEL-79325
* Tue Jan 14 2025 Joel Savitz <jsavitz@redhat.com> - 0.8.0-7
- Add support for CXL memory failure event logging
Resolves: RHEL-61233
* Tue Oct 29 2024 Troy Dawson <tdawson@redhat.com> - 0.8.0-6
- Bump release for October 2024 mass rebuild:
Resolves: RHEL-64018
* Mon Jun 24 2024 Troy Dawson <tdawson@redhat.com> - 0.8.0-5
- Bump release for June 2024 mass rebuild
* Fri Jan 26 2024 Fedora Release Engineering <releng@fedoraproject.org> - 0.8.0-4
- Rebuilt for https://fedoraproject.org/wiki/Fedora_40_Mass_Rebuild
* Mon Jan 22 2024 Fedora Release Engineering <releng@fedoraproject.org> - 0.8.0-3
- Rebuilt for https://fedoraproject.org/wiki/Fedora_40_Mass_Rebuild
* Fri Jul 21 2023 Fedora Release Engineering <releng@fedoraproject.org> - 0.8.0-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_39_Mass_Rebuild
* Sat Feb 18 2023 Mauro Carvalho Chehab <mchehab@kernel.org> 0.8.0
- Bump to version 0.8.0 using libtraceevent.
* Sat Jan 21 2023 Mauro Carvalho Chehab <mchehab@kernel.org> 0.7.0
- Bump to version 0.7.0 with several fixes and additions
* Fri Jan 20 2023 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.8-3
- Rebuilt for https://fedoraproject.org/wiki/Fedora_38_Mass_Rebuild
* Sat Jul 23 2022 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.8-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_37_Mass_Rebuild
* Fri Apr 01 2022 Mauro Carvalho Chehab <mchehab@kernel.org> 0.6.8-1
- Fix sysconfdir issues and upgrade to version 0.6.8
* Fri Jan 21 2022 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.7-3
- Rebuilt for https://fedoraproject.org/wiki/Fedora_36_Mass_Rebuild
* Fri Jul 23 2021 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.7-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_35_Mass_Rebuild
* Wed May 26 2021 Mauro Carvalho Chehab <mchehab+huawei@kernel.org> 0.6.7-1
- Bump to version 0.6.7 with several fixes and additions
* Wed Jan 27 2021 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.4-4
- Rebuilt for https://fedoraproject.org/wiki/Fedora_34_Mass_Rebuild
* Wed Jul 29 2020 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.4-3
- Rebuilt for https://fedoraproject.org/wiki/Fedora_33_Mass_Rebuild
* Thu Jan 30 2020 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.4-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_32_Mass_Rebuild
* Thu Oct 10 2019 Mauro Carvalho Chehab <mchehab+samsung@kernel.org> 0.6.4-1
- Bump to version 0.6.4 with some DB changes for hip08 and some fixes
* Fri Aug 23 2019 Mauro Carvalho Chehab <mchehab+samsung@kernel.org> 0.6.3-1
- Bump to version 0.6.3 with new ARM events, plus disk I/O and netlink support
* Fri Jul 26 2019 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.2-3
- Rebuilt for https://fedoraproject.org/wiki/Fedora_31_Mass_Rebuild
* Sat Feb 02 2019 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.2-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_30_Mass_Rebuild
* Tue Aug 14 2018 Mauro Carvalho Chehab <mchehab+samsung@kernel.org> 0.6.2-1
- Bump to version 0.6.2 with improvements for PCIe AER parsing and at ras-mc-ctl tool
* Sat Jul 14 2018 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.1-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_29_Mass_Rebuild
* Wed Apr 25 2018 Mauro Carvalho Chehab <mchehab+samsung@kernel.org> 0.6.1-1
- Bump to version 0.6.1 adding support for Skylake Xeon MSCOD, a bug fix and some new DELL labels
* Fri Feb 09 2018 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.0-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_28_Mass_Rebuild
* Sat Oct 14 2017 Mauro Carvalho Chehab <mchehab@osg.samsung.com> 0.6.0-1
- Bump to version 0.6.0 adding support for Arm and Hisilicon events and update Dell Skylate labels
* Thu Aug 03 2017 Fedora Release Engineering <releng@fedoraproject.org> - 0.5.8-6
- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Binutils_Mass_Rebuild
* Thu Jul 27 2017 Fedora Release Engineering <releng@fedoraproject.org> - 0.5.8-5
- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Mass_Rebuild
* Sat Feb 11 2017 Fedora Release Engineering <releng@fedoraproject.org> - 0.5.8-4
- Rebuilt for https://fedoraproject.org/wiki/Fedora_26_Mass_Rebuild
* Fri Apr 15 2016 Mauro Carvalho Chehab <mchehab@osg.samsung.com> 0.5.8-3
- Add a virtual provide, per BZ#104132
* Fri Apr 15 2016 Mauro Carvalho Chehab <mchehab@osg.samsung.com> 0.5.8-2
- Bump to version 0.5.8 with support for Broadwell EP/EX MSCOD/DE MSCOD
* Thu Feb 04 2016 Fedora Release Engineering <releng@fedoraproject.org> - 0.5.6-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_24_Mass_Rebuild
* Fri Jul 03 2015 Mauro Carvalho Chehab <mchehab@osg.samsung.com> 0.5.6-1
- Bump to version 0.5.6 with support for LMCE and some fixes
* Thu Jun 18 2015 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 0.5.5-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_23_Mass_Rebuild
* Wed Jun 03 2015 Mauro Carvalho Chehab <mchehab@osg.samsung.com> 0.5.5-1
- Bump to version 0.5.5 with support for newer Intel platforms & some fixes
* Tue Sep 16 2014 Peter Robinson <pbrobinson@fedoraproject.org> 0.5.4-3
- aarch64/ppc64 have edac capabilities
- spec cleanups
- No need to run autoreconf
* Sun Aug 17 2014 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 0.5.4-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_21_22_Mass_Rebuild
* Fri Aug 15 2014 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.5.4-1
- Bump to version 0.5.4 with some fixes, mainly for amd64
* Sun Aug 10 2014 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.5.3-1
- Bump to version 0.5.3 and enable ABRT and ExtLog
* Sun Jun 08 2014 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 0.5.2-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_21_Mass_Rebuild
* Thu Apr 03 2014 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.5.2-1
- fix and enable ABRT report support
* Fri Mar 28 2014 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.5.1-1
- Do some fixes at the service files and add some documentation for --record
* Sun Feb 16 2014 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.5.0-1
- Add experimental ABRT support
* Tue Sep 10 2013 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.4.2-1
- Fix ras-mc-ctl layout filling
* Sun Aug 04 2013 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 0.4.1-5
- Rebuilt for https://fedoraproject.org/wiki/Fedora_20_Mass_Rebuild
* Wed Jul 17 2013 Petr Pisar <ppisar@redhat.com> - 0.4.1-4
- Perl 5.18 rebuild
* Sun Jun 2 2013 Peter Robinson <pbrobinson@fedoraproject.org> 0.4.1-3
- ARM has EDMA drivers (currently supported in Calxeda highbank)
* Wed May 29 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.4.1-2
- Fix the name of perl-DBD-SQLite package
* Wed May 29 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.4.1-1
- Updated to version 0.4.1 with contains some bug fixes
* Tue May 28 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.4.0-1
- Updated to version 0.4.0 and added support for mce, aer and sqlite3 storage
* Mon May 20 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.3.0-1
- Package created

1
sources Normal file
View File

@ -0,0 +1 @@
SHA512 (rasdaemon-0.8.0.tar.bz2) = 3f69bf41ae1efe11c153ffdf736116a78eee7d34dac1148f938e183e5523bc7c2da60af6376a1fc29c113dfad2c4b9798ba3b700f0eca7454f213d86df31c7a9

3
tests/basic-test.sh Normal file
View File

@ -0,0 +1,3 @@
#!/bin/sh
(systemctl start rasdaemon && systemctl status rasdaemon && ras-mc-ctl --summary && echo "PASS" && exit 0) || (echo "FAIL"; exit 1;)

9
tests/tests.yml Normal file
View File

@ -0,0 +1,9 @@
- hosts: localhost
roles:
- role: standard-test-basic # this is a standard test role, it takes care of the test environment, logging, archiving results..
tags:
- classic
tests:
- simple:
dir: .
run: "./basic-test.sh" # this is your test command, its exit code is the outcome of the test