diff --git a/2ff9bc453998ddb145c7bb8ba30a57c56bd18eab.patch b/2ff9bc453998ddb145c7bb8ba30a57c56bd18eab.patch new file mode 100644 index 0000000..eaa9559 --- /dev/null +++ b/2ff9bc453998ddb145c7bb8ba30a57c56bd18eab.patch @@ -0,0 +1,66 @@ +commit 2ff9bc453998ddb145c7bb8ba30a57c56bd18eab +Author: Shiju Jose +Date: Tue Apr 4 14:40:42 2023 +0100 + + rasdaemon: Add common function to convert timestamp in the CXL event records to the broken-down time format + + Add common function to convert the timestamp in the CXL event records + in nanoseconds to the broken-down time format. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index 8f6342d..59534a4 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -23,6 +23,25 @@ + #include "ras-report.h" + #include + ++/* Common Functions */ ++static void convert_timestamp(unsigned long long ts, char *ts_ptr, uint16_t size) ++{ ++ /* CXL Specification 3.0 ++ * Overflow timestamp - The number of unsigned nanoseconds ++ * that have elapsed since midnight, 01-Jan-1970 UTC ++ */ ++ time_t ts_secs = ts / 1000000000ULL; ++ struct tm *tm; ++ ++ tm = localtime(&ts_secs); ++ if (tm) ++ strftime(ts_ptr, size, "%Y-%m-%d %H:%M:%S %z", tm); ++ ++ if (!ts || !tm) ++ strncpy(ts_ptr, "1970-01-01 00:00:00 +0000", ++ size); ++} ++ + /* Poison List: Payload out flags */ + #define CXL_POISON_FLAG_MORE BIT(0) + #define CXL_POISON_FLAG_OVERFLOW BIT(1) +@@ -168,22 +187,7 @@ int ras_cxl_poison_event_handler(struct trace_seq *s, + if (ev.flags & CXL_POISON_FLAG_OVERFLOW) { + if (tep_get_field_val(s, event, "overflow_ts", record, &val, 1) < 0) + return -1; +- if (val) { +- /* CXL Specification 3.0 +- * Overflow timestamp - The number of unsigned nanoseconds +- * that have elapsed since midnight, 01-Jan-1970 UTC +- */ +- time_t ovf_ts_secs = val / 1000000000ULL; +- +- tm = localtime(&ovf_ts_secs); +- if (tm) { +- strftime(ev.overflow_ts, sizeof(ev.overflow_ts), +- "%Y-%m-%d %H:%M:%S %z", tm); +- } +- } +- if (!val || !tm) +- strncpy(ev.overflow_ts, "1970-01-01 00:00:00 +0000", +- sizeof(ev.overflow_ts)); ++ convert_timestamp(val, ev.overflow_ts, sizeof(ev.overflow_ts)); + } else + strncpy(ev.overflow_ts, "1970-01-01 00:00:00 +0000", sizeof(ev.overflow_ts)); + if (trace_seq_printf(s, "overflow timestamp:%s\n", ev.overflow_ts) <= 0) diff --git a/31c7578ddb0fc15aa7247f2b8885956540031221.patch b/31c7578ddb0fc15aa7247f2b8885956540031221.patch new file mode 100644 index 0000000..7ee1e3b --- /dev/null +++ b/31c7578ddb0fc15aa7247f2b8885956540031221.patch @@ -0,0 +1,54 @@ +commit 31c7578ddb0fc15aa7247f2b8885956540031221 +Author: Shiju Jose +Date: Tue Feb 6 12:08:00 2024 +0000 + + rasdaemon: ras-memory-failure-handler: update memory failure action page types + + Update memory failure action page types corresponding to the same in + mm/memory-failure.c in the kernel. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c +index 97e8840..a5acc08 100644 +--- a/ras-memory-failure-handler.c ++++ b/ras-memory-failure-handler.c +@@ -26,10 +26,8 @@ enum mf_action_page_type { + MF_MSG_KERNEL_HIGH_ORDER, + MF_MSG_SLAB, + MF_MSG_DIFFERENT_COMPOUND, +- MF_MSG_POISONED_HUGE, + MF_MSG_HUGE, + MF_MSG_FREE_HUGE, +- MF_MSG_NON_PMD_HUGE, + MF_MSG_UNMAP_FAILED, + MF_MSG_DIRTY_SWAPCACHE, + MF_MSG_CLEAN_SWAPCACHE, +@@ -41,7 +39,6 @@ enum mf_action_page_type { + MF_MSG_CLEAN_LRU, + MF_MSG_TRUNCATED_LRU, + MF_MSG_BUDDY, +- MF_MSG_BUDDY_2ND, + MF_MSG_DAX, + MF_MSG_UNSPLIT_THP, + MF_MSG_UNKNOWN, +@@ -64,10 +61,8 @@ static const struct { + { MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page"}, + { MF_MSG_SLAB, "kernel slab page"}, + { MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking"}, +- { MF_MSG_POISONED_HUGE, "huge page already hardware poisoned"}, + { MF_MSG_HUGE, "huge page"}, + { MF_MSG_FREE_HUGE, "free huge page"}, +- { MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page"}, + { MF_MSG_UNMAP_FAILED, "unmapping failed page"}, + { MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page"}, + { MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page"}, +@@ -79,7 +74,6 @@ static const struct { + { MF_MSG_CLEAN_LRU, "clean LRU page"}, + { MF_MSG_TRUNCATED_LRU, "already truncated LRU page"}, + { MF_MSG_BUDDY, "free buddy page"}, +- { MF_MSG_BUDDY_2ND, "free buddy page (2nd try)"}, + { MF_MSG_DAX, "dax page"}, + { MF_MSG_UNSPLIT_THP, "unsplit thp"}, + { MF_MSG_UNKNOWN, "unknown page"}, diff --git a/53c682fb45c2909c128be4ee8f51a3e42fe2f7fd.patch b/53c682fb45c2909c128be4ee8f51a3e42fe2f7fd.patch new file mode 100644 index 0000000..cb656cc --- /dev/null +++ b/53c682fb45c2909c128be4ee8f51a3e42fe2f7fd.patch @@ -0,0 +1,551 @@ +commit 53c682fb45c2909c128be4ee8f51a3e42fe2f7fd +Author: Shiju Jose +Date: Wed Apr 5 11:54:41 2023 +0100 + + rasdaemon: Add support for the CXL general media events + + Add support to log and record the CXL general media events. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index 83ada56..2de96f6 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -99,6 +99,14 @@ static char *uuid_be(const char *uu) + return uuid; + } + ++static const char* get_cxl_type_str(const char** type_array, uint8_t num_elems, uint8_t type) ++{ ++ if (type >= num_elems) ++ return "Unknown"; ++ ++ return type_array[type]; ++} ++ + /* Poison List: Payload out flags */ + #define CXL_POISON_FLAG_MORE BIT(0) + #define CXL_POISON_FLAG_OVERFLOW BIT(1) +@@ -709,3 +717,151 @@ int ras_cxl_generic_event_handler(struct trace_seq *s, + + return 0; + } ++ ++#define CXL_DPA_VOLATILE BIT(0) ++#define CXL_DPA_NOT_REPAIRABLE BIT(1) ++ ++static const struct cxl_event_flags cxl_dpa_flags[] = { ++ { .bit = CXL_DPA_VOLATILE, .flag = "VOLATILE" }, ++ { .bit = CXL_DPA_NOT_REPAIRABLE, .flag = "NOT_REPAIRABLE" }, ++}; ++ ++/* ++ * General Media Event Record - GMER ++ * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 ++ */ ++#define CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT BIT(0) ++#define CXL_GMER_EVT_DESC_THRESHOLD_EVENT BIT(1) ++#define CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW BIT(2) ++ ++static const struct cxl_event_flags cxl_gmer_event_desc_flags[] = { ++ { .bit = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT, .flag = "UNCORRECTABLE EVENT" }, ++ { .bit = CXL_GMER_EVT_DESC_THRESHOLD_EVENT, .flag = "THRESHOLD EVENT" }, ++ { .bit = CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW, .flag = "POISON LIST OVERFLOW" }, ++}; ++ ++#define CXL_GMER_VALID_CHANNEL BIT(0) ++#define CXL_GMER_VALID_RANK BIT(1) ++#define CXL_GMER_VALID_DEVICE BIT(2) ++#define CXL_GMER_VALID_COMPONENT BIT(3) ++ ++static const char* cxl_gmer_mem_event_type[] = { ++ "ECC Error", ++ "Invalid Address", ++ "Data Path Error", ++}; ++ ++static const char* cxl_gmer_trans_type[] = { ++ "Unknown", ++ "Host Read", ++ "Host Write", ++ "Host Scan Media", ++ "Host Inject Poison", ++ "Internal Media Scrub", ++ "Internal Media Management", ++}; ++ ++int ras_cxl_general_media_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ int len, i; ++ unsigned long long val; ++ struct ras_events *ras = context; ++ struct ras_cxl_general_media_event ev; ++ ++ memset(&ev, 0, sizeof(ev)); ++ if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "dpa", record, &val, 1) < 0) ++ return -1; ++ ev.dpa = val; ++ if (trace_seq_printf(s, "dpa:0x%llx ", (unsigned long long)ev.dpa) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "dpa_flags", record, &val, 1) < 0) ++ return -1; ++ ev.dpa_flags = val; ++ if (trace_seq_printf(s, "dpa_flags:") <= 0) ++ return -1; ++ if (decode_cxl_event_flags(s, ev.dpa_flags, cxl_dpa_flags, ARRAY_SIZE(cxl_dpa_flags)) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "descriptor", record, &val, 1) < 0) ++ return -1; ++ ev.descriptor = val; ++ if (trace_seq_printf(s, "descriptor:") <= 0) ++ return -1; ++ if (decode_cxl_event_flags(s, ev.descriptor, cxl_gmer_event_desc_flags, ++ ARRAY_SIZE(cxl_gmer_event_desc_flags)) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "type", record, &val, 1) < 0) ++ return -1; ++ ev.type = val; ++ if (trace_seq_printf(s, "type:%s ", get_cxl_type_str(cxl_gmer_mem_event_type, ++ ARRAY_SIZE(cxl_gmer_mem_event_type), ev.type)) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "transaction_type", record, &val, 1) < 0) ++ return -1; ++ ev.transaction_type = val; ++ if (trace_seq_printf(s, "transaction_type:%s ", ++ get_cxl_type_str(cxl_gmer_trans_type, ++ ARRAY_SIZE(cxl_gmer_trans_type), ++ ev.transaction_type)) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "validity_flags", record, &val, 1) < 0) ++ return -1; ++ ev.validity_flags = val; ++ ++ if (ev.validity_flags & CXL_GMER_VALID_CHANNEL) { ++ if (tep_get_field_val(s, event, "channel", record, &val, 1) < 0) ++ return -1; ++ ev.channel = val; ++ if (trace_seq_printf(s, "channel:%u ", ev.channel) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_GMER_VALID_RANK) { ++ if (tep_get_field_val(s, event, "rank", record, &val, 1) < 0) ++ return -1; ++ ev.rank = val; ++ if (trace_seq_printf(s, "rank:%u ", ev.rank) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_GMER_VALID_DEVICE) { ++ if (tep_get_field_val(s, event, "device", record, &val, 1) < 0) ++ return -1; ++ ev.device = val; ++ if (trace_seq_printf(s, "device:%x ", ev.device) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_GMER_VALID_COMPONENT) { ++ ev.comp_id = tep_get_field_raw(s, event, "comp_id", record, &len, 1); ++ if (!ev.comp_id) ++ return -1; ++ if (trace_seq_printf(s, "comp_id:") <= 0) ++ return -1; ++ for (i = 0; i < CXL_EVENT_GEN_MED_COMP_ID_SIZE; i++) { ++ if (trace_seq_printf(s, "%02x ", ev.comp_id[i]) <= 0) ++ break; ++ } ++ } ++ ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_cxl_general_media_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_cxl_general_media_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h +index 9f77cb7..3adca4a 100644 +--- a/ras-cxl-handler.h ++++ b/ras-cxl-handler.h +@@ -35,4 +35,7 @@ int ras_cxl_overflow_event_handler(struct trace_seq *s, + int ras_cxl_generic_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); ++int ras_cxl_general_media_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context); + #endif +diff --git a/ras-events.c b/ras-events.c +index 4036933..978dee4 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -250,6 +250,7 @@ int toggle_ras_mc_event(int enable) + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_correctable_error", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_overflow", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_generic_event", enable); ++ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_general_media", enable); + #endif + + free_ras: +@@ -1063,6 +1064,14 @@ int handle_ras_events(int record_events) + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "cxl_generic_event"); ++ ++ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_general_media", ++ ras_cxl_general_media_event_handler, NULL, CXL_GENERAL_MEDIA_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "cxl", "cxl_general_media"); + #endif + + if (!num_events) { +diff --git a/ras-events.h b/ras-events.h +index 96c299e..9b83df3 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -44,6 +44,7 @@ enum { + CXL_AER_CE_EVENT, + CXL_OVERFLOW_EVENT, + CXL_GENERIC_EVENT, ++ CXL_GENERAL_MEDIA_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index a65d9c0..507a58e 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -846,6 +846,75 @@ int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_e + + return rc; + } ++ ++/* ++ * Table and functions to handle cxl:cxl_general_media_event ++ */ ++static const struct db_fields cxl_general_media_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "memdev", .type = "TEXT" }, ++ { .name = "host", .type = "TEXT" }, ++ { .name = "serial", .type = "INTEGER" }, ++ { .name = "log_type", .type = "TEXT" }, ++ { .name = "hdr_uuid", .type = "TEXT" }, ++ { .name = "hdr_flags", .type = "INTEGER" }, ++ { .name = "hdr_handle", .type = "INTEGER" }, ++ { .name = "hdr_related_handle", .type = "INTEGER" }, ++ { .name = "hdr_ts", .type = "TEXT" }, ++ { .name = "hdr_length", .type = "INTEGER" }, ++ { .name = "hdr_maint_op_class", .type = "INTEGER" }, ++ { .name = "dpa", .type = "INTEGER" }, ++ { .name = "dpa_flags", .type = "INTEGER" }, ++ { .name = "descriptor", .type = "INTEGER" }, ++ { .name = "type", .type = "INTEGER" }, ++ { .name = "transaction_type", .type = "INTEGER" }, ++ { .name = "channel", .type = "INTEGER" }, ++ { .name = "rank", .type = "INTEGER" }, ++ { .name = "device", .type = "INTEGER" }, ++ { .name = "comp_id", .type = "BLOB" }, ++}; ++ ++static const struct db_table_descriptor cxl_general_media_event_tab = { ++ .name = "cxl_general_media_event", ++ .fields = cxl_general_media_event_fields, ++ .num_fields = ARRAY_SIZE(cxl_general_media_event_fields), ++}; ++ ++int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_cxl_general_media_event) ++ return 0; ++ log(TERM, LOG_INFO, "cxl_general_media_event store: %p\n", ++ priv->stmt_cxl_general_media_event); ++ ++ ras_store_cxl_common_hdr(priv->stmt_cxl_general_media_event, &ev->hdr); ++ sqlite3_bind_int64(priv->stmt_cxl_general_media_event, 13, ev->dpa); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 14, ev->dpa_flags); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 15, ev->descriptor); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 16, ev->type); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 17, ev->transaction_type); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 18, ev->channel); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 19, ev->rank); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 20, ev->device); ++ sqlite3_bind_blob(priv->stmt_cxl_general_media_event, 21, ev->comp_id, ++ CXL_EVENT_GEN_MED_COMP_ID_SIZE, NULL); ++ ++ rc = sqlite3_step(priv->stmt_cxl_general_media_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do stmt_cxl_general_media_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_cxl_general_media_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset stmt_cxl_general_media_event on sqlite: error = %d\n", rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} + #endif + + /* +@@ -1229,6 +1298,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + if (rc != SQLITE_OK) + goto error; + } ++ ++ rc = ras_mc_create_table(priv, &cxl_general_media_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_general_media_event, ++ &cxl_general_media_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } + #endif + + ras->db_priv = priv; +@@ -1390,6 +1467,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + "cpu %u: Failed to finalize cxl_generic_event sqlite: error = %d\n", + cpu, rc); + } ++ ++ if (priv->stmt_cxl_general_media_event) { ++ rc = sqlite3_finalize(priv->stmt_cxl_general_media_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize cxl_general_media_event sqlite: error = %d\n", ++ cpu, rc); ++ } + #endif + + rc = sqlite3_close_v2(db); +diff --git a/ras-record.h b/ras-record.h +index 9ecfcda..37c32de 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -134,6 +134,7 @@ struct ras_cxl_poison_event { + #define CXL_HEADERLOG_SIZE SZ_512 + #define CXL_HEADERLOG_SIZE_U32 (SZ_512 / sizeof(uint32_t)) + #define CXL_EVENT_RECORD_DATA_LENGTH 0x50 ++#define CXL_EVENT_GEN_MED_COMP_ID_SIZE 0x10 + + struct ras_cxl_aer_ue_event { + char timestamp[64]; +@@ -184,6 +185,20 @@ struct ras_cxl_generic_event { + uint8_t *data; + }; + ++struct ras_cxl_general_media_event { ++ struct ras_cxl_event_common_hdr hdr; ++ uint64_t dpa; ++ uint8_t dpa_flags; ++ uint8_t descriptor; ++ uint8_t type; ++ uint8_t transaction_type; ++ uint8_t channel; ++ uint8_t rank; ++ uint32_t device; ++ uint8_t *comp_id; ++ uint16_t validity_flags; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -198,6 +213,7 @@ struct ras_cxl_aer_ue_event; + struct ras_cxl_aer_ce_event; + struct ras_cxl_overflow_event; + struct ras_cxl_generic_event; ++struct ras_cxl_general_media_event; + + #ifdef HAVE_SQLITE3 + +@@ -236,6 +252,7 @@ struct sqlite3_priv { + sqlite3_stmt *stmt_cxl_aer_ce_event; + sqlite3_stmt *stmt_cxl_overflow_event; + sqlite3_stmt *stmt_cxl_generic_event; ++ sqlite3_stmt *stmt_cxl_general_media_event; + #endif + }; + +@@ -269,6 +286,7 @@ int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_eve + int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); + int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); + int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); ++int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -287,6 +305,7 @@ static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_ + static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; + static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; + static inline int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; ++static inline int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 8d7b76a..725dc9b 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -489,6 +489,60 @@ static int set_cxl_generic_event_backtrace(char *buf, struct ras_cxl_generic_eve + return 0; + } + ++static int set_cxl_general_media_event_backtrace(char *buf, struct ras_cxl_general_media_event *ev) ++{ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "memdev=%s\n" \ ++ "host=%s\n" \ ++ "serial=0x%lx\n" \ ++ "log_type=%s\n" \ ++ "hdr_uuid=%s\n" \ ++ "hdr_flags=0x%x\n" \ ++ "hdr_handle=0x%x\n" \ ++ "hdr_related_handle=0x%x\n" \ ++ "hdr_timestamp=%s\n" \ ++ "hdr_length=%u\n" \ ++ "hdr_maint_op_class=%u\n" \ ++ "dpa=0x%lx\n" \ ++ "dpa_flags=%u\n" \ ++ "descriptor=%u\n" \ ++ "type=%u\n" \ ++ "transaction_type=%u\n" \ ++ "channel=%u\n" \ ++ "rank=%u\n" \ ++ "device=0x%x\n", \ ++ ev->hdr.timestamp, \ ++ ev->hdr.memdev, \ ++ ev->hdr.host, \ ++ ev->hdr.serial, \ ++ ev->hdr.log_type, \ ++ ev->hdr.hdr_uuid, \ ++ ev->hdr.hdr_flags, \ ++ ev->hdr.hdr_handle, \ ++ ev->hdr.hdr_related_handle, \ ++ ev->hdr.hdr_timestamp, \ ++ ev->hdr.hdr_length, \ ++ ev->hdr.hdr_maint_op_class, \ ++ ev->dpa, \ ++ ev->dpa_flags, \ ++ ev->descriptor, \ ++ ev->type, \ ++ ev->transaction_type, \ ++ ev->channel, \ ++ ev->rank, \ ++ ev->device); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -541,6 +595,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case CXL_GENERIC_EVENT: + rc = set_cxl_generic_event_backtrace(buf, (struct ras_cxl_generic_event *)ev); + break; ++ case CXL_GENERAL_MEDIA_EVENT: ++ rc = set_cxl_general_media_event_backtrace(buf, (struct ras_cxl_general_media_event *)ev); ++ break; + default: + return -1; + } +@@ -1170,3 +1227,47 @@ cxl_generic_fail: + return -1; + + } ++ ++int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto cxl_general_media_fail; ++ ++ rc = commit_report_backtrace(sockfd, CXL_GENERAL_MEDIA_EVENT, ev); ++ if (rc < 0) ++ goto cxl_general_media_fail; ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl_general_media_event"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_general_media_fail; ++ ++ sprintf(buf, "REASON=%s", "CXL General Media Event"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_general_media_fail; ++ ++ done = 1; ++ ++cxl_general_media_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ else ++ return -1; ++} +diff --git a/ras-report.h b/ras-report.h +index bf591a6..d9ec7df 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -44,6 +44,7 @@ int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_ev + int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); + int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); + int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); ++int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); + + #else + +@@ -60,6 +61,7 @@ static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras + static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; + static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; + static inline int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; ++static inline int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; + + #endif + diff --git a/572de9d57691be9e630abee9ffa56a2fb155d558.patch b/572de9d57691be9e630abee9ffa56a2fb155d558.patch new file mode 100644 index 0000000..4a89c04 --- /dev/null +++ b/572de9d57691be9e630abee9ffa56a2fb155d558.patch @@ -0,0 +1,182 @@ +commit dea649c9f9a6f2941e80cade9ed311a398e267be +Author: Shiju Jose +Date: Mon Feb 12 11:14:03 2024 +0000 + + rasdaemon: ras-mc-ctl: Add support for CXL general media trace events + + Add support for CXL general media events to the ras-mc-ctl tool. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + (cherry picked from commit 572de9d57691be9e630abee9ffa56a2fb155d558) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 5528021..99b3c10 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1298,6 +1298,84 @@ sub get_cxl_hdr_flags_text + return join (", ", @out); + } + ++use constant { ++ CXL_DPA_VOLATILE => 0x0001, ++ CXL_DPA_NOT_REPAIRABLE => 0x0002, ++}; ++ ++sub get_cxl_dpa_flags_text ++{ ++ my $flags = $_[0]; ++ my @out; ++ ++ if ($flags & CXL_DPA_VOLATILE) { ++ push @out, (sprintf "\'VOLATILE\' "); ++ } ++ if ($flags & CXL_DPA_NOT_REPAIRABLE) { ++ push @out, (sprintf "\'NOT_REPAIRABLE\' "); ++ } ++ ++ return join (", ", @out); ++} ++ ++use constant { ++ CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT => 0x0001, ++ CXL_GMER_EVT_DESC_THRESHOLD_EVENT => 0x0002, ++ CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW => 0x0004, ++}; ++ ++sub get_cxl_descriptor_flags_text ++{ ++ my $flags = $_[0]; ++ my @out; ++ ++ if ($flags & CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT) { ++ push @out, (sprintf "\'UNCORRECTABLE EVENT\' "); ++ } ++ if ($flags & CXL_GMER_EVT_DESC_THRESHOLD_EVENT) { ++ push @out, (sprintf "\'THRESHOLD EVENT\' "); ++ } ++ if ($flags & CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW) { ++ push @out, (sprintf "\'POISON LIST OVERFLOW\' "); ++ } ++ ++ return join (", ", @out); ++} ++ ++sub get_cxl_mem_event_type ++{ ++ my @types; ++ ++ if ($_[0] < 0 || $_[0] > 2) { ++ return "unknown-type"; ++ } ++ ++ @types = ("ECC Error", ++ "Invalid Address", ++ "Data Path Error"); ++ ++ return $types[$_[0]]; ++} ++ ++sub get_cxl_transaction_type ++{ ++ my @types; ++ ++ if ($_[0] < 0 || $_[0] > 6) { ++ return "unknown-type"; ++ } ++ ++ @types = ("Unknown", ++ "Host Read", ++ "Host Write", ++ "Host Scan Media", ++ "Host Inject Poison", ++ "Internal Media Scrub", ++ "Internal Media Management"); ++ ++ return $types[$_[0]]; ++} ++ + sub summary + { + require DBI; +@@ -1442,6 +1520,22 @@ sub summary + print "No CXL generic errors.\n\n"; + } + $query_handle->finish; ++ ++ # CXL general media errors ++ $query = "select memdev, count(*) from cxl_general_media_event$conf{opt}{since} group by memdev"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($memdev, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$memdev errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "CXL general media events summary:\n$out\n"; ++ } else { ++ print "No CXL general media errors.\n\n"; ++ } ++ $query_handle->finish; + } + + # extlog errors +@@ -1553,6 +1647,7 @@ sub errors + my ($log_type, $first_ts, $last_ts); + my ($trace_type, $region, $region_uuid, $hpa, $dpa, $dpa_length, $source, $flags, $overflow_ts); + my ($hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $data); ++ my ($dpa_flags, $descriptor, $mem_event_type, $transaction_type, $channel, $rank, $device, $comp_id); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -1764,6 +1859,49 @@ sub errors + } else { + print "No CXL generic errors.\n\n"; + } ++ ++ # CXL general media errors ++ use constant CXL_EVENT_GEN_MED_COMP_ID_SIZE => 0x10; ++ $query = "select id, timestamp, memdev, host, serial, log_type, hdr_uuid, hdr_flags, hdr_handle, hdr_related_handle, hdr_ts, hdr_length, hdr_maint_op_class, dpa, dpa_flags, descriptor, type, transaction_type, channel, rank, device, comp_id from cxl_general_media_event$conf{opt}{since} order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $log_type, $hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $dpa, $dpa_flags, $descriptor, $mem_event_type, $transaction_type, $channel, $rank, $device, $comp_id)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $timestamp error: "; ++ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev); ++ $out .= "host=$host, " if (defined $host && length $host); ++ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial); ++ $out .= "log=$log_type, " if (defined $log_type && length $log_type); ++ $out .= "hdr_uuid=$hdr_uuid, " if (defined $hdr_uuid && length $hdr_uuid); ++ $out .= sprintf "hdr_flags=0x%llx %s, ", $hdr_flags, get_cxl_hdr_flags_text($hdr_flags) if (defined $hdr_flags && length $hdr_flags); ++ $out .= sprintf "hdr_handle=0x%x, ", $hdr_handle if (defined $hdr_handle && length $hdr_handle); ++ $out .= sprintf "hdr_related_handle=0x%x, ", $hdr_related_handle if (defined $hdr_related_handle && length $hdr_related_handle); ++ $out .= "hdr_timestamp=$hdr_ts, " if (defined $hdr_ts && length $hdr_ts); ++ $out .= sprintf "hdr_length=%u, ", $hdr_length if (defined $hdr_length && length $hdr_length); ++ $out .= sprintf "hdr_maint_op_class=%u, ", $hdr_maint_op_class if (defined $hdr_maint_op_class && length $hdr_maint_op_class); ++ $out .= sprintf "dpa=0x%llx, ", $dpa if (defined $dpa && length $dpa); ++ $out .= sprintf "dpa_flags: %s, ", get_cxl_dpa_flags_text($dpa_flags) if (defined $dpa_flags && length $dpa_flags); ++ $out .= sprintf "descriptor_flags: %s, ", get_cxl_descriptor_flags_text($descriptor) if (defined $descriptor && length $descriptor); ++ $out .= sprintf "memory event type: %s, ", get_cxl_mem_event_type($mem_event_type) if (defined $mem_event_type && length $mem_event_type); ++ $out .= sprintf "transaction_type: %s, ", get_cxl_transaction_type($transaction_type) if (defined $transaction_type && length $transaction_type); ++ $out .= sprintf "channel=%u, ", $channel if (defined $channel && length $channel); ++ $out .= sprintf "rank=%u, ", $rank if (defined $rank && length $rank); ++ $out .= sprintf "device=0x%x, ", $device if (defined $device && length $device); ++ if (defined $comp_id && length $comp_id) { ++ $out .= sprintf "component_id:"; ++ my @bytes = unpack "C*", $comp_id; ++ for (my $i = 0; $i < CXL_EVENT_GEN_MED_COMP_ID_SIZE; $i++) { ++ $out .= sprintf "%02x ", $bytes[$i]; ++ } ++ } ++ $out .= "\n"; ++ } ++ if ($out ne "") { ++ print "CXL general media events:\n$out\n"; ++ } else { ++ print "No CXL general media errors.\n\n"; ++ } + } + + # Extlog errors diff --git a/75c8fec559641f843345ef8fbc36d124b60b914d.patch b/75c8fec559641f843345ef8fbc36d124b60b914d.patch new file mode 100644 index 0000000..cd0aca4 --- /dev/null +++ b/75c8fec559641f843345ef8fbc36d124b60b914d.patch @@ -0,0 +1,663 @@ +commit 75c8fec559641f843345ef8fbc36d124b60b914d +Author: Shiju Jose +Date: Fri Mar 31 13:35:13 2023 +0100 + + rasdaemon: Add support for the CXL poison events + + Add support to log and record the CXL poison events. + + The corresponding Kernel patches here: + https://lore.kernel.org/linux-cxl/64457d30bae07_2028294ac@dwillia2-xfh.jf.intel.com.notmuch/ + + Presently for logging only, could be extended for the policy + based recovery action for the frequent poison events depending on the above + kernel patches. + + Signed-off-by: Shiju Jose + Reviewed-by: Jonathan Cameron + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/Makefile.am b/Makefile.am +index 56c144e..5bddeac 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -73,6 +73,11 @@ endif + if WITH_CPU_FAULT_ISOLATION + rasdaemon_SOURCES += ras-cpu-isolation.c queue.c + endif ++ ++if WITH_CXL ++ rasdaemon_SOURCES += ras-cxl-handler.c ++endif ++ + rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) + rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) + +@@ -81,7 +86,7 @@ include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ + ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ + ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ + non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ +- ras-cpu-isolation.h queue.h ++ ras-cxl-handler.h ras-cpu-isolation.h queue.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/configure.ac b/configure.ac +index f588090..ab5697d 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -127,6 +127,16 @@ AS_IF([test "x$enable_memory_failure" = "xyes" || test "x$enable_all" = "xyes"], + AM_CONDITIONAL([WITH_MEMORY_FAILURE], [test x$enable_memory_failure = xyes || test x$enable_all = xyes]) + AM_COND_IF([WITH_MEMORY_FAILURE], [USE_MEMORY_FAILURE="yes"], [USE_MEMORY_FAILURE="no"]) + ++AC_ARG_ENABLE([cxl], ++ AS_HELP_STRING([--enable-cxl], [enable CXL events (currently experimental)])) ++ ++AS_IF([test "x$enable_cxl" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_DEFINE(HAVE_CXL,1,"have CXL events collect") ++ AC_SUBST([WITH_CXL]) ++]) ++AM_CONDITIONAL([WITH_CXL], [test x$enable_cxl = xyes || test x$enable_all == xyes]) ++AM_COND_IF([WITH_CXL], [USE_CXL="yes"], [USE_CXL="no"]) ++ + AC_ARG_ENABLE([abrt_report], + AS_HELP_STRING([--enable-abrt-report], [enable report event to ABRT (currently experimental)])) + +@@ -215,6 +225,7 @@ compile time options summary + DEVLINK : $USE_DEVLINK + Disk I/O errors : $USE_DISKERROR + Memory Failure : $USE_MEMORY_FAILURE ++ CXL events : $USE_CXL + Memory CE PFA : $USE_MEMORY_CE_PFA + AMP RAS errors : $USE_AMP_NS_DECODE + CPU fault isolation : $USE_CPU_FAULT_ISOLATION +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +new file mode 100644 +index 0000000..cb23ba2 +--- /dev/null ++++ b/ras-cxl-handler.c +@@ -0,0 +1,202 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include "ras-cxl-handler.h" ++#include "ras-record.h" ++#include "ras-logger.h" ++#include "ras-report.h" ++ ++/* Poison List: Payload out flags */ ++#define CXL_POISON_FLAG_MORE BIT(0) ++#define CXL_POISON_FLAG_OVERFLOW BIT(1) ++#define CXL_POISON_FLAG_SCANNING BIT(2) ++ ++/* CXL poison - source types */ ++enum cxl_poison_source { ++ CXL_POISON_SOURCE_UNKNOWN = 0, ++ CXL_POISON_SOURCE_EXTERNAL = 1, ++ CXL_POISON_SOURCE_INTERNAL = 2, ++ CXL_POISON_SOURCE_INJECTED = 3, ++ CXL_POISON_SOURCE_VENDOR = 7, ++}; ++ ++/* CXL poison - trace types */ ++enum cxl_poison_trace_type { ++ CXL_POISON_TRACE_LIST, ++ CXL_POISON_TRACE_INJECT, ++ CXL_POISON_TRACE_CLEAR, ++}; ++ ++int ras_cxl_poison_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ int len; ++ unsigned long long val; ++ struct ras_events *ras = context; ++ time_t now; ++ struct tm *tm; ++ struct ras_cxl_poison_event ev; ++ ++ now = record->ts / user_hz + ras->uptime_diff; ++ tm = localtime(&now); ++ if (tm) ++ strftime(ev.timestamp, sizeof(ev.timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ else ++ strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); ++ if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) ++ return -1; ++ ++ ev.memdev = tep_get_field_raw(s, event, "memdev", ++ record, &len, 1); ++ if (!ev.memdev) ++ return -1; ++ if (trace_seq_printf(s, "memdev:%s ", ev.memdev) <= 0) ++ return -1; ++ ++ ev.host = tep_get_field_raw(s, event, "host", ++ record, &len, 1); ++ if (!ev.host) ++ return -1; ++ if (trace_seq_printf(s, "host:%s ", ev.host) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "serial", record, &val, 1) < 0) ++ return -1; ++ ev.serial = val; ++ if (trace_seq_printf(s, "serial:0x%llx ", (unsigned long long)ev.serial) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "trace_type", record, &val, 1) < 0) ++ return -1; ++ switch (val) { ++ case CXL_POISON_TRACE_LIST: ++ ev.trace_type = "List"; ++ break; ++ case CXL_POISON_TRACE_INJECT: ++ ev.trace_type = "Inject"; ++ break; ++ case CXL_POISON_TRACE_CLEAR: ++ ev.trace_type = "Clear"; ++ break; ++ default: ++ ev.trace_type = "Invalid"; ++ } ++ if (trace_seq_printf(s, "trace_type:%s ", ev.trace_type) <= 0) ++ return -1; ++ ++ ev.region = tep_get_field_raw(s, event, "region", ++ record, &len, 1); ++ if (!ev.region) ++ return -1; ++ if (trace_seq_printf(s, "region:%s ", ev.region) <= 0) ++ return -1; ++ ++ ev.uuid = tep_get_field_raw(s, event, "uuid", ++ record, &len, 1); ++ if (!ev.uuid) ++ return -1; ++ if (trace_seq_printf(s, "region_uuid:%s ", ev.uuid) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "hpa", record, &val, 1) < 0) ++ return -1; ++ ev.hpa = val; ++ if (trace_seq_printf(s, "poison list: hpa:0x%llx ", (unsigned long long)ev.hpa) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "dpa", record, &val, 1) < 0) ++ return -1; ++ ev.dpa = val; ++ if (trace_seq_printf(s, "dpa:0x%llx ", (unsigned long long)ev.dpa) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "dpa_length", record, &val, 1) < 0) ++ return -1; ++ ev.dpa_length = val; ++ if (trace_seq_printf(s, "dpa_length:0x%x ", ev.dpa_length) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "source", record, &val, 1) < 0) ++ return -1; ++ switch (val) { ++ case CXL_POISON_SOURCE_UNKNOWN: ++ ev.source = "Unknown"; ++ break; ++ case CXL_POISON_SOURCE_EXTERNAL: ++ ev.source = "External"; ++ break; ++ case CXL_POISON_SOURCE_INTERNAL: ++ ev.source = "Internal"; ++ break; ++ case CXL_POISON_SOURCE_INJECTED: ++ ev.source = "Injected"; ++ break; ++ case CXL_POISON_SOURCE_VENDOR: ++ ev.source = "Vendor"; ++ break; ++ default: ++ ev.source = "Invalid"; ++ } ++ if (trace_seq_printf(s, "source:%s ", ev.source) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "flags", record, &val, 1) < 0) ++ return -1; ++ ev.flags = val; ++ if (trace_seq_printf(s, "flags:%d ", ev.flags) <= 0) ++ return -1; ++ ++ if (ev.flags & CXL_POISON_FLAG_OVERFLOW) { ++ if (tep_get_field_val(s, event, "overflow_ts", record, &val, 1) < 0) ++ return -1; ++ if (val) { ++ /* CXL Specification 3.0 ++ * Overflow timestamp - The number of unsigned nanoseconds ++ * that have elapsed since midnight, 01-Jan-1970 UTC ++ */ ++ time_t ovf_ts_secs = val / 1000000000ULL; ++ ++ tm = localtime(&ovf_ts_secs); ++ if (tm) { ++ strftime(ev.overflow_ts, sizeof(ev.overflow_ts), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ } ++ } ++ if (!val || !tm) ++ strncpy(ev.overflow_ts, "1970-01-01 00:00:00 +0000", ++ sizeof(ev.overflow_ts)); ++ } else ++ strncpy(ev.overflow_ts, "1970-01-01 00:00:00 +0000", sizeof(ev.overflow_ts)); ++ if (trace_seq_printf(s, "overflow timestamp:%s\n", ev.overflow_ts) <= 0) ++ return -1; ++ ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_cxl_poison_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_cxl_poison_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h +new file mode 100644 +index 0000000..84d5cc6 +--- /dev/null ++++ b/ras-cxl-handler.h +@@ -0,0 +1,24 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ */ ++ ++#ifndef __RAS_CXL_HANDLER_H ++#define __RAS_CXL_HANDLER_H ++ ++#include "ras-events.h" ++#include ++ ++int ras_cxl_poison_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context); ++#endif +diff --git a/ras-events.c b/ras-events.c +index 5fe8e19..f95844a 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -40,6 +40,7 @@ + #include "ras-devlink-handler.h" + #include "ras-diskerror-handler.h" + #include "ras-memory-failure-handler.h" ++#include "ras-cxl-handler.h" + #include "ras-record.h" + #include "ras-logger.h" + #include "ras-page-isolation.h" +@@ -243,6 +244,10 @@ int toggle_ras_mc_event(int enable) + rc |= __toggle_ras_mc_event(ras, "ras", "memory_failure_event", enable); + #endif + ++#ifdef HAVE_CXL ++ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_poison", enable); ++#endif ++ + free_ras: + free(ras); + return rc; +@@ -979,6 +984,16 @@ int handle_ras_events(int record_events) + "ras", "memory_failure_event"); + #endif + ++#ifdef HAVE_CXL ++ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_poison", ++ ras_cxl_poison_event_handler, NULL, CXL_POISON_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "cxl", "cxl_poison"); ++#endif ++ + if (!num_events) { + log(ALL, LOG_INFO, + "Failed to trace all supported RAS events. Aborting.\n"); +diff --git a/ras-events.h b/ras-events.h +index 649b0c0..1ef3ecd 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -39,6 +39,7 @@ enum { + DEVLINK_EVENT, + DISKERROR_EVENT, + MF_EVENT, ++ CXL_POISON_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index adc97a4..c31baa0 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -559,6 +559,71 @@ int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) + } + #endif + ++#ifdef HAVE_CXL ++/* ++ * Table and functions to handle cxl:cxl_poison ++ */ ++static const struct db_fields cxl_poison_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "memdev", .type = "TEXT" }, ++ { .name = "host", .type = "TEXT" }, ++ { .name = "serial", .type = "INTEGER" }, ++ { .name = "trace_type", .type = "TEXT" }, ++ { .name = "region", .type = "TEXT" }, ++ { .name = "region_uuid", .type = "TEXT" }, ++ { .name = "hpa", .type = "INTEGER" }, ++ { .name = "dpa", .type = "INTEGER" }, ++ { .name = "dpa_length", .type = "INTEGER" }, ++ { .name = "source", .type = "TEXT" }, ++ { .name = "flags", .type = "INTEGER" }, ++ { .name = "overflow_ts", .type = "TEXT" }, ++}; ++ ++static const struct db_table_descriptor cxl_poison_event_tab = { ++ .name = "cxl_poison_event", ++ .fields = cxl_poison_event_fields, ++ .num_fields = ARRAY_SIZE(cxl_poison_event_fields), ++}; ++ ++int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_cxl_poison_event) ++ return 0; ++ log(TERM, LOG_INFO, "cxl_poison_event store: %p\n", priv->stmt_cxl_poison_event); ++ ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 1, ev->timestamp, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 2, ev->memdev, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 3, ev->host, -1, NULL); ++ sqlite3_bind_int64(priv->stmt_cxl_poison_event, 4, ev->serial); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 5, ev->trace_type, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 6, ev->region, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 7, ev->uuid, -1, NULL); ++ sqlite3_bind_int64(priv->stmt_cxl_poison_event, 8, ev->hpa); ++ sqlite3_bind_int64(priv->stmt_cxl_poison_event, 9, ev->dpa); ++ sqlite3_bind_int(priv->stmt_cxl_poison_event, 10, ev->dpa_length); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 11, ev->source, -1, NULL); ++ sqlite3_bind_int(priv->stmt_cxl_poison_event, 12, ev->flags); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 13, ev->overflow_ts, -1, NULL); ++ ++ rc = sqlite3_step(priv->stmt_cxl_poison_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do cxl_poison_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_cxl_poison_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset cxl_poison_event on sqlite: error = %d\n", ++ rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} ++#endif ++ + /* + * Generic code + */ +@@ -900,6 +965,16 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + } + #endif + ++#ifdef HAVE_CXL ++ rc = ras_mc_create_table(priv, &cxl_poison_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_poison_event, ++ &cxl_poison_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } ++#endif ++ + ras->db_priv = priv; + return 0; + +@@ -1019,6 +1094,16 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + } + #endif + ++#ifdef HAVE_CXL ++ if (priv->stmt_cxl_poison_event) { ++ rc = sqlite3_finalize(priv->stmt_cxl_poison_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize cxl_poison_event sqlite: error = %d\n", ++ cpu, rc); ++ } ++#endif ++ + rc = sqlite3_close_v2(db); + if (rc != SQLITE_OK) + log(TERM, LOG_ERR, +diff --git a/ras-record.h b/ras-record.h +index 219f10b..fd15215 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -114,6 +114,22 @@ struct ras_mf_event { + const char *action_result; + }; + ++struct ras_cxl_poison_event { ++ char timestamp[64]; ++ const char *memdev; ++ const char *host; ++ uint64_t serial; ++ const char *trace_type; ++ const char *region; ++ const char *uuid; ++ uint64_t hpa; ++ uint64_t dpa; ++ uint32_t dpa_length; ++ const char *source; ++ uint8_t flags; ++ char overflow_ts[64]; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -123,6 +139,7 @@ struct mce_event; + struct devlink_event; + struct diskerror_event; + struct ras_mf_event; ++struct ras_cxl_poison_event; + + #ifdef HAVE_SQLITE3 + +@@ -155,6 +172,9 @@ struct sqlite3_priv { + #ifdef HAVE_MEMORY_FAILURE + sqlite3_stmt *stmt_mf_event; + #endif ++#ifdef HAVE_CXL ++ sqlite3_stmt *stmt_cxl_poison_event; ++#endif + }; + + struct db_fields { +@@ -182,6 +202,7 @@ int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev); + int ras_store_devlink_event(struct ras_events *ras, struct devlink_event *ev); + int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev); + int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev); ++int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -195,6 +216,7 @@ static inline int ras_store_arm_record(struct ras_events *ras, struct ras_arm_ev + static inline int ras_store_devlink_event(struct ras_events *ras, struct devlink_event *ev) { return 0; }; + static inline int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; }; + static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; }; ++static inline int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 62d5eb7..3daecc0 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -331,6 +331,46 @@ static int set_mf_event_backtrace(char *buf, struct ras_mf_event *ev) + return 0; + } + ++static int set_cxl_poison_event_backtrace(char *buf, struct ras_cxl_poison_event *ev) ++{ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "memdev=%s\n" \ ++ "host=%s\n" \ ++ "serial=0x%lx\n" \ ++ "trace_type=%s\n" \ ++ "region=%s\n" \ ++ "region_uuid=%s\n" \ ++ "hpa=0x%lx\n" \ ++ "dpa=0x%lx\n" \ ++ "dpa_length=0x%x\n" \ ++ "source=%s\n" \ ++ "flags=%u\n" \ ++ "overflow_timestamp=%s\n", \ ++ ev->timestamp, \ ++ ev->memdev, \ ++ ev->host, \ ++ ev->serial, \ ++ ev->trace_type, \ ++ ev->region, \ ++ ev->uuid, \ ++ ev->hpa, \ ++ ev->dpa, \ ++ ev->dpa_length, \ ++ ev->source, \ ++ ev->flags, \ ++ ev->overflow_ts); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -368,6 +408,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case MF_EVENT: + rc = set_mf_event_backtrace(buf, (struct ras_mf_event *)ev); + break; ++ case CXL_POISON_EVENT: ++ rc = set_cxl_poison_event_backtrace(buf, (struct ras_cxl_poison_event *)ev); ++ break; + default: + return -1; + } +@@ -776,3 +819,47 @@ mf_fail: + else + return -1; + } ++ ++int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto cxl_poison_fail; ++ ++ rc = commit_report_backtrace(sockfd, CXL_POISON_EVENT, ev); ++ if (rc < 0) ++ goto cxl_poison_fail; ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl-poison"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_poison_fail; ++ ++ sprintf(buf, "REASON=%s", "CXL poison"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_poison_fail; ++ ++ done = 1; ++ ++cxl_poison_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ else ++ return -1; ++} +diff --git a/ras-report.h b/ras-report.h +index e605eb1..d1591ce 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -39,6 +39,7 @@ int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev); + int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev); + int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev); + int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev); ++int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev); + + #else + +@@ -50,6 +51,7 @@ static inline int ras_report_arm_event(struct ras_events *ras, struct ras_arm_ev + static inline int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev) { return 0; }; + static inline int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; }; + static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; }; ++static inline int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; }; + + #endif + diff --git a/7be2edbf863b7acf7e1cab10c2b9f9bf51b3d513.patch b/7be2edbf863b7acf7e1cab10c2b9f9bf51b3d513.patch new file mode 100644 index 0000000..b6092db --- /dev/null +++ b/7be2edbf863b7acf7e1cab10c2b9f9bf51b3d513.patch @@ -0,0 +1,97 @@ +commit 7be2edbf863b7acf7e1cab10c2b9f9bf51b3d513 +Author: Shiju Jose +Date: Tue Apr 4 16:07:21 2023 +0100 + + rasdaemon: Add common function to get timestamp for the event + + Add common function to get the timestamp for the event + reported. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index 59534a4..d540ebb 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -42,6 +42,20 @@ static void convert_timestamp(unsigned long long ts, char *ts_ptr, uint16_t size + size); + } + ++static void get_timestamp(struct trace_seq *s, struct tep_record *record, ++ struct ras_events *ras, char *ts_ptr, uint16_t size) ++{ ++ time_t now; ++ struct tm *tm; ++ ++ now = record->ts / user_hz + ras->uptime_diff; ++ tm = localtime(&now); ++ if (tm) ++ strftime(ts_ptr, size, "%Y-%m-%d %H:%M:%S %z", tm); ++ else ++ strncpy(ts_ptr, "1970-01-01 00:00:00 +0000", size); ++} ++ + /* Poison List: Payload out flags */ + #define CXL_POISON_FLAG_MORE BIT(0) + #define CXL_POISON_FLAG_OVERFLOW BIT(1) +@@ -70,17 +84,9 @@ int ras_cxl_poison_event_handler(struct trace_seq *s, + int len; + unsigned long long val; + struct ras_events *ras = context; +- time_t now; +- struct tm *tm; + struct ras_cxl_poison_event ev; + +- now = record->ts / user_hz + ras->uptime_diff; +- tm = localtime(&now); +- if (tm) +- strftime(ev.timestamp, sizeof(ev.timestamp), +- "%Y-%m-%d %H:%M:%S %z", tm); +- else +- strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); ++ get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); + if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) + return -1; + +@@ -285,19 +291,11 @@ int ras_cxl_aer_ue_event_handler(struct trace_seq *s, + { + int len, i; + unsigned long long val; +- time_t now; +- struct tm *tm; + struct ras_events *ras = context; + struct ras_cxl_aer_ue_event ev; + + memset(&ev, 0, sizeof(ev)); +- now = record->ts / user_hz + ras->uptime_diff; +- tm = localtime(&now); +- if (tm) +- strftime(ev.timestamp, sizeof(ev.timestamp), +- "%Y-%m-%d %H:%M:%S %z", tm); +- else +- strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); ++ get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); + if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) + return -1; + +@@ -380,18 +378,10 @@ int ras_cxl_aer_ce_event_handler(struct trace_seq *s, + { + int len; + unsigned long long val; +- time_t now; +- struct tm *tm; + struct ras_events *ras = context; + struct ras_cxl_aer_ce_event ev; + +- now = record->ts / user_hz + ras->uptime_diff; +- tm = localtime(&now); +- if (tm) +- strftime(ev.timestamp, sizeof(ev.timestamp), +- "%Y-%m-%d %H:%M:%S %z", tm); +- else +- strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); ++ get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); + if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) + return -1; + diff --git a/8f79833e3d78424f4a594985fbeb91890f4af81c.patch b/8f79833e3d78424f4a594985fbeb91890f4af81c.patch new file mode 100644 index 0000000..b509270 --- /dev/null +++ b/8f79833e3d78424f4a594985fbeb91890f4af81c.patch @@ -0,0 +1,78 @@ +commit 8f79833e3d78424f4a594985fbeb91890f4af81c +Author: Shiju Jose +Date: Mon Mar 4 11:49:50 2024 +0000 + + rasdaemon: Fix build warnings unused variable if AMP RAS errors is not enabled + + This patch fixes following build warnings unused variable if AMP RAS errors + is not enabled(--enable-amp-ns-decode). + + ================================================== + ras-aer-handler.c: In function ‘ras_aer_event_handler’: + ras-aer-handler.c:72:21: warning: unused variable ‘fn’ [-Wunused-variable] + int seg, bus, dev, fn; + ^~ + ras-aer-handler.c:72:16: warning: unused variable ‘dev’ [-Wunused-variable] + int seg, bus, dev, fn; + ^~~ + ras-aer-handler.c:72:11: warning: unused variable ‘bus’ [-Wunused-variable] + int seg, bus, dev, fn; + ^~~ + ras-aer-handler.c:72:6: warning: unused variable ‘seg’ [-Wunused-variable] + int seg, bus, dev, fn; + ^~~ + ras-aer-handler.c:71:10: warning: variable ‘sel_data’ set but not used [-Wunused-but-set-variable] + uint8_t sel_data[5]; + ^~~~~~~~ + ras-aer-handler.c:70:7: warning: unused variable ‘ipmi_add_sel’ [-Wunused-variable] + char ipmi_add_sel[105]; + ^~~~~~~~~~~~ + ================================================== + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-aer-handler.c b/ras-aer-handler.c +index bb1a6f6..29f6551 100644 +--- a/ras-aer-handler.c ++++ b/ras-aer-handler.c +@@ -67,9 +67,11 @@ int ras_aer_event_handler(struct trace_seq *s, + struct tm *tm; + struct ras_aer_event ev; + char buf[BUF_LEN]; ++#ifdef HAVE_AMP_NS_DECODE + char ipmi_add_sel[105]; + uint8_t sel_data[5]; + int seg, bus, dev, fn; ++#endif + + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. +@@ -132,19 +134,27 @@ int ras_aer_event_handler(struct trace_seq *s, + switch (severity_val) { + case HW_EVENT_AER_UNCORRECTED_NON_FATAL: + ev.error_type = "Uncorrected (Non-Fatal)"; ++#ifdef HAVE_AMP_NS_DECODE + sel_data[0] = 0xca; ++#endif + break; + case HW_EVENT_AER_UNCORRECTED_FATAL: + ev.error_type = "Uncorrected (Fatal)"; ++#ifdef HAVE_AMP_NS_DECODE + sel_data[0] = 0xca; ++#endif + break; + case HW_EVENT_AER_CORRECTED: + ev.error_type = "Corrected"; ++#ifdef HAVE_AMP_NS_DECODE + sel_data[0] = 0xbf; ++#endif + break; + default: + ev.error_type = "Unknown severity"; ++#ifdef HAVE_AMP_NS_DECODE + sel_data[0] = 0xbf; ++#endif + } + trace_seq_puts(s, ev.error_type); + diff --git a/93ca96b66c917af37b2ae9295dc5df46a7d64dd2.patch b/93ca96b66c917af37b2ae9295dc5df46a7d64dd2.patch new file mode 100644 index 0000000..4952349 --- /dev/null +++ b/93ca96b66c917af37b2ae9295dc5df46a7d64dd2.patch @@ -0,0 +1,82 @@ +commit b6506f22fb2d7f44d9d633d44656dff2a94f257e +Author: Shiju Jose +Date: Mon Feb 12 10:49:10 2024 +0000 + + rasdaemon: ras-mc-ctl: Add support for CXL poison trace events + + Add support for CXL poison events to the ras-mc-ctl tool. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + (cherry picked from commit 93ca96b66c917af37b2ae9295dc5df46a7d64dd2) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 6a319a7..16b0589 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1382,6 +1382,22 @@ sub summary + print "No CXL overflow errors.\n\n"; + } + $query_handle->finish; ++ ++ # CXL poison errors ++ $query = "select memdev, count(*) from cxl_poison_event$conf{opt}{since} group by memdev"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($memdev, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$memdev errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "CXL poison events summary:\n$out\n"; ++ } else { ++ print "No CXL poison errors.\n\n"; ++ } ++ $query_handle->finish; + } + + # extlog errors +@@ -1491,6 +1507,7 @@ sub errors + my ($pfn, $page_type, $action_result); + my ($memdev, $host, $serial, $error_status, $first_error, $header_log); + my ($log_type, $first_ts, $last_ts); ++ my ($trace_type, $region, $region_uuid, $hpa, $dpa, $dpa_length, $source, $flags, $overflow_ts); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -1636,6 +1653,34 @@ sub errors + } else { + print "No CXL overflow errors.\n\n"; + } ++ ++ # CXL poison errors ++ $query = "select id, timestamp, memdev, host, serial, trace_type, region, region_uuid, hpa, dpa, dpa_length, source, flags, overflow_ts from cxl_poison_event$conf{opt}{since} order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $trace_type, $region, $region_uuid, $hpa, $dpa, $dpa_length, $source, $flags, $overflow_ts)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $timestamp error: "; ++ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev); ++ $out .= "host=$host, " if (defined $host && length $host); ++ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial); ++ $out .= "trace_type=$trace_type, " if (defined $trace_type && length $trace_type); ++ $out .= "region=$region, " if (defined $region && length $region); ++ $out .= "region_uuid=$region_uuid, " if (defined $region_uuid && length $region_uuid); ++ $out .= sprintf "hpa=0x%llx, ", $hpa if (defined $hpa && length $hpa); ++ $out .= sprintf "dpa=0x%llx, ", $dpa if (defined $dpa && length $dpa); ++ $out .= sprintf "dpa_length=0x%x, ", $dpa_length if (defined $dpa_length && length $dpa_length); ++ $out .= "source=$source, " if (defined $source && length $source); ++ $out .= sprintf "flags=%d, ", $flags if (defined $flags && length $flags); ++ $out .= "overflow timestamp=$overflow_ts " if (defined $overflow_ts && length $overflow_ts); ++ $out .= "\n"; ++ } ++ if ($out ne "") { ++ print "CXL poison events:\n$out\n"; ++ } else { ++ print "No CXL poison errors.\n\n"; ++ } + } + + # Extlog errors diff --git a/9a2f6186db2622788f8868d8ec082684d6a06d4d.patch b/9a2f6186db2622788f8868d8ec082684d6a06d4d.patch new file mode 100644 index 0000000..c85f54e --- /dev/null +++ b/9a2f6186db2622788f8868d8ec082684d6a06d4d.patch @@ -0,0 +1,559 @@ +commit 9a2f6186db2622788f8868d8ec082684d6a06d4d +Author: Shiju Jose +Date: Wed Apr 5 13:28:20 2023 +0100 + + rasdaemon: Add support for the CXL dram events + + Add support to log and record the CXL dram events. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index 2de96f6..64b0b50 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -865,3 +865,154 @@ int ras_cxl_general_media_event_handler(struct trace_seq *s, + + return 0; + } ++ ++/* ++ * DRAM Event Record - DER ++ * ++ * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44 ++ */ ++#define CXL_DER_VALID_CHANNEL BIT(0) ++#define CXL_DER_VALID_RANK BIT(1) ++#define CXL_DER_VALID_NIBBLE BIT(2) ++#define CXL_DER_VALID_BANK_GROUP BIT(3) ++#define CXL_DER_VALID_BANK BIT(4) ++#define CXL_DER_VALID_ROW BIT(5) ++#define CXL_DER_VALID_COLUMN BIT(6) ++#define CXL_DER_VALID_CORRECTION_MASK BIT(7) ++ ++int ras_cxl_dram_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ int len, i; ++ unsigned long long val; ++ struct ras_events *ras = context; ++ struct ras_cxl_dram_event ev; ++ ++ memset(&ev, 0, sizeof(ev)); ++ if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "dpa", record, &val, 1) < 0) ++ return -1; ++ ev.dpa = val; ++ if (trace_seq_printf(s, "dpa:0x%llx ", (unsigned long long)ev.dpa) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "dpa_flags", record, &val, 1) < 0) ++ return -1; ++ ev.dpa_flags = val; ++ if (trace_seq_printf(s, "dpa_flags:") <= 0) ++ return -1; ++ if (decode_cxl_event_flags(s, ev.dpa_flags, cxl_dpa_flags, ARRAY_SIZE(cxl_dpa_flags)) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "descriptor", record, &val, 1) < 0) ++ return -1; ++ ev.descriptor = val; ++ if (trace_seq_printf(s, "descriptor:") <= 0) ++ return -1; ++ if (decode_cxl_event_flags(s, ev.descriptor, cxl_gmer_event_desc_flags, ++ ARRAY_SIZE(cxl_gmer_event_desc_flags)) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "type", record, &val, 1) < 0) ++ return -1; ++ ev.type = val; ++ if (trace_seq_printf(s, "type:%s ", get_cxl_type_str(cxl_gmer_mem_event_type, ++ ARRAY_SIZE(cxl_gmer_mem_event_type), ev.type)) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "transaction_type", record, &val, 1) < 0) ++ return -1; ++ ev.transaction_type = val; ++ if (trace_seq_printf(s, "transaction_type:%s ", ++ get_cxl_type_str(cxl_gmer_trans_type, ++ ARRAY_SIZE(cxl_gmer_trans_type), ++ ev.transaction_type)) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "validity_flags", record, &val, 1) < 0) ++ return -1; ++ ev.validity_flags = val; ++ ++ if (ev.validity_flags & CXL_DER_VALID_CHANNEL) { ++ if (tep_get_field_val(s, event, "channel", record, &val, 1) < 0) ++ return -1; ++ ev.channel = val; ++ if (trace_seq_printf(s, "channel:%u ", ev.channel) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_RANK) { ++ if (tep_get_field_val(s, event, "rank", record, &val, 1) < 0) ++ return -1; ++ ev.rank = val; ++ if (trace_seq_printf(s, "rank:%u ", ev.rank) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_NIBBLE) { ++ if (tep_get_field_val(s, event, "nibble_mask", record, &val, 1) < 0) ++ return -1; ++ ev.nibble_mask = val; ++ if (trace_seq_printf(s, "nibble_mask:%u ", ev.nibble_mask) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_BANK_GROUP) { ++ if (tep_get_field_val(s, event, "bank_group", record, &val, 1) < 0) ++ return -1; ++ ev.bank_group = val; ++ if (trace_seq_printf(s, "bank_group:%u ", ev.bank_group) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_BANK) { ++ if (tep_get_field_val(s, event, "bank", record, &val, 1) < 0) ++ return -1; ++ ev.bank = val; ++ if (trace_seq_printf(s, "bank:%u ", ev.bank) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_ROW) { ++ if (tep_get_field_val(s, event, "row", record, &val, 1) < 0) ++ return -1; ++ ev.row = val; ++ if (trace_seq_printf(s, "row:%u ", ev.row) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_COLUMN) { ++ if (tep_get_field_val(s, event, "column", record, &val, 1) < 0) ++ return -1; ++ ev.column = val; ++ if (trace_seq_printf(s, "column:%u ", ev.column) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_CORRECTION_MASK) { ++ ev.cor_mask = tep_get_field_raw(s, event, "cor_mask", record, &len, 1); ++ if (!ev.cor_mask) ++ return -1; ++ if (trace_seq_printf(s, "correction_mask:") <= 0) ++ return -1; ++ for (i = 0; i < CXL_EVENT_DER_CORRECTION_MASK_SIZE; i++) { ++ if (trace_seq_printf(s, "%02x ", ev.cor_mask[i]) <= 0) ++ break; ++ } ++ } ++ ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_cxl_dram_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_cxl_dram_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h +index 3adca4a..35455af 100644 +--- a/ras-cxl-handler.h ++++ b/ras-cxl-handler.h +@@ -38,4 +38,7 @@ int ras_cxl_generic_event_handler(struct trace_seq *s, + int ras_cxl_general_media_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); ++int ras_cxl_dram_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context); + #endif +diff --git a/ras-events.c b/ras-events.c +index 978dee4..d27e0c4 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -251,6 +251,7 @@ int toggle_ras_mc_event(int enable) + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_overflow", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_generic_event", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_general_media", enable); ++ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_dram", enable); + #endif + + free_ras: +@@ -1072,6 +1073,14 @@ int handle_ras_events(int record_events) + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "cxl_general_media"); ++ ++ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_dram", ++ ras_cxl_dram_event_handler, NULL, CXL_DRAM_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "cxl", "cxl_dram"); + #endif + + if (!num_events) { +diff --git a/ras-events.h b/ras-events.h +index 9b83df3..d192a6b 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -45,6 +45,7 @@ enum { + CXL_OVERFLOW_EVENT, + CXL_GENERIC_EVENT, + CXL_GENERAL_MEDIA_EVENT, ++ CXL_DRAM_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index 507a58e..fffa81c 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -915,6 +915,83 @@ int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_gen + + return rc; + } ++ ++/* ++ * Table and functions to handle cxl:cxl_dram_event ++ */ ++static const struct db_fields cxl_dram_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "memdev", .type = "TEXT" }, ++ { .name = "host", .type = "TEXT" }, ++ { .name = "serial", .type = "INTEGER" }, ++ { .name = "log_type", .type = "TEXT" }, ++ { .name = "hdr_uuid", .type = "TEXT" }, ++ { .name = "hdr_flags", .type = "INTEGER" }, ++ { .name = "hdr_handle", .type = "INTEGER" }, ++ { .name = "hdr_related_handle", .type = "INTEGER" }, ++ { .name = "hdr_ts", .type = "TEXT" }, ++ { .name = "hdr_length", .type = "INTEGER" }, ++ { .name = "hdr_maint_op_class", .type = "INTEGER" }, ++ { .name = "dpa", .type = "INTEGER" }, ++ { .name = "dpa_flags", .type = "INTEGER" }, ++ { .name = "descriptor", .type = "INTEGER" }, ++ { .name = "type", .type = "INTEGER" }, ++ { .name = "transaction_type", .type = "INTEGER" }, ++ { .name = "channel", .type = "INTEGER" }, ++ { .name = "rank", .type = "INTEGER" }, ++ { .name = "nibble_mask", .type = "INTEGER" }, ++ { .name = "bank_group", .type = "INTEGER" }, ++ { .name = "bank", .type = "INTEGER" }, ++ { .name = "row", .type = "INTEGER" }, ++ { .name = "column", .type = "INTEGER" }, ++ { .name = "cor_mask", .type = "BLOB" }, ++}; ++ ++static const struct db_table_descriptor cxl_dram_event_tab = { ++ .name = "cxl_dram_event", ++ .fields = cxl_dram_event_fields, ++ .num_fields = ARRAY_SIZE(cxl_dram_event_fields), ++}; ++ ++int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_cxl_dram_event) ++ return 0; ++ log(TERM, LOG_INFO, "cxl_dram_event store: %p\n", ++ priv->stmt_cxl_dram_event); ++ ++ ras_store_cxl_common_hdr(priv->stmt_cxl_dram_event, &ev->hdr); ++ sqlite3_bind_int64(priv->stmt_cxl_dram_event, 13, ev->dpa); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 14, ev->dpa_flags); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 15, ev->descriptor); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 16, ev->type); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 17, ev->transaction_type); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 18, ev->channel); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 19, ev->rank); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 20, ev->nibble_mask); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 21, ev->bank_group); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 22, ev->bank); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 23, ev->row); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 24, ev->column); ++ sqlite3_bind_blob(priv->stmt_cxl_dram_event, 25, ev->cor_mask, ++ CXL_EVENT_DER_CORRECTION_MASK_SIZE, NULL); ++ ++ rc = sqlite3_step(priv->stmt_cxl_dram_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do stmt_cxl_dram_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_cxl_dram_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset stmt_cxl_dram_event on sqlite: error = %d\n", rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} + #endif + + /* +@@ -1306,6 +1383,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + if (rc != SQLITE_OK) + goto error; + } ++ ++ rc = ras_mc_create_table(priv, &cxl_dram_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_dram_event, ++ &cxl_dram_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } + #endif + + ras->db_priv = priv; +@@ -1475,6 +1560,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + "cpu %u: Failed to finalize cxl_general_media_event sqlite: error = %d\n", + cpu, rc); + } ++ ++ if (priv->stmt_cxl_dram_event) { ++ rc = sqlite3_finalize(priv->stmt_cxl_dram_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize cxl_dram_event sqlite: error = %d\n", ++ cpu, rc); ++ } + #endif + + rc = sqlite3_close_v2(db); +diff --git a/ras-record.h b/ras-record.h +index 37c32de..480ff92 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -135,6 +135,7 @@ struct ras_cxl_poison_event { + #define CXL_HEADERLOG_SIZE_U32 (SZ_512 / sizeof(uint32_t)) + #define CXL_EVENT_RECORD_DATA_LENGTH 0x50 + #define CXL_EVENT_GEN_MED_COMP_ID_SIZE 0x10 ++#define CXL_EVENT_DER_CORRECTION_MASK_SIZE 0x20 + + struct ras_cxl_aer_ue_event { + char timestamp[64]; +@@ -199,6 +200,24 @@ struct ras_cxl_general_media_event { + uint16_t validity_flags; + }; + ++struct ras_cxl_dram_event { ++ struct ras_cxl_event_common_hdr hdr; ++ uint64_t dpa; ++ uint8_t dpa_flags; ++ uint8_t descriptor; ++ uint8_t type; ++ uint8_t transaction_type; ++ uint8_t channel; ++ uint8_t rank; ++ uint32_t nibble_mask; ++ uint8_t bank_group; ++ uint8_t bank; ++ uint32_t row; ++ uint16_t column; ++ uint8_t *cor_mask; ++ uint16_t validity_flags; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -214,6 +233,7 @@ struct ras_cxl_aer_ce_event; + struct ras_cxl_overflow_event; + struct ras_cxl_generic_event; + struct ras_cxl_general_media_event; ++struct ras_cxl_dram_event; + + #ifdef HAVE_SQLITE3 + +@@ -253,6 +273,7 @@ struct sqlite3_priv { + sqlite3_stmt *stmt_cxl_overflow_event; + sqlite3_stmt *stmt_cxl_generic_event; + sqlite3_stmt *stmt_cxl_general_media_event; ++ sqlite3_stmt *stmt_cxl_dram_event; + #endif + }; + +@@ -287,6 +308,7 @@ int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_eve + int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); + int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); + int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); ++int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -306,6 +328,7 @@ static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_ + static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; + static inline int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; + static inline int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; ++static inline int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 725dc9b..21180b1 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -543,6 +543,68 @@ static int set_cxl_general_media_event_backtrace(char *buf, struct ras_cxl_gener + return 0; + } + ++static int set_cxl_dram_event_backtrace(char *buf, struct ras_cxl_dram_event *ev) ++{ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "memdev=%s\n" \ ++ "host=%s\n" \ ++ "serial=0x%lx\n" \ ++ "log_type=%s\n" \ ++ "hdr_uuid=%s\n" \ ++ "hdr_flags=0x%x\n" \ ++ "hdr_handle=0x%x\n" \ ++ "hdr_related_handle=0x%x\n" \ ++ "hdr_timestamp=%s\n" \ ++ "hdr_length=%u\n" \ ++ "hdr_maint_op_class=%u\n" \ ++ "dpa=0x%lx\n" \ ++ "dpa_flags=%u\n" \ ++ "descriptor=%u\n" \ ++ "type=%u\n" \ ++ "transaction_type=%u\n" \ ++ "channel=%u\n" \ ++ "rank=%u\n" \ ++ "nibble_mask=%u\n" \ ++ "bank_group=%u\n" \ ++ "bank=%u\n" \ ++ "row=%u\n" \ ++ "column=%u\n", \ ++ ev->hdr.timestamp, \ ++ ev->hdr.memdev, \ ++ ev->hdr.host, \ ++ ev->hdr.serial, \ ++ ev->hdr.log_type, \ ++ ev->hdr.hdr_uuid, \ ++ ev->hdr.hdr_flags, \ ++ ev->hdr.hdr_handle, \ ++ ev->hdr.hdr_related_handle, \ ++ ev->hdr.hdr_timestamp, \ ++ ev->hdr.hdr_length, \ ++ ev->hdr.hdr_maint_op_class, \ ++ ev->dpa, \ ++ ev->dpa_flags, \ ++ ev->descriptor, \ ++ ev->type, \ ++ ev->transaction_type, \ ++ ev->channel, \ ++ ev->rank, \ ++ ev->nibble_mask, \ ++ ev->bank_group, \ ++ ev->bank, \ ++ ev->row, \ ++ ev->column); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -598,6 +660,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case CXL_GENERAL_MEDIA_EVENT: + rc = set_cxl_general_media_event_backtrace(buf, (struct ras_cxl_general_media_event *)ev); + break; ++ case CXL_DRAM_EVENT: ++ rc = set_cxl_dram_event_backtrace(buf, (struct ras_cxl_dram_event *)ev); ++ break; + default: + return -1; + } +@@ -1271,3 +1336,47 @@ cxl_general_media_fail: + else + return -1; + } ++ ++int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto cxl_dram_fail; ++ ++ rc = commit_report_backtrace(sockfd, CXL_DRAM_EVENT, ev); ++ if (rc < 0) ++ goto cxl_dram_fail; ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl_dram_event"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_dram_fail; ++ ++ sprintf(buf, "REASON=%s", "CXL DRAM Event"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_dram_fail; ++ ++ done = 1; ++ ++cxl_dram_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ else ++ return -1; ++} +diff --git a/ras-report.h b/ras-report.h +index d9ec7df..1ad00e0 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -45,6 +45,7 @@ int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_ev + int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); + int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); + int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); ++int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev); + + #else + +@@ -62,6 +63,7 @@ static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras + static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; + static inline int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; + static inline int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; ++static inline int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) { return 0; }; + + #endif + diff --git a/a247baf7110ab6427259eb1421a103e2021a8735.patch b/a247baf7110ab6427259eb1421a103e2021a8735.patch new file mode 100644 index 0000000..eb615ec --- /dev/null +++ b/a247baf7110ab6427259eb1421a103e2021a8735.patch @@ -0,0 +1,424 @@ +commit a247baf7110ab6427259eb1421a103e2021a8735 +Author: Shiju Jose +Date: Fri Mar 17 13:07:01 2023 +0000 + + rasdaemon: Add support for the CXL AER correctable errors + + Add support to log and record the CXL AER correctable errors. + + The corresponding Kernel patches are here: + https://lore.kernel.org/linux-cxl/166974401763.1608150.5424589924034481387.stgit@djiang5-desk3.ch.intel.com/T/#t + https://lore.kernel.org/linux-cxl/63e5ed38d77d9_138fbc2947a@iweiny-mobl.notmuch/T/#t + + Signed-off-by: Shiju Jose + Reviewed-by: Jonathan Cameron + Reviewed-by: Dave Jiang + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index 0f2c9e4..8f6342d 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -220,6 +220,14 @@ int ras_cxl_poison_event_handler(struct trace_seq *s, + #define CXL_AER_UE_IDE_TX_ERR BIT(15) + #define CXL_AER_UE_IDE_RX_ERR BIT(16) + ++#define CXL_AER_CE_CACHE_DATA_ECC BIT(0) ++#define CXL_AER_CE_MEM_DATA_ECC BIT(1) ++#define CXL_AER_CE_CRC_THRESH BIT(2) ++#define CXL_AER_CE_RETRY_THRESH BIT(3) ++#define CXL_AER_CE_CACHE_POISON BIT(4) ++#define CXL_AER_CE_MEM_POISON BIT(5) ++#define CXL_AER_CE_PHYS_LAYER_ERR BIT(6) ++ + struct cxl_error_list { + uint32_t bit; + const char *error; +@@ -243,6 +251,16 @@ static const struct cxl_error_list cxl_aer_ue[] = { + { .bit = CXL_AER_UE_IDE_RX_ERR, .error = "IDE Rx Error" }, + }; + ++static const struct cxl_error_list cxl_aer_ce[] = { ++ { .bit = CXL_AER_CE_CACHE_DATA_ECC, .error = "Cache Data ECC Error" }, ++ { .bit = CXL_AER_CE_MEM_DATA_ECC, .error = "Memory Data ECC Error" }, ++ { .bit = CXL_AER_CE_CRC_THRESH, .error = "CRC Threshold Hit" }, ++ { .bit = CXL_AER_CE_RETRY_THRESH, .error = "Retry Threshold" }, ++ { .bit = CXL_AER_CE_CACHE_POISON, .error = "Received Cache Poison From Peer" }, ++ { .bit = CXL_AER_CE_MEM_POISON, .error = "Received Memory Poison From Peer" }, ++ { .bit = CXL_AER_CE_PHYS_LAYER_ERR, .error = "Received Error From Physical Layer" }, ++}; ++ + static int decode_cxl_error_status(struct trace_seq *s, uint32_t status, + const struct cxl_error_list *cxl_error_list, + uint8_t num_elems) +@@ -351,3 +369,66 @@ int ras_cxl_aer_ue_event_handler(struct trace_seq *s, + + return 0; + } ++ ++int ras_cxl_aer_ce_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ int len; ++ unsigned long long val; ++ time_t now; ++ struct tm *tm; ++ struct ras_events *ras = context; ++ struct ras_cxl_aer_ce_event ev; ++ ++ now = record->ts / user_hz + ras->uptime_diff; ++ tm = localtime(&now); ++ if (tm) ++ strftime(ev.timestamp, sizeof(ev.timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ else ++ strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); ++ if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) ++ return -1; ++ ++ ev.memdev = tep_get_field_raw(s, event, "memdev", ++ record, &len, 1); ++ if (!ev.memdev) ++ return -1; ++ if (trace_seq_printf(s, "memdev:%s ", ev.memdev) <= 0) ++ return -1; ++ ++ ev.host = tep_get_field_raw(s, event, "host", ++ record, &len, 1); ++ if (!ev.host) ++ return -1; ++ if (trace_seq_printf(s, "host:%s ", ev.host) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "serial", record, &val, 1) < 0) ++ return -1; ++ ev.serial = val; ++ if (trace_seq_printf(s, "serial:0x%llx ", (unsigned long long)ev.serial) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "status", record, &val, 1) < 0) ++ return -1; ++ ev.error_status = val; ++ if (trace_seq_printf(s, "error status:") <= 0) ++ return -1; ++ if (decode_cxl_error_status(s, ev.error_status, ++ cxl_aer_ce, ARRAY_SIZE(cxl_aer_ce)) < 0) ++ return -1; ++ ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_cxl_aer_ce_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_cxl_aer_ce_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h +index 35efadd..711daf4 100644 +--- a/ras-cxl-handler.h ++++ b/ras-cxl-handler.h +@@ -25,4 +25,8 @@ int ras_cxl_poison_event_handler(struct trace_seq *s, + int ras_cxl_aer_ue_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); ++ ++int ras_cxl_aer_ce_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context); + #endif +diff --git a/ras-events.c b/ras-events.c +index 5d73df1..2662467 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -247,6 +247,7 @@ int toggle_ras_mc_event(int enable) + #ifdef HAVE_CXL + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_poison", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_uncorrectable_error", enable); ++ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_correctable_error", enable); + #endif + + free_ras: +@@ -1001,6 +1002,14 @@ int handle_ras_events(int record_events) + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "cxl_aer_uncorrectable_error"); ++ ++ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_aer_correctable_error", ++ ras_cxl_aer_ce_event_handler, NULL, CXL_AER_CE_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "cxl", "cxl_aer_correctable_error"); + #endif + + if (!num_events) { +diff --git a/ras-events.h b/ras-events.h +index 4acbe57..a9d67c2 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -41,6 +41,7 @@ enum { + MF_EVENT, + CXL_POISON_EVENT, + CXL_AER_UE_EVENT, ++ CXL_AER_CE_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index 97a2a37..86133c4 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -673,6 +673,53 @@ int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_eve + + return rc; + } ++ ++/* ++ * Table and functions to handle cxl:cxl_aer_correctable_error ++ */ ++static const struct db_fields cxl_aer_ce_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "memdev", .type = "TEXT" }, ++ { .name = "host", .type = "TEXT" }, ++ { .name = "serial", .type = "INTEGER" }, ++ { .name = "error_status", .type = "INTEGER" }, ++}; ++ ++static const struct db_table_descriptor cxl_aer_ce_event_tab = { ++ .name = "cxl_aer_ce_event", ++ .fields = cxl_aer_ce_event_fields, ++ .num_fields = ARRAY_SIZE(cxl_aer_ce_event_fields), ++}; ++ ++int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_cxl_aer_ce_event) ++ return 0; ++ log(TERM, LOG_INFO, "cxl_aer_ce_event store: %p\n", priv->stmt_cxl_aer_ce_event); ++ ++ sqlite3_bind_text(priv->stmt_cxl_aer_ce_event, 1, ev->timestamp, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_aer_ce_event, 2, ev->memdev, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_aer_ce_event, 3, ev->host, -1, NULL); ++ sqlite3_bind_int64(priv->stmt_cxl_aer_ce_event, 4, ev->serial); ++ sqlite3_bind_int(priv->stmt_cxl_aer_ce_event, 5, ev->error_status); ++ ++ rc = sqlite3_step(priv->stmt_cxl_aer_ce_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do cxl_aer_ce_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_cxl_aer_ce_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset cxl_aer_ce_event on sqlite: error = %d\n", ++ rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} + #endif + + /* +@@ -1032,6 +1079,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + if (rc != SQLITE_OK) + goto error; + } ++ ++ rc = ras_mc_create_table(priv, &cxl_aer_ce_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_aer_ce_event, ++ &cxl_aer_ce_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } + #endif + + ras->db_priv = priv; +@@ -1169,6 +1224,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + "cpu %u: Failed to finalize cxl_aer_ue_event sqlite: error = %d\n", + cpu, rc); + } ++ ++ if (priv->stmt_cxl_aer_ce_event) { ++ rc = sqlite3_finalize(priv->stmt_cxl_aer_ce_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize cxl_aer_ce_event sqlite: error = %d\n", ++ cpu, rc); ++ } + #endif + + rc = sqlite3_close_v2(db); +diff --git a/ras-record.h b/ras-record.h +index f11985f..ab7153d 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -144,6 +144,14 @@ struct ras_cxl_aer_ue_event { + uint32_t *header_log; + }; + ++struct ras_cxl_aer_ce_event { ++ char timestamp[64]; ++ const char *memdev; ++ const char *host; ++ uint64_t serial; ++ uint32_t error_status; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -155,6 +163,7 @@ struct diskerror_event; + struct ras_mf_event; + struct ras_cxl_poison_event; + struct ras_cxl_aer_ue_event; ++struct ras_cxl_aer_ce_event; + + #ifdef HAVE_SQLITE3 + +@@ -190,6 +199,7 @@ struct sqlite3_priv { + #ifdef HAVE_CXL + sqlite3_stmt *stmt_cxl_poison_event; + sqlite3_stmt *stmt_cxl_aer_ue_event; ++ sqlite3_stmt *stmt_cxl_aer_ce_event; + #endif + }; + +@@ -220,6 +230,7 @@ int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev + int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev); + int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev); + int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev); ++int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -235,6 +246,7 @@ static inline int ras_store_diskerror_event(struct ras_events *ras, struct diske + static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; }; + static inline int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; }; + static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; }; ++static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 2ebdc80..63b47f5 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -397,6 +397,30 @@ static int set_cxl_aer_ue_event_backtrace(char *buf, struct ras_cxl_aer_ue_event + return 0; + } + ++static int set_cxl_aer_ce_event_backtrace(char *buf, struct ras_cxl_aer_ce_event *ev) ++{ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "memdev=%s\n" \ ++ "host=%s\n" \ ++ "serial=0x%lx\n" \ ++ "error_status=%u\n", \ ++ ev->timestamp, \ ++ ev->memdev, \ ++ ev->host, \ ++ ev->serial, \ ++ ev->error_status); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -440,6 +464,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case CXL_AER_UE_EVENT: + rc = set_cxl_aer_ue_event_backtrace(buf, (struct ras_cxl_aer_ue_event *)ev); + break; ++ case CXL_AER_CE_EVENT: ++ rc = set_cxl_aer_ce_event_backtrace(buf, (struct ras_cxl_aer_ce_event *)ev); ++ break; + default: + return -1; + } +@@ -936,3 +963,47 @@ cxl_aer_ue_fail: + else + return -1; + } ++ ++int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto cxl_aer_ce_fail; ++ ++ rc = commit_report_backtrace(sockfd, CXL_AER_CE_EVENT, ev); ++ if (rc < 0) ++ goto cxl_aer_ce_fail; ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl-aer-correctable-error"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_aer_ce_fail; ++ ++ sprintf(buf, "REASON=%s", "CXL AER correctable error"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_aer_ce_fail; ++ ++ done = 1; ++ ++cxl_aer_ce_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ else ++ return -1; ++} +diff --git a/ras-report.h b/ras-report.h +index dfe89d1..46155ee 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -41,6 +41,7 @@ int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *e + int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev); + int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev); + int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev); ++int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); + + #else + +@@ -54,6 +55,7 @@ static inline int ras_report_diskerror_event(struct ras_events *ras, struct disk + static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; }; + static inline int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; }; + static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; }; ++static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; + + #endif + diff --git a/a7524917befe7e67c02253cc27cb0c724e5992c0.patch b/a7524917befe7e67c02253cc27cb0c724e5992c0.patch new file mode 100644 index 0000000..b5625d0 --- /dev/null +++ b/a7524917befe7e67c02253cc27cb0c724e5992c0.patch @@ -0,0 +1,503 @@ +commit a7524917befe7e67c02253cc27cb0c724e5992c0 +Author: Shiju Jose +Date: Fri Mar 17 12:51:02 2023 +0000 + + rasdaemon: Add support for the CXL AER uncorrectable errors + + Add support to log and record the CXL AER uncorrectable errors. + + The corresponding Kernel patches are here: + https://lore.kernel.org/linux-cxl/166974401763.1608150.5424589924034481387.stgit@djiang5-desk3.ch.intel.com/T/#t + https://lore.kernel.org/lkml/63eeb2a8c9e3f_32d612941f@dwillia2-xfh.jf.intel.com.notmuch/T/ + + It was found that the header log data to be converted to the + big-endian format to correctly store in the SQLite DB likely + because the SQLite database seems uses the big-endian storage. + + Signed-off-by: Shiju Jose + Reviewed-by: Jonathan Cameron + Reviewed-by: Dave Jiang # + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index cb23ba2..0f2c9e4 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -21,6 +21,7 @@ + #include "ras-record.h" + #include "ras-logger.h" + #include "ras-report.h" ++#include + + /* Poison List: Payload out flags */ + #define CXL_POISON_FLAG_MORE BIT(0) +@@ -200,3 +201,153 @@ int ras_cxl_poison_event_handler(struct trace_seq *s, + + return 0; + } ++ ++/* CXL AER Errors */ ++ ++#define CXL_AER_UE_CACHE_DATA_PARITY BIT(0) ++#define CXL_AER_UE_CACHE_ADDR_PARITY BIT(1) ++#define CXL_AER_UE_CACHE_BE_PARITY BIT(2) ++#define CXL_AER_UE_CACHE_DATA_ECC BIT(3) ++#define CXL_AER_UE_MEM_DATA_PARITY BIT(4) ++#define CXL_AER_UE_MEM_ADDR_PARITY BIT(5) ++#define CXL_AER_UE_MEM_BE_PARITY BIT(6) ++#define CXL_AER_UE_MEM_DATA_ECC BIT(7) ++#define CXL_AER_UE_REINIT_THRESH BIT(8) ++#define CXL_AER_UE_RSVD_ENCODE BIT(9) ++#define CXL_AER_UE_POISON BIT(10) ++#define CXL_AER_UE_RECV_OVERFLOW BIT(11) ++#define CXL_AER_UE_INTERNAL_ERR BIT(14) ++#define CXL_AER_UE_IDE_TX_ERR BIT(15) ++#define CXL_AER_UE_IDE_RX_ERR BIT(16) ++ ++struct cxl_error_list { ++ uint32_t bit; ++ const char *error; ++}; ++ ++static const struct cxl_error_list cxl_aer_ue[] = { ++ { .bit = CXL_AER_UE_CACHE_DATA_PARITY, .error = "Cache Data Parity Error" }, ++ { .bit = CXL_AER_UE_CACHE_ADDR_PARITY, .error = "Cache Address Parity Error" }, ++ { .bit = CXL_AER_UE_CACHE_BE_PARITY, .error = "Cache Byte Enable Parity Error" }, ++ { .bit = CXL_AER_UE_CACHE_DATA_ECC, .error = "Cache Data ECC Error" }, ++ { .bit = CXL_AER_UE_MEM_DATA_PARITY, .error = "Memory Data Parity Error" }, ++ { .bit = CXL_AER_UE_MEM_ADDR_PARITY, .error = "Memory Address Parity Error" }, ++ { .bit = CXL_AER_UE_MEM_BE_PARITY, .error = "Memory Byte Enable Parity Error" }, ++ { .bit = CXL_AER_UE_MEM_DATA_ECC, .error = "Memory Data ECC Error" }, ++ { .bit = CXL_AER_UE_REINIT_THRESH, .error = "REINIT Threshold Hit" }, ++ { .bit = CXL_AER_UE_RSVD_ENCODE, .error = "Received Unrecognized Encoding" }, ++ { .bit = CXL_AER_UE_POISON, .error = "Received Poison From Peer" }, ++ { .bit = CXL_AER_UE_RECV_OVERFLOW, .error = "Receiver Overflow" }, ++ { .bit = CXL_AER_UE_INTERNAL_ERR, .error = "Component Specific Error" }, ++ { .bit = CXL_AER_UE_IDE_TX_ERR, .error = "IDE Tx Error" }, ++ { .bit = CXL_AER_UE_IDE_RX_ERR, .error = "IDE Rx Error" }, ++}; ++ ++static int decode_cxl_error_status(struct trace_seq *s, uint32_t status, ++ const struct cxl_error_list *cxl_error_list, ++ uint8_t num_elems) ++{ ++ int i; ++ ++ for (i = 0; i < num_elems; i++) { ++ if (status & cxl_error_list[i].bit) ++ if (trace_seq_printf(s, "\'%s\' ", cxl_error_list[i].error) <= 0) ++ return -1; ++ } ++ return 0; ++} ++ ++int ras_cxl_aer_ue_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ int len, i; ++ unsigned long long val; ++ time_t now; ++ struct tm *tm; ++ struct ras_events *ras = context; ++ struct ras_cxl_aer_ue_event ev; ++ ++ memset(&ev, 0, sizeof(ev)); ++ now = record->ts / user_hz + ras->uptime_diff; ++ tm = localtime(&now); ++ if (tm) ++ strftime(ev.timestamp, sizeof(ev.timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ else ++ strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); ++ if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) ++ return -1; ++ ++ ev.memdev = tep_get_field_raw(s, event, "memdev", ++ record, &len, 1); ++ if (!ev.memdev) ++ return -1; ++ if (trace_seq_printf(s, "memdev:%s ", ev.memdev) <= 0) ++ return -1; ++ ++ ev.host = tep_get_field_raw(s, event, "host", ++ record, &len, 1); ++ if (!ev.host) ++ return -1; ++ if (trace_seq_printf(s, "host:%s ", ev.host) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "serial", record, &val, 1) < 0) ++ return -1; ++ ev.serial = val; ++ if (trace_seq_printf(s, "serial:0x%llx ", (unsigned long long)ev.serial) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "status", record, &val, 1) < 0) ++ return -1; ++ ev.error_status = val; ++ ++ if (trace_seq_printf(s, "error status:") <= 0) ++ return -1; ++ if (decode_cxl_error_status(s, ev.error_status, ++ cxl_aer_ue, ARRAY_SIZE(cxl_aer_ue)) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "first_error", record, &val, 1) < 0) ++ return -1; ++ ev.first_error = val; ++ ++ if (trace_seq_printf(s, "first error:") <= 0) ++ return -1; ++ if (decode_cxl_error_status(s, ev.first_error, ++ cxl_aer_ue, ARRAY_SIZE(cxl_aer_ue)) < 0) ++ return -1; ++ ++ ev.header_log = tep_get_field_raw(s, event, "header_log", ++ record, &len, 1); ++ if (!ev.header_log) ++ return -1; ++ if (trace_seq_printf(s, "header log:\n") <= 0) ++ return -1; ++ for (i = 0; i < CXL_HEADERLOG_SIZE_U32; i++) { ++ if (trace_seq_printf(s, "%08x ", ev.header_log[i]) <= 0) ++ break; ++ if ((i > 0) && ((i % 20) == 0)) ++ if (trace_seq_printf(s, "\n") <= 0) ++ break; ++ /* Convert header log data to the big-endian format because ++ * the SQLite database seems uses the big-endian storage. ++ */ ++ ev.header_log[i] = htobe32(ev.header_log[i]); ++ } ++ if (i < CXL_HEADERLOG_SIZE_U32) ++ return -1; ++ ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_cxl_aer_ue_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_cxl_aer_ue_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h +index 84d5cc6..35efadd 100644 +--- a/ras-cxl-handler.h ++++ b/ras-cxl-handler.h +@@ -21,4 +21,8 @@ + int ras_cxl_poison_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); ++ ++int ras_cxl_aer_ue_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context); + #endif +diff --git a/ras-events.c b/ras-events.c +index f95844a..5d73df1 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -246,6 +246,7 @@ int toggle_ras_mc_event(int enable) + + #ifdef HAVE_CXL + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_poison", enable); ++ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_uncorrectable_error", enable); + #endif + + free_ras: +@@ -992,6 +993,14 @@ int handle_ras_events(int record_events) + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "cxl_poison"); ++ ++ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_aer_uncorrectable_error", ++ ras_cxl_aer_ue_event_handler, NULL, CXL_AER_UE_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "cxl", "cxl_aer_uncorrectable_error"); + #endif + + if (!num_events) { +diff --git a/ras-events.h b/ras-events.h +index 1ef3ecd..4acbe57 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -40,6 +40,7 @@ enum { + DISKERROR_EVENT, + MF_EVENT, + CXL_POISON_EVENT, ++ CXL_AER_UE_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index c31baa0..97a2a37 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -622,6 +622,57 @@ int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_eve + + return rc; + } ++ ++/* ++ * Table and functions to handle cxl:cxl_aer_uncorrectable_error ++ */ ++static const struct db_fields cxl_aer_ue_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "memdev", .type = "TEXT" }, ++ { .name = "host", .type = "TEXT" }, ++ { .name = "serial", .type = "INTEGER" }, ++ { .name = "error_status", .type = "INTEGER" }, ++ { .name = "first_error", .type = "INTEGER" }, ++ { .name = "header_log", .type = "BLOB" }, ++}; ++ ++static const struct db_table_descriptor cxl_aer_ue_event_tab = { ++ .name = "cxl_aer_ue_event", ++ .fields = cxl_aer_ue_event_fields, ++ .num_fields = ARRAY_SIZE(cxl_aer_ue_event_fields), ++}; ++ ++int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_cxl_aer_ue_event) ++ return 0; ++ log(TERM, LOG_INFO, "cxl_aer_ue_event store: %p\n", priv->stmt_cxl_aer_ue_event); ++ ++ sqlite3_bind_text(priv->stmt_cxl_aer_ue_event, 1, ev->timestamp, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_aer_ue_event, 2, ev->memdev, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_aer_ue_event, 3, ev->host, -1, NULL); ++ sqlite3_bind_int64(priv->stmt_cxl_aer_ue_event, 4, ev->serial); ++ sqlite3_bind_int(priv->stmt_cxl_aer_ue_event, 5, ev->error_status); ++ sqlite3_bind_int(priv->stmt_cxl_aer_ue_event, 6, ev->first_error); ++ sqlite3_bind_blob(priv->stmt_cxl_aer_ue_event, 7, ev->header_log, CXL_HEADERLOG_SIZE, NULL); ++ ++ rc = sqlite3_step(priv->stmt_cxl_aer_ue_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do cxl_aer_ue_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_cxl_aer_ue_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset cxl_aer_ue_event on sqlite: error = %d\n", ++ rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} + #endif + + /* +@@ -973,6 +1024,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + if (rc != SQLITE_OK) + goto error; + } ++ ++ rc = ras_mc_create_table(priv, &cxl_aer_ue_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_aer_ue_event, ++ &cxl_aer_ue_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } + #endif + + ras->db_priv = priv; +@@ -1102,6 +1161,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + "cpu %u: Failed to finalize cxl_poison_event sqlite: error = %d\n", + cpu, rc); + } ++ ++ if (priv->stmt_cxl_aer_ue_event) { ++ rc = sqlite3_finalize(priv->stmt_cxl_aer_ue_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize cxl_aer_ue_event sqlite: error = %d\n", ++ cpu, rc); ++ } + #endif + + rc = sqlite3_close_v2(db); +diff --git a/ras-record.h b/ras-record.h +index fd15215..f11985f 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -130,6 +130,20 @@ struct ras_cxl_poison_event { + char overflow_ts[64]; + }; + ++#define SZ_512 0x200 ++#define CXL_HEADERLOG_SIZE SZ_512 ++#define CXL_HEADERLOG_SIZE_U32 (SZ_512 / sizeof(uint32_t)) ++ ++struct ras_cxl_aer_ue_event { ++ char timestamp[64]; ++ const char *memdev; ++ const char *host; ++ uint64_t serial; ++ uint32_t error_status; ++ uint32_t first_error; ++ uint32_t *header_log; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -140,6 +154,7 @@ struct devlink_event; + struct diskerror_event; + struct ras_mf_event; + struct ras_cxl_poison_event; ++struct ras_cxl_aer_ue_event; + + #ifdef HAVE_SQLITE3 + +@@ -174,6 +189,7 @@ struct sqlite3_priv { + #endif + #ifdef HAVE_CXL + sqlite3_stmt *stmt_cxl_poison_event; ++ sqlite3_stmt *stmt_cxl_aer_ue_event; + #endif + }; + +@@ -203,6 +219,7 @@ int ras_store_devlink_event(struct ras_events *ras, struct devlink_event *ev); + int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev); + int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev); + int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev); ++int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -217,6 +234,7 @@ static inline int ras_store_devlink_event(struct ras_events *ras, struct devlink + static inline int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; }; + static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; }; + static inline int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; }; ++static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 3daecc0..2ebdc80 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -371,6 +371,32 @@ static int set_cxl_poison_event_backtrace(char *buf, struct ras_cxl_poison_event + return 0; + } + ++static int set_cxl_aer_ue_event_backtrace(char *buf, struct ras_cxl_aer_ue_event *ev) ++{ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "memdev=%s\n" \ ++ "host=%s\n" \ ++ "serial=0x%lx\n" \ ++ "error_status=%u\n" \ ++ "first_error=%u\n", \ ++ ev->timestamp, \ ++ ev->memdev, \ ++ ev->host, \ ++ ev->serial, \ ++ ev->error_status, \ ++ ev->first_error); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -411,6 +437,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case CXL_POISON_EVENT: + rc = set_cxl_poison_event_backtrace(buf, (struct ras_cxl_poison_event *)ev); + break; ++ case CXL_AER_UE_EVENT: ++ rc = set_cxl_aer_ue_event_backtrace(buf, (struct ras_cxl_aer_ue_event *)ev); ++ break; + default: + return -1; + } +@@ -863,3 +892,47 @@ cxl_poison_fail: + else + return -1; + } ++ ++int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto cxl_aer_ue_fail; ++ ++ rc = commit_report_backtrace(sockfd, CXL_AER_UE_EVENT, ev); ++ if (rc < 0) ++ goto cxl_aer_ue_fail; ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl-aer-uncorrectable-error"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_aer_ue_fail; ++ ++ sprintf(buf, "REASON=%s", "CXL AER uncorrectable error"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_aer_ue_fail; ++ ++ done = 1; ++ ++cxl_aer_ue_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ else ++ return -1; ++} +diff --git a/ras-report.h b/ras-report.h +index d1591ce..dfe89d1 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -40,6 +40,7 @@ int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev); + int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev); + int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev); + int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev); ++int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev); + + #else + +@@ -52,6 +53,7 @@ static inline int ras_report_devlink_event(struct ras_events *ras, struct devlin + static inline int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; }; + static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; }; + static inline int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; }; ++static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; }; + + #endif + diff --git a/ae1647624486fca0070b297d0e2fd4e53443c10b.patch b/ae1647624486fca0070b297d0e2fd4e53443c10b.patch new file mode 100644 index 0000000..7d5cb0b --- /dev/null +++ b/ae1647624486fca0070b297d0e2fd4e53443c10b.patch @@ -0,0 +1,116 @@ +commit 81b362f0412eb9769098c2f4317b84b9bd82cce9 +Author: Shiju Jose +Date: Mon Feb 12 10:35:25 2024 +0000 + + rasdaemon: ras-mc-ctl: Add support for CXL AER correctable trace events + + Add support for CXL AER correctable events to the ras-mc-ctl tool. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + (cherry picked from commit ae1647624486fca0070b297d0e2fd4e53443c10b) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index c0a2ec6..9519279 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1230,6 +1230,46 @@ sub get_cxl_ue_error_status_text + return join (", ", @out); + } + ++use constant { ++ CXL_AER_CE_CACHE_DATA_ECC => 0x0001, ++ CXL_AER_CE_MEM_DATA_ECC => 0x0002, ++ CXL_AER_CE_CRC_THRESH => 0x0004, ++ CXL_AER_CE_RETRY_THRESH => 0x0008, ++ CXL_AER_CE_CACHE_POISON => 0x0010, ++ CXL_AER_CE_MEM_POISON => 0x0020, ++ CXL_AER_CE_PHYS_LAYER_ERR => 0x0040, ++}; ++ ++sub get_cxl_ce_error_status_text ++{ ++ my $error_status = $_[0]; ++ my @out; ++ ++ if ($error_status & CXL_AER_CE_CACHE_DATA_ECC) { ++ push @out, (sprintf "\'Cache Data ECC Error\' "); ++ } ++ if ($error_status & CXL_AER_CE_MEM_DATA_ECC) { ++ push @out, (sprintf "\'Memory Data ECC Error\' "); ++ } ++ if ($error_status & CXL_AER_CE_CRC_THRESH) { ++ push @out, (sprintf "\'CRC Threshold Hit\' "); ++ } ++ if ($error_status & CXL_AER_CE_RETRY_THRESH) { ++ push @out, (sprintf "\'Retry Threshold\' "); ++ } ++ if ($error_status & CXL_AER_CE_CACHE_POISON) { ++ push @out, (sprintf "\'Received Cache Poison From Peer\' "); ++ } ++ if ($error_status & CXL_AER_CE_MEM_POISON) { ++ push @out, (sprintf "\'Received Memory Poison From Peer\' "); ++ } ++ if ($error_status & CXL_AER_CE_PHYS_LAYER_ERR) { ++ push @out, (sprintf "\'Received Error From Physical Layer\' "); ++ } ++ ++ return join (", ", @out); ++} ++ + sub summary + { + require DBI; +@@ -1310,6 +1350,22 @@ sub summary + print "No CXL AER uncorrectable errors.\n\n"; + } + $query_handle->finish; ++ ++ # CXL AER correctable errors ++ $query = "select memdev, count(*) from cxl_aer_ce_event$conf{opt}{since} group by memdev"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($memdev, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$memdev errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "CXL AER correctable events summary:\n$out\n"; ++ } else { ++ print "No CXL AER correctable errors.\n\n"; ++ } ++ $query_handle->finish; + } + + # extlog errors +@@ -1519,6 +1575,29 @@ sub errors + print "No CXL AER uncorrectable errors.\n\n"; + } + $query_handle->finish; ++ ++ # CXL AER correctable errors ++ $query = "select id, timestamp, memdev, host, serial, error_status from cxl_aer_ce_event$conf{opt}{since} order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $error_status)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $timestamp error: "; ++ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev); ++ $out .= "host=$host, " if (defined $host && length $host); ++ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial); ++ if (defined $error_status && length $error_status) { ++ $out .= sprintf "error_status: %s, ", get_cxl_ce_error_status_text($error_status); ++ } ++ $out .= "\n"; ++ } ++ if ($out ne "") { ++ print "CXL AER correctable events:\n$out\n"; ++ } else { ++ print "No CXL AER correctable errors.\n\n"; ++ } ++ $query_handle->finish; + } + + # Extlog errors diff --git a/aee13f74266382c64128bd7367a5eeb46277f490.patch b/aee13f74266382c64128bd7367a5eeb46277f490.patch new file mode 100644 index 0000000..2f330fa --- /dev/null +++ b/aee13f74266382c64128bd7367a5eeb46277f490.patch @@ -0,0 +1,161 @@ +commit b2e5a6821fae4278cc37803a223a5a64bf50c8cc +Author: Shiju Jose +Date: Mon Feb 12 11:29:13 2024 +0000 + + rasdaemon: ras-mc-ctl: Add support for CXL memory module trace events + + Add support for CXL memory module events to the ras-mc-ctl tool. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + (cherry picked from commit aee13f74266382c64128bd7367a5eeb46277f490) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 5e45889..5e120d9 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1376,6 +1376,70 @@ sub get_cxl_transaction_type + return $types[$_[0]]; + } + ++sub get_cxl_dev_event_type ++{ ++ my @types; ++ ++ if ($_[0] < 0 || $_[0] > 5) { ++ return "unknown-type"; ++ } ++ ++ @types = ("Health Status Change", ++ "Media Status Change", ++ "Life Used Change", ++ "Temperature Change", ++ "Data Path Error", ++ "LSA Error"); ++ ++ return $types[$_[0]]; ++} ++ ++use constant { ++ CXL_DHI_HS_MAINTENANCE_NEEDED => 0x0001, ++ CXL_DHI_HS_PERFORMANCE_DEGRADED => 0x0002, ++ CXL_DHI_HS_HW_REPLACEMENT_NEEDED => 0x0004, ++}; ++ ++sub get_cxl_health_status_text ++{ ++ my $flags = $_[0]; ++ my @out; ++ ++ if ($flags & CXL_DHI_HS_MAINTENANCE_NEEDED) { ++ push @out, (sprintf "\'MAINTENANCE_NEEDED\' "); ++ } ++ if ($flags & CXL_DHI_HS_PERFORMANCE_DEGRADED) { ++ push @out, (sprintf "\'PERFORMANCE_DEGRADED\' "); ++ } ++ if ($flags & CXL_DHI_HS_HW_REPLACEMENT_NEEDED) { ++ push @out, (sprintf "\'REPLACEMENT_NEEDED\' "); ++ } ++ ++ return join (", ", @out); ++} ++ ++sub get_cxl_media_status ++{ ++ my @types; ++ ++ if ($_[0] < 0 || $_[0] > 9) { ++ return "unknown"; ++ } ++ ++ @types = ("Normal", ++ "Not Ready", ++ "Write Persistency Lost", ++ "All Data Lost", ++ "Write Persistency Loss in the Event of Power Loss", ++ "Write Persistency Loss in Event of Shutdown", ++ "Write Persistency Loss Imminent", ++ "All Data Loss in Event of Power Loss", ++ "All Data loss in the Event of Shutdown", ++ "All Data Loss Imminent"); ++ ++ return $types[$_[0]]; ++} ++ + sub summary + { + require DBI; +@@ -1552,6 +1616,22 @@ sub summary + print "No CXL DRAM errors.\n\n"; + } + $query_handle->finish; ++ ++ # CXL memory module errors ++ $query = "select memdev, count(*) from cxl_memory_module_event$conf{opt}{since} group by memdev"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($memdev, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$memdev errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "CXL memory module events summary:\n$out\n"; ++ } else { ++ print "No CXL memory module errors.\n\n"; ++ } ++ $query_handle->finish; + } + + # extlog errors +@@ -1665,6 +1745,7 @@ sub errors + my ($hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $data); + my ($dpa_flags, $descriptor, $mem_event_type, $transaction_type, $channel, $rank, $device, $comp_id); + my ($nibble_mask, $bank_group, $row, $column, $cor_mask); ++ my ($event_type, $health_status, $media_status, $life_used, $dirty_shutdown_cnt, $cor_vol_err_cnt, $cor_per_err_cnt, $device_temp, $add_status); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -1966,6 +2047,42 @@ sub errors + } else { + print "No CXL DRAM errors.\n\n"; + } ++ ++ # CXL memory module errors ++ $query = "select id, timestamp, memdev, host, serial, log_type, hdr_uuid, hdr_flags, hdr_handle, hdr_related_handle, hdr_ts, hdr_length, hdr_maint_op_class, event_type, health_status, media_status, life_used, dirty_shutdown_cnt, cor_vol_err_cnt, cor_per_err_cnt, device_temp, add_status from cxl_memory_module_event$conf{opt}{since} order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $log_type, $hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $event_type, $health_status, $media_status, $life_used, $dirty_shutdown_cnt, $cor_vol_err_cnt, $cor_per_err_cnt, $device_temp, $add_status)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $timestamp error: "; ++ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev); ++ $out .= "host=$host, " if (defined $host && length $host); ++ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial); ++ $out .= "log=$log_type, " if (defined $log_type && length $log_type); ++ $out .= "hdr_uuid=$hdr_uuid, " if (defined $hdr_uuid && length $hdr_uuid); ++ $out .= sprintf "hdr_flags=0x%llx, %s, ", $hdr_flags, get_cxl_hdr_flags_text($hdr_flags) if (defined $hdr_flags && length $hdr_flags); ++ $out .= sprintf "hdr_handle=0x%x, ", $hdr_handle if (defined $hdr_handle && length $hdr_handle); ++ $out .= sprintf "hdr_related_handle=0x%x, ", $hdr_related_handle if (defined $hdr_related_handle && length $hdr_related_handle); ++ $out .= "hdr_timestamp=$hdr_ts, " if (defined $hdr_ts && length $hdr_ts); ++ $out .= sprintf "hdr_length=%u, ", $hdr_length if (defined $hdr_length && length $hdr_length); ++ $out .= sprintf "hdr_maint_op_class=%u, ", $hdr_maint_op_class if (defined $hdr_maint_op_class && length $hdr_maint_op_class); ++ $out .= sprintf "event_type: %s, ", get_cxl_dev_event_type($event_type) if (defined $event_type && length $event_type); ++ $out .= sprintf "health_status: %s, ", get_cxl_health_status_text($health_status) if (defined $health_status && length $health_status); ++ $out .= sprintf "media_status: %s, ", get_cxl_media_status($media_status) if (defined $media_status && length $media_status); ++ $out .= sprintf "life_used=%u, ", $life_used if (defined $life_used && length $life_used); ++ $out .= sprintf "dirty_shutdown_cnt=%u, ", $dirty_shutdown_cnt if (defined $dirty_shutdown_cnt && length $dirty_shutdown_cnt); ++ $out .= sprintf "cor_vol_err_cnt=%u, ", $cor_vol_err_cnt if (defined $cor_vol_err_cnt && length $cor_vol_err_cnt); ++ $out .= sprintf "cor_per_err_cnt=%u, ", $cor_per_err_cnt if (defined $cor_per_err_cnt && length $cor_per_err_cnt); ++ $out .= sprintf "device_temp=%u, ", $device_temp if (defined $device_temp && length $device_temp); ++ $out .= sprintf "add_status=%u ", $add_status if (defined $add_status && length $add_status); ++ $out .= "\n"; ++ } ++ if ($out ne "") { ++ print "CXL memory module events:\n$out\n"; ++ } else { ++ print "No CXL memory module errors.\n\n"; ++ } + } + + # Extlog errors diff --git a/b22cb067755f4604770f9864a0babed8f93a1553.patch b/b22cb067755f4604770f9864a0babed8f93a1553.patch new file mode 100644 index 0000000..2f7da9e --- /dev/null +++ b/b22cb067755f4604770f9864a0babed8f93a1553.patch @@ -0,0 +1,75 @@ +commit 25ef3044f38224d653d880fb9f20be9e7c9bf570 +Author: Shiju Jose +Date: Mon Feb 12 10:38:51 2024 +0000 + + rasdaemon: ras-mc-ctl: Add support for CXL overflow trace events + + Add support for CXL overflow events to the ras-mc-ctl tool. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + (cherry picked from commit b22cb067755f4604770f9864a0babed8f93a1553) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 9519279..6a319a7 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1366,6 +1366,22 @@ sub summary + print "No CXL AER correctable errors.\n\n"; + } + $query_handle->finish; ++ ++ # CXL overflow errors ++ $query = "select memdev, count(*) from cxl_overflow_event$conf{opt}{since} group by memdev"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($memdev, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$memdev errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "CXL overflow events summary:\n$out\n"; ++ } else { ++ print "No CXL overflow errors.\n\n"; ++ } ++ $query_handle->finish; + } + + # extlog errors +@@ -1474,6 +1490,7 @@ sub errors + my ($error_count, $affinity, $mpidr, $r_state, $psci_state); + my ($pfn, $page_type, $action_result); + my ($memdev, $host, $serial, $error_status, $first_error, $header_log); ++ my ($log_type, $first_ts, $last_ts); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -1598,6 +1615,27 @@ sub errors + print "No CXL AER correctable errors.\n\n"; + } + $query_handle->finish; ++ ++ # CXL overflow errors ++ $query = "select id, timestamp, memdev, host, serial, log_type, count, first_ts, last_ts from cxl_overflow_event$conf{opt}{since} order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $log_type, $count, $first_ts, $last_ts)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $timestamp error: "; ++ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev); ++ $out .= "host=$host, " if (defined $host && length $host); ++ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial); ++ $out .= "log=$log_type, " if (defined $log_type && length $log_type); ++ $out .= sprintf "%u records from $first_ts to $last_ts", $count if (defined $count && length $count); ++ $out .= "\n"; ++ } ++ if ($out ne "") { ++ print "CXL overflow events:\n$out\n"; ++ } else { ++ print "No CXL overflow errors.\n\n"; ++ } + } + + # Extlog errors diff --git a/c38c14afc5d7bb6c8c52d1023271d755deb23008.patch b/c38c14afc5d7bb6c8c52d1023271d755deb23008.patch new file mode 100644 index 0000000..2970075 --- /dev/null +++ b/c38c14afc5d7bb6c8c52d1023271d755deb23008.patch @@ -0,0 +1,101 @@ +commit 703e0f8eabbe1e191a8bd85632066c155ec1f4fa +Author: Shiju Jose +Date: Mon Feb 12 11:22:03 2024 +0000 + + rasdaemon: ras-mc-ctl: Add support for CXL DRAM trace events + + Add support for CXL DRAM events to the ras-mc-ctl tool. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + (cherry picked from commit c38c14afc5d7bb6c8c52d1023271d755deb23008) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 99b3c10..5e45889 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1536,6 +1536,22 @@ sub summary + print "No CXL general media errors.\n\n"; + } + $query_handle->finish; ++ ++ # CXL DRAM errors ++ $query = "select memdev, count(*) from cxl_dram_event$conf{opt}{since} group by memdev"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($memdev, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$memdev errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "CXL DRAM events summary:\n$out\n"; ++ } else { ++ print "No CXL DRAM errors.\n\n"; ++ } ++ $query_handle->finish; + } + + # extlog errors +@@ -1648,6 +1664,7 @@ sub errors + my ($trace_type, $region, $region_uuid, $hpa, $dpa, $dpa_length, $source, $flags, $overflow_ts); + my ($hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $data); + my ($dpa_flags, $descriptor, $mem_event_type, $transaction_type, $channel, $rank, $device, $comp_id); ++ my ($nibble_mask, $bank_group, $row, $column, $cor_mask); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -1902,6 +1919,53 @@ sub errors + } else { + print "No CXL general media errors.\n\n"; + } ++ ++ # CXL DRAM errors ++ use constant CXL_EVENT_DER_CORRECTION_MASK_SIZE => 0x20; ++ $query = "select id, timestamp, memdev, host, serial, log_type, hdr_uuid, hdr_flags, hdr_handle, hdr_related_handle, hdr_ts, hdr_length, hdr_maint_op_class, dpa, dpa_flags, descriptor, type, transaction_type, channel, rank, nibble_mask, bank_group, bank, row, column, cor_mask from cxl_dram_event$conf{opt}{since} order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $log_type, $hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $dpa, $dpa_flags, $descriptor, $type, $transaction_type, $channel, $rank, $nibble_mask, $bank_group, $bank, $row, $column, $cor_mask)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $timestamp error: "; ++ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev); ++ $out .= "host=$host, " if (defined $host && length $host); ++ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial); ++ $out .= "log=$log_type, " if (defined $log_type && length $log_type); ++ $out .= "hdr_uuid=$hdr_uuid, " if (defined $hdr_uuid && length $hdr_uuid); ++ $out .= sprintf "hdr_flags=0x%llx, %s, ", $hdr_flags, get_cxl_hdr_flags_text($hdr_flags) if (defined $hdr_flags && length $hdr_flags); ++ $out .= sprintf "hdr_handle=0x%x, ", $hdr_handle if (defined $hdr_handle && length $hdr_handle); ++ $out .= sprintf "hdr_related_handle=0x%x, ", $hdr_related_handle if (defined $hdr_related_handle && length $hdr_related_handle); ++ $out .= "hdr_timestamp=$hdr_ts, " if (defined $hdr_ts && length $hdr_ts); ++ $out .= sprintf "hdr_length=%u, ", $hdr_length if (defined $hdr_length && length $hdr_length); ++ $out .= sprintf "hdr_maint_op_class=%u, ", $hdr_maint_op_class if (defined $hdr_maint_op_class && length $hdr_maint_op_class); ++ $out .= sprintf "dpa=0x%llx, ", $dpa if (defined $dpa && length $dpa); ++ $out .= sprintf "dpa_flags: %s, ", get_cxl_dpa_flags_text($dpa_flags) if (defined $dpa_flags && length $dpa_flags); ++ $out .= sprintf "descriptor_flags: %s, ", get_cxl_descriptor_flags_text($descriptor) if (defined $descriptor && length $descriptor); ++ $out .= sprintf "memory event type: %s, ", get_cxl_mem_event_type($type) if (defined $type && length $type); ++ $out .= sprintf "transaction_type: %s, ", get_cxl_transaction_type($transaction_type) if (defined $transaction_type && length $transaction_type); ++ $out .= sprintf "channel=%u, ", $channel if (defined $channel && length $channel); ++ $out .= sprintf "rank=%u, ", $rank if (defined $rank && length $rank); ++ $out .= sprintf "nibble_mask=%u, ", $nibble_mask if (defined $nibble_mask && length $nibble_mask); ++ $out .= sprintf "bank_group=%u, ", $bank_group if (defined $bank_group && length $bank_group); ++ $out .= sprintf "bank=%u, ", $bank if (defined $bank && length $bank); ++ $out .= sprintf "row=%u, ", $row if (defined $row && length $row); ++ $out .= sprintf "column=%u, ", $column if (defined $column && length $column); ++ if (defined $cor_mask && length $cor_mask) { ++ $out .= sprintf "correction_mask:"; ++ my @bytes = unpack "C*", $cor_mask; ++ for (my $i = 0; $i < CXL_EVENT_DER_CORRECTION_MASK_SIZE; $i++) { ++ $out .= sprintf "%02x ", $bytes[$i]; ++ } ++ } ++ $out .= "\n"; ++ } ++ if ($out ne "") { ++ print "CXL DRAM events:\n$out\n"; ++ } else { ++ print "No CXL DRAM errors.\n\n"; ++ } + } + + # Extlog errors diff --git a/d3836aa061f677232f99c514247d3dbf80812a1b.patch b/d3836aa061f677232f99c514247d3dbf80812a1b.patch new file mode 100644 index 0000000..f85f264 --- /dev/null +++ b/d3836aa061f677232f99c514247d3dbf80812a1b.patch @@ -0,0 +1,42 @@ +commit d3836aa061f677232f99c514247d3dbf80812a1b +Author: Shiju Jose +Date: Mon Jan 16 17:13:32 2023 +0000 + + rasdaemon: Move definition for BIT and BIT_ULL to a common file + + Move definition for BIT() and BIT_ULL() to the + common file ras-record.h + + Signed-off-by: Shiju Jose + Reviewed-by: Jonathan Cameron + Reviewed-by: Dave Jiang + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-non-standard-handler.h b/ras-non-standard-handler.h +index 4d9f938..c360eaf 100644 +--- a/ras-non-standard-handler.h ++++ b/ras-non-standard-handler.h +@@ -17,9 +17,6 @@ + #include "ras-events.h" + #include + +-#define BIT(nr) (1UL << (nr)) +-#define BIT_ULL(nr) (1ULL << (nr)) +- + struct ras_ns_ev_decoder { + struct ras_ns_ev_decoder *next; + const char *sec_type; +diff --git a/ras-record.h b/ras-record.h +index d9f7733..219f10b 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -25,6 +25,9 @@ + + #define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x))) + ++#define BIT(nr) (1UL << (nr)) ++#define BIT_ULL(nr) (1ULL << (nr)) ++ + extern long user_hz; + + struct ras_events; diff --git a/e0cde0edf073b939d345aeba0aed23e238dbc53b.patch b/e0cde0edf073b939d345aeba0aed23e238dbc53b.patch new file mode 100644 index 0000000..b26c6a1 --- /dev/null +++ b/e0cde0edf073b939d345aeba0aed23e238dbc53b.patch @@ -0,0 +1,575 @@ +commit e0cde0edf073b939d345aeba0aed23e238dbc53b +Author: Shiju Jose +Date: Tue Apr 4 18:49:09 2023 +0100 + + rasdaemon: Add support for the CXL generic events + + Add support to log and record the CXL generic events. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index d4c845e..83ada56 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -56,6 +56,49 @@ static void get_timestamp(struct trace_seq *s, struct tep_record *record, + strncpy(ts_ptr, "1970-01-01 00:00:00 +0000", size); + } + ++struct cxl_event_flags { ++ uint32_t bit; ++ const char *flag; ++}; ++ ++static int decode_cxl_event_flags(struct trace_seq *s, uint32_t flags, ++ const struct cxl_event_flags *cxl_ev_flags, ++ uint8_t num_elems) ++{ ++ int i; ++ ++ for (i = 0; i < num_elems; i++) { ++ if (flags & cxl_ev_flags[i].bit) ++ if (trace_seq_printf(s, "\'%s\' ", cxl_ev_flags[i].flag) <= 0) ++ return -1; ++ } ++ return 0; ++} ++ ++static char *uuid_be(const char *uu) ++{ ++ static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")]; ++ char *p = uuid; ++ int i; ++ static const unsigned char be[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; ++ ++ for (i = 0; i < 16; i++) { ++ p += sprintf(p, "%.2x", (unsigned char) uu[be[i]]); ++ switch (i) { ++ case 3: ++ case 5: ++ case 7: ++ case 9: ++ *p++ = '-'; ++ break; ++ } ++ } ++ ++ *p = 0; ++ ++ return uuid; ++} ++ + /* Poison List: Payload out flags */ + #define CXL_POISON_FLAG_MORE BIT(0) + #define CXL_POISON_FLAG_OVERFLOW BIT(1) +@@ -524,3 +567,145 @@ int ras_cxl_overflow_event_handler(struct trace_seq *s, + + return 0; + } ++ ++/* ++ * Common Event Record Format ++ * CXL 3.0 section 8.2.9.2.1; Table 8-42 ++ */ ++#define CXL_EVENT_RECORD_FLAG_PERMANENT BIT(2) ++#define CXL_EVENT_RECORD_FLAG_MAINT_NEEDED BIT(3) ++#define CXL_EVENT_RECORD_FLAG_PERF_DEGRADED BIT(4) ++#define CXL_EVENT_RECORD_FLAG_HW_REPLACE BIT(5) ++ ++static const struct cxl_event_flags cxl_hdr_flags[] = { ++ { .bit = CXL_EVENT_RECORD_FLAG_PERMANENT, .flag = "PERMANENT_CONDITION" }, ++ { .bit = CXL_EVENT_RECORD_FLAG_MAINT_NEEDED, .flag = "MAINTENANCE_NEEDED" }, ++ { .bit = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED, .flag = "PERFORMANCE_DEGRADED" }, ++ { .bit = CXL_EVENT_RECORD_FLAG_HW_REPLACE, .flag = "HARDWARE_REPLACEMENT_NEEDED" }, ++}; ++ ++static int handle_ras_cxl_common_hdr(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context, ++ struct ras_cxl_event_common_hdr *hdr) ++{ ++ int len; ++ unsigned long long val; ++ struct ras_events *ras = context; ++ ++ get_timestamp(s, record, ras, (char *)&hdr->timestamp, sizeof(hdr->timestamp)); ++ if (trace_seq_printf(s, "%s ", hdr->timestamp) <= 0) ++ return -1; ++ ++ hdr->memdev = tep_get_field_raw(s, event, "memdev", record, &len, 1); ++ if (!hdr->memdev) ++ return -1; ++ if (trace_seq_printf(s, "memdev:%s ", hdr->memdev) <= 0) ++ return -1; ++ ++ hdr->host = tep_get_field_raw(s, event, "host", record, &len, 1); ++ if (!hdr->host) ++ return -1; ++ if (trace_seq_printf(s, "host:%s ", hdr->host) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "serial", record, &val, 1) < 0) ++ return -1; ++ hdr->serial = val; ++ if (trace_seq_printf(s, "serial:0x%llx ", (unsigned long long)hdr->serial) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "log", record, &val, 1) < 0) ++ return -1; ++ hdr->log_type = cxl_event_log_type_str(val); ++ if (trace_seq_printf(s, "log type:%s ", hdr->log_type) <= 0) ++ return -1; ++ ++ hdr->hdr_uuid = tep_get_field_raw(s, event, "hdr_uuid", record, &len, 1); ++ if (!hdr->hdr_uuid) ++ return -1; ++ hdr->hdr_uuid = uuid_be(hdr->hdr_uuid); ++ if (trace_seq_printf(s, "hdr_uuid:%s ", hdr->hdr_uuid) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "hdr_flags", record, &val, 1) < 0) ++ return -1; ++ hdr->hdr_flags = val; ++ if (decode_cxl_event_flags(s, hdr->hdr_flags, cxl_hdr_flags, ++ ARRAY_SIZE(cxl_hdr_flags)) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "hdr_handle", record, &val, 1) < 0) ++ return -1; ++ hdr->hdr_handle = val; ++ if (trace_seq_printf(s, "hdr_handle:0x%x ", hdr->hdr_handle) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "hdr_related_handle", record, &val, 1) < 0) ++ return -1; ++ hdr->hdr_related_handle = val; ++ if (trace_seq_printf(s, "hdr_related_handle:0x%x ", hdr->hdr_related_handle) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "hdr_timestamp", record, &val, 1) < 0) ++ return -1; ++ convert_timestamp(val, hdr->hdr_timestamp, sizeof(hdr->hdr_timestamp)); ++ if (trace_seq_printf(s, "hdr_timestamp:%s ", hdr->hdr_timestamp) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "hdr_length", record, &val, 1) < 0) ++ return -1; ++ hdr->hdr_length = val; ++ if (trace_seq_printf(s, "hdr_length:%u ", hdr->hdr_length) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "hdr_maint_op_class", record, &val, 1) < 0) ++ return -1; ++ hdr->hdr_maint_op_class = val; ++ if (trace_seq_printf(s, "hdr_maint_op_class:%u ", hdr->hdr_maint_op_class) <= 0) ++ return -1; ++ ++ return 0; ++} ++ ++int ras_cxl_generic_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ int len, i; ++ struct ras_events *ras = context; ++ struct ras_cxl_generic_event ev; ++ const uint8_t *buf; ++ ++ memset(&ev, 0, sizeof(ev)); ++ if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0) ++ return -1; ++ ++ ev.data = tep_get_field_raw(s, event, "data", record, &len, 1); ++ if (!ev.data) ++ return -1; ++ i = 0; ++ buf = ev.data; ++ if (trace_seq_printf(s, "\ndata:\n %08x: ", i) <= 0) ++ return -1; ++ for (i = 0; i < CXL_EVENT_RECORD_DATA_LENGTH; i += 4) { ++ if ((i > 0) && ((i % 16) == 0)) ++ if (trace_seq_printf(s, "\n %08x: ", i) <= 0) ++ break; ++ if (trace_seq_printf(s, "%02x%02x%02x%02x ", ++ buf[i], buf[i+1], buf[i+2], buf[i+3]) <= 0) ++ break; ++ } ++ ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_cxl_generic_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_cxl_generic_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h +index e7847ec..9f77cb7 100644 +--- a/ras-cxl-handler.h ++++ b/ras-cxl-handler.h +@@ -32,4 +32,7 @@ int ras_cxl_aer_ce_event_handler(struct trace_seq *s, + int ras_cxl_overflow_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); ++int ras_cxl_generic_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context); + #endif +diff --git a/ras-events.c b/ras-events.c +index f2a869a..4036933 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -249,6 +249,7 @@ int toggle_ras_mc_event(int enable) + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_uncorrectable_error", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_correctable_error", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_overflow", enable); ++ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_generic_event", enable); + #endif + + free_ras: +@@ -1054,6 +1055,14 @@ int handle_ras_events(int record_events) + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "cxl_overflow"); ++ ++ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_generic_event", ++ ras_cxl_generic_event_handler, NULL, CXL_GENERIC_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "cxl", "cxl_generic_event"); + #endif + + if (!num_events) { +diff --git a/ras-events.h b/ras-events.h +index 7c869d9..96c299e 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -43,6 +43,7 @@ enum { + CXL_AER_UE_EVENT, + CXL_AER_CE_EVENT, + CXL_OVERFLOW_EVENT, ++ CXL_GENERIC_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index 7b808a5..a65d9c0 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -773,6 +773,79 @@ int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow + + return rc; + } ++ ++static int ras_store_cxl_common_hdr(sqlite3_stmt *stmt, struct ras_cxl_event_common_hdr *hdr) ++{ ++ if (!stmt || !hdr) ++ return 0; ++ ++ sqlite3_bind_text(stmt, 1, hdr->timestamp, -1, NULL); ++ sqlite3_bind_text(stmt, 2, hdr->memdev, -1, NULL); ++ sqlite3_bind_text(stmt, 3, hdr->host, -1, NULL); ++ sqlite3_bind_int64(stmt, 4, hdr->serial); ++ sqlite3_bind_text(stmt, 5, hdr->log_type, -1, NULL); ++ sqlite3_bind_text(stmt, 6, hdr->hdr_uuid, -1, NULL); ++ sqlite3_bind_int(stmt, 7, hdr->hdr_flags); ++ sqlite3_bind_int(stmt, 8, hdr->hdr_handle); ++ sqlite3_bind_int(stmt, 9, hdr->hdr_related_handle); ++ sqlite3_bind_text(stmt, 10, hdr->hdr_timestamp, -1, NULL); ++ sqlite3_bind_int(stmt, 11, hdr->hdr_length); ++ sqlite3_bind_int(stmt, 12, hdr->hdr_maint_op_class); ++ ++ return 0; ++} ++ ++/* ++ * Table and functions to handle cxl:cxl_generic_event ++ */ ++static const struct db_fields cxl_generic_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "memdev", .type = "TEXT" }, ++ { .name = "host", .type = "TEXT" }, ++ { .name = "serial", .type = "INTEGER" }, ++ { .name = "log_type", .type = "TEXT" }, ++ { .name = "hdr_uuid", .type = "TEXT" }, ++ { .name = "hdr_flags", .type = "INTEGER" }, ++ { .name = "hdr_handle", .type = "INTEGER" }, ++ { .name = "hdr_related_handle", .type = "INTEGER" }, ++ { .name = "hdr_ts", .type = "TEXT" }, ++ { .name = "hdr_length", .type = "INTEGER" }, ++ { .name = "hdr_maint_op_class", .type = "INTEGER" }, ++ { .name = "data", .type = "BLOB" }, ++}; ++ ++static const struct db_table_descriptor cxl_generic_event_tab = { ++ .name = "cxl_generic_event", ++ .fields = cxl_generic_event_fields, ++ .num_fields = ARRAY_SIZE(cxl_generic_event_fields), ++}; ++ ++int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_cxl_generic_event) ++ return 0; ++ log(TERM, LOG_INFO, "cxl_generic_event store: %p\n", priv->stmt_cxl_generic_event); ++ ++ ras_store_cxl_common_hdr(priv->stmt_cxl_generic_event, &ev->hdr); ++ sqlite3_bind_blob(priv->stmt_cxl_generic_event, 13, ev->data, ++ CXL_EVENT_RECORD_DATA_LENGTH, NULL); ++ ++ rc = sqlite3_step(priv->stmt_cxl_generic_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do stmt_cxl_generic_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_cxl_generic_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset stmt_cxl_generic_event on sqlite: error = %d\n", rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} + #endif + + /* +@@ -1148,6 +1221,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + if (rc != SQLITE_OK) + goto error; + } ++ ++ rc = ras_mc_create_table(priv, &cxl_generic_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_generic_event, ++ &cxl_generic_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } + #endif + + ras->db_priv = priv; +@@ -1301,6 +1382,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + "cpu %u: Failed to finalize cxl_overflow_event sqlite: error = %d\n", + cpu, rc); + } ++ ++ if (priv->stmt_cxl_generic_event) { ++ rc = sqlite3_finalize(priv->stmt_cxl_generic_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize cxl_generic_event sqlite: error = %d\n", ++ cpu, rc); ++ } + #endif + + rc = sqlite3_close_v2(db); +diff --git a/ras-record.h b/ras-record.h +index 90db6ad..9ecfcda 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -133,6 +133,7 @@ struct ras_cxl_poison_event { + #define SZ_512 0x200 + #define CXL_HEADERLOG_SIZE SZ_512 + #define CXL_HEADERLOG_SIZE_U32 (SZ_512 / sizeof(uint32_t)) ++#define CXL_EVENT_RECORD_DATA_LENGTH 0x50 + + struct ras_cxl_aer_ue_event { + char timestamp[64]; +@@ -163,6 +164,26 @@ struct ras_cxl_overflow_event { + uint16_t count; + }; + ++struct ras_cxl_event_common_hdr { ++ char timestamp[64]; ++ const char *memdev; ++ const char *host; ++ uint64_t serial; ++ const char *log_type; ++ const char *hdr_uuid; ++ uint32_t hdr_flags; ++ uint16_t hdr_handle; ++ uint16_t hdr_related_handle; ++ char hdr_timestamp[64]; ++ uint8_t hdr_length; ++ uint8_t hdr_maint_op_class; ++}; ++ ++struct ras_cxl_generic_event { ++ struct ras_cxl_event_common_hdr hdr; ++ uint8_t *data; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -176,6 +197,7 @@ struct ras_cxl_poison_event; + struct ras_cxl_aer_ue_event; + struct ras_cxl_aer_ce_event; + struct ras_cxl_overflow_event; ++struct ras_cxl_generic_event; + + #ifdef HAVE_SQLITE3 + +@@ -213,6 +235,7 @@ struct sqlite3_priv { + sqlite3_stmt *stmt_cxl_aer_ue_event; + sqlite3_stmt *stmt_cxl_aer_ce_event; + sqlite3_stmt *stmt_cxl_overflow_event; ++ sqlite3_stmt *stmt_cxl_generic_event; + #endif + }; + +@@ -245,6 +268,7 @@ int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_eve + int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev); + int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); + int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); ++int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -262,6 +286,7 @@ static inline int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_ + static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; }; + static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; + static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; ++static inline int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index dbed454..8d7b76a 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -451,6 +451,44 @@ static int set_cxl_overflow_event_backtrace(char *buf, struct ras_cxl_overflow_e + return 0; + } + ++static int set_cxl_generic_event_backtrace(char *buf, struct ras_cxl_generic_event *ev) ++{ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "memdev=%s\n" \ ++ "host=%s\n" \ ++ "serial=0x%lx\n" \ ++ "log_type=%s\n" \ ++ "hdr_uuid=%s\n" \ ++ "hdr_flags=0x%x\n" \ ++ "hdr_handle=0x%x\n" \ ++ "hdr_related_handle=0x%x\n" \ ++ "hdr_timestamp=%s\n" \ ++ "hdr_length=%u\n" \ ++ "hdr_maint_op_class=%u\n", \ ++ ev->hdr.timestamp, \ ++ ev->hdr.memdev, \ ++ ev->hdr.host, \ ++ ev->hdr.serial, \ ++ ev->hdr.log_type, \ ++ ev->hdr.hdr_uuid, \ ++ ev->hdr.hdr_flags, \ ++ ev->hdr.hdr_handle, \ ++ ev->hdr.hdr_related_handle, \ ++ ev->hdr.hdr_timestamp, \ ++ ev->hdr.hdr_length, \ ++ ev->hdr.hdr_maint_op_class); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -500,6 +538,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case CXL_OVERFLOW_EVENT: + rc = set_cxl_overflow_event_backtrace(buf, (struct ras_cxl_overflow_event *)ev); + break; ++ case CXL_GENERIC_EVENT: ++ rc = set_cxl_generic_event_backtrace(buf, (struct ras_cxl_generic_event *)ev); ++ break; + default: + return -1; + } +@@ -1084,3 +1125,48 @@ cxl_overflow_fail: + else + return -1; + } ++ ++int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto cxl_generic_fail; ++ ++ rc = commit_report_backtrace(sockfd, CXL_GENERIC_EVENT, ev); ++ if (rc < 0) ++ goto cxl_generic_fail; ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl_generic_event"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_generic_fail; ++ ++ sprintf(buf, "REASON=%s", "CXL Generic Event "); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_generic_fail; ++ ++ done = 1; ++ ++cxl_generic_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ else ++ return -1; ++ ++} +diff --git a/ras-report.h b/ras-report.h +index 204d485..bf591a6 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -43,6 +43,7 @@ int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_ev + int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev); + int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); + int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); ++int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); + + #else + +@@ -58,6 +59,7 @@ static inline int ras_report_cxl_poison_event(struct ras_events *ras, struct ras + static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; }; + static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; + static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; ++static inline int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; + + #endif + diff --git a/f63b4c942e19a0da1e85a88783ed6e222ad4bdba.patch b/f63b4c942e19a0da1e85a88783ed6e222ad4bdba.patch new file mode 100644 index 0000000..c5103a9 --- /dev/null +++ b/f63b4c942e19a0da1e85a88783ed6e222ad4bdba.patch @@ -0,0 +1,536 @@ +commit f63b4c942e19a0da1e85a88783ed6e222ad4bdba +Author: Shiju Jose +Date: Wed Apr 5 16:16:19 2023 +0100 + + rasdaemon: Add support for the CXL memory module events + + Add support to log and record the CXL memory module events. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index 64b0b50..a0b6780 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -1016,3 +1016,159 @@ int ras_cxl_dram_event_handler(struct trace_seq *s, + + return 0; + } ++ ++/* ++ * Memory Module Event Record - MMER ++ * ++ * CXL res 3.0 section 8.2.9.2.1.3; Table 8-45 ++ */ ++static const char* cxl_dev_evt_type[] = { ++ "Health Status Change", ++ "Media Status Change", ++ "Life Used Change", ++ "Temperature Change", ++ "Data Path Error", ++ "LSA Error", ++}; ++ ++/* ++ * Device Health Information - DHI ++ * ++ * CXL res 3.0 section 8.2.9.8.3.1; Table 8-100 ++ */ ++#define CXL_DHI_HS_MAINTENANCE_NEEDED BIT(0) ++#define CXL_DHI_HS_PERFORMANCE_DEGRADED BIT(1) ++#define CXL_DHI_HS_HW_REPLACEMENT_NEEDED BIT(2) ++ ++static const struct cxl_event_flags cxl_health_status[] = { ++ { .bit = CXL_DHI_HS_MAINTENANCE_NEEDED, .flag = "MAINTENANCE_NEEDED" }, ++ { .bit = CXL_DHI_HS_PERFORMANCE_DEGRADED, .flag = "PERFORMANCE_DEGRADED" }, ++ { .bit = CXL_DHI_HS_HW_REPLACEMENT_NEEDED, .flag = "REPLACEMENT_NEEDED" }, ++}; ++ ++static const char* cxl_media_status[] = { ++ "Normal", ++ "Not Ready", ++ "Write Persistency Lost", ++ "All Data Lost", ++ "Write Persistency Loss in the Event of Power Loss", ++ "Write Persistency Loss in Event of Shutdown", ++ "Write Persistency Loss Imminent", ++ "All Data Loss in Event of Power Loss", ++ "All Data loss in the Event of Shutdown", ++ "All Data Loss Imminent", ++}; ++ ++static const char* cxl_two_bit_status[] = { ++ "Normal", ++ "Warning", ++ "Critical", ++}; ++ ++static const char* cxl_one_bit_status[] = { ++ "Normal", ++ "Warning", ++}; ++ ++#define CXL_DHI_AS_LIFE_USED(as) (as & 0x3) ++#define CXL_DHI_AS_DEV_TEMP(as) ((as & 0xC) >> 2) ++#define CXL_DHI_AS_COR_VOL_ERR_CNT(as) ((as & 0x10) >> 4) ++#define CXL_DHI_AS_COR_PER_ERR_CNT(as) ((as & 0x20) >> 5) ++ ++int ras_cxl_memory_module_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ unsigned long long val; ++ struct ras_events *ras = context; ++ struct ras_cxl_memory_module_event ev; ++ ++ memset(&ev, 0, sizeof(ev)); ++ if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "event_type", record, &val, 1) < 0) ++ return -1; ++ ev.event_type = val; ++ if (trace_seq_printf(s, "event_type:%s ", get_cxl_type_str(cxl_dev_evt_type, ++ ARRAY_SIZE(cxl_dev_evt_type), ev.event_type)) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "health_status", record, &val, 1) < 0) ++ return -1; ++ ev.health_status = val; ++ if (trace_seq_printf(s, "health_status:") <= 0) ++ return -1; ++ if (decode_cxl_event_flags(s, ev.health_status, cxl_health_status, ++ ARRAY_SIZE(cxl_health_status)) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "media_status", record, &val, 1) < 0) ++ return -1; ++ ev.media_status = val; ++ if (trace_seq_printf(s, "media_status:%s ", get_cxl_type_str(cxl_media_status, ++ ARRAY_SIZE(cxl_media_status), ev.media_status)) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "add_status", record, &val, 1) < 0) ++ return -1; ++ ev.add_status = val; ++ if (trace_seq_printf(s, "as_life_used:%s ", get_cxl_type_str(cxl_two_bit_status, ++ ARRAY_SIZE(cxl_two_bit_status), ++ CXL_DHI_AS_LIFE_USED(ev.add_status))) <= 0) ++ return -1; ++ if (trace_seq_printf(s, "as_dev_temp:%s ", get_cxl_type_str(cxl_two_bit_status, ++ ARRAY_SIZE(cxl_two_bit_status), ++ CXL_DHI_AS_DEV_TEMP(ev.add_status))) <= 0) ++ return -1; ++ if (trace_seq_printf(s, "as_cor_vol_err_cnt:%s ", get_cxl_type_str(cxl_one_bit_status, ++ ARRAY_SIZE(cxl_one_bit_status), ++ CXL_DHI_AS_COR_VOL_ERR_CNT(ev.add_status))) <= 0) ++ return -1; ++ if (trace_seq_printf(s, "as_cor_per_err_cnt:%s ", get_cxl_type_str(cxl_one_bit_status, ++ ARRAY_SIZE(cxl_one_bit_status), ++ CXL_DHI_AS_COR_PER_ERR_CNT(ev.add_status))) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "life_used", record, &val, 1) < 0) ++ return -1; ++ ev.life_used = val; ++ if (trace_seq_printf(s, "life_used:%u ", ev.life_used) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "device_temp", record, &val, 1) < 0) ++ return -1; ++ ev.device_temp = val; ++ if (trace_seq_printf(s, "device_temp:%u ", ev.device_temp) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "dirty_shutdown_cnt", record, &val, 1) < 0) ++ return -1; ++ ev.dirty_shutdown_cnt = val; ++ if (trace_seq_printf(s, "dirty_shutdown_cnt:%u ", ev.dirty_shutdown_cnt) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "cor_vol_err_cnt", record, &val, 1) < 0) ++ return -1; ++ ev.cor_vol_err_cnt = val; ++ if (trace_seq_printf(s, "cor_vol_err_cnt:%u ", ev.cor_vol_err_cnt) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "cor_per_err_cnt", record, &val, 1) < 0) ++ return -1; ++ ev.cor_per_err_cnt = val; ++ if (trace_seq_printf(s, "cor_per_err_cnt:%u ", ev.cor_per_err_cnt) <= 0) ++ return -1; ++ ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_cxl_memory_module_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_cxl_memory_module_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h +index 35455af..1ea0f93 100644 +--- a/ras-cxl-handler.h ++++ b/ras-cxl-handler.h +@@ -41,4 +41,7 @@ int ras_cxl_general_media_event_handler(struct trace_seq *s, + int ras_cxl_dram_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); ++int ras_cxl_memory_module_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context); + #endif +diff --git a/ras-events.c b/ras-events.c +index d27e0c4..a82dab2 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -252,6 +252,7 @@ int toggle_ras_mc_event(int enable) + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_generic_event", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_general_media", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_dram", enable); ++ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_memory_module", enable); + #endif + + free_ras: +@@ -1081,6 +1082,14 @@ int handle_ras_events(int record_events) + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "cxl_dram"); ++ ++ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_memory_module", ++ ras_cxl_memory_module_event_handler, NULL, CXL_MEMORY_MODULE_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "cxl", "memory_module"); + #endif + + if (!num_events) { +diff --git a/ras-events.h b/ras-events.h +index d192a6b..c4d54e3 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -46,6 +46,7 @@ enum { + CXL_GENERIC_EVENT, + CXL_GENERAL_MEDIA_EVENT, + CXL_DRAM_EVENT, ++ CXL_MEMORY_MODULE_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index fffa81c..a5f99ae 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -992,6 +992,74 @@ int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event * + + return rc; + } ++ ++/* ++ * Table and functions to handle cxl:cxl_memory_module_event ++ */ ++static const struct db_fields cxl_memory_module_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "memdev", .type = "TEXT" }, ++ { .name = "host", .type = "TEXT" }, ++ { .name = "serial", .type = "INTEGER" }, ++ { .name = "log_type", .type = "TEXT" }, ++ { .name = "hdr_uuid", .type = "TEXT" }, ++ { .name = "hdr_flags", .type = "INTEGER" }, ++ { .name = "hdr_handle", .type = "INTEGER" }, ++ { .name = "hdr_related_handle", .type = "INTEGER" }, ++ { .name = "hdr_ts", .type = "TEXT" }, ++ { .name = "hdr_length", .type = "INTEGER" }, ++ { .name = "hdr_maint_op_class", .type = "INTEGER" }, ++ { .name = "event_type", .type = "INTEGER" }, ++ { .name = "health_status", .type = "INTEGER" }, ++ { .name = "media_status", .type = "INTEGER" }, ++ { .name = "life_used", .type = "INTEGER" }, ++ { .name = "dirty_shutdown_cnt", .type = "INTEGER" }, ++ { .name = "cor_vol_err_cnt", .type = "INTEGER" }, ++ { .name = "cor_per_err_cnt", .type = "INTEGER" }, ++ { .name = "device_temp", .type = "INTEGER" }, ++ { .name = "add_status", .type = "INTEGER" }, ++}; ++ ++static const struct db_table_descriptor cxl_memory_module_event_tab = { ++ .name = "cxl_memory_module_event", ++ .fields = cxl_memory_module_event_fields, ++ .num_fields = ARRAY_SIZE(cxl_memory_module_event_fields), ++}; ++ ++int ras_store_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_cxl_memory_module_event) ++ return 0; ++ log(TERM, LOG_INFO, "cxl_memory_module_event store: %p\n", ++ priv->stmt_cxl_memory_module_event); ++ ++ ras_store_cxl_common_hdr(priv->stmt_cxl_memory_module_event, &ev->hdr); ++ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 13, ev->event_type); ++ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 14, ev->health_status); ++ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 15, ev->media_status); ++ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 16, ev->life_used); ++ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 17, ev->dirty_shutdown_cnt); ++ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 18, ev->cor_vol_err_cnt); ++ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 19, ev->cor_per_err_cnt); ++ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 20, ev->device_temp); ++ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 21, ev->add_status); ++ ++ rc = sqlite3_step(priv->stmt_cxl_memory_module_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do stmt_cxl_memory_module_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_cxl_memory_module_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset stmt_cxl_memory_module_event on sqlite: error = %d\n", rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} + #endif + + /* +@@ -1391,6 +1459,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + if (rc != SQLITE_OK) + goto error; + } ++ ++ rc = ras_mc_create_table(priv, &cxl_memory_module_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_memory_module_event, ++ &cxl_memory_module_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } + #endif + + ras->db_priv = priv; +@@ -1568,6 +1644,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + "cpu %u: Failed to finalize cxl_dram_event sqlite: error = %d\n", + cpu, rc); + } ++ ++ if (priv->stmt_cxl_memory_module_event) { ++ rc = sqlite3_finalize(priv->stmt_cxl_memory_module_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize stmt_cxl_memory_module_event sqlite: error = %d\n", ++ cpu, rc); ++ } + #endif + + rc = sqlite3_close_v2(db); +diff --git a/ras-record.h b/ras-record.h +index 480ff92..a7b9ab9 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -218,6 +218,19 @@ struct ras_cxl_dram_event { + uint16_t validity_flags; + }; + ++struct ras_cxl_memory_module_event { ++ struct ras_cxl_event_common_hdr hdr; ++ uint8_t event_type; ++ uint8_t health_status; ++ uint8_t media_status; ++ uint8_t life_used; ++ uint32_t dirty_shutdown_cnt; ++ uint32_t cor_vol_err_cnt; ++ uint32_t cor_per_err_cnt; ++ int16_t device_temp; ++ uint8_t add_status; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -234,6 +247,7 @@ struct ras_cxl_overflow_event; + struct ras_cxl_generic_event; + struct ras_cxl_general_media_event; + struct ras_cxl_dram_event; ++struct ras_cxl_memory_module_event; + + #ifdef HAVE_SQLITE3 + +@@ -274,6 +288,7 @@ struct sqlite3_priv { + sqlite3_stmt *stmt_cxl_generic_event; + sqlite3_stmt *stmt_cxl_general_media_event; + sqlite3_stmt *stmt_cxl_dram_event; ++ sqlite3_stmt *stmt_cxl_memory_module_event; + #endif + }; + +@@ -309,6 +324,7 @@ int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow + int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); + int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); + int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev); ++int ras_store_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -329,6 +345,7 @@ static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ra + static inline int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; + static inline int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; + static inline int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) { return 0; }; ++static inline int ras_store_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 21180b1..a30b66d 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -605,6 +605,62 @@ static int set_cxl_dram_event_backtrace(char *buf, struct ras_cxl_dram_event *ev + return 0; + } + ++static int set_cxl_memory_module_event_backtrace(char *buf, struct ras_cxl_memory_module_event *ev) ++{ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "memdev=%s\n" \ ++ "host=%s\n" \ ++ "serial=0x%lx\n" \ ++ "log_type=%s\n" \ ++ "hdr_uuid=%s\n" \ ++ "hdr_flags=0x%x\n" \ ++ "hdr_handle=0x%x\n" \ ++ "hdr_related_handle=0x%x\n" \ ++ "hdr_timestamp=%s\n" \ ++ "hdr_length=%u\n" \ ++ "hdr_maint_op_class=%u\n" \ ++ "event_type=%u\n" \ ++ "health_status=%u\n" \ ++ "media_status=%u\n" \ ++ "life_used=%u\n" \ ++ "dirty_shutdown_cnt=%u\n" \ ++ "cor_vol_err_cnt=%u\n" \ ++ "cor_per_err_cnt=%u\n" \ ++ "device_temp=%d\n" \ ++ "add_status=%u\n", \ ++ ev->hdr.timestamp, \ ++ ev->hdr.memdev, \ ++ ev->hdr.host, \ ++ ev->hdr.serial, \ ++ ev->hdr.log_type, \ ++ ev->hdr.hdr_uuid, \ ++ ev->hdr.hdr_flags, \ ++ ev->hdr.hdr_handle, \ ++ ev->hdr.hdr_related_handle, \ ++ ev->hdr.hdr_timestamp, \ ++ ev->hdr.hdr_length, \ ++ ev->hdr.hdr_maint_op_class, \ ++ ev->event_type, \ ++ ev->health_status, \ ++ ev->media_status, \ ++ ev->life_used, \ ++ ev->dirty_shutdown_cnt, \ ++ ev->cor_vol_err_cnt, \ ++ ev->cor_per_err_cnt, \ ++ ev->device_temp, \ ++ ev->add_status); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -663,6 +719,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case CXL_DRAM_EVENT: + rc = set_cxl_dram_event_backtrace(buf, (struct ras_cxl_dram_event *)ev); + break; ++ case CXL_MEMORY_MODULE_EVENT: ++ rc = set_cxl_memory_module_event_backtrace(buf, (struct ras_cxl_memory_module_event *)ev); ++ break; + default: + return -1; + } +@@ -1380,3 +1439,47 @@ cxl_dram_fail: + else + return -1; + } ++ ++int ras_report_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto cxl_memory_module_fail; ++ ++ rc = commit_report_backtrace(sockfd, CXL_MEMORY_MODULE_EVENT, ev); ++ if (rc < 0) ++ goto cxl_memory_module_fail; ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl_memory_module_event"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_memory_module_fail; ++ ++ sprintf(buf, "REASON=%s", "CXL Memory Module Event"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_memory_module_fail; ++ ++ done = 1; ++ ++cxl_memory_module_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ else ++ return -1; ++} +diff --git a/ras-report.h b/ras-report.h +index 1ad00e0..e401850 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -46,6 +46,7 @@ int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflo + int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); + int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); + int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev); ++int ras_report_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev); + + #else + +@@ -64,6 +65,7 @@ static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct r + static inline int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; + static inline int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; + static inline int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) { return 0; }; ++static inline int ras_report_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev) { return 0; }; + + #endif + diff --git a/f73ed45b91244eb3986ac2574cd7d36ae1d4d22a.patch b/f73ed45b91244eb3986ac2574cd7d36ae1d4d22a.patch new file mode 100644 index 0000000..2157647 --- /dev/null +++ b/f73ed45b91244eb3986ac2574cd7d36ae1d4d22a.patch @@ -0,0 +1,435 @@ +commit f73ed45b91244eb3986ac2574cd7d36ae1d4d22a +Author: Shiju Jose +Date: Tue Apr 4 16:50:50 2023 +0100 + + rasdaemon: Add support for the CXL overflow events + + Add support to log and record the CXL overflow events. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index d540ebb..d4c845e 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -426,3 +426,101 @@ int ras_cxl_aer_ce_event_handler(struct trace_seq *s, + + return 0; + } ++ ++/* ++ * CXL rev 3.0 section 8.2.9.2.2; Table 8-49 ++ */ ++enum cxl_event_log_type { ++ CXL_EVENT_TYPE_INFO = 0x00, ++ CXL_EVENT_TYPE_WARN, ++ CXL_EVENT_TYPE_FAIL, ++ CXL_EVENT_TYPE_FATAL, ++ CXL_EVENT_TYPE_UNKNOWN ++}; ++ ++static char *cxl_event_log_type_str(uint32_t log_type) ++{ ++ ++ switch (log_type) { ++ case CXL_EVENT_TYPE_INFO: ++ return "Informational"; ++ case CXL_EVENT_TYPE_WARN: ++ return "Warning"; ++ case CXL_EVENT_TYPE_FAIL: ++ return "Failure"; ++ case CXL_EVENT_TYPE_FATAL: ++ return "Fatal"; ++ default: ++ break; ++ } ++ ++ return "Unknown"; ++} ++ ++int ras_cxl_overflow_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ int len; ++ unsigned long long val; ++ struct ras_events *ras = context; ++ struct ras_cxl_overflow_event ev; ++ ++ memset(&ev, 0, sizeof(ev)); ++ get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); ++ if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) ++ return -1; ++ ++ ev.memdev = tep_get_field_raw(s, event, "memdev", record, &len, 1); ++ if (!ev.memdev) ++ return -1; ++ if (trace_seq_printf(s, "memdev:%s ", ev.memdev) <= 0) ++ return -1; ++ ++ ev.host = tep_get_field_raw(s, event, "host", record, &len, 1); ++ if (!ev.host) ++ return -1; ++ if (trace_seq_printf(s, "host:%s ", ev.host) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "serial", record, &val, 1) < 0) ++ return -1; ++ ev.serial = val; ++ if (trace_seq_printf(s, "serial:0x%llx ", (unsigned long long)ev.serial) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "log", record, &val, 1) < 0) ++ return -1; ++ ev.log_type = cxl_event_log_type_str(val); ++ if (trace_seq_printf(s, "log type:%s ", ev.log_type) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "count", record, &val, 1) < 0) ++ return -1; ++ ev.count = val; ++ ++ if (tep_get_field_val(s, event, "first_ts", record, &val, 1) < 0) ++ return -1; ++ convert_timestamp(val, ev.first_ts, sizeof(ev.first_ts)); ++ ++ if (tep_get_field_val(s, event, "last_ts", record, &val, 1) < 0) ++ return -1; ++ convert_timestamp(val, ev.last_ts, sizeof(ev.last_ts)); ++ ++ if (ev.count) { ++ if (trace_seq_printf(s, "%u errors from %s to %s\n", ++ ev.count, ev.first_ts, ev.last_ts) <= 0) ++ return -1; ++ } ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_cxl_overflow_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_cxl_overflow_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h +index 711daf4..e7847ec 100644 +--- a/ras-cxl-handler.h ++++ b/ras-cxl-handler.h +@@ -29,4 +29,7 @@ int ras_cxl_aer_ue_event_handler(struct trace_seq *s, + int ras_cxl_aer_ce_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); ++int ras_cxl_overflow_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context); + #endif +diff --git a/ras-events.c b/ras-events.c +index d0251e0..f2a869a 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -248,6 +248,7 @@ int toggle_ras_mc_event(int enable) + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_poison", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_uncorrectable_error", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_correctable_error", enable); ++ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_overflow", enable); + #endif + + free_ras: +@@ -1045,6 +1046,14 @@ int handle_ras_events(int record_events) + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "cxl_aer_correctable_error"); ++ ++ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_overflow", ++ ras_cxl_overflow_event_handler, NULL, CXL_OVERFLOW_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "cxl", "cxl_overflow"); + #endif + + if (!num_events) { +diff --git a/ras-events.h b/ras-events.h +index a9d67c2..7c869d9 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -42,6 +42,7 @@ enum { + CXL_POISON_EVENT, + CXL_AER_UE_EVENT, + CXL_AER_CE_EVENT, ++ CXL_OVERFLOW_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index 86133c4..7b808a5 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -720,6 +720,59 @@ int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_eve + + return rc; + } ++ ++/* ++ * Table and functions to handle cxl:cxl_overflow ++ */ ++static const struct db_fields cxl_overflow_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "memdev", .type = "TEXT" }, ++ { .name = "host", .type = "TEXT" }, ++ { .name = "serial", .type = "INTEGER" }, ++ { .name = "log_type", .type = "TEXT" }, ++ { .name = "count", .type = "INTEGER" }, ++ { .name = "first_ts", .type = "TEXT" }, ++ { .name = "last_ts", .type = "TEXT" }, ++}; ++ ++static const struct db_table_descriptor cxl_overflow_event_tab = { ++ .name = "cxl_overflow_event", ++ .fields = cxl_overflow_event_fields, ++ .num_fields = ARRAY_SIZE(cxl_overflow_event_fields), ++}; ++ ++int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_cxl_overflow_event) ++ return 0; ++ log(TERM, LOG_INFO, "cxl_overflow_event store: %p\n", priv->stmt_cxl_overflow_event); ++ ++ sqlite3_bind_text(priv->stmt_cxl_overflow_event, 1, ev->timestamp, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_overflow_event, 2, ev->memdev, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_overflow_event, 3, ev->host, -1, NULL); ++ sqlite3_bind_int64(priv->stmt_cxl_overflow_event, 4, ev->serial); ++ sqlite3_bind_text(priv->stmt_cxl_overflow_event, 5, ev->log_type, -1, NULL); ++ sqlite3_bind_int(priv->stmt_cxl_overflow_event, 6, ev->count); ++ sqlite3_bind_text(priv->stmt_cxl_overflow_event, 7, ev->first_ts, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_overflow_event, 8, ev->last_ts, -1, NULL); ++ ++ rc = sqlite3_step(priv->stmt_cxl_overflow_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do cxl_overflow_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_cxl_overflow_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset cxl_overflow_event on sqlite: error = %d\n", ++ rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} + #endif + + /* +@@ -1087,6 +1140,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + if (rc != SQLITE_OK) + goto error; + } ++ ++ rc = ras_mc_create_table(priv, &cxl_overflow_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_overflow_event, ++ &cxl_overflow_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } + #endif + + ras->db_priv = priv; +@@ -1232,6 +1293,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + "cpu %u: Failed to finalize cxl_aer_ce_event sqlite: error = %d\n", + cpu, rc); + } ++ ++ if (priv->stmt_cxl_overflow_event) { ++ rc = sqlite3_finalize(priv->stmt_cxl_overflow_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize cxl_overflow_event sqlite: error = %d\n", ++ cpu, rc); ++ } + #endif + + rc = sqlite3_close_v2(db); +diff --git a/ras-record.h b/ras-record.h +index ab7153d..90db6ad 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -152,6 +152,17 @@ struct ras_cxl_aer_ce_event { + uint32_t error_status; + }; + ++struct ras_cxl_overflow_event { ++ char timestamp[64]; ++ const char *memdev; ++ const char *host; ++ uint64_t serial; ++ const char *log_type; ++ char first_ts[64]; ++ char last_ts[64]; ++ uint16_t count; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -164,6 +175,7 @@ struct ras_mf_event; + struct ras_cxl_poison_event; + struct ras_cxl_aer_ue_event; + struct ras_cxl_aer_ce_event; ++struct ras_cxl_overflow_event; + + #ifdef HAVE_SQLITE3 + +@@ -200,6 +212,7 @@ struct sqlite3_priv { + sqlite3_stmt *stmt_cxl_poison_event; + sqlite3_stmt *stmt_cxl_aer_ue_event; + sqlite3_stmt *stmt_cxl_aer_ce_event; ++ sqlite3_stmt *stmt_cxl_overflow_event; + #endif + }; + +@@ -231,6 +244,7 @@ int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev); + int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev); + int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev); + int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); ++int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -247,6 +261,7 @@ static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event + static inline int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; }; + static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; }; + static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; ++static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 63b47f5..dbed454 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -421,6 +421,36 @@ static int set_cxl_aer_ce_event_backtrace(char *buf, struct ras_cxl_aer_ce_event + return 0; + } + ++static int set_cxl_overflow_event_backtrace(char *buf, struct ras_cxl_overflow_event *ev) ++{ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "memdev=%s\n" \ ++ "host=%s\n" \ ++ "serial=0x%lx\n" \ ++ "log_type=%s\n" \ ++ "count=%u\n" \ ++ "first_ts=%s\n" \ ++ "last_ts=%s\n", \ ++ ev->timestamp, \ ++ ev->memdev, \ ++ ev->host, \ ++ ev->serial, \ ++ ev->log_type, \ ++ ev->count, \ ++ ev->first_ts, \ ++ ev->last_ts); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -467,6 +497,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case CXL_AER_CE_EVENT: + rc = set_cxl_aer_ce_event_backtrace(buf, (struct ras_cxl_aer_ce_event *)ev); + break; ++ case CXL_OVERFLOW_EVENT: ++ rc = set_cxl_overflow_event_backtrace(buf, (struct ras_cxl_overflow_event *)ev); ++ break; + default: + return -1; + } +@@ -1007,3 +1040,47 @@ cxl_aer_ce_fail: + else + return -1; + } ++ ++int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto cxl_overflow_fail; ++ ++ rc = commit_report_backtrace(sockfd, CXL_OVERFLOW_EVENT, ev); ++ if (rc < 0) ++ goto cxl_overflow_fail; ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl-overflow"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_overflow_fail; ++ ++ sprintf(buf, "REASON=%s", "CXL overflow"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_overflow_fail; ++ ++ done = 1; ++ ++cxl_overflow_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ else ++ return -1; ++} +diff --git a/ras-report.h b/ras-report.h +index 46155ee..204d485 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -42,6 +42,7 @@ int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev); + int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev); + int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev); + int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); ++int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); + + #else + +@@ -56,6 +57,7 @@ static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_even + static inline int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; }; + static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; }; + static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; ++static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; + + #endif + diff --git a/f8b6da812eddc063ea739970f941fdd24fb984ae.patch b/f8b6da812eddc063ea739970f941fdd24fb984ae.patch new file mode 100644 index 0000000..ee8d818 --- /dev/null +++ b/f8b6da812eddc063ea739970f941fdd24fb984ae.patch @@ -0,0 +1,199 @@ +commit 70acd500302d2db318bb0e35b551f74fd4baebc4 +Author: Shiju Jose +Date: Mon Feb 12 10:27:58 2024 +0000 + + rasdaemon: ras-mc-ctl: Add support for CXL AER uncorrectable trace events + + Add support for CXL AER uncorrectable events to the ras-mc-ctl tool. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + (cherry picked from commit f8b6da812eddc063ea739970f941fdd24fb984ae) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 1cc19b3..c0a2ec6 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -43,6 +43,7 @@ my $modprobe = find_prog ("modprobe") or exit (1); + + my $has_aer = 0; + my $has_arm = 0; ++my $has_cxl = 0; + my $has_devlink = 0; + my $has_disk_errors = 0; + my $has_extlog = 0; +@@ -51,6 +52,7 @@ my $has_mce = 0; + + @WITH_AER_TRUE@$has_aer = 1; + @WITH_ARM_TRUE@$has_arm = 1; ++@WITH_CXL_TRUE@$has_cxl = 1; + @WITH_DEVLINK_TRUE@$has_devlink = 1; + @WITH_DISKERROR_TRUE@$has_disk_errors = 1; + @WITH_EXTLOG_TRUE@$has_extlog = 1; +@@ -1156,6 +1158,78 @@ sub get_uuid_le + return $out; + } + ++use constant { ++ CXL_AER_UE_CACHE_DATA_PARITY => 0x0001, ++ CXL_AER_UE_CACHE_ADDR_PARITY => 0x0002, ++ CXL_AER_UE_CACHE_BE_PARITY => 0x0004, ++ CXL_AER_UE_CACHE_DATA_ECC => 0x0008, ++ CXL_AER_UE_MEM_DATA_PARITY => 0x0010, ++ CXL_AER_UE_MEM_ADDR_PARITY => 0x0020, ++ CXL_AER_UE_MEM_BE_PARITY => 0x0040, ++ CXL_AER_UE_MEM_DATA_ECC => 0x0080, ++ CXL_AER_UE_REINIT_THRESH => 0x0100, ++ CXL_AER_UE_RSVD_ENCODE => 0x0200, ++ CXL_AER_UE_POISON => 0x0400, ++ CXL_AER_UE_RECV_OVERFLOW => 0x0800, ++ CXL_AER_UE_INTERNAL_ERR => 0x4000, ++ CXL_AER_UE_IDE_TX_ERR => 0x8000, ++ CXL_AER_UE_IDE_RX_ERR => 0x10000, ++}; ++ ++sub get_cxl_ue_error_status_text ++{ ++ my $error_status = $_[0]; ++ my @out; ++ ++ if ($error_status & CXL_AER_UE_CACHE_DATA_PARITY) { ++ push @out, (sprintf "\'Cache Data Parity Error\' "); ++ } ++ if ($error_status & CXL_AER_UE_CACHE_ADDR_PARITY) { ++ push @out, (sprintf "\'Cache Address Parity Error\' "); ++ } ++ if ($error_status & CXL_AER_UE_CACHE_BE_PARITY) { ++ push @out, (sprintf "\'Cache Byte Enable Parity Error\' "); ++ } ++ if ($error_status & CXL_AER_UE_CACHE_DATA_ECC) { ++ push @out, (sprintf "\'Cache Data ECC Error\' "); ++ } ++ if ($error_status & CXL_AER_UE_MEM_DATA_PARITY) { ++ push @out, (sprintf "\'Memory Data Parity Error\' "); ++ } ++ if ($error_status & CXL_AER_UE_MEM_ADDR_PARITY) { ++ push @out, (sprintf "\'Memory Address Parity Error\' "); ++ } ++ if ($error_status & CXL_AER_UE_MEM_BE_PARITY) { ++ push @out, (sprintf "\'Memory Byte Enable Parity Error\' "); ++ } ++ if ($error_status & CXL_AER_UE_MEM_DATA_ECC) { ++ push @out, (sprintf "\'Memory Data ECC Error\' "); ++ } ++ if ($error_status & CXL_AER_UE_REINIT_THRESH) { ++ push @out, (sprintf "\'REINIT Threshold Hit\' "); ++ } ++ if ($error_status & CXL_AER_UE_RSVD_ENCODE) { ++ push @out, (sprintf "\'Received Unrecognized Encoding\' "); ++ } ++ if ($error_status & CXL_AER_UE_POISON) { ++ push @out, (sprintf "\'Received Poison From Peer\' "); ++ } ++ if ($error_status & CXL_AER_UE_RECV_OVERFLOW) { ++ push @out, (sprintf "\'Receiver Overflow\' "); ++ } ++ if ($error_status & CXL_AER_UE_INTERNAL_ERR) { ++ push @out, (sprintf "\'Component Specific Error\' "); ++ } ++ if ($error_status & CXL_AER_UE_IDE_TX_ERR) { ++ push @out, (sprintf "\'IDE Tx Error\' "); ++ } ++ if ($error_status & CXL_AER_UE_IDE_RX_ERR) { ++ push @out, (sprintf "\'IDE Rx Error\' "); ++ } ++ ++ return join (", ", @out); ++} ++ + sub summary + { + require DBI; +@@ -1163,7 +1237,7 @@ sub summary + my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg, $action_result); + my ($etype, $severity, $etype_string, $severity_string); + my ($dev_name, $dev); +- my ($mpidr); ++ my ($mpidr, $memdev); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -1219,6 +1293,25 @@ sub summary + $query_handle->finish; + } + ++ # CXL errors ++ if ($has_cxl == 1) { ++ # CXL AER uncorrectable errors ++ $query = "select memdev, count(*) from cxl_aer_ue_event$conf{opt}{since} group by memdev"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($memdev, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$memdev errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "CXL AER uncorrectable events summary:\n$out\n"; ++ } else { ++ print "No CXL AER uncorrectable errors.\n\n"; ++ } ++ $query_handle->finish; ++ } ++ + # extlog errors + if ($has_extlog == 1) { + $query = "select etype, severity, count(*) from extlog_event group by etype, severity"; +@@ -1324,6 +1417,7 @@ sub errors + my ($dev, $sector, $nr_sector, $error, $rwbs, $cmd); + my ($error_count, $affinity, $mpidr, $r_state, $psci_state); + my ($pfn, $page_type, $action_result); ++ my ($memdev, $host, $serial, $error_status, $first_error, $header_log); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -1389,6 +1483,44 @@ sub errors + $query_handle->finish; + } + ++ # CXL errors ++ if ($has_cxl == 1) { ++ # CXL AER uncorrectable errors ++ use constant SZ_512 => 0x200; ++ use constant CXL_HEADERLOG_SIZE_U32 => SZ_512/32; ++ $query = "select id, timestamp, memdev, host, serial, error_status, first_error, header_log from cxl_aer_ue_event$conf{opt}{since} order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $error_status, $first_error, $header_log)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $timestamp error: "; ++ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev); ++ $out .= "host=$host, " if (defined $host && length $host); ++ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial); ++ if (defined $error_status && length $error_status) { ++ $out .= sprintf "error_status: %s, ", get_cxl_ue_error_status_text($error_status); ++ } ++ if (defined $first_error && length $first_error) { ++ $out .= sprintf "first_error: %s, ", get_cxl_ue_error_status_text($first_error); ++ } ++ if (defined $header_log && length $header_log) { ++ $out .= sprintf "header_log:\n"; ++ my @bytes = unpack "C*", $header_log; ++ for (my $i = 0; $i < CXL_HEADERLOG_SIZE_U32; $i++) { ++ $out .= sprintf "%08x ", $bytes[$i]; ++ } ++ } ++ $out .= "\n"; ++ } ++ if ($out ne "") { ++ print "CXL AER uncorrectable events:\n$out\n"; ++ } else { ++ print "No CXL AER uncorrectable errors.\n\n"; ++ } ++ $query_handle->finish; ++ } ++ + # Extlog errors + if ($has_extlog == 1) { + $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id"; diff --git a/fd11670d2d35c5d939b03ba1ca80eb81c1f636b6.patch b/fd11670d2d35c5d939b03ba1ca80eb81c1f636b6.patch new file mode 100644 index 0000000..2215c83 --- /dev/null +++ b/fd11670d2d35c5d939b03ba1ca80eb81c1f636b6.patch @@ -0,0 +1,127 @@ +commit dba1c58ef5802b96b6555cb42e3cf7f75fa0da8c +Author: Shiju Jose +Date: Mon Feb 12 10:56:25 2024 +0000 + + rasdaemon: ras-mc-ctl: Add support for CXL generic trace events + + Add support for CXL generic events to the ras-mc-ctl tool. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + (cherry picked from commit fd11670d2d35c5d939b03ba1ca80eb81c1f636b6) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 16b0589..5528021 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1270,6 +1270,34 @@ sub get_cxl_ce_error_status_text + return join (", ", @out); + } + ++use constant { ++ CXL_EVENT_RECORD_FLAG_PERMANENT => 0x0004, ++ CXL_EVENT_RECORD_FLAG_MAINT_NEEDED => 0x0008, ++ CXL_EVENT_RECORD_FLAG_PERF_DEGRADED => 0x0010, ++ CXL_EVENT_RECORD_FLAG_HW_REPLACE => 0x0020, ++}; ++ ++sub get_cxl_hdr_flags_text ++{ ++ my $flags = $_[0]; ++ my @out; ++ ++ if ($flags & CXL_EVENT_RECORD_FLAG_PERMANENT) { ++ push @out, (sprintf "\'PERMANENT_CONDITION\' "); ++ } ++ if ($flags & CXL_EVENT_RECORD_FLAG_MAINT_NEEDED) { ++ push @out, (sprintf "\'MAINTENANCE_NEEDED\' "); ++ } ++ if ($flags & CXL_EVENT_RECORD_FLAG_PERF_DEGRADED) { ++ push @out, (sprintf "\'PERFORMANCE_DEGRADED\' "); ++ } ++ if ($flags & CXL_EVENT_RECORD_FLAG_HW_REPLACE) { ++ push @out, (sprintf "\'HARDWARE_REPLACEMENT_NEEDED\' "); ++ } ++ ++ return join (", ", @out); ++} ++ + sub summary + { + require DBI; +@@ -1398,6 +1426,22 @@ sub summary + print "No CXL poison errors.\n\n"; + } + $query_handle->finish; ++ ++ # CXL generic errors ++ $query = "select memdev, count(*) from cxl_generic_event$conf{opt}{since} group by memdev"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($memdev, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$memdev errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "CXL generic events summary:\n$out\n"; ++ } else { ++ print "No CXL generic errors.\n\n"; ++ } ++ $query_handle->finish; + } + + # extlog errors +@@ -1508,6 +1552,7 @@ sub errors + my ($memdev, $host, $serial, $error_status, $first_error, $header_log); + my ($log_type, $first_ts, $last_ts); + my ($trace_type, $region, $region_uuid, $hpa, $dpa, $dpa_length, $source, $flags, $overflow_ts); ++ my ($hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $data); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -1681,6 +1726,44 @@ sub errors + } else { + print "No CXL poison errors.\n\n"; + } ++ ++ # CXL generic errors ++ use constant CXL_EVENT_RECORD_DATA_LENGTH => 0x50; ++ $query = "select id, timestamp, memdev, host, serial, log_type, hdr_uuid, hdr_flags, hdr_handle, hdr_related_handle, hdr_ts, hdr_length, hdr_maint_op_class, data from cxl_generic_event$conf{opt}{since} order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $log_type, $hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $data)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $timestamp error: "; ++ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev); ++ $out .= "host=$host, " if (defined $host && length $host); ++ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial); ++ $out .= "log=$log_type, " if (defined $log_type && length $log_type); ++ $out .= "hdr_uuid=$hdr_uuid, " if (defined $hdr_uuid && length $hdr_uuid); ++ $out .= sprintf "hdr_flags=0x%llx %s, ", $hdr_flags, get_cxl_hdr_flags_text($hdr_flags) if (defined $hdr_flags && length $hdr_flags); ++ $out .= sprintf "hdr_handle=0x%x, ", $hdr_handle if (defined $hdr_handle && length $hdr_handle); ++ $out .= sprintf "hdr_related_handle=0x%x, ", $hdr_related_handle if (defined $hdr_related_handle && length $hdr_related_handle); ++ $out .= "hdr_timestamp=$hdr_ts, " if (defined $hdr_ts && length $hdr_ts); ++ $out .= sprintf "hdr_length=%u, ", $hdr_length if (defined $hdr_length && length $hdr_length); ++ $out .= sprintf "hdr_maint_op_class=%u, ", $hdr_maint_op_class if (defined $hdr_maint_op_class && length $hdr_maint_op_class); ++ if (defined $data && length $data) { ++ $out .= sprintf "data:\n"; ++ my @bytes = unpack "C*", $data; ++ for (my $i = 0; $i < CXL_EVENT_RECORD_DATA_LENGTH; $i++) { ++ if (($i > 0) && (($i % 16) == 0)) { ++ $out .= sprintf "\n %08x: ", $i; ++ } ++ $out .= sprintf "%02x%02x%02x%02x ", $bytes[$i], $bytes[$i + 1], $bytes[$i + 2], $bytes[$i + 3]; ++ } ++ } ++ $out .= "\n"; ++ } ++ if ($out ne "") { ++ print "CXL generic events:\n$out\n"; ++ } else { ++ print "No CXL generic errors.\n\n"; ++ } + } + + # Extlog errors diff --git a/rasdaemon.spec b/rasdaemon.spec index e656119..b9f787d 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,12 +1,84 @@ Name: rasdaemon Version: 0.8.0 -Release: 6%{?dist} +Release: 7%{?dist} Summary: Utility to receive RAS error tracings Group: Applications/System License: GPLv2 URL: http://git.infradead.org/users/mchehab/rasdaemon.git Source0: http://www.infradead.org/~mchehab/rasdaemon/%{name}-%{version}.tar.bz2 +# https://github.com/mchehab/rasdaemon/pull/96 +# Add support for CXL poison and AER error events (4 patches) + +# rasdaemon: Move definition for BIT and BIT_ULL to a common file +Patch0: d3836aa061f677232f99c514247d3dbf80812a1b.patch + +# rasdaemon: Add support for the CXL poison events +Patch1: 75c8fec559641f843345ef8fbc36d124b60b914d.patch + +# rasdaemon: Add support for the CXL AER uncorrectable errors +Patch2: a7524917befe7e67c02253cc27cb0c724e5992c0.patch + +# rasdaemon: Add support for the CXL AER correctable errors +Patch3: a247baf7110ab6427259eb1421a103e2021a8735.patch + +# https://github.com/mchehab/rasdaemon/pull/104 +# rasdaemon: Process the generic CXL trace events (7 patches) + +# rasdaemon: Add common function to convert timestamp in the CXL event records to the broken-down time format +Patch4: 2ff9bc453998ddb145c7bb8ba30a57c56bd18eab.patch + +# rasdaemon: Add common function to get timestamp for the event +Patch5: 7be2edbf863b7acf7e1cab10c2b9f9bf51b3d513.patch + +# rasdaemon: Add support for the CXL overflow events +Patch6: f73ed45b91244eb3986ac2574cd7d36ae1d4d22a.patch + +# rasdaemon: Add support for the CXL generic events +Patch7: e0cde0edf073b939d345aeba0aed23e238dbc53b.patch + +# rasdaemon: Add support for the CXL general media events +Patch8: 53c682fb45c2909c128be4ee8f51a3e42fe2f7fd.patch + +# rasdaemon: Add support for the CXL dram events +Patch9: 9a2f6186db2622788f8868d8ec082684d6a06d4d.patch + +# rasdaemon: Add support for the CXL memory module events +Patch10: f63b4c942e19a0da1e85a88783ed6e222ad4bdba.patch + +# https://github.com/mchehab/rasdaemon/pull/149 +# rasdaemon: generic fixes and ras-mc-ctl: add support for CXL error events (10 patches) + +# rasdaemon: Fix build warnings unused variable if AMP RAS errors is not enabled +Patch11: 8f79833e3d78424f4a594985fbeb91890f4af81c.patch + +# rasdaemon: ras-memory-failure-handler: update memory failure action page types +Patch12: 31c7578ddb0fc15aa7247f2b8885956540031221.patch + +# rasdaemon: ras-mc-ctl: Add support for CXL AER uncorrectable trace events +Patch13: f8b6da812eddc063ea739970f941fdd24fb984ae.patch + +# rasdaemon: ras-mc-ctl: Add support for CXL AER correctable trace events +Patch14: ae1647624486fca0070b297d0e2fd4e53443c10b.patch + +# rasdaemon: ras-mc-ctl: Add support for CXL overflow trace events +Patch15: b22cb067755f4604770f9864a0babed8f93a1553.patch + +# rasdaemon: ras-mc-ctl: Add support for CXL poison trace events +Patch16: 93ca96b66c917af37b2ae9295dc5df46a7d64dd2.patch + +# rasdaemon: ras-mc-ctl: Add support for CXL generic trace events +Patch17: fd11670d2d35c5d939b03ba1ca80eb81c1f636b6.patch + +# rasdaemon: ras-mc-ctl: Add support for CXL general media trace events +Patch18: 572de9d57691be9e630abee9ffa56a2fb155d558.patch + +# rasdaemon: ras-mc-ctl: Add support for CXL DRAM trace events +Patch19: c38c14afc5d7bb6c8c52d1023271d755deb23008.patch + +# rasdaemon: ras-mc-ctl: Add support for CXL memory module trace events +Patch20: aee13f74266382c64128bd7367a5eeb46277f490.patch + ExcludeArch: s390 s390x BuildRequires: make BuildRequires: gcc @@ -40,6 +112,27 @@ an utility for reporting current error counts from the EDAC sysfs files. %prep %setup -q +%patch0 -p1 +%patch1 -p1 +%patch2 -p1 +%patch3 -p1 +%patch4 -p1 +%patch5 -p1 +%patch6 -p1 +%patch7 -p1 +%patch8 -p1 +%patch9 -p1 +%patch10 -p1 +%patch11 -p1 +%patch12 -p1 +%patch13 -p1 +%patch14 -p1 +%patch15 -p1 +%patch16 -p1 +%patch17 -p1 +%patch18 -p1 +%patch19 -p1 +%patch20 -p1 autoreconf -vfi %build @@ -48,11 +141,13 @@ autoreconf -vfi --enable-mce --enable-extlog --enable-devlink --enable-diskerror \ --enable-memory-failure --enable-abrt-report --enable-hisi-ns-decode \ --enable-memory-ce-pfa --enable-amp-ns-decode --enable-cpu-fault-isolation \ + --enable-cxl \ --with-sysconfdefdir=%{_sysconfdir}/sysconfig %else %configure --enable-sqlite3 --enable-aer \ --enable-mce --enable-extlog --enable-devlink --enable-diskerror \ --enable-memory-failure --enable-abrt-report --enable-cpu-fault-isolation \ + --enable-cxl \ --with-sysconfdefdir=%{_sysconfdir}/sysconfig %endif make %{?_smp_mflags} @@ -74,6 +169,10 @@ rm INSTALL %{buildroot}/usr/include/*.h %config(noreplace) %{_sysconfdir}/sysconfig/%{name} %changelog +* Tue Jan 14 2025 Joel Savitz - 0.8.0-7 +- Add support for CXL memory failure event logging + Resolves: RHEL-61233 + * Tue Oct 29 2024 Troy Dawson - 0.8.0-6 - Bump release for October 2024 mass rebuild: Resolves: RHEL-64018