From 5d6b3c3241a0cf458f10284bc711c693cf8e195a Mon Sep 17 00:00:00 2001 From: Joel Savitz Date: Thu, 9 Jan 2025 20:10:50 -0500 Subject: [PATCH] Add support for CXL memory failure event logging Resolves: RHEL-61233 --- ...bc453998ddb145c7bb8ba30a57c56bd18eab.patch | 66 ++ ...578ddb0fc15aa7247f2b8885956540031221.patch | 54 ++ ...82fb45c2909c128be4ee8f51a3e42fe2f7fd.patch | 551 +++++++++++++++ ...e9d57691be9e630abee9ffa56a2fb155d558.patch | 182 +++++ ...fec559641f843345ef8fbc36d124b60b914d.patch | 663 ++++++++++++++++++ ...edbf863b7acf7e1cab10c2b9f9bf51b3d513.patch | 97 +++ ...833e3d78424f4a594985fbeb91890f4af81c.patch | 78 +++ ...96b66c917af37b2ae9295dc5df46a7d64dd2.patch | 82 +++ ...6186db2622788f8868d8ec082684d6a06d4d.patch | 559 +++++++++++++++ ...baf7110ab6427259eb1421a103e2021a8735.patch | 424 +++++++++++ ...4917befe7e67c02253cc27cb0c724e5992c0.patch | 503 +++++++++++++ ...47624486fca0070b297d0e2fd4e53443c10b.patch | 116 +++ ...3f74266382c64128bd7367a5eeb46277f490.patch | 161 +++++ ...b067755f4604770f9864a0babed8f93a1553.patch | 75 ++ ...14afc5d7bb6c8c52d1023271d755deb23008.patch | 101 +++ ...6aa061f677232f99c514247d3dbf80812a1b.patch | 42 ++ ...e0edf073b939d345aeba0aed23e238dbc53b.patch | 575 +++++++++++++++ ...4c942e19a0da1e85a88783ed6e222ad4bdba.patch | 536 ++++++++++++++ ...d45b91244eb3986ac2574cd7d36ae1d4d22a.patch | 435 ++++++++++++ ...da812eddc063ea739970f941fdd24fb984ae.patch | 199 ++++++ ...670d2d35c5d939b03ba1ca80eb81c1f636b6.patch | 127 ++++ rasdaemon.spec | 101 ++- 22 files changed, 5726 insertions(+), 1 deletion(-) create mode 100644 2ff9bc453998ddb145c7bb8ba30a57c56bd18eab.patch create mode 100644 31c7578ddb0fc15aa7247f2b8885956540031221.patch create mode 100644 53c682fb45c2909c128be4ee8f51a3e42fe2f7fd.patch create mode 100644 572de9d57691be9e630abee9ffa56a2fb155d558.patch create mode 100644 75c8fec559641f843345ef8fbc36d124b60b914d.patch create mode 100644 7be2edbf863b7acf7e1cab10c2b9f9bf51b3d513.patch create mode 100644 8f79833e3d78424f4a594985fbeb91890f4af81c.patch create mode 100644 93ca96b66c917af37b2ae9295dc5df46a7d64dd2.patch create mode 100644 9a2f6186db2622788f8868d8ec082684d6a06d4d.patch create mode 100644 a247baf7110ab6427259eb1421a103e2021a8735.patch create mode 100644 a7524917befe7e67c02253cc27cb0c724e5992c0.patch create mode 100644 ae1647624486fca0070b297d0e2fd4e53443c10b.patch create mode 100644 aee13f74266382c64128bd7367a5eeb46277f490.patch create mode 100644 b22cb067755f4604770f9864a0babed8f93a1553.patch create mode 100644 c38c14afc5d7bb6c8c52d1023271d755deb23008.patch create mode 100644 d3836aa061f677232f99c514247d3dbf80812a1b.patch create mode 100644 e0cde0edf073b939d345aeba0aed23e238dbc53b.patch create mode 100644 f63b4c942e19a0da1e85a88783ed6e222ad4bdba.patch create mode 100644 f73ed45b91244eb3986ac2574cd7d36ae1d4d22a.patch create mode 100644 f8b6da812eddc063ea739970f941fdd24fb984ae.patch create mode 100644 fd11670d2d35c5d939b03ba1ca80eb81c1f636b6.patch diff --git a/2ff9bc453998ddb145c7bb8ba30a57c56bd18eab.patch b/2ff9bc453998ddb145c7bb8ba30a57c56bd18eab.patch new file mode 100644 index 0000000..eaa9559 --- /dev/null +++ b/2ff9bc453998ddb145c7bb8ba30a57c56bd18eab.patch @@ -0,0 +1,66 @@ +commit 2ff9bc453998ddb145c7bb8ba30a57c56bd18eab +Author: Shiju Jose +Date: Tue Apr 4 14:40:42 2023 +0100 + + rasdaemon: Add common function to convert timestamp in the CXL event records to the broken-down time format + + Add common function to convert the timestamp in the CXL event records + in nanoseconds to the broken-down time format. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index 8f6342d..59534a4 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -23,6 +23,25 @@ + #include "ras-report.h" + #include + ++/* Common Functions */ ++static void convert_timestamp(unsigned long long ts, char *ts_ptr, uint16_t size) ++{ ++ /* CXL Specification 3.0 ++ * Overflow timestamp - The number of unsigned nanoseconds ++ * that have elapsed since midnight, 01-Jan-1970 UTC ++ */ ++ time_t ts_secs = ts / 1000000000ULL; ++ struct tm *tm; ++ ++ tm = localtime(&ts_secs); ++ if (tm) ++ strftime(ts_ptr, size, "%Y-%m-%d %H:%M:%S %z", tm); ++ ++ if (!ts || !tm) ++ strncpy(ts_ptr, "1970-01-01 00:00:00 +0000", ++ size); ++} ++ + /* Poison List: Payload out flags */ + #define CXL_POISON_FLAG_MORE BIT(0) + #define CXL_POISON_FLAG_OVERFLOW BIT(1) +@@ -168,22 +187,7 @@ int ras_cxl_poison_event_handler(struct trace_seq *s, + if (ev.flags & CXL_POISON_FLAG_OVERFLOW) { + if (tep_get_field_val(s, event, "overflow_ts", record, &val, 1) < 0) + return -1; +- if (val) { +- /* CXL Specification 3.0 +- * Overflow timestamp - The number of unsigned nanoseconds +- * that have elapsed since midnight, 01-Jan-1970 UTC +- */ +- time_t ovf_ts_secs = val / 1000000000ULL; +- +- tm = localtime(&ovf_ts_secs); +- if (tm) { +- strftime(ev.overflow_ts, sizeof(ev.overflow_ts), +- "%Y-%m-%d %H:%M:%S %z", tm); +- } +- } +- if (!val || !tm) +- strncpy(ev.overflow_ts, "1970-01-01 00:00:00 +0000", +- sizeof(ev.overflow_ts)); ++ convert_timestamp(val, ev.overflow_ts, sizeof(ev.overflow_ts)); + } else + strncpy(ev.overflow_ts, "1970-01-01 00:00:00 +0000", sizeof(ev.overflow_ts)); + if (trace_seq_printf(s, "overflow timestamp:%s\n", ev.overflow_ts) <= 0) diff --git a/31c7578ddb0fc15aa7247f2b8885956540031221.patch b/31c7578ddb0fc15aa7247f2b8885956540031221.patch new file mode 100644 index 0000000..7ee1e3b --- /dev/null +++ b/31c7578ddb0fc15aa7247f2b8885956540031221.patch @@ -0,0 +1,54 @@ +commit 31c7578ddb0fc15aa7247f2b8885956540031221 +Author: Shiju Jose +Date: Tue Feb 6 12:08:00 2024 +0000 + + rasdaemon: ras-memory-failure-handler: update memory failure action page types + + Update memory failure action page types corresponding to the same in + mm/memory-failure.c in the kernel. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c +index 97e8840..a5acc08 100644 +--- a/ras-memory-failure-handler.c ++++ b/ras-memory-failure-handler.c +@@ -26,10 +26,8 @@ enum mf_action_page_type { + MF_MSG_KERNEL_HIGH_ORDER, + MF_MSG_SLAB, + MF_MSG_DIFFERENT_COMPOUND, +- MF_MSG_POISONED_HUGE, + MF_MSG_HUGE, + MF_MSG_FREE_HUGE, +- MF_MSG_NON_PMD_HUGE, + MF_MSG_UNMAP_FAILED, + MF_MSG_DIRTY_SWAPCACHE, + MF_MSG_CLEAN_SWAPCACHE, +@@ -41,7 +39,6 @@ enum mf_action_page_type { + MF_MSG_CLEAN_LRU, + MF_MSG_TRUNCATED_LRU, + MF_MSG_BUDDY, +- MF_MSG_BUDDY_2ND, + MF_MSG_DAX, + MF_MSG_UNSPLIT_THP, + MF_MSG_UNKNOWN, +@@ -64,10 +61,8 @@ static const struct { + { MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page"}, + { MF_MSG_SLAB, "kernel slab page"}, + { MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking"}, +- { MF_MSG_POISONED_HUGE, "huge page already hardware poisoned"}, + { MF_MSG_HUGE, "huge page"}, + { MF_MSG_FREE_HUGE, "free huge page"}, +- { MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page"}, + { MF_MSG_UNMAP_FAILED, "unmapping failed page"}, + { MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page"}, + { MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page"}, +@@ -79,7 +74,6 @@ static const struct { + { MF_MSG_CLEAN_LRU, "clean LRU page"}, + { MF_MSG_TRUNCATED_LRU, "already truncated LRU page"}, + { MF_MSG_BUDDY, "free buddy page"}, +- { MF_MSG_BUDDY_2ND, "free buddy page (2nd try)"}, + { MF_MSG_DAX, "dax page"}, + { MF_MSG_UNSPLIT_THP, "unsplit thp"}, + { MF_MSG_UNKNOWN, "unknown page"}, diff --git a/53c682fb45c2909c128be4ee8f51a3e42fe2f7fd.patch b/53c682fb45c2909c128be4ee8f51a3e42fe2f7fd.patch new file mode 100644 index 0000000..cb656cc --- /dev/null +++ b/53c682fb45c2909c128be4ee8f51a3e42fe2f7fd.patch @@ -0,0 +1,551 @@ +commit 53c682fb45c2909c128be4ee8f51a3e42fe2f7fd +Author: Shiju Jose +Date: Wed Apr 5 11:54:41 2023 +0100 + + rasdaemon: Add support for the CXL general media events + + Add support to log and record the CXL general media events. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index 83ada56..2de96f6 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -99,6 +99,14 @@ static char *uuid_be(const char *uu) + return uuid; + } + ++static const char* get_cxl_type_str(const char** type_array, uint8_t num_elems, uint8_t type) ++{ ++ if (type >= num_elems) ++ return "Unknown"; ++ ++ return type_array[type]; ++} ++ + /* Poison List: Payload out flags */ + #define CXL_POISON_FLAG_MORE BIT(0) + #define CXL_POISON_FLAG_OVERFLOW BIT(1) +@@ -709,3 +717,151 @@ int ras_cxl_generic_event_handler(struct trace_seq *s, + + return 0; + } ++ ++#define CXL_DPA_VOLATILE BIT(0) ++#define CXL_DPA_NOT_REPAIRABLE BIT(1) ++ ++static const struct cxl_event_flags cxl_dpa_flags[] = { ++ { .bit = CXL_DPA_VOLATILE, .flag = "VOLATILE" }, ++ { .bit = CXL_DPA_NOT_REPAIRABLE, .flag = "NOT_REPAIRABLE" }, ++}; ++ ++/* ++ * General Media Event Record - GMER ++ * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 ++ */ ++#define CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT BIT(0) ++#define CXL_GMER_EVT_DESC_THRESHOLD_EVENT BIT(1) ++#define CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW BIT(2) ++ ++static const struct cxl_event_flags cxl_gmer_event_desc_flags[] = { ++ { .bit = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT, .flag = "UNCORRECTABLE EVENT" }, ++ { .bit = CXL_GMER_EVT_DESC_THRESHOLD_EVENT, .flag = "THRESHOLD EVENT" }, ++ { .bit = CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW, .flag = "POISON LIST OVERFLOW" }, ++}; ++ ++#define CXL_GMER_VALID_CHANNEL BIT(0) ++#define CXL_GMER_VALID_RANK BIT(1) ++#define CXL_GMER_VALID_DEVICE BIT(2) ++#define CXL_GMER_VALID_COMPONENT BIT(3) ++ ++static const char* cxl_gmer_mem_event_type[] = { ++ "ECC Error", ++ "Invalid Address", ++ "Data Path Error", ++}; ++ ++static const char* cxl_gmer_trans_type[] = { ++ "Unknown", ++ "Host Read", ++ "Host Write", ++ "Host Scan Media", ++ "Host Inject Poison", ++ "Internal Media Scrub", ++ "Internal Media Management", ++}; ++ ++int ras_cxl_general_media_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ int len, i; ++ unsigned long long val; ++ struct ras_events *ras = context; ++ struct ras_cxl_general_media_event ev; ++ ++ memset(&ev, 0, sizeof(ev)); ++ if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "dpa", record, &val, 1) < 0) ++ return -1; ++ ev.dpa = val; ++ if (trace_seq_printf(s, "dpa:0x%llx ", (unsigned long long)ev.dpa) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "dpa_flags", record, &val, 1) < 0) ++ return -1; ++ ev.dpa_flags = val; ++ if (trace_seq_printf(s, "dpa_flags:") <= 0) ++ return -1; ++ if (decode_cxl_event_flags(s, ev.dpa_flags, cxl_dpa_flags, ARRAY_SIZE(cxl_dpa_flags)) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "descriptor", record, &val, 1) < 0) ++ return -1; ++ ev.descriptor = val; ++ if (trace_seq_printf(s, "descriptor:") <= 0) ++ return -1; ++ if (decode_cxl_event_flags(s, ev.descriptor, cxl_gmer_event_desc_flags, ++ ARRAY_SIZE(cxl_gmer_event_desc_flags)) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "type", record, &val, 1) < 0) ++ return -1; ++ ev.type = val; ++ if (trace_seq_printf(s, "type:%s ", get_cxl_type_str(cxl_gmer_mem_event_type, ++ ARRAY_SIZE(cxl_gmer_mem_event_type), ev.type)) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "transaction_type", record, &val, 1) < 0) ++ return -1; ++ ev.transaction_type = val; ++ if (trace_seq_printf(s, "transaction_type:%s ", ++ get_cxl_type_str(cxl_gmer_trans_type, ++ ARRAY_SIZE(cxl_gmer_trans_type), ++ ev.transaction_type)) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "validity_flags", record, &val, 1) < 0) ++ return -1; ++ ev.validity_flags = val; ++ ++ if (ev.validity_flags & CXL_GMER_VALID_CHANNEL) { ++ if (tep_get_field_val(s, event, "channel", record, &val, 1) < 0) ++ return -1; ++ ev.channel = val; ++ if (trace_seq_printf(s, "channel:%u ", ev.channel) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_GMER_VALID_RANK) { ++ if (tep_get_field_val(s, event, "rank", record, &val, 1) < 0) ++ return -1; ++ ev.rank = val; ++ if (trace_seq_printf(s, "rank:%u ", ev.rank) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_GMER_VALID_DEVICE) { ++ if (tep_get_field_val(s, event, "device", record, &val, 1) < 0) ++ return -1; ++ ev.device = val; ++ if (trace_seq_printf(s, "device:%x ", ev.device) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_GMER_VALID_COMPONENT) { ++ ev.comp_id = tep_get_field_raw(s, event, "comp_id", record, &len, 1); ++ if (!ev.comp_id) ++ return -1; ++ if (trace_seq_printf(s, "comp_id:") <= 0) ++ return -1; ++ for (i = 0; i < CXL_EVENT_GEN_MED_COMP_ID_SIZE; i++) { ++ if (trace_seq_printf(s, "%02x ", ev.comp_id[i]) <= 0) ++ break; ++ } ++ } ++ ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_cxl_general_media_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_cxl_general_media_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h +index 9f77cb7..3adca4a 100644 +--- a/ras-cxl-handler.h ++++ b/ras-cxl-handler.h +@@ -35,4 +35,7 @@ int ras_cxl_overflow_event_handler(struct trace_seq *s, + int ras_cxl_generic_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); ++int ras_cxl_general_media_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context); + #endif +diff --git a/ras-events.c b/ras-events.c +index 4036933..978dee4 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -250,6 +250,7 @@ int toggle_ras_mc_event(int enable) + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_correctable_error", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_overflow", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_generic_event", enable); ++ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_general_media", enable); + #endif + + free_ras: +@@ -1063,6 +1064,14 @@ int handle_ras_events(int record_events) + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "cxl_generic_event"); ++ ++ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_general_media", ++ ras_cxl_general_media_event_handler, NULL, CXL_GENERAL_MEDIA_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "cxl", "cxl_general_media"); + #endif + + if (!num_events) { +diff --git a/ras-events.h b/ras-events.h +index 96c299e..9b83df3 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -44,6 +44,7 @@ enum { + CXL_AER_CE_EVENT, + CXL_OVERFLOW_EVENT, + CXL_GENERIC_EVENT, ++ CXL_GENERAL_MEDIA_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index a65d9c0..507a58e 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -846,6 +846,75 @@ int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_e + + return rc; + } ++ ++/* ++ * Table and functions to handle cxl:cxl_general_media_event ++ */ ++static const struct db_fields cxl_general_media_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "memdev", .type = "TEXT" }, ++ { .name = "host", .type = "TEXT" }, ++ { .name = "serial", .type = "INTEGER" }, ++ { .name = "log_type", .type = "TEXT" }, ++ { .name = "hdr_uuid", .type = "TEXT" }, ++ { .name = "hdr_flags", .type = "INTEGER" }, ++ { .name = "hdr_handle", .type = "INTEGER" }, ++ { .name = "hdr_related_handle", .type = "INTEGER" }, ++ { .name = "hdr_ts", .type = "TEXT" }, ++ { .name = "hdr_length", .type = "INTEGER" }, ++ { .name = "hdr_maint_op_class", .type = "INTEGER" }, ++ { .name = "dpa", .type = "INTEGER" }, ++ { .name = "dpa_flags", .type = "INTEGER" }, ++ { .name = "descriptor", .type = "INTEGER" }, ++ { .name = "type", .type = "INTEGER" }, ++ { .name = "transaction_type", .type = "INTEGER" }, ++ { .name = "channel", .type = "INTEGER" }, ++ { .name = "rank", .type = "INTEGER" }, ++ { .name = "device", .type = "INTEGER" }, ++ { .name = "comp_id", .type = "BLOB" }, ++}; ++ ++static const struct db_table_descriptor cxl_general_media_event_tab = { ++ .name = "cxl_general_media_event", ++ .fields = cxl_general_media_event_fields, ++ .num_fields = ARRAY_SIZE(cxl_general_media_event_fields), ++}; ++ ++int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_cxl_general_media_event) ++ return 0; ++ log(TERM, LOG_INFO, "cxl_general_media_event store: %p\n", ++ priv->stmt_cxl_general_media_event); ++ ++ ras_store_cxl_common_hdr(priv->stmt_cxl_general_media_event, &ev->hdr); ++ sqlite3_bind_int64(priv->stmt_cxl_general_media_event, 13, ev->dpa); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 14, ev->dpa_flags); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 15, ev->descriptor); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 16, ev->type); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 17, ev->transaction_type); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 18, ev->channel); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 19, ev->rank); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 20, ev->device); ++ sqlite3_bind_blob(priv->stmt_cxl_general_media_event, 21, ev->comp_id, ++ CXL_EVENT_GEN_MED_COMP_ID_SIZE, NULL); ++ ++ rc = sqlite3_step(priv->stmt_cxl_general_media_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do stmt_cxl_general_media_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_cxl_general_media_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset stmt_cxl_general_media_event on sqlite: error = %d\n", rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} + #endif + + /* +@@ -1229,6 +1298,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + if (rc != SQLITE_OK) + goto error; + } ++ ++ rc = ras_mc_create_table(priv, &cxl_general_media_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_general_media_event, ++ &cxl_general_media_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } + #endif + + ras->db_priv = priv; +@@ -1390,6 +1467,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + "cpu %u: Failed to finalize cxl_generic_event sqlite: error = %d\n", + cpu, rc); + } ++ ++ if (priv->stmt_cxl_general_media_event) { ++ rc = sqlite3_finalize(priv->stmt_cxl_general_media_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize cxl_general_media_event sqlite: error = %d\n", ++ cpu, rc); ++ } + #endif + + rc = sqlite3_close_v2(db); +diff --git a/ras-record.h b/ras-record.h +index 9ecfcda..37c32de 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -134,6 +134,7 @@ struct ras_cxl_poison_event { + #define CXL_HEADERLOG_SIZE SZ_512 + #define CXL_HEADERLOG_SIZE_U32 (SZ_512 / sizeof(uint32_t)) + #define CXL_EVENT_RECORD_DATA_LENGTH 0x50 ++#define CXL_EVENT_GEN_MED_COMP_ID_SIZE 0x10 + + struct ras_cxl_aer_ue_event { + char timestamp[64]; +@@ -184,6 +185,20 @@ struct ras_cxl_generic_event { + uint8_t *data; + }; + ++struct ras_cxl_general_media_event { ++ struct ras_cxl_event_common_hdr hdr; ++ uint64_t dpa; ++ uint8_t dpa_flags; ++ uint8_t descriptor; ++ uint8_t type; ++ uint8_t transaction_type; ++ uint8_t channel; ++ uint8_t rank; ++ uint32_t device; ++ uint8_t *comp_id; ++ uint16_t validity_flags; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -198,6 +213,7 @@ struct ras_cxl_aer_ue_event; + struct ras_cxl_aer_ce_event; + struct ras_cxl_overflow_event; + struct ras_cxl_generic_event; ++struct ras_cxl_general_media_event; + + #ifdef HAVE_SQLITE3 + +@@ -236,6 +252,7 @@ struct sqlite3_priv { + sqlite3_stmt *stmt_cxl_aer_ce_event; + sqlite3_stmt *stmt_cxl_overflow_event; + sqlite3_stmt *stmt_cxl_generic_event; ++ sqlite3_stmt *stmt_cxl_general_media_event; + #endif + }; + +@@ -269,6 +286,7 @@ int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_eve + int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); + int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); + int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); ++int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -287,6 +305,7 @@ static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_ + static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; + static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; + static inline int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; ++static inline int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 8d7b76a..725dc9b 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -489,6 +489,60 @@ static int set_cxl_generic_event_backtrace(char *buf, struct ras_cxl_generic_eve + return 0; + } + ++static int set_cxl_general_media_event_backtrace(char *buf, struct ras_cxl_general_media_event *ev) ++{ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "memdev=%s\n" \ ++ "host=%s\n" \ ++ "serial=0x%lx\n" \ ++ "log_type=%s\n" \ ++ "hdr_uuid=%s\n" \ ++ "hdr_flags=0x%x\n" \ ++ "hdr_handle=0x%x\n" \ ++ "hdr_related_handle=0x%x\n" \ ++ "hdr_timestamp=%s\n" \ ++ "hdr_length=%u\n" \ ++ "hdr_maint_op_class=%u\n" \ ++ "dpa=0x%lx\n" \ ++ "dpa_flags=%u\n" \ ++ "descriptor=%u\n" \ ++ "type=%u\n" \ ++ "transaction_type=%u\n" \ ++ "channel=%u\n" \ ++ "rank=%u\n" \ ++ "device=0x%x\n", \ ++ ev->hdr.timestamp, \ ++ ev->hdr.memdev, \ ++ ev->hdr.host, \ ++ ev->hdr.serial, \ ++ ev->hdr.log_type, \ ++ ev->hdr.hdr_uuid, \ ++ ev->hdr.hdr_flags, \ ++ ev->hdr.hdr_handle, \ ++ ev->hdr.hdr_related_handle, \ ++ ev->hdr.hdr_timestamp, \ ++ ev->hdr.hdr_length, \ ++ ev->hdr.hdr_maint_op_class, \ ++ ev->dpa, \ ++ ev->dpa_flags, \ ++ ev->descriptor, \ ++ ev->type, \ ++ ev->transaction_type, \ ++ ev->channel, \ ++ ev->rank, \ ++ ev->device); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -541,6 +595,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case CXL_GENERIC_EVENT: + rc = set_cxl_generic_event_backtrace(buf, (struct ras_cxl_generic_event *)ev); + break; ++ case CXL_GENERAL_MEDIA_EVENT: ++ rc = set_cxl_general_media_event_backtrace(buf, (struct ras_cxl_general_media_event *)ev); ++ break; + default: + return -1; + } +@@ -1170,3 +1227,47 @@ cxl_generic_fail: + return -1; + + } ++ ++int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto cxl_general_media_fail; ++ ++ rc = commit_report_backtrace(sockfd, CXL_GENERAL_MEDIA_EVENT, ev); ++ if (rc < 0) ++ goto cxl_general_media_fail; ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl_general_media_event"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_general_media_fail; ++ ++ sprintf(buf, "REASON=%s", "CXL General Media Event"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_general_media_fail; ++ ++ done = 1; ++ ++cxl_general_media_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ else ++ return -1; ++} +diff --git a/ras-report.h b/ras-report.h +index bf591a6..d9ec7df 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -44,6 +44,7 @@ int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_ev + int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); + int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); + int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); ++int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); + + #else + +@@ -60,6 +61,7 @@ static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras + static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; + static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; + static inline int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; ++static inline int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; + + #endif + diff --git a/572de9d57691be9e630abee9ffa56a2fb155d558.patch b/572de9d57691be9e630abee9ffa56a2fb155d558.patch new file mode 100644 index 0000000..4a89c04 --- /dev/null +++ b/572de9d57691be9e630abee9ffa56a2fb155d558.patch @@ -0,0 +1,182 @@ +commit dea649c9f9a6f2941e80cade9ed311a398e267be +Author: Shiju Jose +Date: Mon Feb 12 11:14:03 2024 +0000 + + rasdaemon: ras-mc-ctl: Add support for CXL general media trace events + + Add support for CXL general media events to the ras-mc-ctl tool. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + (cherry picked from commit 572de9d57691be9e630abee9ffa56a2fb155d558) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 5528021..99b3c10 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1298,6 +1298,84 @@ sub get_cxl_hdr_flags_text + return join (", ", @out); + } + ++use constant { ++ CXL_DPA_VOLATILE => 0x0001, ++ CXL_DPA_NOT_REPAIRABLE => 0x0002, ++}; ++ ++sub get_cxl_dpa_flags_text ++{ ++ my $flags = $_[0]; ++ my @out; ++ ++ if ($flags & CXL_DPA_VOLATILE) { ++ push @out, (sprintf "\'VOLATILE\' "); ++ } ++ if ($flags & CXL_DPA_NOT_REPAIRABLE) { ++ push @out, (sprintf "\'NOT_REPAIRABLE\' "); ++ } ++ ++ return join (", ", @out); ++} ++ ++use constant { ++ CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT => 0x0001, ++ CXL_GMER_EVT_DESC_THRESHOLD_EVENT => 0x0002, ++ CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW => 0x0004, ++}; ++ ++sub get_cxl_descriptor_flags_text ++{ ++ my $flags = $_[0]; ++ my @out; ++ ++ if ($flags & CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT) { ++ push @out, (sprintf "\'UNCORRECTABLE EVENT\' "); ++ } ++ if ($flags & CXL_GMER_EVT_DESC_THRESHOLD_EVENT) { ++ push @out, (sprintf "\'THRESHOLD EVENT\' "); ++ } ++ if ($flags & CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW) { ++ push @out, (sprintf "\'POISON LIST OVERFLOW\' "); ++ } ++ ++ return join (", ", @out); ++} ++ ++sub get_cxl_mem_event_type ++{ ++ my @types; ++ ++ if ($_[0] < 0 || $_[0] > 2) { ++ return "unknown-type"; ++ } ++ ++ @types = ("ECC Error", ++ "Invalid Address", ++ "Data Path Error"); ++ ++ return $types[$_[0]]; ++} ++ ++sub get_cxl_transaction_type ++{ ++ my @types; ++ ++ if ($_[0] < 0 || $_[0] > 6) { ++ return "unknown-type"; ++ } ++ ++ @types = ("Unknown", ++ "Host Read", ++ "Host Write", ++ "Host Scan Media", ++ "Host Inject Poison", ++ "Internal Media Scrub", ++ "Internal Media Management"); ++ ++ return $types[$_[0]]; ++} ++ + sub summary + { + require DBI; +@@ -1442,6 +1520,22 @@ sub summary + print "No CXL generic errors.\n\n"; + } + $query_handle->finish; ++ ++ # CXL general media errors ++ $query = "select memdev, count(*) from cxl_general_media_event$conf{opt}{since} group by memdev"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($memdev, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$memdev errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "CXL general media events summary:\n$out\n"; ++ } else { ++ print "No CXL general media errors.\n\n"; ++ } ++ $query_handle->finish; + } + + # extlog errors +@@ -1553,6 +1647,7 @@ sub errors + my ($log_type, $first_ts, $last_ts); + my ($trace_type, $region, $region_uuid, $hpa, $dpa, $dpa_length, $source, $flags, $overflow_ts); + my ($hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $data); ++ my ($dpa_flags, $descriptor, $mem_event_type, $transaction_type, $channel, $rank, $device, $comp_id); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -1764,6 +1859,49 @@ sub errors + } else { + print "No CXL generic errors.\n\n"; + } ++ ++ # CXL general media errors ++ use constant CXL_EVENT_GEN_MED_COMP_ID_SIZE => 0x10; ++ $query = "select id, timestamp, memdev, host, serial, log_type, hdr_uuid, hdr_flags, hdr_handle, hdr_related_handle, hdr_ts, hdr_length, hdr_maint_op_class, dpa, dpa_flags, descriptor, type, transaction_type, channel, rank, device, comp_id from cxl_general_media_event$conf{opt}{since} order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $log_type, $hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $dpa, $dpa_flags, $descriptor, $mem_event_type, $transaction_type, $channel, $rank, $device, $comp_id)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $timestamp error: "; ++ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev); ++ $out .= "host=$host, " if (defined $host && length $host); ++ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial); ++ $out .= "log=$log_type, " if (defined $log_type && length $log_type); ++ $out .= "hdr_uuid=$hdr_uuid, " if (defined $hdr_uuid && length $hdr_uuid); ++ $out .= sprintf "hdr_flags=0x%llx %s, ", $hdr_flags, get_cxl_hdr_flags_text($hdr_flags) if (defined $hdr_flags && length $hdr_flags); ++ $out .= sprintf "hdr_handle=0x%x, ", $hdr_handle if (defined $hdr_handle && length $hdr_handle); ++ $out .= sprintf "hdr_related_handle=0x%x, ", $hdr_related_handle if (defined $hdr_related_handle && length $hdr_related_handle); ++ $out .= "hdr_timestamp=$hdr_ts, " if (defined $hdr_ts && length $hdr_ts); ++ $out .= sprintf "hdr_length=%u, ", $hdr_length if (defined $hdr_length && length $hdr_length); ++ $out .= sprintf "hdr_maint_op_class=%u, ", $hdr_maint_op_class if (defined $hdr_maint_op_class && length $hdr_maint_op_class); ++ $out .= sprintf "dpa=0x%llx, ", $dpa if (defined $dpa && length $dpa); ++ $out .= sprintf "dpa_flags: %s, ", get_cxl_dpa_flags_text($dpa_flags) if (defined $dpa_flags && length $dpa_flags); ++ $out .= sprintf "descriptor_flags: %s, ", get_cxl_descriptor_flags_text($descriptor) if (defined $descriptor && length $descriptor); ++ $out .= sprintf "memory event type: %s, ", get_cxl_mem_event_type($mem_event_type) if (defined $mem_event_type && length $mem_event_type); ++ $out .= sprintf "transaction_type: %s, ", get_cxl_transaction_type($transaction_type) if (defined $transaction_type && length $transaction_type); ++ $out .= sprintf "channel=%u, ", $channel if (defined $channel && length $channel); ++ $out .= sprintf "rank=%u, ", $rank if (defined $rank && length $rank); ++ $out .= sprintf "device=0x%x, ", $device if (defined $device && length $device); ++ if (defined $comp_id && length $comp_id) { ++ $out .= sprintf "component_id:"; ++ my @bytes = unpack "C*", $comp_id; ++ for (my $i = 0; $i < CXL_EVENT_GEN_MED_COMP_ID_SIZE; $i++) { ++ $out .= sprintf "%02x ", $bytes[$i]; ++ } ++ } ++ $out .= "\n"; ++ } ++ if ($out ne "") { ++ print "CXL general media events:\n$out\n"; ++ } else { ++ print "No CXL general media errors.\n\n"; ++ } + } + + # Extlog errors diff --git a/75c8fec559641f843345ef8fbc36d124b60b914d.patch b/75c8fec559641f843345ef8fbc36d124b60b914d.patch new file mode 100644 index 0000000..cd0aca4 --- /dev/null +++ b/75c8fec559641f843345ef8fbc36d124b60b914d.patch @@ -0,0 +1,663 @@ +commit 75c8fec559641f843345ef8fbc36d124b60b914d +Author: Shiju Jose +Date: Fri Mar 31 13:35:13 2023 +0100 + + rasdaemon: Add support for the CXL poison events + + Add support to log and record the CXL poison events. + + The corresponding Kernel patches here: + https://lore.kernel.org/linux-cxl/64457d30bae07_2028294ac@dwillia2-xfh.jf.intel.com.notmuch/ + + Presently for logging only, could be extended for the policy + based recovery action for the frequent poison events depending on the above + kernel patches. + + Signed-off-by: Shiju Jose + Reviewed-by: Jonathan Cameron + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/Makefile.am b/Makefile.am +index 56c144e..5bddeac 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -73,6 +73,11 @@ endif + if WITH_CPU_FAULT_ISOLATION + rasdaemon_SOURCES += ras-cpu-isolation.c queue.c + endif ++ ++if WITH_CXL ++ rasdaemon_SOURCES += ras-cxl-handler.c ++endif ++ + rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) + rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) + +@@ -81,7 +86,7 @@ include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ + ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ + ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ + non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ +- ras-cpu-isolation.h queue.h ++ ras-cxl-handler.h ras-cpu-isolation.h queue.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/configure.ac b/configure.ac +index f588090..ab5697d 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -127,6 +127,16 @@ AS_IF([test "x$enable_memory_failure" = "xyes" || test "x$enable_all" = "xyes"], + AM_CONDITIONAL([WITH_MEMORY_FAILURE], [test x$enable_memory_failure = xyes || test x$enable_all = xyes]) + AM_COND_IF([WITH_MEMORY_FAILURE], [USE_MEMORY_FAILURE="yes"], [USE_MEMORY_FAILURE="no"]) + ++AC_ARG_ENABLE([cxl], ++ AS_HELP_STRING([--enable-cxl], [enable CXL events (currently experimental)])) ++ ++AS_IF([test "x$enable_cxl" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_DEFINE(HAVE_CXL,1,"have CXL events collect") ++ AC_SUBST([WITH_CXL]) ++]) ++AM_CONDITIONAL([WITH_CXL], [test x$enable_cxl = xyes || test x$enable_all == xyes]) ++AM_COND_IF([WITH_CXL], [USE_CXL="yes"], [USE_CXL="no"]) ++ + AC_ARG_ENABLE([abrt_report], + AS_HELP_STRING([--enable-abrt-report], [enable report event to ABRT (currently experimental)])) + +@@ -215,6 +225,7 @@ compile time options summary + DEVLINK : $USE_DEVLINK + Disk I/O errors : $USE_DISKERROR + Memory Failure : $USE_MEMORY_FAILURE ++ CXL events : $USE_CXL + Memory CE PFA : $USE_MEMORY_CE_PFA + AMP RAS errors : $USE_AMP_NS_DECODE + CPU fault isolation : $USE_CPU_FAULT_ISOLATION +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +new file mode 100644 +index 0000000..cb23ba2 +--- /dev/null ++++ b/ras-cxl-handler.c +@@ -0,0 +1,202 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include "ras-cxl-handler.h" ++#include "ras-record.h" ++#include "ras-logger.h" ++#include "ras-report.h" ++ ++/* Poison List: Payload out flags */ ++#define CXL_POISON_FLAG_MORE BIT(0) ++#define CXL_POISON_FLAG_OVERFLOW BIT(1) ++#define CXL_POISON_FLAG_SCANNING BIT(2) ++ ++/* CXL poison - source types */ ++enum cxl_poison_source { ++ CXL_POISON_SOURCE_UNKNOWN = 0, ++ CXL_POISON_SOURCE_EXTERNAL = 1, ++ CXL_POISON_SOURCE_INTERNAL = 2, ++ CXL_POISON_SOURCE_INJECTED = 3, ++ CXL_POISON_SOURCE_VENDOR = 7, ++}; ++ ++/* CXL poison - trace types */ ++enum cxl_poison_trace_type { ++ CXL_POISON_TRACE_LIST, ++ CXL_POISON_TRACE_INJECT, ++ CXL_POISON_TRACE_CLEAR, ++}; ++ ++int ras_cxl_poison_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ int len; ++ unsigned long long val; ++ struct ras_events *ras = context; ++ time_t now; ++ struct tm *tm; ++ struct ras_cxl_poison_event ev; ++ ++ now = record->ts / user_hz + ras->uptime_diff; ++ tm = localtime(&now); ++ if (tm) ++ strftime(ev.timestamp, sizeof(ev.timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ else ++ strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); ++ if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) ++ return -1; ++ ++ ev.memdev = tep_get_field_raw(s, event, "memdev", ++ record, &len, 1); ++ if (!ev.memdev) ++ return -1; ++ if (trace_seq_printf(s, "memdev:%s ", ev.memdev) <= 0) ++ return -1; ++ ++ ev.host = tep_get_field_raw(s, event, "host", ++ record, &len, 1); ++ if (!ev.host) ++ return -1; ++ if (trace_seq_printf(s, "host:%s ", ev.host) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "serial", record, &val, 1) < 0) ++ return -1; ++ ev.serial = val; ++ if (trace_seq_printf(s, "serial:0x%llx ", (unsigned long long)ev.serial) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "trace_type", record, &val, 1) < 0) ++ return -1; ++ switch (val) { ++ case CXL_POISON_TRACE_LIST: ++ ev.trace_type = "List"; ++ break; ++ case CXL_POISON_TRACE_INJECT: ++ ev.trace_type = "Inject"; ++ break; ++ case CXL_POISON_TRACE_CLEAR: ++ ev.trace_type = "Clear"; ++ break; ++ default: ++ ev.trace_type = "Invalid"; ++ } ++ if (trace_seq_printf(s, "trace_type:%s ", ev.trace_type) <= 0) ++ return -1; ++ ++ ev.region = tep_get_field_raw(s, event, "region", ++ record, &len, 1); ++ if (!ev.region) ++ return -1; ++ if (trace_seq_printf(s, "region:%s ", ev.region) <= 0) ++ return -1; ++ ++ ev.uuid = tep_get_field_raw(s, event, "uuid", ++ record, &len, 1); ++ if (!ev.uuid) ++ return -1; ++ if (trace_seq_printf(s, "region_uuid:%s ", ev.uuid) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "hpa", record, &val, 1) < 0) ++ return -1; ++ ev.hpa = val; ++ if (trace_seq_printf(s, "poison list: hpa:0x%llx ", (unsigned long long)ev.hpa) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "dpa", record, &val, 1) < 0) ++ return -1; ++ ev.dpa = val; ++ if (trace_seq_printf(s, "dpa:0x%llx ", (unsigned long long)ev.dpa) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "dpa_length", record, &val, 1) < 0) ++ return -1; ++ ev.dpa_length = val; ++ if (trace_seq_printf(s, "dpa_length:0x%x ", ev.dpa_length) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "source", record, &val, 1) < 0) ++ return -1; ++ switch (val) { ++ case CXL_POISON_SOURCE_UNKNOWN: ++ ev.source = "Unknown"; ++ break; ++ case CXL_POISON_SOURCE_EXTERNAL: ++ ev.source = "External"; ++ break; ++ case CXL_POISON_SOURCE_INTERNAL: ++ ev.source = "Internal"; ++ break; ++ case CXL_POISON_SOURCE_INJECTED: ++ ev.source = "Injected"; ++ break; ++ case CXL_POISON_SOURCE_VENDOR: ++ ev.source = "Vendor"; ++ break; ++ default: ++ ev.source = "Invalid"; ++ } ++ if (trace_seq_printf(s, "source:%s ", ev.source) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "flags", record, &val, 1) < 0) ++ return -1; ++ ev.flags = val; ++ if (trace_seq_printf(s, "flags:%d ", ev.flags) <= 0) ++ return -1; ++ ++ if (ev.flags & CXL_POISON_FLAG_OVERFLOW) { ++ if (tep_get_field_val(s, event, "overflow_ts", record, &val, 1) < 0) ++ return -1; ++ if (val) { ++ /* CXL Specification 3.0 ++ * Overflow timestamp - The number of unsigned nanoseconds ++ * that have elapsed since midnight, 01-Jan-1970 UTC ++ */ ++ time_t ovf_ts_secs = val / 1000000000ULL; ++ ++ tm = localtime(&ovf_ts_secs); ++ if (tm) { ++ strftime(ev.overflow_ts, sizeof(ev.overflow_ts), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ } ++ } ++ if (!val || !tm) ++ strncpy(ev.overflow_ts, "1970-01-01 00:00:00 +0000", ++ sizeof(ev.overflow_ts)); ++ } else ++ strncpy(ev.overflow_ts, "1970-01-01 00:00:00 +0000", sizeof(ev.overflow_ts)); ++ if (trace_seq_printf(s, "overflow timestamp:%s\n", ev.overflow_ts) <= 0) ++ return -1; ++ ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_cxl_poison_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_cxl_poison_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h +new file mode 100644 +index 0000000..84d5cc6 +--- /dev/null ++++ b/ras-cxl-handler.h +@@ -0,0 +1,24 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ */ ++ ++#ifndef __RAS_CXL_HANDLER_H ++#define __RAS_CXL_HANDLER_H ++ ++#include "ras-events.h" ++#include ++ ++int ras_cxl_poison_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context); ++#endif +diff --git a/ras-events.c b/ras-events.c +index 5fe8e19..f95844a 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -40,6 +40,7 @@ + #include "ras-devlink-handler.h" + #include "ras-diskerror-handler.h" + #include "ras-memory-failure-handler.h" ++#include "ras-cxl-handler.h" + #include "ras-record.h" + #include "ras-logger.h" + #include "ras-page-isolation.h" +@@ -243,6 +244,10 @@ int toggle_ras_mc_event(int enable) + rc |= __toggle_ras_mc_event(ras, "ras", "memory_failure_event", enable); + #endif + ++#ifdef HAVE_CXL ++ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_poison", enable); ++#endif ++ + free_ras: + free(ras); + return rc; +@@ -979,6 +984,16 @@ int handle_ras_events(int record_events) + "ras", "memory_failure_event"); + #endif + ++#ifdef HAVE_CXL ++ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_poison", ++ ras_cxl_poison_event_handler, NULL, CXL_POISON_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "cxl", "cxl_poison"); ++#endif ++ + if (!num_events) { + log(ALL, LOG_INFO, + "Failed to trace all supported RAS events. Aborting.\n"); +diff --git a/ras-events.h b/ras-events.h +index 649b0c0..1ef3ecd 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -39,6 +39,7 @@ enum { + DEVLINK_EVENT, + DISKERROR_EVENT, + MF_EVENT, ++ CXL_POISON_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index adc97a4..c31baa0 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -559,6 +559,71 @@ int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) + } + #endif + ++#ifdef HAVE_CXL ++/* ++ * Table and functions to handle cxl:cxl_poison ++ */ ++static const struct db_fields cxl_poison_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "memdev", .type = "TEXT" }, ++ { .name = "host", .type = "TEXT" }, ++ { .name = "serial", .type = "INTEGER" }, ++ { .name = "trace_type", .type = "TEXT" }, ++ { .name = "region", .type = "TEXT" }, ++ { .name = "region_uuid", .type = "TEXT" }, ++ { .name = "hpa", .type = "INTEGER" }, ++ { .name = "dpa", .type = "INTEGER" }, ++ { .name = "dpa_length", .type = "INTEGER" }, ++ { .name = "source", .type = "TEXT" }, ++ { .name = "flags", .type = "INTEGER" }, ++ { .name = "overflow_ts", .type = "TEXT" }, ++}; ++ ++static const struct db_table_descriptor cxl_poison_event_tab = { ++ .name = "cxl_poison_event", ++ .fields = cxl_poison_event_fields, ++ .num_fields = ARRAY_SIZE(cxl_poison_event_fields), ++}; ++ ++int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_cxl_poison_event) ++ return 0; ++ log(TERM, LOG_INFO, "cxl_poison_event store: %p\n", priv->stmt_cxl_poison_event); ++ ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 1, ev->timestamp, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 2, ev->memdev, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 3, ev->host, -1, NULL); ++ sqlite3_bind_int64(priv->stmt_cxl_poison_event, 4, ev->serial); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 5, ev->trace_type, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 6, ev->region, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 7, ev->uuid, -1, NULL); ++ sqlite3_bind_int64(priv->stmt_cxl_poison_event, 8, ev->hpa); ++ sqlite3_bind_int64(priv->stmt_cxl_poison_event, 9, ev->dpa); ++ sqlite3_bind_int(priv->stmt_cxl_poison_event, 10, ev->dpa_length); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 11, ev->source, -1, NULL); ++ sqlite3_bind_int(priv->stmt_cxl_poison_event, 12, ev->flags); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 13, ev->overflow_ts, -1, NULL); ++ ++ rc = sqlite3_step(priv->stmt_cxl_poison_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do cxl_poison_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_cxl_poison_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset cxl_poison_event on sqlite: error = %d\n", ++ rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} ++#endif ++ + /* + * Generic code + */ +@@ -900,6 +965,16 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + } + #endif + ++#ifdef HAVE_CXL ++ rc = ras_mc_create_table(priv, &cxl_poison_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_poison_event, ++ &cxl_poison_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } ++#endif ++ + ras->db_priv = priv; + return 0; + +@@ -1019,6 +1094,16 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + } + #endif + ++#ifdef HAVE_CXL ++ if (priv->stmt_cxl_poison_event) { ++ rc = sqlite3_finalize(priv->stmt_cxl_poison_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize cxl_poison_event sqlite: error = %d\n", ++ cpu, rc); ++ } ++#endif ++ + rc = sqlite3_close_v2(db); + if (rc != SQLITE_OK) + log(TERM, LOG_ERR, +diff --git a/ras-record.h b/ras-record.h +index 219f10b..fd15215 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -114,6 +114,22 @@ struct ras_mf_event { + const char *action_result; + }; + ++struct ras_cxl_poison_event { ++ char timestamp[64]; ++ const char *memdev; ++ const char *host; ++ uint64_t serial; ++ const char *trace_type; ++ const char *region; ++ const char *uuid; ++ uint64_t hpa; ++ uint64_t dpa; ++ uint32_t dpa_length; ++ const char *source; ++ uint8_t flags; ++ char overflow_ts[64]; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -123,6 +139,7 @@ struct mce_event; + struct devlink_event; + struct diskerror_event; + struct ras_mf_event; ++struct ras_cxl_poison_event; + + #ifdef HAVE_SQLITE3 + +@@ -155,6 +172,9 @@ struct sqlite3_priv { + #ifdef HAVE_MEMORY_FAILURE + sqlite3_stmt *stmt_mf_event; + #endif ++#ifdef HAVE_CXL ++ sqlite3_stmt *stmt_cxl_poison_event; ++#endif + }; + + struct db_fields { +@@ -182,6 +202,7 @@ int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev); + int ras_store_devlink_event(struct ras_events *ras, struct devlink_event *ev); + int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev); + int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev); ++int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -195,6 +216,7 @@ static inline int ras_store_arm_record(struct ras_events *ras, struct ras_arm_ev + static inline int ras_store_devlink_event(struct ras_events *ras, struct devlink_event *ev) { return 0; }; + static inline int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; }; + static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; }; ++static inline int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 62d5eb7..3daecc0 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -331,6 +331,46 @@ static int set_mf_event_backtrace(char *buf, struct ras_mf_event *ev) + return 0; + } + ++static int set_cxl_poison_event_backtrace(char *buf, struct ras_cxl_poison_event *ev) ++{ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "memdev=%s\n" \ ++ "host=%s\n" \ ++ "serial=0x%lx\n" \ ++ "trace_type=%s\n" \ ++ "region=%s\n" \ ++ "region_uuid=%s\n" \ ++ "hpa=0x%lx\n" \ ++ "dpa=0x%lx\n" \ ++ "dpa_length=0x%x\n" \ ++ "source=%s\n" \ ++ "flags=%u\n" \ ++ "overflow_timestamp=%s\n", \ ++ ev->timestamp, \ ++ ev->memdev, \ ++ ev->host, \ ++ ev->serial, \ ++ ev->trace_type, \ ++ ev->region, \ ++ ev->uuid, \ ++ ev->hpa, \ ++ ev->dpa, \ ++ ev->dpa_length, \ ++ ev->source, \ ++ ev->flags, \ ++ ev->overflow_ts); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -368,6 +408,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case MF_EVENT: + rc = set_mf_event_backtrace(buf, (struct ras_mf_event *)ev); + break; ++ case CXL_POISON_EVENT: ++ rc = set_cxl_poison_event_backtrace(buf, (struct ras_cxl_poison_event *)ev); ++ break; + default: + return -1; + } +@@ -776,3 +819,47 @@ mf_fail: + else + return -1; + } ++ ++int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto cxl_poison_fail; ++ ++ rc = commit_report_backtrace(sockfd, CXL_POISON_EVENT, ev); ++ if (rc < 0) ++ goto cxl_poison_fail; ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl-poison"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_poison_fail; ++ ++ sprintf(buf, "REASON=%s", "CXL poison"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_poison_fail; ++ ++ done = 1; ++ ++cxl_poison_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ else ++ return -1; ++} +diff --git a/ras-report.h b/ras-report.h +index e605eb1..d1591ce 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -39,6 +39,7 @@ int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev); + int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev); + int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev); + int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev); ++int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev); + + #else + +@@ -50,6 +51,7 @@ static inline int ras_report_arm_event(struct ras_events *ras, struct ras_arm_ev + static inline int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev) { return 0; }; + static inline int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; }; + static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; }; ++static inline int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; }; + + #endif + diff --git a/7be2edbf863b7acf7e1cab10c2b9f9bf51b3d513.patch b/7be2edbf863b7acf7e1cab10c2b9f9bf51b3d513.patch new file mode 100644 index 0000000..b6092db --- /dev/null +++ b/7be2edbf863b7acf7e1cab10c2b9f9bf51b3d513.patch @@ -0,0 +1,97 @@ +commit 7be2edbf863b7acf7e1cab10c2b9f9bf51b3d513 +Author: Shiju Jose +Date: Tue Apr 4 16:07:21 2023 +0100 + + rasdaemon: Add common function to get timestamp for the event + + Add common function to get the timestamp for the event + reported. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index 59534a4..d540ebb 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -42,6 +42,20 @@ static void convert_timestamp(unsigned long long ts, char *ts_ptr, uint16_t size + size); + } + ++static void get_timestamp(struct trace_seq *s, struct tep_record *record, ++ struct ras_events *ras, char *ts_ptr, uint16_t size) ++{ ++ time_t now; ++ struct tm *tm; ++ ++ now = record->ts / user_hz + ras->uptime_diff; ++ tm = localtime(&now); ++ if (tm) ++ strftime(ts_ptr, size, "%Y-%m-%d %H:%M:%S %z", tm); ++ else ++ strncpy(ts_ptr, "1970-01-01 00:00:00 +0000", size); ++} ++ + /* Poison List: Payload out flags */ + #define CXL_POISON_FLAG_MORE BIT(0) + #define CXL_POISON_FLAG_OVERFLOW BIT(1) +@@ -70,17 +84,9 @@ int ras_cxl_poison_event_handler(struct trace_seq *s, + int len; + unsigned long long val; + struct ras_events *ras = context; +- time_t now; +- struct tm *tm; + struct ras_cxl_poison_event ev; + +- now = record->ts / user_hz + ras->uptime_diff; +- tm = localtime(&now); +- if (tm) +- strftime(ev.timestamp, sizeof(ev.timestamp), +- "%Y-%m-%d %H:%M:%S %z", tm); +- else +- strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); ++ get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); + if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) + return -1; + +@@ -285,19 +291,11 @@ int ras_cxl_aer_ue_event_handler(struct trace_seq *s, + { + int len, i; + unsigned long long val; +- time_t now; +- struct tm *tm; + struct ras_events *ras = context; + struct ras_cxl_aer_ue_event ev; + + memset(&ev, 0, sizeof(ev)); +- now = record->ts / user_hz + ras->uptime_diff; +- tm = localtime(&now); +- if (tm) +- strftime(ev.timestamp, sizeof(ev.timestamp), +- "%Y-%m-%d %H:%M:%S %z", tm); +- else +- strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); ++ get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); + if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) + return -1; + +@@ -380,18 +378,10 @@ int ras_cxl_aer_ce_event_handler(struct trace_seq *s, + { + int len; + unsigned long long val; +- time_t now; +- struct tm *tm; + struct ras_events *ras = context; + struct ras_cxl_aer_ce_event ev; + +- now = record->ts / user_hz + ras->uptime_diff; +- tm = localtime(&now); +- if (tm) +- strftime(ev.timestamp, sizeof(ev.timestamp), +- "%Y-%m-%d %H:%M:%S %z", tm); +- else +- strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); ++ get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); + if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) + return -1; + diff --git a/8f79833e3d78424f4a594985fbeb91890f4af81c.patch b/8f79833e3d78424f4a594985fbeb91890f4af81c.patch new file mode 100644 index 0000000..b509270 --- /dev/null +++ b/8f79833e3d78424f4a594985fbeb91890f4af81c.patch @@ -0,0 +1,78 @@ +commit 8f79833e3d78424f4a594985fbeb91890f4af81c +Author: Shiju Jose +Date: Mon Mar 4 11:49:50 2024 +0000 + + rasdaemon: Fix build warnings unused variable if AMP RAS errors is not enabled + + This patch fixes following build warnings unused variable if AMP RAS errors + is not enabled(--enable-amp-ns-decode). + + ================================================== + ras-aer-handler.c: In function ‘ras_aer_event_handler’: + ras-aer-handler.c:72:21: warning: unused variable ‘fn’ [-Wunused-variable] + int seg, bus, dev, fn; + ^~ + ras-aer-handler.c:72:16: warning: unused variable ‘dev’ [-Wunused-variable] + int seg, bus, dev, fn; + ^~~ + ras-aer-handler.c:72:11: warning: unused variable ‘bus’ [-Wunused-variable] + int seg, bus, dev, fn; + ^~~ + ras-aer-handler.c:72:6: warning: unused variable ‘seg’ [-Wunused-variable] + int seg, bus, dev, fn; + ^~~ + ras-aer-handler.c:71:10: warning: variable ‘sel_data’ set but not used [-Wunused-but-set-variable] + uint8_t sel_data[5]; + ^~~~~~~~ + ras-aer-handler.c:70:7: warning: unused variable ‘ipmi_add_sel’ [-Wunused-variable] + char ipmi_add_sel[105]; + ^~~~~~~~~~~~ + ================================================== + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-aer-handler.c b/ras-aer-handler.c +index bb1a6f6..29f6551 100644 +--- a/ras-aer-handler.c ++++ b/ras-aer-handler.c +@@ -67,9 +67,11 @@ int ras_aer_event_handler(struct trace_seq *s, + struct tm *tm; + struct ras_aer_event ev; + char buf[BUF_LEN]; ++#ifdef HAVE_AMP_NS_DECODE + char ipmi_add_sel[105]; + uint8_t sel_data[5]; + int seg, bus, dev, fn; ++#endif + + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. +@@ -132,19 +134,27 @@ int ras_aer_event_handler(struct trace_seq *s, + switch (severity_val) { + case HW_EVENT_AER_UNCORRECTED_NON_FATAL: + ev.error_type = "Uncorrected (Non-Fatal)"; ++#ifdef HAVE_AMP_NS_DECODE + sel_data[0] = 0xca; ++#endif + break; + case HW_EVENT_AER_UNCORRECTED_FATAL: + ev.error_type = "Uncorrected (Fatal)"; ++#ifdef HAVE_AMP_NS_DECODE + sel_data[0] = 0xca; ++#endif + break; + case HW_EVENT_AER_CORRECTED: + ev.error_type = "Corrected"; ++#ifdef HAVE_AMP_NS_DECODE + sel_data[0] = 0xbf; ++#endif + break; + default: + ev.error_type = "Unknown severity"; ++#ifdef HAVE_AMP_NS_DECODE + sel_data[0] = 0xbf; ++#endif + } + trace_seq_puts(s, ev.error_type); + diff --git a/93ca96b66c917af37b2ae9295dc5df46a7d64dd2.patch b/93ca96b66c917af37b2ae9295dc5df46a7d64dd2.patch new file mode 100644 index 0000000..4952349 --- /dev/null +++ b/93ca96b66c917af37b2ae9295dc5df46a7d64dd2.patch @@ -0,0 +1,82 @@ +commit b6506f22fb2d7f44d9d633d44656dff2a94f257e +Author: Shiju Jose +Date: Mon Feb 12 10:49:10 2024 +0000 + + rasdaemon: ras-mc-ctl: Add support for CXL poison trace events + + Add support for CXL poison events to the ras-mc-ctl tool. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + (cherry picked from commit 93ca96b66c917af37b2ae9295dc5df46a7d64dd2) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 6a319a7..16b0589 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1382,6 +1382,22 @@ sub summary + print "No CXL overflow errors.\n\n"; + } + $query_handle->finish; ++ ++ # CXL poison errors ++ $query = "select memdev, count(*) from cxl_poison_event$conf{opt}{since} group by memdev"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($memdev, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$memdev errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "CXL poison events summary:\n$out\n"; ++ } else { ++ print "No CXL poison errors.\n\n"; ++ } ++ $query_handle->finish; + } + + # extlog errors +@@ -1491,6 +1507,7 @@ sub errors + my ($pfn, $page_type, $action_result); + my ($memdev, $host, $serial, $error_status, $first_error, $header_log); + my ($log_type, $first_ts, $last_ts); ++ my ($trace_type, $region, $region_uuid, $hpa, $dpa, $dpa_length, $source, $flags, $overflow_ts); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -1636,6 +1653,34 @@ sub errors + } else { + print "No CXL overflow errors.\n\n"; + } ++ ++ # CXL poison errors ++ $query = "select id, timestamp, memdev, host, serial, trace_type, region, region_uuid, hpa, dpa, dpa_length, source, flags, overflow_ts from cxl_poison_event$conf{opt}{since} order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $trace_type, $region, $region_uuid, $hpa, $dpa, $dpa_length, $source, $flags, $overflow_ts)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $timestamp error: "; ++ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev); ++ $out .= "host=$host, " if (defined $host && length $host); ++ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial); ++ $out .= "trace_type=$trace_type, " if (defined $trace_type && length $trace_type); ++ $out .= "region=$region, " if (defined $region && length $region); ++ $out .= "region_uuid=$region_uuid, " if (defined $region_uuid && length $region_uuid); ++ $out .= sprintf "hpa=0x%llx, ", $hpa if (defined $hpa && length $hpa); ++ $out .= sprintf "dpa=0x%llx, ", $dpa if (defined $dpa && length $dpa); ++ $out .= sprintf "dpa_length=0x%x, ", $dpa_length if (defined $dpa_length && length $dpa_length); ++ $out .= "source=$source, " if (defined $source && length $source); ++ $out .= sprintf "flags=%d, ", $flags if (defined $flags && length $flags); ++ $out .= "overflow timestamp=$overflow_ts " if (defined $overflow_ts && length $overflow_ts); ++ $out .= "\n"; ++ } ++ if ($out ne "") { ++ print "CXL poison events:\n$out\n"; ++ } else { ++ print "No CXL poison errors.\n\n"; ++ } + } + + # Extlog errors diff --git a/9a2f6186db2622788f8868d8ec082684d6a06d4d.patch b/9a2f6186db2622788f8868d8ec082684d6a06d4d.patch new file mode 100644 index 0000000..c85f54e --- /dev/null +++ b/9a2f6186db2622788f8868d8ec082684d6a06d4d.patch @@ -0,0 +1,559 @@ +commit 9a2f6186db2622788f8868d8ec082684d6a06d4d +Author: Shiju Jose +Date: Wed Apr 5 13:28:20 2023 +0100 + + rasdaemon: Add support for the CXL dram events + + Add support to log and record the CXL dram events. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index 2de96f6..64b0b50 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -865,3 +865,154 @@ int ras_cxl_general_media_event_handler(struct trace_seq *s, + + return 0; + } ++ ++/* ++ * DRAM Event Record - DER ++ * ++ * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44 ++ */ ++#define CXL_DER_VALID_CHANNEL BIT(0) ++#define CXL_DER_VALID_RANK BIT(1) ++#define CXL_DER_VALID_NIBBLE BIT(2) ++#define CXL_DER_VALID_BANK_GROUP BIT(3) ++#define CXL_DER_VALID_BANK BIT(4) ++#define CXL_DER_VALID_ROW BIT(5) ++#define CXL_DER_VALID_COLUMN BIT(6) ++#define CXL_DER_VALID_CORRECTION_MASK BIT(7) ++ ++int ras_cxl_dram_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ int len, i; ++ unsigned long long val; ++ struct ras_events *ras = context; ++ struct ras_cxl_dram_event ev; ++ ++ memset(&ev, 0, sizeof(ev)); ++ if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "dpa", record, &val, 1) < 0) ++ return -1; ++ ev.dpa = val; ++ if (trace_seq_printf(s, "dpa:0x%llx ", (unsigned long long)ev.dpa) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "dpa_flags", record, &val, 1) < 0) ++ return -1; ++ ev.dpa_flags = val; ++ if (trace_seq_printf(s, "dpa_flags:") <= 0) ++ return -1; ++ if (decode_cxl_event_flags(s, ev.dpa_flags, cxl_dpa_flags, ARRAY_SIZE(cxl_dpa_flags)) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "descriptor", record, &val, 1) < 0) ++ return -1; ++ ev.descriptor = val; ++ if (trace_seq_printf(s, "descriptor:") <= 0) ++ return -1; ++ if (decode_cxl_event_flags(s, ev.descriptor, cxl_gmer_event_desc_flags, ++ ARRAY_SIZE(cxl_gmer_event_desc_flags)) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "type", record, &val, 1) < 0) ++ return -1; ++ ev.type = val; ++ if (trace_seq_printf(s, "type:%s ", get_cxl_type_str(cxl_gmer_mem_event_type, ++ ARRAY_SIZE(cxl_gmer_mem_event_type), ev.type)) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "transaction_type", record, &val, 1) < 0) ++ return -1; ++ ev.transaction_type = val; ++ if (trace_seq_printf(s, "transaction_type:%s ", ++ get_cxl_type_str(cxl_gmer_trans_type, ++ ARRAY_SIZE(cxl_gmer_trans_type), ++ ev.transaction_type)) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "validity_flags", record, &val, 1) < 0) ++ return -1; ++ ev.validity_flags = val; ++ ++ if (ev.validity_flags & CXL_DER_VALID_CHANNEL) { ++ if (tep_get_field_val(s, event, "channel", record, &val, 1) < 0) ++ return -1; ++ ev.channel = val; ++ if (trace_seq_printf(s, "channel:%u ", ev.channel) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_RANK) { ++ if (tep_get_field_val(s, event, "rank", record, &val, 1) < 0) ++ return -1; ++ ev.rank = val; ++ if (trace_seq_printf(s, "rank:%u ", ev.rank) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_NIBBLE) { ++ if (tep_get_field_val(s, event, "nibble_mask", record, &val, 1) < 0) ++ return -1; ++ ev.nibble_mask = val; ++ if (trace_seq_printf(s, "nibble_mask:%u ", ev.nibble_mask) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_BANK_GROUP) { ++ if (tep_get_field_val(s, event, "bank_group", record, &val, 1) < 0) ++ return -1; ++ ev.bank_group = val; ++ if (trace_seq_printf(s, "bank_group:%u ", ev.bank_group) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_BANK) { ++ if (tep_get_field_val(s, event, "bank", record, &val, 1) < 0) ++ return -1; ++ ev.bank = val; ++ if (trace_seq_printf(s, "bank:%u ", ev.bank) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_ROW) { ++ if (tep_get_field_val(s, event, "row", record, &val, 1) < 0) ++ return -1; ++ ev.row = val; ++ if (trace_seq_printf(s, "row:%u ", ev.row) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_COLUMN) { ++ if (tep_get_field_val(s, event, "column", record, &val, 1) < 0) ++ return -1; ++ ev.column = val; ++ if (trace_seq_printf(s, "column:%u ", ev.column) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_CORRECTION_MASK) { ++ ev.cor_mask = tep_get_field_raw(s, event, "cor_mask", record, &len, 1); ++ if (!ev.cor_mask) ++ return -1; ++ if (trace_seq_printf(s, "correction_mask:") <= 0) ++ return -1; ++ for (i = 0; i < CXL_EVENT_DER_CORRECTION_MASK_SIZE; i++) { ++ if (trace_seq_printf(s, "%02x ", ev.cor_mask[i]) <= 0) ++ break; ++ } ++ } ++ ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_cxl_dram_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_cxl_dram_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h +index 3adca4a..35455af 100644 +--- a/ras-cxl-handler.h ++++ b/ras-cxl-handler.h +@@ -38,4 +38,7 @@ int ras_cxl_generic_event_handler(struct trace_seq *s, + int ras_cxl_general_media_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); ++int ras_cxl_dram_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context); + #endif +diff --git a/ras-events.c b/ras-events.c +index 978dee4..d27e0c4 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -251,6 +251,7 @@ int toggle_ras_mc_event(int enable) + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_overflow", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_generic_event", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_general_media", enable); ++ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_dram", enable); + #endif + + free_ras: +@@ -1072,6 +1073,14 @@ int handle_ras_events(int record_events) + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "cxl_general_media"); ++ ++ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_dram", ++ ras_cxl_dram_event_handler, NULL, CXL_DRAM_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "cxl", "cxl_dram"); + #endif + + if (!num_events) { +diff --git a/ras-events.h b/ras-events.h +index 9b83df3..d192a6b 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -45,6 +45,7 @@ enum { + CXL_OVERFLOW_EVENT, + CXL_GENERIC_EVENT, + CXL_GENERAL_MEDIA_EVENT, ++ CXL_DRAM_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index 507a58e..fffa81c 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -915,6 +915,83 @@ int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_gen + + return rc; + } ++ ++/* ++ * Table and functions to handle cxl:cxl_dram_event ++ */ ++static const struct db_fields cxl_dram_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "memdev", .type = "TEXT" }, ++ { .name = "host", .type = "TEXT" }, ++ { .name = "serial", .type = "INTEGER" }, ++ { .name = "log_type", .type = "TEXT" }, ++ { .name = "hdr_uuid", .type = "TEXT" }, ++ { .name = "hdr_flags", .type = "INTEGER" }, ++ { .name = "hdr_handle", .type = "INTEGER" }, ++ { .name = "hdr_related_handle", .type = "INTEGER" }, ++ { .name = "hdr_ts", .type = "TEXT" }, ++ { .name = "hdr_length", .type = "INTEGER" }, ++ { .name = "hdr_maint_op_class", .type = "INTEGER" }, ++ { .name = "dpa", .type = "INTEGER" }, ++ { .name = "dpa_flags", .type = "INTEGER" }, ++ { .name = "descriptor", .type = "INTEGER" }, ++ { .name = "type", .type = "INTEGER" }, ++ { .name = "transaction_type", .type = "INTEGER" }, ++ { .name = "channel", .type = "INTEGER" }, ++ { .name = "rank", .type = "INTEGER" }, ++ { .name = "nibble_mask", .type = "INTEGER" }, ++ { .name = "bank_group", .type = "INTEGER" }, ++ { .name = "bank", .type = "INTEGER" }, ++ { .name = "row", .type = "INTEGER" }, ++ { .name = "column", .type = "INTEGER" }, ++ { .name = "cor_mask", .type = "BLOB" }, ++}; ++ ++static const struct db_table_descriptor cxl_dram_event_tab = { ++ .name = "cxl_dram_event", ++ .fields = cxl_dram_event_fields, ++ .num_fields = ARRAY_SIZE(cxl_dram_event_fields), ++}; ++ ++int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_cxl_dram_event) ++ return 0; ++ log(TERM, LOG_INFO, "cxl_dram_event store: %p\n", ++ priv->stmt_cxl_dram_event); ++ ++ ras_store_cxl_common_hdr(priv->stmt_cxl_dram_event, &ev->hdr); ++ sqlite3_bind_int64(priv->stmt_cxl_dram_event, 13, ev->dpa); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 14, ev->dpa_flags); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 15, ev->descriptor); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 16, ev->type); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 17, ev->transaction_type); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 18, ev->channel); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 19, ev->rank); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 20, ev->nibble_mask); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 21, ev->bank_group); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 22, ev->bank); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 23, ev->row); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 24, ev->column); ++ sqlite3_bind_blob(priv->stmt_cxl_dram_event, 25, ev->cor_mask, ++ CXL_EVENT_DER_CORRECTION_MASK_SIZE, NULL); ++ ++ rc = sqlite3_step(priv->stmt_cxl_dram_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do stmt_cxl_dram_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_cxl_dram_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset stmt_cxl_dram_event on sqlite: error = %d\n", rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} + #endif + + /* +@@ -1306,6 +1383,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + if (rc != SQLITE_OK) + goto error; + } ++ ++ rc = ras_mc_create_table(priv, &cxl_dram_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_dram_event, ++ &cxl_dram_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } + #endif + + ras->db_priv = priv; +@@ -1475,6 +1560,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + "cpu %u: Failed to finalize cxl_general_media_event sqlite: error = %d\n", + cpu, rc); + } ++ ++ if (priv->stmt_cxl_dram_event) { ++ rc = sqlite3_finalize(priv->stmt_cxl_dram_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize cxl_dram_event sqlite: error = %d\n", ++ cpu, rc); ++ } + #endif + + rc = sqlite3_close_v2(db); +diff --git a/ras-record.h b/ras-record.h +index 37c32de..480ff92 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -135,6 +135,7 @@ struct ras_cxl_poison_event { + #define CXL_HEADERLOG_SIZE_U32 (SZ_512 / sizeof(uint32_t)) + #define CXL_EVENT_RECORD_DATA_LENGTH 0x50 + #define CXL_EVENT_GEN_MED_COMP_ID_SIZE 0x10 ++#define CXL_EVENT_DER_CORRECTION_MASK_SIZE 0x20 + + struct ras_cxl_aer_ue_event { + char timestamp[64]; +@@ -199,6 +200,24 @@ struct ras_cxl_general_media_event { + uint16_t validity_flags; + }; + ++struct ras_cxl_dram_event { ++ struct ras_cxl_event_common_hdr hdr; ++ uint64_t dpa; ++ uint8_t dpa_flags; ++ uint8_t descriptor; ++ uint8_t type; ++ uint8_t transaction_type; ++ uint8_t channel; ++ uint8_t rank; ++ uint32_t nibble_mask; ++ uint8_t bank_group; ++ uint8_t bank; ++ uint32_t row; ++ uint16_t column; ++ uint8_t *cor_mask; ++ uint16_t validity_flags; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -214,6 +233,7 @@ struct ras_cxl_aer_ce_event; + struct ras_cxl_overflow_event; + struct ras_cxl_generic_event; + struct ras_cxl_general_media_event; ++struct ras_cxl_dram_event; + + #ifdef HAVE_SQLITE3 + +@@ -253,6 +273,7 @@ struct sqlite3_priv { + sqlite3_stmt *stmt_cxl_overflow_event; + sqlite3_stmt *stmt_cxl_generic_event; + sqlite3_stmt *stmt_cxl_general_media_event; ++ sqlite3_stmt *stmt_cxl_dram_event; + #endif + }; + +@@ -287,6 +308,7 @@ int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_eve + int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); + int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); + int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); ++int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -306,6 +328,7 @@ static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_ + static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; + static inline int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; + static inline int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; ++static inline int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 725dc9b..21180b1 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -543,6 +543,68 @@ static int set_cxl_general_media_event_backtrace(char *buf, struct ras_cxl_gener + return 0; + } + ++static int set_cxl_dram_event_backtrace(char *buf, struct ras_cxl_dram_event *ev) ++{ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "memdev=%s\n" \ ++ "host=%s\n" \ ++ "serial=0x%lx\n" \ ++ "log_type=%s\n" \ ++ "hdr_uuid=%s\n" \ ++ "hdr_flags=0x%x\n" \ ++ "hdr_handle=0x%x\n" \ ++ "hdr_related_handle=0x%x\n" \ ++ "hdr_timestamp=%s\n" \ ++ "hdr_length=%u\n" \ ++ "hdr_maint_op_class=%u\n" \ ++ "dpa=0x%lx\n" \ ++ "dpa_flags=%u\n" \ ++ "descriptor=%u\n" \ ++ "type=%u\n" \ ++ "transaction_type=%u\n" \ ++ "channel=%u\n" \ ++ "rank=%u\n" \ ++ "nibble_mask=%u\n" \ ++ "bank_group=%u\n" \ ++ "bank=%u\n" \ ++ "row=%u\n" \ ++ "column=%u\n", \ ++ ev->hdr.timestamp, \ ++ ev->hdr.memdev, \ ++ ev->hdr.host, \ ++ ev->hdr.serial, \ ++ ev->hdr.log_type, \ ++ ev->hdr.hdr_uuid, \ ++ ev->hdr.hdr_flags, \ ++ ev->hdr.hdr_handle, \ ++ ev->hdr.hdr_related_handle, \ ++ ev->hdr.hdr_timestamp, \ ++ ev->hdr.hdr_length, \ ++ ev->hdr.hdr_maint_op_class, \ ++ ev->dpa, \ ++ ev->dpa_flags, \ ++ ev->descriptor, \ ++ ev->type, \ ++ ev->transaction_type, \ ++ ev->channel, \ ++ ev->rank, \ ++ ev->nibble_mask, \ ++ ev->bank_group, \ ++ ev->bank, \ ++ ev->row, \ ++ ev->column); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -598,6 +660,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case CXL_GENERAL_MEDIA_EVENT: + rc = set_cxl_general_media_event_backtrace(buf, (struct ras_cxl_general_media_event *)ev); + break; ++ case CXL_DRAM_EVENT: ++ rc = set_cxl_dram_event_backtrace(buf, (struct ras_cxl_dram_event *)ev); ++ break; + default: + return -1; + } +@@ -1271,3 +1336,47 @@ cxl_general_media_fail: + else + return -1; + } ++ ++int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto cxl_dram_fail; ++ ++ rc = commit_report_backtrace(sockfd, CXL_DRAM_EVENT, ev); ++ if (rc < 0) ++ goto cxl_dram_fail; ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl_dram_event"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_dram_fail; ++ ++ sprintf(buf, "REASON=%s", "CXL DRAM Event"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_dram_fail; ++ ++ done = 1; ++ ++cxl_dram_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ else ++ return -1; ++} +diff --git a/ras-report.h b/ras-report.h +index d9ec7df..1ad00e0 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -45,6 +45,7 @@ int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_ev + int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); + int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); + int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); ++int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev); + + #else + +@@ -62,6 +63,7 @@ static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras + static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; + static inline int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; + static inline int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; ++static inline int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) { return 0; }; + + #endif + diff --git a/a247baf7110ab6427259eb1421a103e2021a8735.patch b/a247baf7110ab6427259eb1421a103e2021a8735.patch new file mode 100644 index 0000000..eb615ec --- /dev/null +++ b/a247baf7110ab6427259eb1421a103e2021a8735.patch @@ -0,0 +1,424 @@ +commit a247baf7110ab6427259eb1421a103e2021a8735 +Author: Shiju Jose +Date: Fri Mar 17 13:07:01 2023 +0000 + + rasdaemon: Add support for the CXL AER correctable errors + + Add support to log and record the CXL AER correctable errors. + + The corresponding Kernel patches are here: + https://lore.kernel.org/linux-cxl/166974401763.1608150.5424589924034481387.stgit@djiang5-desk3.ch.intel.com/T/#t + https://lore.kernel.org/linux-cxl/63e5ed38d77d9_138fbc2947a@iweiny-mobl.notmuch/T/#t + + Signed-off-by: Shiju Jose + Reviewed-by: Jonathan Cameron + Reviewed-by: Dave Jiang + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index 0f2c9e4..8f6342d 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -220,6 +220,14 @@ int ras_cxl_poison_event_handler(struct trace_seq *s, + #define CXL_AER_UE_IDE_TX_ERR BIT(15) + #define CXL_AER_UE_IDE_RX_ERR BIT(16) + ++#define CXL_AER_CE_CACHE_DATA_ECC BIT(0) ++#define CXL_AER_CE_MEM_DATA_ECC BIT(1) ++#define CXL_AER_CE_CRC_THRESH BIT(2) ++#define CXL_AER_CE_RETRY_THRESH BIT(3) ++#define CXL_AER_CE_CACHE_POISON BIT(4) ++#define CXL_AER_CE_MEM_POISON BIT(5) ++#define CXL_AER_CE_PHYS_LAYER_ERR BIT(6) ++ + struct cxl_error_list { + uint32_t bit; + const char *error; +@@ -243,6 +251,16 @@ static const struct cxl_error_list cxl_aer_ue[] = { + { .bit = CXL_AER_UE_IDE_RX_ERR, .error = "IDE Rx Error" }, + }; + ++static const struct cxl_error_list cxl_aer_ce[] = { ++ { .bit = CXL_AER_CE_CACHE_DATA_ECC, .error = "Cache Data ECC Error" }, ++ { .bit = CXL_AER_CE_MEM_DATA_ECC, .error = "Memory Data ECC Error" }, ++ { .bit = CXL_AER_CE_CRC_THRESH, .error = "CRC Threshold Hit" }, ++ { .bit = CXL_AER_CE_RETRY_THRESH, .error = "Retry Threshold" }, ++ { .bit = CXL_AER_CE_CACHE_POISON, .error = "Received Cache Poison From Peer" }, ++ { .bit = CXL_AER_CE_MEM_POISON, .error = "Received Memory Poison From Peer" }, ++ { .bit = CXL_AER_CE_PHYS_LAYER_ERR, .error = "Received Error From Physical Layer" }, ++}; ++ + static int decode_cxl_error_status(struct trace_seq *s, uint32_t status, + const struct cxl_error_list *cxl_error_list, + uint8_t num_elems) +@@ -351,3 +369,66 @@ int ras_cxl_aer_ue_event_handler(struct trace_seq *s, + + return 0; + } ++ ++int ras_cxl_aer_ce_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ int len; ++ unsigned long long val; ++ time_t now; ++ struct tm *tm; ++ struct ras_events *ras = context; ++ struct ras_cxl_aer_ce_event ev; ++ ++ now = record->ts / user_hz + ras->uptime_diff; ++ tm = localtime(&now); ++ if (tm) ++ strftime(ev.timestamp, sizeof(ev.timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ else ++ strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); ++ if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) ++ return -1; ++ ++ ev.memdev = tep_get_field_raw(s, event, "memdev", ++ record, &len, 1); ++ if (!ev.memdev) ++ return -1; ++ if (trace_seq_printf(s, "memdev:%s ", ev.memdev) <= 0) ++ return -1; ++ ++ ev.host = tep_get_field_raw(s, event, "host", ++ record, &len, 1); ++ if (!ev.host) ++ return -1; ++ if (trace_seq_printf(s, "host:%s ", ev.host) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "serial", record, &val, 1) < 0) ++ return -1; ++ ev.serial = val; ++ if (trace_seq_printf(s, "serial:0x%llx ", (unsigned long long)ev.serial) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "status", record, &val, 1) < 0) ++ return -1; ++ ev.error_status = val; ++ if (trace_seq_printf(s, "error status:") <= 0) ++ return -1; ++ if (decode_cxl_error_status(s, ev.error_status, ++ cxl_aer_ce, ARRAY_SIZE(cxl_aer_ce)) < 0) ++ return -1; ++ ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_cxl_aer_ce_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_cxl_aer_ce_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h +index 35efadd..711daf4 100644 +--- a/ras-cxl-handler.h ++++ b/ras-cxl-handler.h +@@ -25,4 +25,8 @@ int ras_cxl_poison_event_handler(struct trace_seq *s, + int ras_cxl_aer_ue_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); ++ ++int ras_cxl_aer_ce_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context); + #endif +diff --git a/ras-events.c b/ras-events.c +index 5d73df1..2662467 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -247,6 +247,7 @@ int toggle_ras_mc_event(int enable) + #ifdef HAVE_CXL + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_poison", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_uncorrectable_error", enable); ++ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_correctable_error", enable); + #endif + + free_ras: +@@ -1001,6 +1002,14 @@ int handle_ras_events(int record_events) + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "cxl_aer_uncorrectable_error"); ++ ++ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_aer_correctable_error", ++ ras_cxl_aer_ce_event_handler, NULL, CXL_AER_CE_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "cxl", "cxl_aer_correctable_error"); + #endif + + if (!num_events) { +diff --git a/ras-events.h b/ras-events.h +index 4acbe57..a9d67c2 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -41,6 +41,7 @@ enum { + MF_EVENT, + CXL_POISON_EVENT, + CXL_AER_UE_EVENT, ++ CXL_AER_CE_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index 97a2a37..86133c4 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -673,6 +673,53 @@ int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_eve + + return rc; + } ++ ++/* ++ * Table and functions to handle cxl:cxl_aer_correctable_error ++ */ ++static const struct db_fields cxl_aer_ce_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "memdev", .type = "TEXT" }, ++ { .name = "host", .type = "TEXT" }, ++ { .name = "serial", .type = "INTEGER" }, ++ { .name = "error_status", .type = "INTEGER" }, ++}; ++ ++static const struct db_table_descriptor cxl_aer_ce_event_tab = { ++ .name = "cxl_aer_ce_event", ++ .fields = cxl_aer_ce_event_fields, ++ .num_fields = ARRAY_SIZE(cxl_aer_ce_event_fields), ++}; ++ ++int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_cxl_aer_ce_event) ++ return 0; ++ log(TERM, LOG_INFO, "cxl_aer_ce_event store: %p\n", priv->stmt_cxl_aer_ce_event); ++ ++ sqlite3_bind_text(priv->stmt_cxl_aer_ce_event, 1, ev->timestamp, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_aer_ce_event, 2, ev->memdev, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_aer_ce_event, 3, ev->host, -1, NULL); ++ sqlite3_bind_int64(priv->stmt_cxl_aer_ce_event, 4, ev->serial); ++ sqlite3_bind_int(priv->stmt_cxl_aer_ce_event, 5, ev->error_status); ++ ++ rc = sqlite3_step(priv->stmt_cxl_aer_ce_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do cxl_aer_ce_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_cxl_aer_ce_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset cxl_aer_ce_event on sqlite: error = %d\n", ++ rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} + #endif + + /* +@@ -1032,6 +1079,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + if (rc != SQLITE_OK) + goto error; + } ++ ++ rc = ras_mc_create_table(priv, &cxl_aer_ce_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_aer_ce_event, ++ &cxl_aer_ce_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } + #endif + + ras->db_priv = priv; +@@ -1169,6 +1224,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + "cpu %u: Failed to finalize cxl_aer_ue_event sqlite: error = %d\n", + cpu, rc); + } ++ ++ if (priv->stmt_cxl_aer_ce_event) { ++ rc = sqlite3_finalize(priv->stmt_cxl_aer_ce_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize cxl_aer_ce_event sqlite: error = %d\n", ++ cpu, rc); ++ } + #endif + + rc = sqlite3_close_v2(db); +diff --git a/ras-record.h b/ras-record.h +index f11985f..ab7153d 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -144,6 +144,14 @@ struct ras_cxl_aer_ue_event { + uint32_t *header_log; + }; + ++struct ras_cxl_aer_ce_event { ++ char timestamp[64]; ++ const char *memdev; ++ const char *host; ++ uint64_t serial; ++ uint32_t error_status; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -155,6 +163,7 @@ struct diskerror_event; + struct ras_mf_event; + struct ras_cxl_poison_event; + struct ras_cxl_aer_ue_event; ++struct ras_cxl_aer_ce_event; + + #ifdef HAVE_SQLITE3 + +@@ -190,6 +199,7 @@ struct sqlite3_priv { + #ifdef HAVE_CXL + sqlite3_stmt *stmt_cxl_poison_event; + sqlite3_stmt *stmt_cxl_aer_ue_event; ++ sqlite3_stmt *stmt_cxl_aer_ce_event; + #endif + }; + +@@ -220,6 +230,7 @@ int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev + int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev); + int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev); + int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev); ++int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -235,6 +246,7 @@ static inline int ras_store_diskerror_event(struct ras_events *ras, struct diske + static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; }; + static inline int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; }; + static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; }; ++static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 2ebdc80..63b47f5 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -397,6 +397,30 @@ static int set_cxl_aer_ue_event_backtrace(char *buf, struct ras_cxl_aer_ue_event + return 0; + } + ++static int set_cxl_aer_ce_event_backtrace(char *buf, struct ras_cxl_aer_ce_event *ev) ++{ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "memdev=%s\n" \ ++ "host=%s\n" \ ++ "serial=0x%lx\n" \ ++ "error_status=%u\n", \ ++ ev->timestamp, \ ++ ev->memdev, \ ++ ev->host, \ ++ ev->serial, \ ++ ev->error_status); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -440,6 +464,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case CXL_AER_UE_EVENT: + rc = set_cxl_aer_ue_event_backtrace(buf, (struct ras_cxl_aer_ue_event *)ev); + break; ++ case CXL_AER_CE_EVENT: ++ rc = set_cxl_aer_ce_event_backtrace(buf, (struct ras_cxl_aer_ce_event *)ev); ++ break; + default: + return -1; + } +@@ -936,3 +963,47 @@ cxl_aer_ue_fail: + else + return -1; + } ++ ++int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto cxl_aer_ce_fail; ++ ++ rc = commit_report_backtrace(sockfd, CXL_AER_CE_EVENT, ev); ++ if (rc < 0) ++ goto cxl_aer_ce_fail; ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl-aer-correctable-error"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_aer_ce_fail; ++ ++ sprintf(buf, "REASON=%s", "CXL AER correctable error"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_aer_ce_fail; ++ ++ done = 1; ++ ++cxl_aer_ce_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ else ++ return -1; ++} +diff --git a/ras-report.h b/ras-report.h +index dfe89d1..46155ee 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -41,6 +41,7 @@ int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *e + int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev); + int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev); + int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev); ++int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); + + #else + +@@ -54,6 +55,7 @@ static inline int ras_report_diskerror_event(struct ras_events *ras, struct disk + static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; }; + static inline int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; }; + static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; }; ++static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; + + #endif + diff --git a/a7524917befe7e67c02253cc27cb0c724e5992c0.patch b/a7524917befe7e67c02253cc27cb0c724e5992c0.patch new file mode 100644 index 0000000..b5625d0 --- /dev/null +++ b/a7524917befe7e67c02253cc27cb0c724e5992c0.patch @@ -0,0 +1,503 @@ +commit a7524917befe7e67c02253cc27cb0c724e5992c0 +Author: Shiju Jose +Date: Fri Mar 17 12:51:02 2023 +0000 + + rasdaemon: Add support for the CXL AER uncorrectable errors + + Add support to log and record the CXL AER uncorrectable errors. + + The corresponding Kernel patches are here: + https://lore.kernel.org/linux-cxl/166974401763.1608150.5424589924034481387.stgit@djiang5-desk3.ch.intel.com/T/#t + https://lore.kernel.org/lkml/63eeb2a8c9e3f_32d612941f@dwillia2-xfh.jf.intel.com.notmuch/T/ + + It was found that the header log data to be converted to the + big-endian format to correctly store in the SQLite DB likely + because the SQLite database seems uses the big-endian storage. + + Signed-off-by: Shiju Jose + Reviewed-by: Jonathan Cameron + Reviewed-by: Dave Jiang # + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index cb23ba2..0f2c9e4 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -21,6 +21,7 @@ + #include "ras-record.h" + #include "ras-logger.h" + #include "ras-report.h" ++#include + + /* Poison List: Payload out flags */ + #define CXL_POISON_FLAG_MORE BIT(0) +@@ -200,3 +201,153 @@ int ras_cxl_poison_event_handler(struct trace_seq *s, + + return 0; + } ++ ++/* CXL AER Errors */ ++ ++#define CXL_AER_UE_CACHE_DATA_PARITY BIT(0) ++#define CXL_AER_UE_CACHE_ADDR_PARITY BIT(1) ++#define CXL_AER_UE_CACHE_BE_PARITY BIT(2) ++#define CXL_AER_UE_CACHE_DATA_ECC BIT(3) ++#define CXL_AER_UE_MEM_DATA_PARITY BIT(4) ++#define CXL_AER_UE_MEM_ADDR_PARITY BIT(5) ++#define CXL_AER_UE_MEM_BE_PARITY BIT(6) ++#define CXL_AER_UE_MEM_DATA_ECC BIT(7) ++#define CXL_AER_UE_REINIT_THRESH BIT(8) ++#define CXL_AER_UE_RSVD_ENCODE BIT(9) ++#define CXL_AER_UE_POISON BIT(10) ++#define CXL_AER_UE_RECV_OVERFLOW BIT(11) ++#define CXL_AER_UE_INTERNAL_ERR BIT(14) ++#define CXL_AER_UE_IDE_TX_ERR BIT(15) ++#define CXL_AER_UE_IDE_RX_ERR BIT(16) ++ ++struct cxl_error_list { ++ uint32_t bit; ++ const char *error; ++}; ++ ++static const struct cxl_error_list cxl_aer_ue[] = { ++ { .bit = CXL_AER_UE_CACHE_DATA_PARITY, .error = "Cache Data Parity Error" }, ++ { .bit = CXL_AER_UE_CACHE_ADDR_PARITY, .error = "Cache Address Parity Error" }, ++ { .bit = CXL_AER_UE_CACHE_BE_PARITY, .error = "Cache Byte Enable Parity Error" }, ++ { .bit = CXL_AER_UE_CACHE_DATA_ECC, .error = "Cache Data ECC Error" }, ++ { .bit = CXL_AER_UE_MEM_DATA_PARITY, .error = "Memory Data Parity Error" }, ++ { .bit = CXL_AER_UE_MEM_ADDR_PARITY, .error = "Memory Address Parity Error" }, ++ { .bit = CXL_AER_UE_MEM_BE_PARITY, .error = "Memory Byte Enable Parity Error" }, ++ { .bit = CXL_AER_UE_MEM_DATA_ECC, .error = "Memory Data ECC Error" }, ++ { .bit = CXL_AER_UE_REINIT_THRESH, .error = "REINIT Threshold Hit" }, ++ { .bit = CXL_AER_UE_RSVD_ENCODE, .error = "Received Unrecognized Encoding" }, ++ { .bit = CXL_AER_UE_POISON, .error = "Received Poison From Peer" }, ++ { .bit = CXL_AER_UE_RECV_OVERFLOW, .error = "Receiver Overflow" }, ++ { .bit = CXL_AER_UE_INTERNAL_ERR, .error = "Component Specific Error" }, ++ { .bit = CXL_AER_UE_IDE_TX_ERR, .error = "IDE Tx Error" }, ++ { .bit = CXL_AER_UE_IDE_RX_ERR, .error = "IDE Rx Error" }, ++}; ++ ++static int decode_cxl_error_status(struct trace_seq *s, uint32_t status, ++ const struct cxl_error_list *cxl_error_list, ++ uint8_t num_elems) ++{ ++ int i; ++ ++ for (i = 0; i < num_elems; i++) { ++ if (status & cxl_error_list[i].bit) ++ if (trace_seq_printf(s, "\'%s\' ", cxl_error_list[i].error) <= 0) ++ return -1; ++ } ++ return 0; ++} ++ ++int ras_cxl_aer_ue_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ int len, i; ++ unsigned long long val; ++ time_t now; ++ struct tm *tm; ++ struct ras_events *ras = context; ++ struct ras_cxl_aer_ue_event ev; ++ ++ memset(&ev, 0, sizeof(ev)); ++ now = record->ts / user_hz + ras->uptime_diff; ++ tm = localtime(&now); ++ if (tm) ++ strftime(ev.timestamp, sizeof(ev.timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ else ++ strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); ++ if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) ++ return -1; ++ ++ ev.memdev = tep_get_field_raw(s, event, "memdev", ++ record, &len, 1); ++ if (!ev.memdev) ++ return -1; ++ if (trace_seq_printf(s, "memdev:%s ", ev.memdev) <= 0) ++ return -1; ++ ++ ev.host = tep_get_field_raw(s, event, "host", ++ record, &len, 1); ++ if (!ev.host) ++ return -1; ++ if (trace_seq_printf(s, "host:%s ", ev.host) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "serial", record, &val, 1) < 0) ++ return -1; ++ ev.serial = val; ++ if (trace_seq_printf(s, "serial:0x%llx ", (unsigned long long)ev.serial) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "status", record, &val, 1) < 0) ++ return -1; ++ ev.error_status = val; ++ ++ if (trace_seq_printf(s, "error status:") <= 0) ++ return -1; ++ if (decode_cxl_error_status(s, ev.error_status, ++ cxl_aer_ue, ARRAY_SIZE(cxl_aer_ue)) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "first_error", record, &val, 1) < 0) ++ return -1; ++ ev.first_error = val; ++ ++ if (trace_seq_printf(s, "first error:") <= 0) ++ return -1; ++ if (decode_cxl_error_status(s, ev.first_error, ++ cxl_aer_ue, ARRAY_SIZE(cxl_aer_ue)) < 0) ++ return -1; ++ ++ ev.header_log = tep_get_field_raw(s, event, "header_log", ++ record, &len, 1); ++ if (!ev.header_log) ++ return -1; ++ if (trace_seq_printf(s, "header log:\n") <= 0) ++ return -1; ++ for (i = 0; i < CXL_HEADERLOG_SIZE_U32; i++) { ++ if (trace_seq_printf(s, "%08x ", ev.header_log[i]) <= 0) ++ break; ++ if ((i > 0) && ((i % 20) == 0)) ++ if (trace_seq_printf(s, "\n") <= 0) ++ break; ++ /* Convert header log data to the big-endian format because ++ * the SQLite database seems uses the big-endian storage. ++ */ ++ ev.header_log[i] = htobe32(ev.header_log[i]); ++ } ++ if (i < CXL_HEADERLOG_SIZE_U32) ++ return -1; ++ ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_cxl_aer_ue_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_cxl_aer_ue_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h +index 84d5cc6..35efadd 100644 +--- a/ras-cxl-handler.h ++++ b/ras-cxl-handler.h +@@ -21,4 +21,8 @@ + int ras_cxl_poison_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); ++ ++int ras_cxl_aer_ue_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context); + #endif +diff --git a/ras-events.c b/ras-events.c +index f95844a..5d73df1 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -246,6 +246,7 @@ int toggle_ras_mc_event(int enable) + + #ifdef HAVE_CXL + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_poison", enable); ++ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_uncorrectable_error", enable); + #endif + + free_ras: +@@ -992,6 +993,14 @@ int handle_ras_events(int record_events) + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "cxl_poison"); ++ ++ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_aer_uncorrectable_error", ++ ras_cxl_aer_ue_event_handler, NULL, CXL_AER_UE_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "cxl", "cxl_aer_uncorrectable_error"); + #endif + + if (!num_events) { +diff --git a/ras-events.h b/ras-events.h +index 1ef3ecd..4acbe57 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -40,6 +40,7 @@ enum { + DISKERROR_EVENT, + MF_EVENT, + CXL_POISON_EVENT, ++ CXL_AER_UE_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index c31baa0..97a2a37 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -622,6 +622,57 @@ int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_eve + + return rc; + } ++ ++/* ++ * Table and functions to handle cxl:cxl_aer_uncorrectable_error ++ */ ++static const struct db_fields cxl_aer_ue_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "memdev", .type = "TEXT" }, ++ { .name = "host", .type = "TEXT" }, ++ { .name = "serial", .type = "INTEGER" }, ++ { .name = "error_status", .type = "INTEGER" }, ++ { .name = "first_error", .type = "INTEGER" }, ++ { .name = "header_log", .type = "BLOB" }, ++}; ++ ++static const struct db_table_descriptor cxl_aer_ue_event_tab = { ++ .name = "cxl_aer_ue_event", ++ .fields = cxl_aer_ue_event_fields, ++ .num_fields = ARRAY_SIZE(cxl_aer_ue_event_fields), ++}; ++ ++int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_cxl_aer_ue_event) ++ return 0; ++ log(TERM, LOG_INFO, "cxl_aer_ue_event store: %p\n", priv->stmt_cxl_aer_ue_event); ++ ++ sqlite3_bind_text(priv->stmt_cxl_aer_ue_event, 1, ev->timestamp, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_aer_ue_event, 2, ev->memdev, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_aer_ue_event, 3, ev->host, -1, NULL); ++ sqlite3_bind_int64(priv->stmt_cxl_aer_ue_event, 4, ev->serial); ++ sqlite3_bind_int(priv->stmt_cxl_aer_ue_event, 5, ev->error_status); ++ sqlite3_bind_int(priv->stmt_cxl_aer_ue_event, 6, ev->first_error); ++ sqlite3_bind_blob(priv->stmt_cxl_aer_ue_event, 7, ev->header_log, CXL_HEADERLOG_SIZE, NULL); ++ ++ rc = sqlite3_step(priv->stmt_cxl_aer_ue_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do cxl_aer_ue_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_cxl_aer_ue_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset cxl_aer_ue_event on sqlite: error = %d\n", ++ rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} + #endif + + /* +@@ -973,6 +1024,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + if (rc != SQLITE_OK) + goto error; + } ++ ++ rc = ras_mc_create_table(priv, &cxl_aer_ue_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_aer_ue_event, ++ &cxl_aer_ue_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } + #endif + + ras->db_priv = priv; +@@ -1102,6 +1161,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + "cpu %u: Failed to finalize cxl_poison_event sqlite: error = %d\n", + cpu, rc); + } ++ ++ if (priv->stmt_cxl_aer_ue_event) { ++ rc = sqlite3_finalize(priv->stmt_cxl_aer_ue_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize cxl_aer_ue_event sqlite: error = %d\n", ++ cpu, rc); ++ } + #endif + + rc = sqlite3_close_v2(db); +diff --git a/ras-record.h b/ras-record.h +index fd15215..f11985f 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -130,6 +130,20 @@ struct ras_cxl_poison_event { + char overflow_ts[64]; + }; + ++#define SZ_512 0x200 ++#define CXL_HEADERLOG_SIZE SZ_512 ++#define CXL_HEADERLOG_SIZE_U32 (SZ_512 / sizeof(uint32_t)) ++ ++struct ras_cxl_aer_ue_event { ++ char timestamp[64]; ++ const char *memdev; ++ const char *host; ++ uint64_t serial; ++ uint32_t error_status; ++ uint32_t first_error; ++ uint32_t *header_log; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -140,6 +154,7 @@ struct devlink_event; + struct diskerror_event; + struct ras_mf_event; + struct ras_cxl_poison_event; ++struct ras_cxl_aer_ue_event; + + #ifdef HAVE_SQLITE3 + +@@ -174,6 +189,7 @@ struct sqlite3_priv { + #endif + #ifdef HAVE_CXL + sqlite3_stmt *stmt_cxl_poison_event; ++ sqlite3_stmt *stmt_cxl_aer_ue_event; + #endif + }; + +@@ -203,6 +219,7 @@ int ras_store_devlink_event(struct ras_events *ras, struct devlink_event *ev); + int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev); + int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev); + int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev); ++int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -217,6 +234,7 @@ static inline int ras_store_devlink_event(struct ras_events *ras, struct devlink + static inline int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; }; + static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; }; + static inline int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; }; ++static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 3daecc0..2ebdc80 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -371,6 +371,32 @@ static int set_cxl_poison_event_backtrace(char *buf, struct ras_cxl_poison_event + return 0; + } + ++static int set_cxl_aer_ue_event_backtrace(char *buf, struct ras_cxl_aer_ue_event *ev) ++{ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "memdev=%s\n" \ ++ "host=%s\n" \ ++ "serial=0x%lx\n" \ ++ "error_status=%u\n" \ ++ "first_error=%u\n", \ ++ ev->timestamp, \ ++ ev->memdev, \ ++ ev->host, \ ++ ev->serial, \ ++ ev->error_status, \ ++ ev->first_error); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -411,6 +437,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case CXL_POISON_EVENT: + rc = set_cxl_poison_event_backtrace(buf, (struct ras_cxl_poison_event *)ev); + break; ++ case CXL_AER_UE_EVENT: ++ rc = set_cxl_aer_ue_event_backtrace(buf, (struct ras_cxl_aer_ue_event *)ev); ++ break; + default: + return -1; + } +@@ -863,3 +892,47 @@ cxl_poison_fail: + else + return -1; + } ++ ++int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto cxl_aer_ue_fail; ++ ++ rc = commit_report_backtrace(sockfd, CXL_AER_UE_EVENT, ev); ++ if (rc < 0) ++ goto cxl_aer_ue_fail; ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl-aer-uncorrectable-error"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_aer_ue_fail; ++ ++ sprintf(buf, "REASON=%s", "CXL AER uncorrectable error"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_aer_ue_fail; ++ ++ done = 1; ++ ++cxl_aer_ue_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ else ++ return -1; ++} +diff --git a/ras-report.h b/ras-report.h +index d1591ce..dfe89d1 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -40,6 +40,7 @@ int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev); + int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev); + int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev); + int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev); ++int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev); + + #else + +@@ -52,6 +53,7 @@ static inline int ras_report_devlink_event(struct ras_events *ras, struct devlin + static inline int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; }; + static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; }; + static inline int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; }; ++static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; }; + + #endif + diff --git a/ae1647624486fca0070b297d0e2fd4e53443c10b.patch b/ae1647624486fca0070b297d0e2fd4e53443c10b.patch new file mode 100644 index 0000000..7d5cb0b --- /dev/null +++ b/ae1647624486fca0070b297d0e2fd4e53443c10b.patch @@ -0,0 +1,116 @@ +commit 81b362f0412eb9769098c2f4317b84b9bd82cce9 +Author: Shiju Jose +Date: Mon Feb 12 10:35:25 2024 +0000 + + rasdaemon: ras-mc-ctl: Add support for CXL AER correctable trace events + + Add support for CXL AER correctable events to the ras-mc-ctl tool. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + (cherry picked from commit ae1647624486fca0070b297d0e2fd4e53443c10b) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index c0a2ec6..9519279 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1230,6 +1230,46 @@ sub get_cxl_ue_error_status_text + return join (", ", @out); + } + ++use constant { ++ CXL_AER_CE_CACHE_DATA_ECC => 0x0001, ++ CXL_AER_CE_MEM_DATA_ECC => 0x0002, ++ CXL_AER_CE_CRC_THRESH => 0x0004, ++ CXL_AER_CE_RETRY_THRESH => 0x0008, ++ CXL_AER_CE_CACHE_POISON => 0x0010, ++ CXL_AER_CE_MEM_POISON => 0x0020, ++ CXL_AER_CE_PHYS_LAYER_ERR => 0x0040, ++}; ++ ++sub get_cxl_ce_error_status_text ++{ ++ my $error_status = $_[0]; ++ my @out; ++ ++ if ($error_status & CXL_AER_CE_CACHE_DATA_ECC) { ++ push @out, (sprintf "\'Cache Data ECC Error\' "); ++ } ++ if ($error_status & CXL_AER_CE_MEM_DATA_ECC) { ++ push @out, (sprintf "\'Memory Data ECC Error\' "); ++ } ++ if ($error_status & CXL_AER_CE_CRC_THRESH) { ++ push @out, (sprintf "\'CRC Threshold Hit\' "); ++ } ++ if ($error_status & CXL_AER_CE_RETRY_THRESH) { ++ push @out, (sprintf "\'Retry Threshold\' "); ++ } ++ if ($error_status & CXL_AER_CE_CACHE_POISON) { ++ push @out, (sprintf "\'Received Cache Poison From Peer\' "); ++ } ++ if ($error_status & CXL_AER_CE_MEM_POISON) { ++ push @out, (sprintf "\'Received Memory Poison From Peer\' "); ++ } ++ if ($error_status & CXL_AER_CE_PHYS_LAYER_ERR) { ++ push @out, (sprintf "\'Received Error From Physical Layer\' "); ++ } ++ ++ return join (", ", @out); ++} ++ + sub summary + { + require DBI; +@@ -1310,6 +1350,22 @@ sub summary + print "No CXL AER uncorrectable errors.\n\n"; + } + $query_handle->finish; ++ ++ # CXL AER correctable errors ++ $query = "select memdev, count(*) from cxl_aer_ce_event$conf{opt}{since} group by memdev"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($memdev, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$memdev errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "CXL AER correctable events summary:\n$out\n"; ++ } else { ++ print "No CXL AER correctable errors.\n\n"; ++ } ++ $query_handle->finish; + } + + # extlog errors +@@ -1519,6 +1575,29 @@ sub errors + print "No CXL AER uncorrectable errors.\n\n"; + } + $query_handle->finish; ++ ++ # CXL AER correctable errors ++ $query = "select id, timestamp, memdev, host, serial, error_status from cxl_aer_ce_event$conf{opt}{since} order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $error_status)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $timestamp error: "; ++ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev); ++ $out .= "host=$host, " if (defined $host && length $host); ++ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial); ++ if (defined $error_status && length $error_status) { ++ $out .= sprintf "error_status: %s, ", get_cxl_ce_error_status_text($error_status); ++ } ++ $out .= "\n"; ++ } ++ if ($out ne "") { ++ print "CXL AER correctable events:\n$out\n"; ++ } else { ++ print "No CXL AER correctable errors.\n\n"; ++ } ++ $query_handle->finish; + } + + # Extlog errors diff --git a/aee13f74266382c64128bd7367a5eeb46277f490.patch b/aee13f74266382c64128bd7367a5eeb46277f490.patch new file mode 100644 index 0000000..2f330fa --- /dev/null +++ b/aee13f74266382c64128bd7367a5eeb46277f490.patch @@ -0,0 +1,161 @@ +commit b2e5a6821fae4278cc37803a223a5a64bf50c8cc +Author: Shiju Jose +Date: Mon Feb 12 11:29:13 2024 +0000 + + rasdaemon: ras-mc-ctl: Add support for CXL memory module trace events + + Add support for CXL memory module events to the ras-mc-ctl tool. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + (cherry picked from commit aee13f74266382c64128bd7367a5eeb46277f490) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 5e45889..5e120d9 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1376,6 +1376,70 @@ sub get_cxl_transaction_type + return $types[$_[0]]; + } + ++sub get_cxl_dev_event_type ++{ ++ my @types; ++ ++ if ($_[0] < 0 || $_[0] > 5) { ++ return "unknown-type"; ++ } ++ ++ @types = ("Health Status Change", ++ "Media Status Change", ++ "Life Used Change", ++ "Temperature Change", ++ "Data Path Error", ++ "LSA Error"); ++ ++ return $types[$_[0]]; ++} ++ ++use constant { ++ CXL_DHI_HS_MAINTENANCE_NEEDED => 0x0001, ++ CXL_DHI_HS_PERFORMANCE_DEGRADED => 0x0002, ++ CXL_DHI_HS_HW_REPLACEMENT_NEEDED => 0x0004, ++}; ++ ++sub get_cxl_health_status_text ++{ ++ my $flags = $_[0]; ++ my @out; ++ ++ if ($flags & CXL_DHI_HS_MAINTENANCE_NEEDED) { ++ push @out, (sprintf "\'MAINTENANCE_NEEDED\' "); ++ } ++ if ($flags & CXL_DHI_HS_PERFORMANCE_DEGRADED) { ++ push @out, (sprintf "\'PERFORMANCE_DEGRADED\' "); ++ } ++ if ($flags & CXL_DHI_HS_HW_REPLACEMENT_NEEDED) { ++ push @out, (sprintf "\'REPLACEMENT_NEEDED\' "); ++ } ++ ++ return join (", ", @out); ++} ++ ++sub get_cxl_media_status ++{ ++ my @types; ++ ++ if ($_[0] < 0 || $_[0] > 9) { ++ return "unknown"; ++ } ++ ++ @types = ("Normal", ++ "Not Ready", ++ "Write Persistency Lost", ++ "All Data Lost", ++ "Write Persistency Loss in the Event of Power Loss", ++ "Write Persistency Loss in Event of Shutdown", ++ "Write Persistency Loss Imminent", ++ "All Data Loss in Event of Power Loss", ++ "All Data loss in the Event of Shutdown", ++ "All Data Loss Imminent"); ++ ++ return $types[$_[0]]; ++} ++ + sub summary + { + require DBI; +@@ -1552,6 +1616,22 @@ sub summary + print "No CXL DRAM errors.\n\n"; + } + $query_handle->finish; ++ ++ # CXL memory module errors ++ $query = "select memdev, count(*) from cxl_memory_module_event$conf{opt}{since} group by memdev"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($memdev, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$memdev errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "CXL memory module events summary:\n$out\n"; ++ } else { ++ print "No CXL memory module errors.\n\n"; ++ } ++ $query_handle->finish; + } + + # extlog errors +@@ -1665,6 +1745,7 @@ sub errors + my ($hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $data); + my ($dpa_flags, $descriptor, $mem_event_type, $transaction_type, $channel, $rank, $device, $comp_id); + my ($nibble_mask, $bank_group, $row, $column, $cor_mask); ++ my ($event_type, $health_status, $media_status, $life_used, $dirty_shutdown_cnt, $cor_vol_err_cnt, $cor_per_err_cnt, $device_temp, $add_status); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -1966,6 +2047,42 @@ sub errors + } else { + print "No CXL DRAM errors.\n\n"; + } ++ ++ # CXL memory module errors ++ $query = "select id, timestamp, memdev, host, serial, log_type, hdr_uuid, hdr_flags, hdr_handle, hdr_related_handle, hdr_ts, hdr_length, hdr_maint_op_class, event_type, health_status, media_status, life_used, dirty_shutdown_cnt, cor_vol_err_cnt, cor_per_err_cnt, device_temp, add_status from cxl_memory_module_event$conf{opt}{since} order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $log_type, $hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $event_type, $health_status, $media_status, $life_used, $dirty_shutdown_cnt, $cor_vol_err_cnt, $cor_per_err_cnt, $device_temp, $add_status)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $timestamp error: "; ++ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev); ++ $out .= "host=$host, " if (defined $host && length $host); ++ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial); ++ $out .= "log=$log_type, " if (defined $log_type && length $log_type); ++ $out .= "hdr_uuid=$hdr_uuid, " if (defined $hdr_uuid && length $hdr_uuid); ++ $out .= sprintf "hdr_flags=0x%llx, %s, ", $hdr_flags, get_cxl_hdr_flags_text($hdr_flags) if (defined $hdr_flags && length $hdr_flags); ++ $out .= sprintf "hdr_handle=0x%x, ", $hdr_handle if (defined $hdr_handle && length $hdr_handle); ++ $out .= sprintf "hdr_related_handle=0x%x, ", $hdr_related_handle if (defined $hdr_related_handle && length $hdr_related_handle); ++ $out .= "hdr_timestamp=$hdr_ts, " if (defined $hdr_ts && length $hdr_ts); ++ $out .= sprintf "hdr_length=%u, ", $hdr_length if (defined $hdr_length && length $hdr_length); ++ $out .= sprintf "hdr_maint_op_class=%u, ", $hdr_maint_op_class if (defined $hdr_maint_op_class && length $hdr_maint_op_class); ++ $out .= sprintf "event_type: %s, ", get_cxl_dev_event_type($event_type) if (defined $event_type && length $event_type); ++ $out .= sprintf "health_status: %s, ", get_cxl_health_status_text($health_status) if (defined $health_status && length $health_status); ++ $out .= sprintf "media_status: %s, ", get_cxl_media_status($media_status) if (defined $media_status && length $media_status); ++ $out .= sprintf "life_used=%u, ", $life_used if (defined $life_used && length $life_used); ++ $out .= sprintf "dirty_shutdown_cnt=%u, ", $dirty_shutdown_cnt if (defined $dirty_shutdown_cnt && length $dirty_shutdown_cnt); ++ $out .= sprintf "cor_vol_err_cnt=%u, ", $cor_vol_err_cnt if (defined $cor_vol_err_cnt && length $cor_vol_err_cnt); ++ $out .= sprintf "cor_per_err_cnt=%u, ", $cor_per_err_cnt if (defined $cor_per_err_cnt && length $cor_per_err_cnt); ++ $out .= sprintf "device_temp=%u, ", $device_temp if (defined $device_temp && length $device_temp); ++ $out .= sprintf "add_status=%u ", $add_status if (defined $add_status && length $add_status); ++ $out .= "\n"; ++ } ++ if ($out ne "") { ++ print "CXL memory module events:\n$out\n"; ++ } else { ++ print "No CXL memory module errors.\n\n"; ++ } + } + + # Extlog errors diff --git a/b22cb067755f4604770f9864a0babed8f93a1553.patch b/b22cb067755f4604770f9864a0babed8f93a1553.patch new file mode 100644 index 0000000..2f7da9e --- /dev/null +++ b/b22cb067755f4604770f9864a0babed8f93a1553.patch @@ -0,0 +1,75 @@ +commit 25ef3044f38224d653d880fb9f20be9e7c9bf570 +Author: Shiju Jose +Date: Mon Feb 12 10:38:51 2024 +0000 + + rasdaemon: ras-mc-ctl: Add support for CXL overflow trace events + + Add support for CXL overflow events to the ras-mc-ctl tool. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + (cherry picked from commit b22cb067755f4604770f9864a0babed8f93a1553) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 9519279..6a319a7 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1366,6 +1366,22 @@ sub summary + print "No CXL AER correctable errors.\n\n"; + } + $query_handle->finish; ++ ++ # CXL overflow errors ++ $query = "select memdev, count(*) from cxl_overflow_event$conf{opt}{since} group by memdev"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($memdev, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$memdev errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "CXL overflow events summary:\n$out\n"; ++ } else { ++ print "No CXL overflow errors.\n\n"; ++ } ++ $query_handle->finish; + } + + # extlog errors +@@ -1474,6 +1490,7 @@ sub errors + my ($error_count, $affinity, $mpidr, $r_state, $psci_state); + my ($pfn, $page_type, $action_result); + my ($memdev, $host, $serial, $error_status, $first_error, $header_log); ++ my ($log_type, $first_ts, $last_ts); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -1598,6 +1615,27 @@ sub errors + print "No CXL AER correctable errors.\n\n"; + } + $query_handle->finish; ++ ++ # CXL overflow errors ++ $query = "select id, timestamp, memdev, host, serial, log_type, count, first_ts, last_ts from cxl_overflow_event$conf{opt}{since} order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $log_type, $count, $first_ts, $last_ts)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $timestamp error: "; ++ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev); ++ $out .= "host=$host, " if (defined $host && length $host); ++ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial); ++ $out .= "log=$log_type, " if (defined $log_type && length $log_type); ++ $out .= sprintf "%u records from $first_ts to $last_ts", $count if (defined $count && length $count); ++ $out .= "\n"; ++ } ++ if ($out ne "") { ++ print "CXL overflow events:\n$out\n"; ++ } else { ++ print "No CXL overflow errors.\n\n"; ++ } + } + + # Extlog errors diff --git a/c38c14afc5d7bb6c8c52d1023271d755deb23008.patch b/c38c14afc5d7bb6c8c52d1023271d755deb23008.patch new file mode 100644 index 0000000..2970075 --- /dev/null +++ b/c38c14afc5d7bb6c8c52d1023271d755deb23008.patch @@ -0,0 +1,101 @@ +commit 703e0f8eabbe1e191a8bd85632066c155ec1f4fa +Author: Shiju Jose +Date: Mon Feb 12 11:22:03 2024 +0000 + + rasdaemon: ras-mc-ctl: Add support for CXL DRAM trace events + + Add support for CXL DRAM events to the ras-mc-ctl tool. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + (cherry picked from commit c38c14afc5d7bb6c8c52d1023271d755deb23008) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 99b3c10..5e45889 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1536,6 +1536,22 @@ sub summary + print "No CXL general media errors.\n\n"; + } + $query_handle->finish; ++ ++ # CXL DRAM errors ++ $query = "select memdev, count(*) from cxl_dram_event$conf{opt}{since} group by memdev"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($memdev, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$memdev errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "CXL DRAM events summary:\n$out\n"; ++ } else { ++ print "No CXL DRAM errors.\n\n"; ++ } ++ $query_handle->finish; + } + + # extlog errors +@@ -1648,6 +1664,7 @@ sub errors + my ($trace_type, $region, $region_uuid, $hpa, $dpa, $dpa_length, $source, $flags, $overflow_ts); + my ($hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $data); + my ($dpa_flags, $descriptor, $mem_event_type, $transaction_type, $channel, $rank, $device, $comp_id); ++ my ($nibble_mask, $bank_group, $row, $column, $cor_mask); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -1902,6 +1919,53 @@ sub errors + } else { + print "No CXL general media errors.\n\n"; + } ++ ++ # CXL DRAM errors ++ use constant CXL_EVENT_DER_CORRECTION_MASK_SIZE => 0x20; ++ $query = "select id, timestamp, memdev, host, serial, log_type, hdr_uuid, hdr_flags, hdr_handle, hdr_related_handle, hdr_ts, hdr_length, hdr_maint_op_class, dpa, dpa_flags, descriptor, type, transaction_type, channel, rank, nibble_mask, bank_group, bank, row, column, cor_mask from cxl_dram_event$conf{opt}{since} order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $log_type, $hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $dpa, $dpa_flags, $descriptor, $type, $transaction_type, $channel, $rank, $nibble_mask, $bank_group, $bank, $row, $column, $cor_mask)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $timestamp error: "; ++ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev); ++ $out .= "host=$host, " if (defined $host && length $host); ++ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial); ++ $out .= "log=$log_type, " if (defined $log_type && length $log_type); ++ $out .= "hdr_uuid=$hdr_uuid, " if (defined $hdr_uuid && length $hdr_uuid); ++ $out .= sprintf "hdr_flags=0x%llx, %s, ", $hdr_flags, get_cxl_hdr_flags_text($hdr_flags) if (defined $hdr_flags && length $hdr_flags); ++ $out .= sprintf "hdr_handle=0x%x, ", $hdr_handle if (defined $hdr_handle && length $hdr_handle); ++ $out .= sprintf "hdr_related_handle=0x%x, ", $hdr_related_handle if (defined $hdr_related_handle && length $hdr_related_handle); ++ $out .= "hdr_timestamp=$hdr_ts, " if (defined $hdr_ts && length $hdr_ts); ++ $out .= sprintf "hdr_length=%u, ", $hdr_length if (defined $hdr_length && length $hdr_length); ++ $out .= sprintf "hdr_maint_op_class=%u, ", $hdr_maint_op_class if (defined $hdr_maint_op_class && length $hdr_maint_op_class); ++ $out .= sprintf "dpa=0x%llx, ", $dpa if (defined $dpa && length $dpa); ++ $out .= sprintf "dpa_flags: %s, ", get_cxl_dpa_flags_text($dpa_flags) if (defined $dpa_flags && length $dpa_flags); ++ $out .= sprintf "descriptor_flags: %s, ", get_cxl_descriptor_flags_text($descriptor) if (defined $descriptor && length $descriptor); ++ $out .= sprintf "memory event type: %s, ", get_cxl_mem_event_type($type) if (defined $type && length $type); ++ $out .= sprintf "transaction_type: %s, ", get_cxl_transaction_type($transaction_type) if (defined $transaction_type && length $transaction_type); ++ $out .= sprintf "channel=%u, ", $channel if (defined $channel && length $channel); ++ $out .= sprintf "rank=%u, ", $rank if (defined $rank && length $rank); ++ $out .= sprintf "nibble_mask=%u, ", $nibble_mask if (defined $nibble_mask && length $nibble_mask); ++ $out .= sprintf "bank_group=%u, ", $bank_group if (defined $bank_group && length $bank_group); ++ $out .= sprintf "bank=%u, ", $bank if (defined $bank && length $bank); ++ $out .= sprintf "row=%u, ", $row if (defined $row && length $row); ++ $out .= sprintf "column=%u, ", $column if (defined $column && length $column); ++ if (defined $cor_mask && length $cor_mask) { ++ $out .= sprintf "correction_mask:"; ++ my @bytes = unpack "C*", $cor_mask; ++ for (my $i = 0; $i < CXL_EVENT_DER_CORRECTION_MASK_SIZE; $i++) { ++ $out .= sprintf "%02x ", $bytes[$i]; ++ } ++ } ++ $out .= "\n"; ++ } ++ if ($out ne "") { ++ print "CXL DRAM events:\n$out\n"; ++ } else { ++ print "No CXL DRAM errors.\n\n"; ++ } + } + + # Extlog errors diff --git a/d3836aa061f677232f99c514247d3dbf80812a1b.patch b/d3836aa061f677232f99c514247d3dbf80812a1b.patch new file mode 100644 index 0000000..f85f264 --- /dev/null +++ b/d3836aa061f677232f99c514247d3dbf80812a1b.patch @@ -0,0 +1,42 @@ +commit d3836aa061f677232f99c514247d3dbf80812a1b +Author: Shiju Jose +Date: Mon Jan 16 17:13:32 2023 +0000 + + rasdaemon: Move definition for BIT and BIT_ULL to a common file + + Move definition for BIT() and BIT_ULL() to the + common file ras-record.h + + Signed-off-by: Shiju Jose + Reviewed-by: Jonathan Cameron + Reviewed-by: Dave Jiang + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-non-standard-handler.h b/ras-non-standard-handler.h +index 4d9f938..c360eaf 100644 +--- a/ras-non-standard-handler.h ++++ b/ras-non-standard-handler.h +@@ -17,9 +17,6 @@ + #include "ras-events.h" + #include + +-#define BIT(nr) (1UL << (nr)) +-#define BIT_ULL(nr) (1ULL << (nr)) +- + struct ras_ns_ev_decoder { + struct ras_ns_ev_decoder *next; + const char *sec_type; +diff --git a/ras-record.h b/ras-record.h +index d9f7733..219f10b 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -25,6 +25,9 @@ + + #define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x))) + ++#define BIT(nr) (1UL << (nr)) ++#define BIT_ULL(nr) (1ULL << (nr)) ++ + extern long user_hz; + + struct ras_events; diff --git a/e0cde0edf073b939d345aeba0aed23e238dbc53b.patch b/e0cde0edf073b939d345aeba0aed23e238dbc53b.patch new file mode 100644 index 0000000..b26c6a1 --- /dev/null +++ b/e0cde0edf073b939d345aeba0aed23e238dbc53b.patch @@ -0,0 +1,575 @@ +commit e0cde0edf073b939d345aeba0aed23e238dbc53b +Author: Shiju Jose +Date: Tue Apr 4 18:49:09 2023 +0100 + + rasdaemon: Add support for the CXL generic events + + Add support to log and record the CXL generic events. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index d4c845e..83ada56 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -56,6 +56,49 @@ static void get_timestamp(struct trace_seq *s, struct tep_record *record, + strncpy(ts_ptr, "1970-01-01 00:00:00 +0000", size); + } + ++struct cxl_event_flags { ++ uint32_t bit; ++ const char *flag; ++}; ++ ++static int decode_cxl_event_flags(struct trace_seq *s, uint32_t flags, ++ const struct cxl_event_flags *cxl_ev_flags, ++ uint8_t num_elems) ++{ ++ int i; ++ ++ for (i = 0; i < num_elems; i++) { ++ if (flags & cxl_ev_flags[i].bit) ++ if (trace_seq_printf(s, "\'%s\' ", cxl_ev_flags[i].flag) <= 0) ++ return -1; ++ } ++ return 0; ++} ++ ++static char *uuid_be(const char *uu) ++{ ++ static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")]; ++ char *p = uuid; ++ int i; ++ static const unsigned char be[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; ++ ++ for (i = 0; i < 16; i++) { ++ p += sprintf(p, "%.2x", (unsigned char) uu[be[i]]); ++ switch (i) { ++ case 3: ++ case 5: ++ case 7: ++ case 9: ++ *p++ = '-'; ++ break; ++ } ++ } ++ ++ *p = 0; ++ ++ return uuid; ++} ++ + /* Poison List: Payload out flags */ + #define CXL_POISON_FLAG_MORE BIT(0) + #define CXL_POISON_FLAG_OVERFLOW BIT(1) +@@ -524,3 +567,145 @@ int ras_cxl_overflow_event_handler(struct trace_seq *s, + + return 0; + } ++ ++/* ++ * Common Event Record Format ++ * CXL 3.0 section 8.2.9.2.1; Table 8-42 ++ */ ++#define CXL_EVENT_RECORD_FLAG_PERMANENT BIT(2) ++#define CXL_EVENT_RECORD_FLAG_MAINT_NEEDED BIT(3) ++#define CXL_EVENT_RECORD_FLAG_PERF_DEGRADED BIT(4) ++#define CXL_EVENT_RECORD_FLAG_HW_REPLACE BIT(5) ++ ++static const struct cxl_event_flags cxl_hdr_flags[] = { ++ { .bit = CXL_EVENT_RECORD_FLAG_PERMANENT, .flag = "PERMANENT_CONDITION" }, ++ { .bit = CXL_EVENT_RECORD_FLAG_MAINT_NEEDED, .flag = "MAINTENANCE_NEEDED" }, ++ { .bit = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED, .flag = "PERFORMANCE_DEGRADED" }, ++ { .bit = CXL_EVENT_RECORD_FLAG_HW_REPLACE, .flag = "HARDWARE_REPLACEMENT_NEEDED" }, ++}; ++ ++static int handle_ras_cxl_common_hdr(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context, ++ struct ras_cxl_event_common_hdr *hdr) ++{ ++ int len; ++ unsigned long long val; ++ struct ras_events *ras = context; ++ ++ get_timestamp(s, record, ras, (char *)&hdr->timestamp, sizeof(hdr->timestamp)); ++ if (trace_seq_printf(s, "%s ", hdr->timestamp) <= 0) ++ return -1; ++ ++ hdr->memdev = tep_get_field_raw(s, event, "memdev", record, &len, 1); ++ if (!hdr->memdev) ++ return -1; ++ if (trace_seq_printf(s, "memdev:%s ", hdr->memdev) <= 0) ++ return -1; ++ ++ hdr->host = tep_get_field_raw(s, event, "host", record, &len, 1); ++ if (!hdr->host) ++ return -1; ++ if (trace_seq_printf(s, "host:%s ", hdr->host) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "serial", record, &val, 1) < 0) ++ return -1; ++ hdr->serial = val; ++ if (trace_seq_printf(s, "serial:0x%llx ", (unsigned long long)hdr->serial) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "log", record, &val, 1) < 0) ++ return -1; ++ hdr->log_type = cxl_event_log_type_str(val); ++ if (trace_seq_printf(s, "log type:%s ", hdr->log_type) <= 0) ++ return -1; ++ ++ hdr->hdr_uuid = tep_get_field_raw(s, event, "hdr_uuid", record, &len, 1); ++ if (!hdr->hdr_uuid) ++ return -1; ++ hdr->hdr_uuid = uuid_be(hdr->hdr_uuid); ++ if (trace_seq_printf(s, "hdr_uuid:%s ", hdr->hdr_uuid) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "hdr_flags", record, &val, 1) < 0) ++ return -1; ++ hdr->hdr_flags = val; ++ if (decode_cxl_event_flags(s, hdr->hdr_flags, cxl_hdr_flags, ++ ARRAY_SIZE(cxl_hdr_flags)) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "hdr_handle", record, &val, 1) < 0) ++ return -1; ++ hdr->hdr_handle = val; ++ if (trace_seq_printf(s, "hdr_handle:0x%x ", hdr->hdr_handle) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "hdr_related_handle", record, &val, 1) < 0) ++ return -1; ++ hdr->hdr_related_handle = val; ++ if (trace_seq_printf(s, "hdr_related_handle:0x%x ", hdr->hdr_related_handle) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "hdr_timestamp", record, &val, 1) < 0) ++ return -1; ++ convert_timestamp(val, hdr->hdr_timestamp, sizeof(hdr->hdr_timestamp)); ++ if (trace_seq_printf(s, "hdr_timestamp:%s ", hdr->hdr_timestamp) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "hdr_length", record, &val, 1) < 0) ++ return -1; ++ hdr->hdr_length = val; ++ if (trace_seq_printf(s, "hdr_length:%u ", hdr->hdr_length) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "hdr_maint_op_class", record, &val, 1) < 0) ++ return -1; ++ hdr->hdr_maint_op_class = val; ++ if (trace_seq_printf(s, "hdr_maint_op_class:%u ", hdr->hdr_maint_op_class) <= 0) ++ return -1; ++ ++ return 0; ++} ++ ++int ras_cxl_generic_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ int len, i; ++ struct ras_events *ras = context; ++ struct ras_cxl_generic_event ev; ++ const uint8_t *buf; ++ ++ memset(&ev, 0, sizeof(ev)); ++ if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0) ++ return -1; ++ ++ ev.data = tep_get_field_raw(s, event, "data", record, &len, 1); ++ if (!ev.data) ++ return -1; ++ i = 0; ++ buf = ev.data; ++ if (trace_seq_printf(s, "\ndata:\n %08x: ", i) <= 0) ++ return -1; ++ for (i = 0; i < CXL_EVENT_RECORD_DATA_LENGTH; i += 4) { ++ if ((i > 0) && ((i % 16) == 0)) ++ if (trace_seq_printf(s, "\n %08x: ", i) <= 0) ++ break; ++ if (trace_seq_printf(s, "%02x%02x%02x%02x ", ++ buf[i], buf[i+1], buf[i+2], buf[i+3]) <= 0) ++ break; ++ } ++ ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_cxl_generic_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_cxl_generic_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h +index e7847ec..9f77cb7 100644 +--- a/ras-cxl-handler.h ++++ b/ras-cxl-handler.h +@@ -32,4 +32,7 @@ int ras_cxl_aer_ce_event_handler(struct trace_seq *s, + int ras_cxl_overflow_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); ++int ras_cxl_generic_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context); + #endif +diff --git a/ras-events.c b/ras-events.c +index f2a869a..4036933 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -249,6 +249,7 @@ int toggle_ras_mc_event(int enable) + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_uncorrectable_error", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_correctable_error", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_overflow", enable); ++ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_generic_event", enable); + #endif + + free_ras: +@@ -1054,6 +1055,14 @@ int handle_ras_events(int record_events) + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "cxl_overflow"); ++ ++ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_generic_event", ++ ras_cxl_generic_event_handler, NULL, CXL_GENERIC_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "cxl", "cxl_generic_event"); + #endif + + if (!num_events) { +diff --git a/ras-events.h b/ras-events.h +index 7c869d9..96c299e 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -43,6 +43,7 @@ enum { + CXL_AER_UE_EVENT, + CXL_AER_CE_EVENT, + CXL_OVERFLOW_EVENT, ++ CXL_GENERIC_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index 7b808a5..a65d9c0 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -773,6 +773,79 @@ int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow + + return rc; + } ++ ++static int ras_store_cxl_common_hdr(sqlite3_stmt *stmt, struct ras_cxl_event_common_hdr *hdr) ++{ ++ if (!stmt || !hdr) ++ return 0; ++ ++ sqlite3_bind_text(stmt, 1, hdr->timestamp, -1, NULL); ++ sqlite3_bind_text(stmt, 2, hdr->memdev, -1, NULL); ++ sqlite3_bind_text(stmt, 3, hdr->host, -1, NULL); ++ sqlite3_bind_int64(stmt, 4, hdr->serial); ++ sqlite3_bind_text(stmt, 5, hdr->log_type, -1, NULL); ++ sqlite3_bind_text(stmt, 6, hdr->hdr_uuid, -1, NULL); ++ sqlite3_bind_int(stmt, 7, hdr->hdr_flags); ++ sqlite3_bind_int(stmt, 8, hdr->hdr_handle); ++ sqlite3_bind_int(stmt, 9, hdr->hdr_related_handle); ++ sqlite3_bind_text(stmt, 10, hdr->hdr_timestamp, -1, NULL); ++ sqlite3_bind_int(stmt, 11, hdr->hdr_length); ++ sqlite3_bind_int(stmt, 12, hdr->hdr_maint_op_class); ++ ++ return 0; ++} ++ ++/* ++ * Table and functions to handle cxl:cxl_generic_event ++ */ ++static const struct db_fields cxl_generic_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "memdev", .type = "TEXT" }, ++ { .name = "host", .type = "TEXT" }, ++ { .name = "serial", .type = "INTEGER" }, ++ { .name = "log_type", .type = "TEXT" }, ++ { .name = "hdr_uuid", .type = "TEXT" }, ++ { .name = "hdr_flags", .type = "INTEGER" }, ++ { .name = "hdr_handle", .type = "INTEGER" }, ++ { .name = "hdr_related_handle", .type = "INTEGER" }, ++ { .name = "hdr_ts", .type = "TEXT" }, ++ { .name = "hdr_length", .type = "INTEGER" }, ++ { .name = "hdr_maint_op_class", .type = "INTEGER" }, ++ { .name = "data", .type = "BLOB" }, ++}; ++ ++static const struct db_table_descriptor cxl_generic_event_tab = { ++ .name = "cxl_generic_event", ++ .fields = cxl_generic_event_fields, ++ .num_fields = ARRAY_SIZE(cxl_generic_event_fields), ++}; ++ ++int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_cxl_generic_event) ++ return 0; ++ log(TERM, LOG_INFO, "cxl_generic_event store: %p\n", priv->stmt_cxl_generic_event); ++ ++ ras_store_cxl_common_hdr(priv->stmt_cxl_generic_event, &ev->hdr); ++ sqlite3_bind_blob(priv->stmt_cxl_generic_event, 13, ev->data, ++ CXL_EVENT_RECORD_DATA_LENGTH, NULL); ++ ++ rc = sqlite3_step(priv->stmt_cxl_generic_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do stmt_cxl_generic_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_cxl_generic_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset stmt_cxl_generic_event on sqlite: error = %d\n", rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} + #endif + + /* +@@ -1148,6 +1221,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + if (rc != SQLITE_OK) + goto error; + } ++ ++ rc = ras_mc_create_table(priv, &cxl_generic_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_generic_event, ++ &cxl_generic_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } + #endif + + ras->db_priv = priv; +@@ -1301,6 +1382,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + "cpu %u: Failed to finalize cxl_overflow_event sqlite: error = %d\n", + cpu, rc); + } ++ ++ if (priv->stmt_cxl_generic_event) { ++ rc = sqlite3_finalize(priv->stmt_cxl_generic_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize cxl_generic_event sqlite: error = %d\n", ++ cpu, rc); ++ } + #endif + + rc = sqlite3_close_v2(db); +diff --git a/ras-record.h b/ras-record.h +index 90db6ad..9ecfcda 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -133,6 +133,7 @@ struct ras_cxl_poison_event { + #define SZ_512 0x200 + #define CXL_HEADERLOG_SIZE SZ_512 + #define CXL_HEADERLOG_SIZE_U32 (SZ_512 / sizeof(uint32_t)) ++#define CXL_EVENT_RECORD_DATA_LENGTH 0x50 + + struct ras_cxl_aer_ue_event { + char timestamp[64]; +@@ -163,6 +164,26 @@ struct ras_cxl_overflow_event { + uint16_t count; + }; + ++struct ras_cxl_event_common_hdr { ++ char timestamp[64]; ++ const char *memdev; ++ const char *host; ++ uint64_t serial; ++ const char *log_type; ++ const char *hdr_uuid; ++ uint32_t hdr_flags; ++ uint16_t hdr_handle; ++ uint16_t hdr_related_handle; ++ char hdr_timestamp[64]; ++ uint8_t hdr_length; ++ uint8_t hdr_maint_op_class; ++}; ++ ++struct ras_cxl_generic_event { ++ struct ras_cxl_event_common_hdr hdr; ++ uint8_t *data; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -176,6 +197,7 @@ struct ras_cxl_poison_event; + struct ras_cxl_aer_ue_event; + struct ras_cxl_aer_ce_event; + struct ras_cxl_overflow_event; ++struct ras_cxl_generic_event; + + #ifdef HAVE_SQLITE3 + +@@ -213,6 +235,7 @@ struct sqlite3_priv { + sqlite3_stmt *stmt_cxl_aer_ue_event; + sqlite3_stmt *stmt_cxl_aer_ce_event; + sqlite3_stmt *stmt_cxl_overflow_event; ++ sqlite3_stmt *stmt_cxl_generic_event; + #endif + }; + +@@ -245,6 +268,7 @@ int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_eve + int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev); + int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); + int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); ++int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -262,6 +286,7 @@ static inline int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_ + static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; }; + static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; + static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; ++static inline int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index dbed454..8d7b76a 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -451,6 +451,44 @@ static int set_cxl_overflow_event_backtrace(char *buf, struct ras_cxl_overflow_e + return 0; + } + ++static int set_cxl_generic_event_backtrace(char *buf, struct ras_cxl_generic_event *ev) ++{ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "memdev=%s\n" \ ++ "host=%s\n" \ ++ "serial=0x%lx\n" \ ++ "log_type=%s\n" \ ++ "hdr_uuid=%s\n" \ ++ "hdr_flags=0x%x\n" \ ++ "hdr_handle=0x%x\n" \ ++ "hdr_related_handle=0x%x\n" \ ++ "hdr_timestamp=%s\n" \ ++ "hdr_length=%u\n" \ ++ "hdr_maint_op_class=%u\n", \ ++ ev->hdr.timestamp, \ ++ ev->hdr.memdev, \ ++ ev->hdr.host, \ ++ ev->hdr.serial, \ ++ ev->hdr.log_type, \ ++ ev->hdr.hdr_uuid, \ ++ ev->hdr.hdr_flags, \ ++ ev->hdr.hdr_handle, \ ++ ev->hdr.hdr_related_handle, \ ++ ev->hdr.hdr_timestamp, \ ++ ev->hdr.hdr_length, \ ++ ev->hdr.hdr_maint_op_class); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -500,6 +538,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case CXL_OVERFLOW_EVENT: + rc = set_cxl_overflow_event_backtrace(buf, (struct ras_cxl_overflow_event *)ev); + break; ++ case CXL_GENERIC_EVENT: ++ rc = set_cxl_generic_event_backtrace(buf, (struct ras_cxl_generic_event *)ev); ++ break; + default: + return -1; + } +@@ -1084,3 +1125,48 @@ cxl_overflow_fail: + else + return -1; + } ++ ++int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto cxl_generic_fail; ++ ++ rc = commit_report_backtrace(sockfd, CXL_GENERIC_EVENT, ev); ++ if (rc < 0) ++ goto cxl_generic_fail; ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl_generic_event"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_generic_fail; ++ ++ sprintf(buf, "REASON=%s", "CXL Generic Event "); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_generic_fail; ++ ++ done = 1; ++ ++cxl_generic_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ else ++ return -1; ++ ++} +diff --git a/ras-report.h b/ras-report.h +index 204d485..bf591a6 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -43,6 +43,7 @@ int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_ev + int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev); + int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); + int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); ++int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); + + #else + +@@ -58,6 +59,7 @@ static inline int ras_report_cxl_poison_event(struct ras_events *ras, struct ras + static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; }; + static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; + static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; ++static inline int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; + + #endif + diff --git a/f63b4c942e19a0da1e85a88783ed6e222ad4bdba.patch b/f63b4c942e19a0da1e85a88783ed6e222ad4bdba.patch new file mode 100644 index 0000000..c5103a9 --- /dev/null +++ b/f63b4c942e19a0da1e85a88783ed6e222ad4bdba.patch @@ -0,0 +1,536 @@ +commit f63b4c942e19a0da1e85a88783ed6e222ad4bdba +Author: Shiju Jose +Date: Wed Apr 5 16:16:19 2023 +0100 + + rasdaemon: Add support for the CXL memory module events + + Add support to log and record the CXL memory module events. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index 64b0b50..a0b6780 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -1016,3 +1016,159 @@ int ras_cxl_dram_event_handler(struct trace_seq *s, + + return 0; + } ++ ++/* ++ * Memory Module Event Record - MMER ++ * ++ * CXL res 3.0 section 8.2.9.2.1.3; Table 8-45 ++ */ ++static const char* cxl_dev_evt_type[] = { ++ "Health Status Change", ++ "Media Status Change", ++ "Life Used Change", ++ "Temperature Change", ++ "Data Path Error", ++ "LSA Error", ++}; ++ ++/* ++ * Device Health Information - DHI ++ * ++ * CXL res 3.0 section 8.2.9.8.3.1; Table 8-100 ++ */ ++#define CXL_DHI_HS_MAINTENANCE_NEEDED BIT(0) ++#define CXL_DHI_HS_PERFORMANCE_DEGRADED BIT(1) ++#define CXL_DHI_HS_HW_REPLACEMENT_NEEDED BIT(2) ++ ++static const struct cxl_event_flags cxl_health_status[] = { ++ { .bit = CXL_DHI_HS_MAINTENANCE_NEEDED, .flag = "MAINTENANCE_NEEDED" }, ++ { .bit = CXL_DHI_HS_PERFORMANCE_DEGRADED, .flag = "PERFORMANCE_DEGRADED" }, ++ { .bit = CXL_DHI_HS_HW_REPLACEMENT_NEEDED, .flag = "REPLACEMENT_NEEDED" }, ++}; ++ ++static const char* cxl_media_status[] = { ++ "Normal", ++ "Not Ready", ++ "Write Persistency Lost", ++ "All Data Lost", ++ "Write Persistency Loss in the Event of Power Loss", ++ "Write Persistency Loss in Event of Shutdown", ++ "Write Persistency Loss Imminent", ++ "All Data Loss in Event of Power Loss", ++ "All Data loss in the Event of Shutdown", ++ "All Data Loss Imminent", ++}; ++ ++static const char* cxl_two_bit_status[] = { ++ "Normal", ++ "Warning", ++ "Critical", ++}; ++ ++static const char* cxl_one_bit_status[] = { ++ "Normal", ++ "Warning", ++}; ++ ++#define CXL_DHI_AS_LIFE_USED(as) (as & 0x3) ++#define CXL_DHI_AS_DEV_TEMP(as) ((as & 0xC) >> 2) ++#define CXL_DHI_AS_COR_VOL_ERR_CNT(as) ((as & 0x10) >> 4) ++#define CXL_DHI_AS_COR_PER_ERR_CNT(as) ((as & 0x20) >> 5) ++ ++int ras_cxl_memory_module_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ unsigned long long val; ++ struct ras_events *ras = context; ++ struct ras_cxl_memory_module_event ev; ++ ++ memset(&ev, 0, sizeof(ev)); ++ if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "event_type", record, &val, 1) < 0) ++ return -1; ++ ev.event_type = val; ++ if (trace_seq_printf(s, "event_type:%s ", get_cxl_type_str(cxl_dev_evt_type, ++ ARRAY_SIZE(cxl_dev_evt_type), ev.event_type)) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "health_status", record, &val, 1) < 0) ++ return -1; ++ ev.health_status = val; ++ if (trace_seq_printf(s, "health_status:") <= 0) ++ return -1; ++ if (decode_cxl_event_flags(s, ev.health_status, cxl_health_status, ++ ARRAY_SIZE(cxl_health_status)) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "media_status", record, &val, 1) < 0) ++ return -1; ++ ev.media_status = val; ++ if (trace_seq_printf(s, "media_status:%s ", get_cxl_type_str(cxl_media_status, ++ ARRAY_SIZE(cxl_media_status), ev.media_status)) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "add_status", record, &val, 1) < 0) ++ return -1; ++ ev.add_status = val; ++ if (trace_seq_printf(s, "as_life_used:%s ", get_cxl_type_str(cxl_two_bit_status, ++ ARRAY_SIZE(cxl_two_bit_status), ++ CXL_DHI_AS_LIFE_USED(ev.add_status))) <= 0) ++ return -1; ++ if (trace_seq_printf(s, "as_dev_temp:%s ", get_cxl_type_str(cxl_two_bit_status, ++ ARRAY_SIZE(cxl_two_bit_status), ++ CXL_DHI_AS_DEV_TEMP(ev.add_status))) <= 0) ++ return -1; ++ if (trace_seq_printf(s, "as_cor_vol_err_cnt:%s ", get_cxl_type_str(cxl_one_bit_status, ++ ARRAY_SIZE(cxl_one_bit_status), ++ CXL_DHI_AS_COR_VOL_ERR_CNT(ev.add_status))) <= 0) ++ return -1; ++ if (trace_seq_printf(s, "as_cor_per_err_cnt:%s ", get_cxl_type_str(cxl_one_bit_status, ++ ARRAY_SIZE(cxl_one_bit_status), ++ CXL_DHI_AS_COR_PER_ERR_CNT(ev.add_status))) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "life_used", record, &val, 1) < 0) ++ return -1; ++ ev.life_used = val; ++ if (trace_seq_printf(s, "life_used:%u ", ev.life_used) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "device_temp", record, &val, 1) < 0) ++ return -1; ++ ev.device_temp = val; ++ if (trace_seq_printf(s, "device_temp:%u ", ev.device_temp) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "dirty_shutdown_cnt", record, &val, 1) < 0) ++ return -1; ++ ev.dirty_shutdown_cnt = val; ++ if (trace_seq_printf(s, "dirty_shutdown_cnt:%u ", ev.dirty_shutdown_cnt) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "cor_vol_err_cnt", record, &val, 1) < 0) ++ return -1; ++ ev.cor_vol_err_cnt = val; ++ if (trace_seq_printf(s, "cor_vol_err_cnt:%u ", ev.cor_vol_err_cnt) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "cor_per_err_cnt", record, &val, 1) < 0) ++ return -1; ++ ev.cor_per_err_cnt = val; ++ if (trace_seq_printf(s, "cor_per_err_cnt:%u ", ev.cor_per_err_cnt) <= 0) ++ return -1; ++ ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_cxl_memory_module_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_cxl_memory_module_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h +index 35455af..1ea0f93 100644 +--- a/ras-cxl-handler.h ++++ b/ras-cxl-handler.h +@@ -41,4 +41,7 @@ int ras_cxl_general_media_event_handler(struct trace_seq *s, + int ras_cxl_dram_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); ++int ras_cxl_memory_module_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context); + #endif +diff --git a/ras-events.c b/ras-events.c +index d27e0c4..a82dab2 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -252,6 +252,7 @@ int toggle_ras_mc_event(int enable) + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_generic_event", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_general_media", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_dram", enable); ++ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_memory_module", enable); + #endif + + free_ras: +@@ -1081,6 +1082,14 @@ int handle_ras_events(int record_events) + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "cxl_dram"); ++ ++ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_memory_module", ++ ras_cxl_memory_module_event_handler, NULL, CXL_MEMORY_MODULE_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "cxl", "memory_module"); + #endif + + if (!num_events) { +diff --git a/ras-events.h b/ras-events.h +index d192a6b..c4d54e3 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -46,6 +46,7 @@ enum { + CXL_GENERIC_EVENT, + CXL_GENERAL_MEDIA_EVENT, + CXL_DRAM_EVENT, ++ CXL_MEMORY_MODULE_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index fffa81c..a5f99ae 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -992,6 +992,74 @@ int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event * + + return rc; + } ++ ++/* ++ * Table and functions to handle cxl:cxl_memory_module_event ++ */ ++static const struct db_fields cxl_memory_module_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "memdev", .type = "TEXT" }, ++ { .name = "host", .type = "TEXT" }, ++ { .name = "serial", .type = "INTEGER" }, ++ { .name = "log_type", .type = "TEXT" }, ++ { .name = "hdr_uuid", .type = "TEXT" }, ++ { .name = "hdr_flags", .type = "INTEGER" }, ++ { .name = "hdr_handle", .type = "INTEGER" }, ++ { .name = "hdr_related_handle", .type = "INTEGER" }, ++ { .name = "hdr_ts", .type = "TEXT" }, ++ { .name = "hdr_length", .type = "INTEGER" }, ++ { .name = "hdr_maint_op_class", .type = "INTEGER" }, ++ { .name = "event_type", .type = "INTEGER" }, ++ { .name = "health_status", .type = "INTEGER" }, ++ { .name = "media_status", .type = "INTEGER" }, ++ { .name = "life_used", .type = "INTEGER" }, ++ { .name = "dirty_shutdown_cnt", .type = "INTEGER" }, ++ { .name = "cor_vol_err_cnt", .type = "INTEGER" }, ++ { .name = "cor_per_err_cnt", .type = "INTEGER" }, ++ { .name = "device_temp", .type = "INTEGER" }, ++ { .name = "add_status", .type = "INTEGER" }, ++}; ++ ++static const struct db_table_descriptor cxl_memory_module_event_tab = { ++ .name = "cxl_memory_module_event", ++ .fields = cxl_memory_module_event_fields, ++ .num_fields = ARRAY_SIZE(cxl_memory_module_event_fields), ++}; ++ ++int ras_store_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_cxl_memory_module_event) ++ return 0; ++ log(TERM, LOG_INFO, "cxl_memory_module_event store: %p\n", ++ priv->stmt_cxl_memory_module_event); ++ ++ ras_store_cxl_common_hdr(priv->stmt_cxl_memory_module_event, &ev->hdr); ++ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 13, ev->event_type); ++ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 14, ev->health_status); ++ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 15, ev->media_status); ++ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 16, ev->life_used); ++ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 17, ev->dirty_shutdown_cnt); ++ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 18, ev->cor_vol_err_cnt); ++ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 19, ev->cor_per_err_cnt); ++ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 20, ev->device_temp); ++ sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 21, ev->add_status); ++ ++ rc = sqlite3_step(priv->stmt_cxl_memory_module_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do stmt_cxl_memory_module_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_cxl_memory_module_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset stmt_cxl_memory_module_event on sqlite: error = %d\n", rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} + #endif + + /* +@@ -1391,6 +1459,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + if (rc != SQLITE_OK) + goto error; + } ++ ++ rc = ras_mc_create_table(priv, &cxl_memory_module_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_memory_module_event, ++ &cxl_memory_module_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } + #endif + + ras->db_priv = priv; +@@ -1568,6 +1644,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + "cpu %u: Failed to finalize cxl_dram_event sqlite: error = %d\n", + cpu, rc); + } ++ ++ if (priv->stmt_cxl_memory_module_event) { ++ rc = sqlite3_finalize(priv->stmt_cxl_memory_module_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize stmt_cxl_memory_module_event sqlite: error = %d\n", ++ cpu, rc); ++ } + #endif + + rc = sqlite3_close_v2(db); +diff --git a/ras-record.h b/ras-record.h +index 480ff92..a7b9ab9 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -218,6 +218,19 @@ struct ras_cxl_dram_event { + uint16_t validity_flags; + }; + ++struct ras_cxl_memory_module_event { ++ struct ras_cxl_event_common_hdr hdr; ++ uint8_t event_type; ++ uint8_t health_status; ++ uint8_t media_status; ++ uint8_t life_used; ++ uint32_t dirty_shutdown_cnt; ++ uint32_t cor_vol_err_cnt; ++ uint32_t cor_per_err_cnt; ++ int16_t device_temp; ++ uint8_t add_status; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -234,6 +247,7 @@ struct ras_cxl_overflow_event; + struct ras_cxl_generic_event; + struct ras_cxl_general_media_event; + struct ras_cxl_dram_event; ++struct ras_cxl_memory_module_event; + + #ifdef HAVE_SQLITE3 + +@@ -274,6 +288,7 @@ struct sqlite3_priv { + sqlite3_stmt *stmt_cxl_generic_event; + sqlite3_stmt *stmt_cxl_general_media_event; + sqlite3_stmt *stmt_cxl_dram_event; ++ sqlite3_stmt *stmt_cxl_memory_module_event; + #endif + }; + +@@ -309,6 +324,7 @@ int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow + int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); + int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); + int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev); ++int ras_store_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -329,6 +345,7 @@ static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ra + static inline int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; + static inline int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; + static inline int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) { return 0; }; ++static inline int ras_store_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 21180b1..a30b66d 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -605,6 +605,62 @@ static int set_cxl_dram_event_backtrace(char *buf, struct ras_cxl_dram_event *ev + return 0; + } + ++static int set_cxl_memory_module_event_backtrace(char *buf, struct ras_cxl_memory_module_event *ev) ++{ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "memdev=%s\n" \ ++ "host=%s\n" \ ++ "serial=0x%lx\n" \ ++ "log_type=%s\n" \ ++ "hdr_uuid=%s\n" \ ++ "hdr_flags=0x%x\n" \ ++ "hdr_handle=0x%x\n" \ ++ "hdr_related_handle=0x%x\n" \ ++ "hdr_timestamp=%s\n" \ ++ "hdr_length=%u\n" \ ++ "hdr_maint_op_class=%u\n" \ ++ "event_type=%u\n" \ ++ "health_status=%u\n" \ ++ "media_status=%u\n" \ ++ "life_used=%u\n" \ ++ "dirty_shutdown_cnt=%u\n" \ ++ "cor_vol_err_cnt=%u\n" \ ++ "cor_per_err_cnt=%u\n" \ ++ "device_temp=%d\n" \ ++ "add_status=%u\n", \ ++ ev->hdr.timestamp, \ ++ ev->hdr.memdev, \ ++ ev->hdr.host, \ ++ ev->hdr.serial, \ ++ ev->hdr.log_type, \ ++ ev->hdr.hdr_uuid, \ ++ ev->hdr.hdr_flags, \ ++ ev->hdr.hdr_handle, \ ++ ev->hdr.hdr_related_handle, \ ++ ev->hdr.hdr_timestamp, \ ++ ev->hdr.hdr_length, \ ++ ev->hdr.hdr_maint_op_class, \ ++ ev->event_type, \ ++ ev->health_status, \ ++ ev->media_status, \ ++ ev->life_used, \ ++ ev->dirty_shutdown_cnt, \ ++ ev->cor_vol_err_cnt, \ ++ ev->cor_per_err_cnt, \ ++ ev->device_temp, \ ++ ev->add_status); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -663,6 +719,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case CXL_DRAM_EVENT: + rc = set_cxl_dram_event_backtrace(buf, (struct ras_cxl_dram_event *)ev); + break; ++ case CXL_MEMORY_MODULE_EVENT: ++ rc = set_cxl_memory_module_event_backtrace(buf, (struct ras_cxl_memory_module_event *)ev); ++ break; + default: + return -1; + } +@@ -1380,3 +1439,47 @@ cxl_dram_fail: + else + return -1; + } ++ ++int ras_report_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto cxl_memory_module_fail; ++ ++ rc = commit_report_backtrace(sockfd, CXL_MEMORY_MODULE_EVENT, ev); ++ if (rc < 0) ++ goto cxl_memory_module_fail; ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl_memory_module_event"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_memory_module_fail; ++ ++ sprintf(buf, "REASON=%s", "CXL Memory Module Event"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_memory_module_fail; ++ ++ done = 1; ++ ++cxl_memory_module_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ else ++ return -1; ++} +diff --git a/ras-report.h b/ras-report.h +index 1ad00e0..e401850 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -46,6 +46,7 @@ int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflo + int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); + int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); + int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev); ++int ras_report_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev); + + #else + +@@ -64,6 +65,7 @@ static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct r + static inline int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; + static inline int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; + static inline int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) { return 0; }; ++static inline int ras_report_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev) { return 0; }; + + #endif + diff --git a/f73ed45b91244eb3986ac2574cd7d36ae1d4d22a.patch b/f73ed45b91244eb3986ac2574cd7d36ae1d4d22a.patch new file mode 100644 index 0000000..2157647 --- /dev/null +++ b/f73ed45b91244eb3986ac2574cd7d36ae1d4d22a.patch @@ -0,0 +1,435 @@ +commit f73ed45b91244eb3986ac2574cd7d36ae1d4d22a +Author: Shiju Jose +Date: Tue Apr 4 16:50:50 2023 +0100 + + rasdaemon: Add support for the CXL overflow events + + Add support to log and record the CXL overflow events. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index d540ebb..d4c845e 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -426,3 +426,101 @@ int ras_cxl_aer_ce_event_handler(struct trace_seq *s, + + return 0; + } ++ ++/* ++ * CXL rev 3.0 section 8.2.9.2.2; Table 8-49 ++ */ ++enum cxl_event_log_type { ++ CXL_EVENT_TYPE_INFO = 0x00, ++ CXL_EVENT_TYPE_WARN, ++ CXL_EVENT_TYPE_FAIL, ++ CXL_EVENT_TYPE_FATAL, ++ CXL_EVENT_TYPE_UNKNOWN ++}; ++ ++static char *cxl_event_log_type_str(uint32_t log_type) ++{ ++ ++ switch (log_type) { ++ case CXL_EVENT_TYPE_INFO: ++ return "Informational"; ++ case CXL_EVENT_TYPE_WARN: ++ return "Warning"; ++ case CXL_EVENT_TYPE_FAIL: ++ return "Failure"; ++ case CXL_EVENT_TYPE_FATAL: ++ return "Fatal"; ++ default: ++ break; ++ } ++ ++ return "Unknown"; ++} ++ ++int ras_cxl_overflow_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ int len; ++ unsigned long long val; ++ struct ras_events *ras = context; ++ struct ras_cxl_overflow_event ev; ++ ++ memset(&ev, 0, sizeof(ev)); ++ get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); ++ if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) ++ return -1; ++ ++ ev.memdev = tep_get_field_raw(s, event, "memdev", record, &len, 1); ++ if (!ev.memdev) ++ return -1; ++ if (trace_seq_printf(s, "memdev:%s ", ev.memdev) <= 0) ++ return -1; ++ ++ ev.host = tep_get_field_raw(s, event, "host", record, &len, 1); ++ if (!ev.host) ++ return -1; ++ if (trace_seq_printf(s, "host:%s ", ev.host) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "serial", record, &val, 1) < 0) ++ return -1; ++ ev.serial = val; ++ if (trace_seq_printf(s, "serial:0x%llx ", (unsigned long long)ev.serial) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "log", record, &val, 1) < 0) ++ return -1; ++ ev.log_type = cxl_event_log_type_str(val); ++ if (trace_seq_printf(s, "log type:%s ", ev.log_type) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "count", record, &val, 1) < 0) ++ return -1; ++ ev.count = val; ++ ++ if (tep_get_field_val(s, event, "first_ts", record, &val, 1) < 0) ++ return -1; ++ convert_timestamp(val, ev.first_ts, sizeof(ev.first_ts)); ++ ++ if (tep_get_field_val(s, event, "last_ts", record, &val, 1) < 0) ++ return -1; ++ convert_timestamp(val, ev.last_ts, sizeof(ev.last_ts)); ++ ++ if (ev.count) { ++ if (trace_seq_printf(s, "%u errors from %s to %s\n", ++ ev.count, ev.first_ts, ev.last_ts) <= 0) ++ return -1; ++ } ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_cxl_overflow_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_cxl_overflow_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h +index 711daf4..e7847ec 100644 +--- a/ras-cxl-handler.h ++++ b/ras-cxl-handler.h +@@ -29,4 +29,7 @@ int ras_cxl_aer_ue_event_handler(struct trace_seq *s, + int ras_cxl_aer_ce_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); ++int ras_cxl_overflow_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context); + #endif +diff --git a/ras-events.c b/ras-events.c +index d0251e0..f2a869a 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -248,6 +248,7 @@ int toggle_ras_mc_event(int enable) + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_poison", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_uncorrectable_error", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_correctable_error", enable); ++ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_overflow", enable); + #endif + + free_ras: +@@ -1045,6 +1046,14 @@ int handle_ras_events(int record_events) + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "cxl_aer_correctable_error"); ++ ++ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_overflow", ++ ras_cxl_overflow_event_handler, NULL, CXL_OVERFLOW_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "cxl", "cxl_overflow"); + #endif + + if (!num_events) { +diff --git a/ras-events.h b/ras-events.h +index a9d67c2..7c869d9 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -42,6 +42,7 @@ enum { + CXL_POISON_EVENT, + CXL_AER_UE_EVENT, + CXL_AER_CE_EVENT, ++ CXL_OVERFLOW_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index 86133c4..7b808a5 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -720,6 +720,59 @@ int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_eve + + return rc; + } ++ ++/* ++ * Table and functions to handle cxl:cxl_overflow ++ */ ++static const struct db_fields cxl_overflow_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "memdev", .type = "TEXT" }, ++ { .name = "host", .type = "TEXT" }, ++ { .name = "serial", .type = "INTEGER" }, ++ { .name = "log_type", .type = "TEXT" }, ++ { .name = "count", .type = "INTEGER" }, ++ { .name = "first_ts", .type = "TEXT" }, ++ { .name = "last_ts", .type = "TEXT" }, ++}; ++ ++static const struct db_table_descriptor cxl_overflow_event_tab = { ++ .name = "cxl_overflow_event", ++ .fields = cxl_overflow_event_fields, ++ .num_fields = ARRAY_SIZE(cxl_overflow_event_fields), ++}; ++ ++int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_cxl_overflow_event) ++ return 0; ++ log(TERM, LOG_INFO, "cxl_overflow_event store: %p\n", priv->stmt_cxl_overflow_event); ++ ++ sqlite3_bind_text(priv->stmt_cxl_overflow_event, 1, ev->timestamp, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_overflow_event, 2, ev->memdev, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_overflow_event, 3, ev->host, -1, NULL); ++ sqlite3_bind_int64(priv->stmt_cxl_overflow_event, 4, ev->serial); ++ sqlite3_bind_text(priv->stmt_cxl_overflow_event, 5, ev->log_type, -1, NULL); ++ sqlite3_bind_int(priv->stmt_cxl_overflow_event, 6, ev->count); ++ sqlite3_bind_text(priv->stmt_cxl_overflow_event, 7, ev->first_ts, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_overflow_event, 8, ev->last_ts, -1, NULL); ++ ++ rc = sqlite3_step(priv->stmt_cxl_overflow_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do cxl_overflow_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_cxl_overflow_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset cxl_overflow_event on sqlite: error = %d\n", ++ rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} + #endif + + /* +@@ -1087,6 +1140,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + if (rc != SQLITE_OK) + goto error; + } ++ ++ rc = ras_mc_create_table(priv, &cxl_overflow_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_overflow_event, ++ &cxl_overflow_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } + #endif + + ras->db_priv = priv; +@@ -1232,6 +1293,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + "cpu %u: Failed to finalize cxl_aer_ce_event sqlite: error = %d\n", + cpu, rc); + } ++ ++ if (priv->stmt_cxl_overflow_event) { ++ rc = sqlite3_finalize(priv->stmt_cxl_overflow_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize cxl_overflow_event sqlite: error = %d\n", ++ cpu, rc); ++ } + #endif + + rc = sqlite3_close_v2(db); +diff --git a/ras-record.h b/ras-record.h +index ab7153d..90db6ad 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -152,6 +152,17 @@ struct ras_cxl_aer_ce_event { + uint32_t error_status; + }; + ++struct ras_cxl_overflow_event { ++ char timestamp[64]; ++ const char *memdev; ++ const char *host; ++ uint64_t serial; ++ const char *log_type; ++ char first_ts[64]; ++ char last_ts[64]; ++ uint16_t count; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -164,6 +175,7 @@ struct ras_mf_event; + struct ras_cxl_poison_event; + struct ras_cxl_aer_ue_event; + struct ras_cxl_aer_ce_event; ++struct ras_cxl_overflow_event; + + #ifdef HAVE_SQLITE3 + +@@ -200,6 +212,7 @@ struct sqlite3_priv { + sqlite3_stmt *stmt_cxl_poison_event; + sqlite3_stmt *stmt_cxl_aer_ue_event; + sqlite3_stmt *stmt_cxl_aer_ce_event; ++ sqlite3_stmt *stmt_cxl_overflow_event; + #endif + }; + +@@ -231,6 +244,7 @@ int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev); + int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev); + int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev); + int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); ++int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -247,6 +261,7 @@ static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event + static inline int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; }; + static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; }; + static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; ++static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 63b47f5..dbed454 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -421,6 +421,36 @@ static int set_cxl_aer_ce_event_backtrace(char *buf, struct ras_cxl_aer_ce_event + return 0; + } + ++static int set_cxl_overflow_event_backtrace(char *buf, struct ras_cxl_overflow_event *ev) ++{ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "memdev=%s\n" \ ++ "host=%s\n" \ ++ "serial=0x%lx\n" \ ++ "log_type=%s\n" \ ++ "count=%u\n" \ ++ "first_ts=%s\n" \ ++ "last_ts=%s\n", \ ++ ev->timestamp, \ ++ ev->memdev, \ ++ ev->host, \ ++ ev->serial, \ ++ ev->log_type, \ ++ ev->count, \ ++ ev->first_ts, \ ++ ev->last_ts); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -467,6 +497,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case CXL_AER_CE_EVENT: + rc = set_cxl_aer_ce_event_backtrace(buf, (struct ras_cxl_aer_ce_event *)ev); + break; ++ case CXL_OVERFLOW_EVENT: ++ rc = set_cxl_overflow_event_backtrace(buf, (struct ras_cxl_overflow_event *)ev); ++ break; + default: + return -1; + } +@@ -1007,3 +1040,47 @@ cxl_aer_ce_fail: + else + return -1; + } ++ ++int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto cxl_overflow_fail; ++ ++ rc = commit_report_backtrace(sockfd, CXL_OVERFLOW_EVENT, ev); ++ if (rc < 0) ++ goto cxl_overflow_fail; ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl-overflow"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_overflow_fail; ++ ++ sprintf(buf, "REASON=%s", "CXL overflow"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_overflow_fail; ++ ++ done = 1; ++ ++cxl_overflow_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ else ++ return -1; ++} +diff --git a/ras-report.h b/ras-report.h +index 46155ee..204d485 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -42,6 +42,7 @@ int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev); + int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev); + int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev); + int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); ++int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); + + #else + +@@ -56,6 +57,7 @@ static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_even + static inline int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; }; + static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_event *ev) { return 0; }; + static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; ++static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; + + #endif + diff --git a/f8b6da812eddc063ea739970f941fdd24fb984ae.patch b/f8b6da812eddc063ea739970f941fdd24fb984ae.patch new file mode 100644 index 0000000..ee8d818 --- /dev/null +++ b/f8b6da812eddc063ea739970f941fdd24fb984ae.patch @@ -0,0 +1,199 @@ +commit 70acd500302d2db318bb0e35b551f74fd4baebc4 +Author: Shiju Jose +Date: Mon Feb 12 10:27:58 2024 +0000 + + rasdaemon: ras-mc-ctl: Add support for CXL AER uncorrectable trace events + + Add support for CXL AER uncorrectable events to the ras-mc-ctl tool. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + (cherry picked from commit f8b6da812eddc063ea739970f941fdd24fb984ae) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 1cc19b3..c0a2ec6 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -43,6 +43,7 @@ my $modprobe = find_prog ("modprobe") or exit (1); + + my $has_aer = 0; + my $has_arm = 0; ++my $has_cxl = 0; + my $has_devlink = 0; + my $has_disk_errors = 0; + my $has_extlog = 0; +@@ -51,6 +52,7 @@ my $has_mce = 0; + + @WITH_AER_TRUE@$has_aer = 1; + @WITH_ARM_TRUE@$has_arm = 1; ++@WITH_CXL_TRUE@$has_cxl = 1; + @WITH_DEVLINK_TRUE@$has_devlink = 1; + @WITH_DISKERROR_TRUE@$has_disk_errors = 1; + @WITH_EXTLOG_TRUE@$has_extlog = 1; +@@ -1156,6 +1158,78 @@ sub get_uuid_le + return $out; + } + ++use constant { ++ CXL_AER_UE_CACHE_DATA_PARITY => 0x0001, ++ CXL_AER_UE_CACHE_ADDR_PARITY => 0x0002, ++ CXL_AER_UE_CACHE_BE_PARITY => 0x0004, ++ CXL_AER_UE_CACHE_DATA_ECC => 0x0008, ++ CXL_AER_UE_MEM_DATA_PARITY => 0x0010, ++ CXL_AER_UE_MEM_ADDR_PARITY => 0x0020, ++ CXL_AER_UE_MEM_BE_PARITY => 0x0040, ++ CXL_AER_UE_MEM_DATA_ECC => 0x0080, ++ CXL_AER_UE_REINIT_THRESH => 0x0100, ++ CXL_AER_UE_RSVD_ENCODE => 0x0200, ++ CXL_AER_UE_POISON => 0x0400, ++ CXL_AER_UE_RECV_OVERFLOW => 0x0800, ++ CXL_AER_UE_INTERNAL_ERR => 0x4000, ++ CXL_AER_UE_IDE_TX_ERR => 0x8000, ++ CXL_AER_UE_IDE_RX_ERR => 0x10000, ++}; ++ ++sub get_cxl_ue_error_status_text ++{ ++ my $error_status = $_[0]; ++ my @out; ++ ++ if ($error_status & CXL_AER_UE_CACHE_DATA_PARITY) { ++ push @out, (sprintf "\'Cache Data Parity Error\' "); ++ } ++ if ($error_status & CXL_AER_UE_CACHE_ADDR_PARITY) { ++ push @out, (sprintf "\'Cache Address Parity Error\' "); ++ } ++ if ($error_status & CXL_AER_UE_CACHE_BE_PARITY) { ++ push @out, (sprintf "\'Cache Byte Enable Parity Error\' "); ++ } ++ if ($error_status & CXL_AER_UE_CACHE_DATA_ECC) { ++ push @out, (sprintf "\'Cache Data ECC Error\' "); ++ } ++ if ($error_status & CXL_AER_UE_MEM_DATA_PARITY) { ++ push @out, (sprintf "\'Memory Data Parity Error\' "); ++ } ++ if ($error_status & CXL_AER_UE_MEM_ADDR_PARITY) { ++ push @out, (sprintf "\'Memory Address Parity Error\' "); ++ } ++ if ($error_status & CXL_AER_UE_MEM_BE_PARITY) { ++ push @out, (sprintf "\'Memory Byte Enable Parity Error\' "); ++ } ++ if ($error_status & CXL_AER_UE_MEM_DATA_ECC) { ++ push @out, (sprintf "\'Memory Data ECC Error\' "); ++ } ++ if ($error_status & CXL_AER_UE_REINIT_THRESH) { ++ push @out, (sprintf "\'REINIT Threshold Hit\' "); ++ } ++ if ($error_status & CXL_AER_UE_RSVD_ENCODE) { ++ push @out, (sprintf "\'Received Unrecognized Encoding\' "); ++ } ++ if ($error_status & CXL_AER_UE_POISON) { ++ push @out, (sprintf "\'Received Poison From Peer\' "); ++ } ++ if ($error_status & CXL_AER_UE_RECV_OVERFLOW) { ++ push @out, (sprintf "\'Receiver Overflow\' "); ++ } ++ if ($error_status & CXL_AER_UE_INTERNAL_ERR) { ++ push @out, (sprintf "\'Component Specific Error\' "); ++ } ++ if ($error_status & CXL_AER_UE_IDE_TX_ERR) { ++ push @out, (sprintf "\'IDE Tx Error\' "); ++ } ++ if ($error_status & CXL_AER_UE_IDE_RX_ERR) { ++ push @out, (sprintf "\'IDE Rx Error\' "); ++ } ++ ++ return join (", ", @out); ++} ++ + sub summary + { + require DBI; +@@ -1163,7 +1237,7 @@ sub summary + my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg, $action_result); + my ($etype, $severity, $etype_string, $severity_string); + my ($dev_name, $dev); +- my ($mpidr); ++ my ($mpidr, $memdev); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -1219,6 +1293,25 @@ sub summary + $query_handle->finish; + } + ++ # CXL errors ++ if ($has_cxl == 1) { ++ # CXL AER uncorrectable errors ++ $query = "select memdev, count(*) from cxl_aer_ue_event$conf{opt}{since} group by memdev"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($memdev, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$memdev errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "CXL AER uncorrectable events summary:\n$out\n"; ++ } else { ++ print "No CXL AER uncorrectable errors.\n\n"; ++ } ++ $query_handle->finish; ++ } ++ + # extlog errors + if ($has_extlog == 1) { + $query = "select etype, severity, count(*) from extlog_event group by etype, severity"; +@@ -1324,6 +1417,7 @@ sub errors + my ($dev, $sector, $nr_sector, $error, $rwbs, $cmd); + my ($error_count, $affinity, $mpidr, $r_state, $psci_state); + my ($pfn, $page_type, $action_result); ++ my ($memdev, $host, $serial, $error_status, $first_error, $header_log); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -1389,6 +1483,44 @@ sub errors + $query_handle->finish; + } + ++ # CXL errors ++ if ($has_cxl == 1) { ++ # CXL AER uncorrectable errors ++ use constant SZ_512 => 0x200; ++ use constant CXL_HEADERLOG_SIZE_U32 => SZ_512/32; ++ $query = "select id, timestamp, memdev, host, serial, error_status, first_error, header_log from cxl_aer_ue_event$conf{opt}{since} order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $error_status, $first_error, $header_log)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $timestamp error: "; ++ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev); ++ $out .= "host=$host, " if (defined $host && length $host); ++ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial); ++ if (defined $error_status && length $error_status) { ++ $out .= sprintf "error_status: %s, ", get_cxl_ue_error_status_text($error_status); ++ } ++ if (defined $first_error && length $first_error) { ++ $out .= sprintf "first_error: %s, ", get_cxl_ue_error_status_text($first_error); ++ } ++ if (defined $header_log && length $header_log) { ++ $out .= sprintf "header_log:\n"; ++ my @bytes = unpack "C*", $header_log; ++ for (my $i = 0; $i < CXL_HEADERLOG_SIZE_U32; $i++) { ++ $out .= sprintf "%08x ", $bytes[$i]; ++ } ++ } ++ $out .= "\n"; ++ } ++ if ($out ne "") { ++ print "CXL AER uncorrectable events:\n$out\n"; ++ } else { ++ print "No CXL AER uncorrectable errors.\n\n"; ++ } ++ $query_handle->finish; ++ } ++ + # Extlog errors + if ($has_extlog == 1) { + $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id"; diff --git a/fd11670d2d35c5d939b03ba1ca80eb81c1f636b6.patch b/fd11670d2d35c5d939b03ba1ca80eb81c1f636b6.patch new file mode 100644 index 0000000..2215c83 --- /dev/null +++ b/fd11670d2d35c5d939b03ba1ca80eb81c1f636b6.patch @@ -0,0 +1,127 @@ +commit dba1c58ef5802b96b6555cb42e3cf7f75fa0da8c +Author: Shiju Jose +Date: Mon Feb 12 10:56:25 2024 +0000 + + rasdaemon: ras-mc-ctl: Add support for CXL generic trace events + + Add support for CXL generic events to the ras-mc-ctl tool. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + (cherry picked from commit fd11670d2d35c5d939b03ba1ca80eb81c1f636b6) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 16b0589..5528021 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1270,6 +1270,34 @@ sub get_cxl_ce_error_status_text + return join (", ", @out); + } + ++use constant { ++ CXL_EVENT_RECORD_FLAG_PERMANENT => 0x0004, ++ CXL_EVENT_RECORD_FLAG_MAINT_NEEDED => 0x0008, ++ CXL_EVENT_RECORD_FLAG_PERF_DEGRADED => 0x0010, ++ CXL_EVENT_RECORD_FLAG_HW_REPLACE => 0x0020, ++}; ++ ++sub get_cxl_hdr_flags_text ++{ ++ my $flags = $_[0]; ++ my @out; ++ ++ if ($flags & CXL_EVENT_RECORD_FLAG_PERMANENT) { ++ push @out, (sprintf "\'PERMANENT_CONDITION\' "); ++ } ++ if ($flags & CXL_EVENT_RECORD_FLAG_MAINT_NEEDED) { ++ push @out, (sprintf "\'MAINTENANCE_NEEDED\' "); ++ } ++ if ($flags & CXL_EVENT_RECORD_FLAG_PERF_DEGRADED) { ++ push @out, (sprintf "\'PERFORMANCE_DEGRADED\' "); ++ } ++ if ($flags & CXL_EVENT_RECORD_FLAG_HW_REPLACE) { ++ push @out, (sprintf "\'HARDWARE_REPLACEMENT_NEEDED\' "); ++ } ++ ++ return join (", ", @out); ++} ++ + sub summary + { + require DBI; +@@ -1398,6 +1426,22 @@ sub summary + print "No CXL poison errors.\n\n"; + } + $query_handle->finish; ++ ++ # CXL generic errors ++ $query = "select memdev, count(*) from cxl_generic_event$conf{opt}{since} group by memdev"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($memdev, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$memdev errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "CXL generic events summary:\n$out\n"; ++ } else { ++ print "No CXL generic errors.\n\n"; ++ } ++ $query_handle->finish; + } + + # extlog errors +@@ -1508,6 +1552,7 @@ sub errors + my ($memdev, $host, $serial, $error_status, $first_error, $header_log); + my ($log_type, $first_ts, $last_ts); + my ($trace_type, $region, $region_uuid, $hpa, $dpa, $dpa_length, $source, $flags, $overflow_ts); ++ my ($hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $data); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -1681,6 +1726,44 @@ sub errors + } else { + print "No CXL poison errors.\n\n"; + } ++ ++ # CXL generic errors ++ use constant CXL_EVENT_RECORD_DATA_LENGTH => 0x50; ++ $query = "select id, timestamp, memdev, host, serial, log_type, hdr_uuid, hdr_flags, hdr_handle, hdr_related_handle, hdr_ts, hdr_length, hdr_maint_op_class, data from cxl_generic_event$conf{opt}{since} order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $log_type, $hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $data)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $timestamp error: "; ++ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev); ++ $out .= "host=$host, " if (defined $host && length $host); ++ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial); ++ $out .= "log=$log_type, " if (defined $log_type && length $log_type); ++ $out .= "hdr_uuid=$hdr_uuid, " if (defined $hdr_uuid && length $hdr_uuid); ++ $out .= sprintf "hdr_flags=0x%llx %s, ", $hdr_flags, get_cxl_hdr_flags_text($hdr_flags) if (defined $hdr_flags && length $hdr_flags); ++ $out .= sprintf "hdr_handle=0x%x, ", $hdr_handle if (defined $hdr_handle && length $hdr_handle); ++ $out .= sprintf "hdr_related_handle=0x%x, ", $hdr_related_handle if (defined $hdr_related_handle && length $hdr_related_handle); ++ $out .= "hdr_timestamp=$hdr_ts, " if (defined $hdr_ts && length $hdr_ts); ++ $out .= sprintf "hdr_length=%u, ", $hdr_length if (defined $hdr_length && length $hdr_length); ++ $out .= sprintf "hdr_maint_op_class=%u, ", $hdr_maint_op_class if (defined $hdr_maint_op_class && length $hdr_maint_op_class); ++ if (defined $data && length $data) { ++ $out .= sprintf "data:\n"; ++ my @bytes = unpack "C*", $data; ++ for (my $i = 0; $i < CXL_EVENT_RECORD_DATA_LENGTH; $i++) { ++ if (($i > 0) && (($i % 16) == 0)) { ++ $out .= sprintf "\n %08x: ", $i; ++ } ++ $out .= sprintf "%02x%02x%02x%02x ", $bytes[$i], $bytes[$i + 1], $bytes[$i + 2], $bytes[$i + 3]; ++ } ++ } ++ $out .= "\n"; ++ } ++ if ($out ne "") { ++ print "CXL generic events:\n$out\n"; ++ } else { ++ print "No CXL generic errors.\n\n"; ++ } + } + + # Extlog errors diff --git a/rasdaemon.spec b/rasdaemon.spec index e656119..b9f787d 100644 --- a/rasdaemon.spec +++ b/rasdaemon.spec @@ -1,12 +1,84 @@ Name: rasdaemon Version: 0.8.0 -Release: 6%{?dist} +Release: 7%{?dist} Summary: Utility to receive RAS error tracings Group: Applications/System License: GPLv2 URL: http://git.infradead.org/users/mchehab/rasdaemon.git Source0: http://www.infradead.org/~mchehab/rasdaemon/%{name}-%{version}.tar.bz2 +# https://github.com/mchehab/rasdaemon/pull/96 +# Add support for CXL poison and AER error events (4 patches) + +# rasdaemon: Move definition for BIT and BIT_ULL to a common file +Patch0: d3836aa061f677232f99c514247d3dbf80812a1b.patch + +# rasdaemon: Add support for the CXL poison events +Patch1: 75c8fec559641f843345ef8fbc36d124b60b914d.patch + +# rasdaemon: Add support for the CXL AER uncorrectable errors +Patch2: a7524917befe7e67c02253cc27cb0c724e5992c0.patch + +# rasdaemon: Add support for the CXL AER correctable errors +Patch3: a247baf7110ab6427259eb1421a103e2021a8735.patch + +# https://github.com/mchehab/rasdaemon/pull/104 +# rasdaemon: Process the generic CXL trace events (7 patches) + +# rasdaemon: Add common function to convert timestamp in the CXL event records to the broken-down time format +Patch4: 2ff9bc453998ddb145c7bb8ba30a57c56bd18eab.patch + +# rasdaemon: Add common function to get timestamp for the event +Patch5: 7be2edbf863b7acf7e1cab10c2b9f9bf51b3d513.patch + +# rasdaemon: Add support for the CXL overflow events +Patch6: f73ed45b91244eb3986ac2574cd7d36ae1d4d22a.patch + +# rasdaemon: Add support for the CXL generic events +Patch7: e0cde0edf073b939d345aeba0aed23e238dbc53b.patch + +# rasdaemon: Add support for the CXL general media events +Patch8: 53c682fb45c2909c128be4ee8f51a3e42fe2f7fd.patch + +# rasdaemon: Add support for the CXL dram events +Patch9: 9a2f6186db2622788f8868d8ec082684d6a06d4d.patch + +# rasdaemon: Add support for the CXL memory module events +Patch10: f63b4c942e19a0da1e85a88783ed6e222ad4bdba.patch + +# https://github.com/mchehab/rasdaemon/pull/149 +# rasdaemon: generic fixes and ras-mc-ctl: add support for CXL error events (10 patches) + +# rasdaemon: Fix build warnings unused variable if AMP RAS errors is not enabled +Patch11: 8f79833e3d78424f4a594985fbeb91890f4af81c.patch + +# rasdaemon: ras-memory-failure-handler: update memory failure action page types +Patch12: 31c7578ddb0fc15aa7247f2b8885956540031221.patch + +# rasdaemon: ras-mc-ctl: Add support for CXL AER uncorrectable trace events +Patch13: f8b6da812eddc063ea739970f941fdd24fb984ae.patch + +# rasdaemon: ras-mc-ctl: Add support for CXL AER correctable trace events +Patch14: ae1647624486fca0070b297d0e2fd4e53443c10b.patch + +# rasdaemon: ras-mc-ctl: Add support for CXL overflow trace events +Patch15: b22cb067755f4604770f9864a0babed8f93a1553.patch + +# rasdaemon: ras-mc-ctl: Add support for CXL poison trace events +Patch16: 93ca96b66c917af37b2ae9295dc5df46a7d64dd2.patch + +# rasdaemon: ras-mc-ctl: Add support for CXL generic trace events +Patch17: fd11670d2d35c5d939b03ba1ca80eb81c1f636b6.patch + +# rasdaemon: ras-mc-ctl: Add support for CXL general media trace events +Patch18: 572de9d57691be9e630abee9ffa56a2fb155d558.patch + +# rasdaemon: ras-mc-ctl: Add support for CXL DRAM trace events +Patch19: c38c14afc5d7bb6c8c52d1023271d755deb23008.patch + +# rasdaemon: ras-mc-ctl: Add support for CXL memory module trace events +Patch20: aee13f74266382c64128bd7367a5eeb46277f490.patch + ExcludeArch: s390 s390x BuildRequires: make BuildRequires: gcc @@ -40,6 +112,27 @@ an utility for reporting current error counts from the EDAC sysfs files. %prep %setup -q +%patch0 -p1 +%patch1 -p1 +%patch2 -p1 +%patch3 -p1 +%patch4 -p1 +%patch5 -p1 +%patch6 -p1 +%patch7 -p1 +%patch8 -p1 +%patch9 -p1 +%patch10 -p1 +%patch11 -p1 +%patch12 -p1 +%patch13 -p1 +%patch14 -p1 +%patch15 -p1 +%patch16 -p1 +%patch17 -p1 +%patch18 -p1 +%patch19 -p1 +%patch20 -p1 autoreconf -vfi %build @@ -48,11 +141,13 @@ autoreconf -vfi --enable-mce --enable-extlog --enable-devlink --enable-diskerror \ --enable-memory-failure --enable-abrt-report --enable-hisi-ns-decode \ --enable-memory-ce-pfa --enable-amp-ns-decode --enable-cpu-fault-isolation \ + --enable-cxl \ --with-sysconfdefdir=%{_sysconfdir}/sysconfig %else %configure --enable-sqlite3 --enable-aer \ --enable-mce --enable-extlog --enable-devlink --enable-diskerror \ --enable-memory-failure --enable-abrt-report --enable-cpu-fault-isolation \ + --enable-cxl \ --with-sysconfdefdir=%{_sysconfdir}/sysconfig %endif make %{?_smp_mflags} @@ -74,6 +169,10 @@ rm INSTALL %{buildroot}/usr/include/*.h %config(noreplace) %{_sysconfdir}/sysconfig/%{name} %changelog +* Tue Jan 14 2025 Joel Savitz - 0.8.0-7 +- Add support for CXL memory failure event logging + Resolves: RHEL-61233 + * Tue Oct 29 2024 Troy Dawson - 0.8.0-6 - Bump release for October 2024 mass rebuild: Resolves: RHEL-64018