From a80e20f7526c77e8ef6ee734581311bb3a673850 Mon Sep 17 00:00:00 2001 From: eabdullin Date: Tue, 1 Apr 2025 10:05:31 +0000 Subject: [PATCH] import CS rasdaemon-0.8.0-8.el10 --- .gitignore | 2 +- .rasdaemon.metadata | 1 - ...bc453998ddb145c7bb8ba30a57c56bd18eab.patch | 66 ++ ...578ddb0fc15aa7247f2b8885956540031221.patch | 54 ++ ...82fb45c2909c128be4ee8f51a3e42fe2f7fd.patch | 551 ++++++++++++++ ...e9d57691be9e630abee9ffa56a2fb155d558.patch | 182 +++++ ...fec559641f843345ef8fbc36d124b60b914d.patch | 663 +++++++++++++++++ ...edbf863b7acf7e1cab10c2b9f9bf51b3d513.patch | 97 +++ ...833e3d78424f4a594985fbeb91890f4af81c.patch | 78 ++ ...96b66c917af37b2ae9295dc5df46a7d64dd2.patch | 82 +++ ...6186db2622788f8868d8ec082684d6a06d4d.patch | 559 +++++++++++++++ ...a096c3a1d0f993703ab3299f1ddfadf53d7f.patch | 85 --- ...29b024c31d54a7f8a72eab094376c7be27f5.patch | 32 - ...d65b97311dd5736838f1e285355f7f357046.patch | 538 -------------- ...956acc2dab7c18b4701f9657afb9ab3ddc79.patch | 28 - ...217660351c08eb2f8bccebf939abba2f7e69.patch | 66 -- ...f713f667437fb6e283cc3dc090679eb47d08.patch | 372 ---------- ...1e4da4f2daf2b10143fc148a8043312b61e5.patch | 149 ---- ...f0d6c2aaaed096f3a3d306416743c0dcb7a4.patch | 24 - ...64ba44aee9bc5646f6537fc744b0b54aff37.patch | 38 - ...a85d8dc3483423ec2934fee8132f85f8fdb6.patch | 207 ------ ...cc2cf21c86b5462c8f4441cd9c92b3d75f7d.patch | 71 -- ...f39f13833f7d53ef96abc5a72e79384260f4.patch | 230 ------ ...a0711001957ee98f2c124abce0fa1f801529.patch | 670 ------------------ ...76ed94f68ae31d7b5f74e19545698898c13c.patch | 138 ---- SOURCES/add_upstream_labels.patch | 159 ----- ...33aa70331670c06db6b652712b476e24051c.patch | 107 --- ...e68453b2497e86cbd273b9cd56fadc5859e3.patch | 37 - ...a3d6a39d402c41065e9284d49114b97e3bfe.patch | 148 ---- ...e5c65ed5a42eaa97aa3659854add6d808da5.patch | 94 --- ...041e0abfa20054ff5d6874ffbd1ab592558d.patch | 28 - ...7864f11f709c4f803828fbc8e507d115d03b.patch | 611 ---------------- ...7ec14a11764fedfea50bd4d96ddda43c7fc1.patch | 24 - ...c-ctl-Fix-script-to-parse-dimm-sizes.patch | 47 -- SPECS/rasdaemon.spec | 236 ------ ...baf7110ab6427259eb1421a103e2021a8735.patch | 424 +++++++++++ ...4917befe7e67c02253cc27cb0c724e5992c0.patch | 503 +++++++++++++ ...47624486fca0070b297d0e2fd4e53443c10b.patch | 116 +++ ...3f74266382c64128bd7367a5eeb46277f490.patch | 161 +++++ ...b067755f4604770f9864a0babed8f93a1553.patch | 75 ++ ...251e3d52f57be1e245dff1cf221e09c5686f.patch | 267 +++++++ ...14afc5d7bb6c8c52d1023271d755deb23008.patch | 101 +++ ...6aa061f677232f99c514247d3dbf80812a1b.patch | 42 ++ ...e0edf073b939d345aeba0aed23e238dbc53b.patch | 575 +++++++++++++++ ...4c942e19a0da1e85a88783ed6e222ad4bdba.patch | 536 ++++++++++++++ ...d45b91244eb3986ac2574cd7d36ae1d4d22a.patch | 435 ++++++++++++ ...da812eddc063ea739970f941fdd24fb984ae.patch | 199 ++++++ ...670d2d35c5d939b03ba1ca80eb81c1f636b6.patch | 127 ++++ rasdaemon.spec | 336 +++++++++ sources | 1 + 50 files changed, 6231 insertions(+), 4141 deletions(-) delete mode 100644 .rasdaemon.metadata create mode 100644 2ff9bc453998ddb145c7bb8ba30a57c56bd18eab.patch create mode 100644 31c7578ddb0fc15aa7247f2b8885956540031221.patch create mode 100644 53c682fb45c2909c128be4ee8f51a3e42fe2f7fd.patch create mode 100644 572de9d57691be9e630abee9ffa56a2fb155d558.patch create mode 100644 75c8fec559641f843345ef8fbc36d124b60b914d.patch create mode 100644 7be2edbf863b7acf7e1cab10c2b9f9bf51b3d513.patch create mode 100644 8f79833e3d78424f4a594985fbeb91890f4af81c.patch create mode 100644 93ca96b66c917af37b2ae9295dc5df46a7d64dd2.patch create mode 100644 9a2f6186db2622788f8868d8ec082684d6a06d4d.patch delete mode 100644 SOURCES/0862a096c3a1d0f993703ab3299f1ddfadf53d7f.patch delete mode 100644 SOURCES/16d929b024c31d54a7f8a72eab094376c7be27f5.patch delete mode 100644 SOURCES/2290d65b97311dd5736838f1e285355f7f357046.patch delete mode 100644 SOURCES/28ea956acc2dab7c18b4701f9657afb9ab3ddc79.patch delete mode 100644 SOURCES/2a1d217660351c08eb2f8bccebf939abba2f7e69.patch delete mode 100644 SOURCES/546cf713f667437fb6e283cc3dc090679eb47d08.patch delete mode 100644 SOURCES/60a91e4da4f2daf2b10143fc148a8043312b61e5.patch delete mode 100644 SOURCES/7937f0d6c2aaaed096f3a3d306416743c0dcb7a4.patch delete mode 100644 SOURCES/854364ba44aee9bc5646f6537fc744b0b54aff37.patch delete mode 100644 SOURCES/8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch delete mode 100644 SOURCES/899fcc2cf21c86b5462c8f4441cd9c92b3d75f7d.patch delete mode 100644 SOURCES/9acef39f13833f7d53ef96abc5a72e79384260f4.patch delete mode 100644 SOURCES/a16ca0711001957ee98f2c124abce0fa1f801529.patch delete mode 100644 SOURCES/a8c776ed94f68ae31d7b5f74e19545698898c13c.patch delete mode 100644 SOURCES/add_upstream_labels.patch delete mode 100644 SOURCES/aecf33aa70331670c06db6b652712b476e24051c.patch delete mode 100644 SOURCES/b22be68453b2497e86cbd273b9cd56fadc5859e3.patch delete mode 100644 SOURCES/b497a3d6a39d402c41065e9284d49114b97e3bfe.patch delete mode 100644 SOURCES/cc2ce5c65ed5a42eaa97aa3659854add6d808da5.patch delete mode 100644 SOURCES/ce33041e0abfa20054ff5d6874ffbd1ab592558d.patch delete mode 100644 SOURCES/ce6e7864f11f709c4f803828fbc8e507d115d03b.patch delete mode 100644 SOURCES/e8b97ec14a11764fedfea50bd4d96ddda43c7fc1.patch delete mode 100644 SOURCES/rasdaemon-ras-mc-ctl-Fix-script-to-parse-dimm-sizes.patch delete mode 100644 SPECS/rasdaemon.spec create mode 100644 a247baf7110ab6427259eb1421a103e2021a8735.patch create mode 100644 a7524917befe7e67c02253cc27cb0c724e5992c0.patch create mode 100644 ae1647624486fca0070b297d0e2fd4e53443c10b.patch create mode 100644 aee13f74266382c64128bd7367a5eeb46277f490.patch create mode 100644 b22cb067755f4604770f9864a0babed8f93a1553.patch create mode 100644 bd27251e3d52f57be1e245dff1cf221e09c5686f.patch create mode 100644 c38c14afc5d7bb6c8c52d1023271d755deb23008.patch create mode 100644 d3836aa061f677232f99c514247d3dbf80812a1b.patch create mode 100644 e0cde0edf073b939d345aeba0aed23e238dbc53b.patch create mode 100644 f63b4c942e19a0da1e85a88783ed6e222ad4bdba.patch create mode 100644 f73ed45b91244eb3986ac2574cd7d36ae1d4d22a.patch create mode 100644 f8b6da812eddc063ea739970f941fdd24fb984ae.patch create mode 100644 fd11670d2d35c5d939b03ba1ca80eb81c1f636b6.patch create mode 100644 rasdaemon.spec create mode 100644 sources diff --git a/.gitignore b/.gitignore index e69cfd0..a9f11d1 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1 @@ -SOURCES/rasdaemon-0.6.1.tar.bz2 +rasdaemon-0.8.0.tar.bz2 diff --git a/.rasdaemon.metadata b/.rasdaemon.metadata deleted file mode 100644 index e6215b6..0000000 --- a/.rasdaemon.metadata +++ /dev/null @@ -1 +0,0 @@ -742eda555cccb8ca8f9b6a18bab1f4a732c11318 SOURCES/rasdaemon-0.6.1.tar.bz2 diff --git a/2ff9bc453998ddb145c7bb8ba30a57c56bd18eab.patch b/2ff9bc453998ddb145c7bb8ba30a57c56bd18eab.patch new file mode 100644 index 0000000..eaa9559 --- /dev/null +++ b/2ff9bc453998ddb145c7bb8ba30a57c56bd18eab.patch @@ -0,0 +1,66 @@ +commit 2ff9bc453998ddb145c7bb8ba30a57c56bd18eab +Author: Shiju Jose +Date: Tue Apr 4 14:40:42 2023 +0100 + + rasdaemon: Add common function to convert timestamp in the CXL event records to the broken-down time format + + Add common function to convert the timestamp in the CXL event records + in nanoseconds to the broken-down time format. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index 8f6342d..59534a4 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -23,6 +23,25 @@ + #include "ras-report.h" + #include + ++/* Common Functions */ ++static void convert_timestamp(unsigned long long ts, char *ts_ptr, uint16_t size) ++{ ++ /* CXL Specification 3.0 ++ * Overflow timestamp - The number of unsigned nanoseconds ++ * that have elapsed since midnight, 01-Jan-1970 UTC ++ */ ++ time_t ts_secs = ts / 1000000000ULL; ++ struct tm *tm; ++ ++ tm = localtime(&ts_secs); ++ if (tm) ++ strftime(ts_ptr, size, "%Y-%m-%d %H:%M:%S %z", tm); ++ ++ if (!ts || !tm) ++ strncpy(ts_ptr, "1970-01-01 00:00:00 +0000", ++ size); ++} ++ + /* Poison List: Payload out flags */ + #define CXL_POISON_FLAG_MORE BIT(0) + #define CXL_POISON_FLAG_OVERFLOW BIT(1) +@@ -168,22 +187,7 @@ int ras_cxl_poison_event_handler(struct trace_seq *s, + if (ev.flags & CXL_POISON_FLAG_OVERFLOW) { + if (tep_get_field_val(s, event, "overflow_ts", record, &val, 1) < 0) + return -1; +- if (val) { +- /* CXL Specification 3.0 +- * Overflow timestamp - The number of unsigned nanoseconds +- * that have elapsed since midnight, 01-Jan-1970 UTC +- */ +- time_t ovf_ts_secs = val / 1000000000ULL; +- +- tm = localtime(&ovf_ts_secs); +- if (tm) { +- strftime(ev.overflow_ts, sizeof(ev.overflow_ts), +- "%Y-%m-%d %H:%M:%S %z", tm); +- } +- } +- if (!val || !tm) +- strncpy(ev.overflow_ts, "1970-01-01 00:00:00 +0000", +- sizeof(ev.overflow_ts)); ++ convert_timestamp(val, ev.overflow_ts, sizeof(ev.overflow_ts)); + } else + strncpy(ev.overflow_ts, "1970-01-01 00:00:00 +0000", sizeof(ev.overflow_ts)); + if (trace_seq_printf(s, "overflow timestamp:%s\n", ev.overflow_ts) <= 0) diff --git a/31c7578ddb0fc15aa7247f2b8885956540031221.patch b/31c7578ddb0fc15aa7247f2b8885956540031221.patch new file mode 100644 index 0000000..7ee1e3b --- /dev/null +++ b/31c7578ddb0fc15aa7247f2b8885956540031221.patch @@ -0,0 +1,54 @@ +commit 31c7578ddb0fc15aa7247f2b8885956540031221 +Author: Shiju Jose +Date: Tue Feb 6 12:08:00 2024 +0000 + + rasdaemon: ras-memory-failure-handler: update memory failure action page types + + Update memory failure action page types corresponding to the same in + mm/memory-failure.c in the kernel. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-memory-failure-handler.c b/ras-memory-failure-handler.c +index 97e8840..a5acc08 100644 +--- a/ras-memory-failure-handler.c ++++ b/ras-memory-failure-handler.c +@@ -26,10 +26,8 @@ enum mf_action_page_type { + MF_MSG_KERNEL_HIGH_ORDER, + MF_MSG_SLAB, + MF_MSG_DIFFERENT_COMPOUND, +- MF_MSG_POISONED_HUGE, + MF_MSG_HUGE, + MF_MSG_FREE_HUGE, +- MF_MSG_NON_PMD_HUGE, + MF_MSG_UNMAP_FAILED, + MF_MSG_DIRTY_SWAPCACHE, + MF_MSG_CLEAN_SWAPCACHE, +@@ -41,7 +39,6 @@ enum mf_action_page_type { + MF_MSG_CLEAN_LRU, + MF_MSG_TRUNCATED_LRU, + MF_MSG_BUDDY, +- MF_MSG_BUDDY_2ND, + MF_MSG_DAX, + MF_MSG_UNSPLIT_THP, + MF_MSG_UNKNOWN, +@@ -64,10 +61,8 @@ static const struct { + { MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page"}, + { MF_MSG_SLAB, "kernel slab page"}, + { MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking"}, +- { MF_MSG_POISONED_HUGE, "huge page already hardware poisoned"}, + { MF_MSG_HUGE, "huge page"}, + { MF_MSG_FREE_HUGE, "free huge page"}, +- { MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page"}, + { MF_MSG_UNMAP_FAILED, "unmapping failed page"}, + { MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page"}, + { MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page"}, +@@ -79,7 +74,6 @@ static const struct { + { MF_MSG_CLEAN_LRU, "clean LRU page"}, + { MF_MSG_TRUNCATED_LRU, "already truncated LRU page"}, + { MF_MSG_BUDDY, "free buddy page"}, +- { MF_MSG_BUDDY_2ND, "free buddy page (2nd try)"}, + { MF_MSG_DAX, "dax page"}, + { MF_MSG_UNSPLIT_THP, "unsplit thp"}, + { MF_MSG_UNKNOWN, "unknown page"}, diff --git a/53c682fb45c2909c128be4ee8f51a3e42fe2f7fd.patch b/53c682fb45c2909c128be4ee8f51a3e42fe2f7fd.patch new file mode 100644 index 0000000..cb656cc --- /dev/null +++ b/53c682fb45c2909c128be4ee8f51a3e42fe2f7fd.patch @@ -0,0 +1,551 @@ +commit 53c682fb45c2909c128be4ee8f51a3e42fe2f7fd +Author: Shiju Jose +Date: Wed Apr 5 11:54:41 2023 +0100 + + rasdaemon: Add support for the CXL general media events + + Add support to log and record the CXL general media events. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index 83ada56..2de96f6 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -99,6 +99,14 @@ static char *uuid_be(const char *uu) + return uuid; + } + ++static const char* get_cxl_type_str(const char** type_array, uint8_t num_elems, uint8_t type) ++{ ++ if (type >= num_elems) ++ return "Unknown"; ++ ++ return type_array[type]; ++} ++ + /* Poison List: Payload out flags */ + #define CXL_POISON_FLAG_MORE BIT(0) + #define CXL_POISON_FLAG_OVERFLOW BIT(1) +@@ -709,3 +717,151 @@ int ras_cxl_generic_event_handler(struct trace_seq *s, + + return 0; + } ++ ++#define CXL_DPA_VOLATILE BIT(0) ++#define CXL_DPA_NOT_REPAIRABLE BIT(1) ++ ++static const struct cxl_event_flags cxl_dpa_flags[] = { ++ { .bit = CXL_DPA_VOLATILE, .flag = "VOLATILE" }, ++ { .bit = CXL_DPA_NOT_REPAIRABLE, .flag = "NOT_REPAIRABLE" }, ++}; ++ ++/* ++ * General Media Event Record - GMER ++ * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 ++ */ ++#define CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT BIT(0) ++#define CXL_GMER_EVT_DESC_THRESHOLD_EVENT BIT(1) ++#define CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW BIT(2) ++ ++static const struct cxl_event_flags cxl_gmer_event_desc_flags[] = { ++ { .bit = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT, .flag = "UNCORRECTABLE EVENT" }, ++ { .bit = CXL_GMER_EVT_DESC_THRESHOLD_EVENT, .flag = "THRESHOLD EVENT" }, ++ { .bit = CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW, .flag = "POISON LIST OVERFLOW" }, ++}; ++ ++#define CXL_GMER_VALID_CHANNEL BIT(0) ++#define CXL_GMER_VALID_RANK BIT(1) ++#define CXL_GMER_VALID_DEVICE BIT(2) ++#define CXL_GMER_VALID_COMPONENT BIT(3) ++ ++static const char* cxl_gmer_mem_event_type[] = { ++ "ECC Error", ++ "Invalid Address", ++ "Data Path Error", ++}; ++ ++static const char* cxl_gmer_trans_type[] = { ++ "Unknown", ++ "Host Read", ++ "Host Write", ++ "Host Scan Media", ++ "Host Inject Poison", ++ "Internal Media Scrub", ++ "Internal Media Management", ++}; ++ ++int ras_cxl_general_media_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ int len, i; ++ unsigned long long val; ++ struct ras_events *ras = context; ++ struct ras_cxl_general_media_event ev; ++ ++ memset(&ev, 0, sizeof(ev)); ++ if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "dpa", record, &val, 1) < 0) ++ return -1; ++ ev.dpa = val; ++ if (trace_seq_printf(s, "dpa:0x%llx ", (unsigned long long)ev.dpa) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "dpa_flags", record, &val, 1) < 0) ++ return -1; ++ ev.dpa_flags = val; ++ if (trace_seq_printf(s, "dpa_flags:") <= 0) ++ return -1; ++ if (decode_cxl_event_flags(s, ev.dpa_flags, cxl_dpa_flags, ARRAY_SIZE(cxl_dpa_flags)) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "descriptor", record, &val, 1) < 0) ++ return -1; ++ ev.descriptor = val; ++ if (trace_seq_printf(s, "descriptor:") <= 0) ++ return -1; ++ if (decode_cxl_event_flags(s, ev.descriptor, cxl_gmer_event_desc_flags, ++ ARRAY_SIZE(cxl_gmer_event_desc_flags)) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "type", record, &val, 1) < 0) ++ return -1; ++ ev.type = val; ++ if (trace_seq_printf(s, "type:%s ", get_cxl_type_str(cxl_gmer_mem_event_type, ++ ARRAY_SIZE(cxl_gmer_mem_event_type), ev.type)) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "transaction_type", record, &val, 1) < 0) ++ return -1; ++ ev.transaction_type = val; ++ if (trace_seq_printf(s, "transaction_type:%s ", ++ get_cxl_type_str(cxl_gmer_trans_type, ++ ARRAY_SIZE(cxl_gmer_trans_type), ++ ev.transaction_type)) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "validity_flags", record, &val, 1) < 0) ++ return -1; ++ ev.validity_flags = val; ++ ++ if (ev.validity_flags & CXL_GMER_VALID_CHANNEL) { ++ if (tep_get_field_val(s, event, "channel", record, &val, 1) < 0) ++ return -1; ++ ev.channel = val; ++ if (trace_seq_printf(s, "channel:%u ", ev.channel) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_GMER_VALID_RANK) { ++ if (tep_get_field_val(s, event, "rank", record, &val, 1) < 0) ++ return -1; ++ ev.rank = val; ++ if (trace_seq_printf(s, "rank:%u ", ev.rank) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_GMER_VALID_DEVICE) { ++ if (tep_get_field_val(s, event, "device", record, &val, 1) < 0) ++ return -1; ++ ev.device = val; ++ if (trace_seq_printf(s, "device:%x ", ev.device) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_GMER_VALID_COMPONENT) { ++ ev.comp_id = tep_get_field_raw(s, event, "comp_id", record, &len, 1); ++ if (!ev.comp_id) ++ return -1; ++ if (trace_seq_printf(s, "comp_id:") <= 0) ++ return -1; ++ for (i = 0; i < CXL_EVENT_GEN_MED_COMP_ID_SIZE; i++) { ++ if (trace_seq_printf(s, "%02x ", ev.comp_id[i]) <= 0) ++ break; ++ } ++ } ++ ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_cxl_general_media_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_cxl_general_media_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h +index 9f77cb7..3adca4a 100644 +--- a/ras-cxl-handler.h ++++ b/ras-cxl-handler.h +@@ -35,4 +35,7 @@ int ras_cxl_overflow_event_handler(struct trace_seq *s, + int ras_cxl_generic_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); ++int ras_cxl_general_media_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context); + #endif +diff --git a/ras-events.c b/ras-events.c +index 4036933..978dee4 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -250,6 +250,7 @@ int toggle_ras_mc_event(int enable) + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_aer_correctable_error", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_overflow", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_generic_event", enable); ++ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_general_media", enable); + #endif + + free_ras: +@@ -1063,6 +1064,14 @@ int handle_ras_events(int record_events) + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "cxl_generic_event"); ++ ++ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_general_media", ++ ras_cxl_general_media_event_handler, NULL, CXL_GENERAL_MEDIA_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "cxl", "cxl_general_media"); + #endif + + if (!num_events) { +diff --git a/ras-events.h b/ras-events.h +index 96c299e..9b83df3 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -44,6 +44,7 @@ enum { + CXL_AER_CE_EVENT, + CXL_OVERFLOW_EVENT, + CXL_GENERIC_EVENT, ++ CXL_GENERAL_MEDIA_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index a65d9c0..507a58e 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -846,6 +846,75 @@ int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_e + + return rc; + } ++ ++/* ++ * Table and functions to handle cxl:cxl_general_media_event ++ */ ++static const struct db_fields cxl_general_media_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "memdev", .type = "TEXT" }, ++ { .name = "host", .type = "TEXT" }, ++ { .name = "serial", .type = "INTEGER" }, ++ { .name = "log_type", .type = "TEXT" }, ++ { .name = "hdr_uuid", .type = "TEXT" }, ++ { .name = "hdr_flags", .type = "INTEGER" }, ++ { .name = "hdr_handle", .type = "INTEGER" }, ++ { .name = "hdr_related_handle", .type = "INTEGER" }, ++ { .name = "hdr_ts", .type = "TEXT" }, ++ { .name = "hdr_length", .type = "INTEGER" }, ++ { .name = "hdr_maint_op_class", .type = "INTEGER" }, ++ { .name = "dpa", .type = "INTEGER" }, ++ { .name = "dpa_flags", .type = "INTEGER" }, ++ { .name = "descriptor", .type = "INTEGER" }, ++ { .name = "type", .type = "INTEGER" }, ++ { .name = "transaction_type", .type = "INTEGER" }, ++ { .name = "channel", .type = "INTEGER" }, ++ { .name = "rank", .type = "INTEGER" }, ++ { .name = "device", .type = "INTEGER" }, ++ { .name = "comp_id", .type = "BLOB" }, ++}; ++ ++static const struct db_table_descriptor cxl_general_media_event_tab = { ++ .name = "cxl_general_media_event", ++ .fields = cxl_general_media_event_fields, ++ .num_fields = ARRAY_SIZE(cxl_general_media_event_fields), ++}; ++ ++int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_cxl_general_media_event) ++ return 0; ++ log(TERM, LOG_INFO, "cxl_general_media_event store: %p\n", ++ priv->stmt_cxl_general_media_event); ++ ++ ras_store_cxl_common_hdr(priv->stmt_cxl_general_media_event, &ev->hdr); ++ sqlite3_bind_int64(priv->stmt_cxl_general_media_event, 13, ev->dpa); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 14, ev->dpa_flags); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 15, ev->descriptor); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 16, ev->type); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 17, ev->transaction_type); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 18, ev->channel); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 19, ev->rank); ++ sqlite3_bind_int(priv->stmt_cxl_general_media_event, 20, ev->device); ++ sqlite3_bind_blob(priv->stmt_cxl_general_media_event, 21, ev->comp_id, ++ CXL_EVENT_GEN_MED_COMP_ID_SIZE, NULL); ++ ++ rc = sqlite3_step(priv->stmt_cxl_general_media_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do stmt_cxl_general_media_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_cxl_general_media_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset stmt_cxl_general_media_event on sqlite: error = %d\n", rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} + #endif + + /* +@@ -1229,6 +1298,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + if (rc != SQLITE_OK) + goto error; + } ++ ++ rc = ras_mc_create_table(priv, &cxl_general_media_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_general_media_event, ++ &cxl_general_media_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } + #endif + + ras->db_priv = priv; +@@ -1390,6 +1467,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + "cpu %u: Failed to finalize cxl_generic_event sqlite: error = %d\n", + cpu, rc); + } ++ ++ if (priv->stmt_cxl_general_media_event) { ++ rc = sqlite3_finalize(priv->stmt_cxl_general_media_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize cxl_general_media_event sqlite: error = %d\n", ++ cpu, rc); ++ } + #endif + + rc = sqlite3_close_v2(db); +diff --git a/ras-record.h b/ras-record.h +index 9ecfcda..37c32de 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -134,6 +134,7 @@ struct ras_cxl_poison_event { + #define CXL_HEADERLOG_SIZE SZ_512 + #define CXL_HEADERLOG_SIZE_U32 (SZ_512 / sizeof(uint32_t)) + #define CXL_EVENT_RECORD_DATA_LENGTH 0x50 ++#define CXL_EVENT_GEN_MED_COMP_ID_SIZE 0x10 + + struct ras_cxl_aer_ue_event { + char timestamp[64]; +@@ -184,6 +185,20 @@ struct ras_cxl_generic_event { + uint8_t *data; + }; + ++struct ras_cxl_general_media_event { ++ struct ras_cxl_event_common_hdr hdr; ++ uint64_t dpa; ++ uint8_t dpa_flags; ++ uint8_t descriptor; ++ uint8_t type; ++ uint8_t transaction_type; ++ uint8_t channel; ++ uint8_t rank; ++ uint32_t device; ++ uint8_t *comp_id; ++ uint16_t validity_flags; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -198,6 +213,7 @@ struct ras_cxl_aer_ue_event; + struct ras_cxl_aer_ce_event; + struct ras_cxl_overflow_event; + struct ras_cxl_generic_event; ++struct ras_cxl_general_media_event; + + #ifdef HAVE_SQLITE3 + +@@ -236,6 +252,7 @@ struct sqlite3_priv { + sqlite3_stmt *stmt_cxl_aer_ce_event; + sqlite3_stmt *stmt_cxl_overflow_event; + sqlite3_stmt *stmt_cxl_generic_event; ++ sqlite3_stmt *stmt_cxl_general_media_event; + #endif + }; + +@@ -269,6 +286,7 @@ int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_eve + int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); + int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); + int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); ++int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -287,6 +305,7 @@ static inline int ras_store_cxl_aer_ue_event(struct ras_events *ras, struct ras_ + static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; + static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; + static inline int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; ++static inline int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 8d7b76a..725dc9b 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -489,6 +489,60 @@ static int set_cxl_generic_event_backtrace(char *buf, struct ras_cxl_generic_eve + return 0; + } + ++static int set_cxl_general_media_event_backtrace(char *buf, struct ras_cxl_general_media_event *ev) ++{ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "memdev=%s\n" \ ++ "host=%s\n" \ ++ "serial=0x%lx\n" \ ++ "log_type=%s\n" \ ++ "hdr_uuid=%s\n" \ ++ "hdr_flags=0x%x\n" \ ++ "hdr_handle=0x%x\n" \ ++ "hdr_related_handle=0x%x\n" \ ++ "hdr_timestamp=%s\n" \ ++ "hdr_length=%u\n" \ ++ "hdr_maint_op_class=%u\n" \ ++ "dpa=0x%lx\n" \ ++ "dpa_flags=%u\n" \ ++ "descriptor=%u\n" \ ++ "type=%u\n" \ ++ "transaction_type=%u\n" \ ++ "channel=%u\n" \ ++ "rank=%u\n" \ ++ "device=0x%x\n", \ ++ ev->hdr.timestamp, \ ++ ev->hdr.memdev, \ ++ ev->hdr.host, \ ++ ev->hdr.serial, \ ++ ev->hdr.log_type, \ ++ ev->hdr.hdr_uuid, \ ++ ev->hdr.hdr_flags, \ ++ ev->hdr.hdr_handle, \ ++ ev->hdr.hdr_related_handle, \ ++ ev->hdr.hdr_timestamp, \ ++ ev->hdr.hdr_length, \ ++ ev->hdr.hdr_maint_op_class, \ ++ ev->dpa, \ ++ ev->dpa_flags, \ ++ ev->descriptor, \ ++ ev->type, \ ++ ev->transaction_type, \ ++ ev->channel, \ ++ ev->rank, \ ++ ev->device); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -541,6 +595,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case CXL_GENERIC_EVENT: + rc = set_cxl_generic_event_backtrace(buf, (struct ras_cxl_generic_event *)ev); + break; ++ case CXL_GENERAL_MEDIA_EVENT: ++ rc = set_cxl_general_media_event_backtrace(buf, (struct ras_cxl_general_media_event *)ev); ++ break; + default: + return -1; + } +@@ -1170,3 +1227,47 @@ cxl_generic_fail: + return -1; + + } ++ ++int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto cxl_general_media_fail; ++ ++ rc = commit_report_backtrace(sockfd, CXL_GENERAL_MEDIA_EVENT, ev); ++ if (rc < 0) ++ goto cxl_general_media_fail; ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl_general_media_event"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_general_media_fail; ++ ++ sprintf(buf, "REASON=%s", "CXL General Media Event"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_general_media_fail; ++ ++ done = 1; ++ ++cxl_general_media_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ else ++ return -1; ++} +diff --git a/ras-report.h b/ras-report.h +index bf591a6..d9ec7df 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -44,6 +44,7 @@ int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras_cxl_aer_ue_ev + int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev); + int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); + int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); ++int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); + + #else + +@@ -60,6 +61,7 @@ static inline int ras_report_cxl_aer_ue_event(struct ras_events *ras, struct ras + static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_event *ev) { return 0; }; + static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; + static inline int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; ++static inline int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; + + #endif + diff --git a/572de9d57691be9e630abee9ffa56a2fb155d558.patch b/572de9d57691be9e630abee9ffa56a2fb155d558.patch new file mode 100644 index 0000000..4a89c04 --- /dev/null +++ b/572de9d57691be9e630abee9ffa56a2fb155d558.patch @@ -0,0 +1,182 @@ +commit dea649c9f9a6f2941e80cade9ed311a398e267be +Author: Shiju Jose +Date: Mon Feb 12 11:14:03 2024 +0000 + + rasdaemon: ras-mc-ctl: Add support for CXL general media trace events + + Add support for CXL general media events to the ras-mc-ctl tool. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + (cherry picked from commit 572de9d57691be9e630abee9ffa56a2fb155d558) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 5528021..99b3c10 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1298,6 +1298,84 @@ sub get_cxl_hdr_flags_text + return join (", ", @out); + } + ++use constant { ++ CXL_DPA_VOLATILE => 0x0001, ++ CXL_DPA_NOT_REPAIRABLE => 0x0002, ++}; ++ ++sub get_cxl_dpa_flags_text ++{ ++ my $flags = $_[0]; ++ my @out; ++ ++ if ($flags & CXL_DPA_VOLATILE) { ++ push @out, (sprintf "\'VOLATILE\' "); ++ } ++ if ($flags & CXL_DPA_NOT_REPAIRABLE) { ++ push @out, (sprintf "\'NOT_REPAIRABLE\' "); ++ } ++ ++ return join (", ", @out); ++} ++ ++use constant { ++ CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT => 0x0001, ++ CXL_GMER_EVT_DESC_THRESHOLD_EVENT => 0x0002, ++ CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW => 0x0004, ++}; ++ ++sub get_cxl_descriptor_flags_text ++{ ++ my $flags = $_[0]; ++ my @out; ++ ++ if ($flags & CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT) { ++ push @out, (sprintf "\'UNCORRECTABLE EVENT\' "); ++ } ++ if ($flags & CXL_GMER_EVT_DESC_THRESHOLD_EVENT) { ++ push @out, (sprintf "\'THRESHOLD EVENT\' "); ++ } ++ if ($flags & CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW) { ++ push @out, (sprintf "\'POISON LIST OVERFLOW\' "); ++ } ++ ++ return join (", ", @out); ++} ++ ++sub get_cxl_mem_event_type ++{ ++ my @types; ++ ++ if ($_[0] < 0 || $_[0] > 2) { ++ return "unknown-type"; ++ } ++ ++ @types = ("ECC Error", ++ "Invalid Address", ++ "Data Path Error"); ++ ++ return $types[$_[0]]; ++} ++ ++sub get_cxl_transaction_type ++{ ++ my @types; ++ ++ if ($_[0] < 0 || $_[0] > 6) { ++ return "unknown-type"; ++ } ++ ++ @types = ("Unknown", ++ "Host Read", ++ "Host Write", ++ "Host Scan Media", ++ "Host Inject Poison", ++ "Internal Media Scrub", ++ "Internal Media Management"); ++ ++ return $types[$_[0]]; ++} ++ + sub summary + { + require DBI; +@@ -1442,6 +1520,22 @@ sub summary + print "No CXL generic errors.\n\n"; + } + $query_handle->finish; ++ ++ # CXL general media errors ++ $query = "select memdev, count(*) from cxl_general_media_event$conf{opt}{since} group by memdev"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($memdev, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$memdev errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "CXL general media events summary:\n$out\n"; ++ } else { ++ print "No CXL general media errors.\n\n"; ++ } ++ $query_handle->finish; + } + + # extlog errors +@@ -1553,6 +1647,7 @@ sub errors + my ($log_type, $first_ts, $last_ts); + my ($trace_type, $region, $region_uuid, $hpa, $dpa, $dpa_length, $source, $flags, $overflow_ts); + my ($hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $data); ++ my ($dpa_flags, $descriptor, $mem_event_type, $transaction_type, $channel, $rank, $device, $comp_id); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -1764,6 +1859,49 @@ sub errors + } else { + print "No CXL generic errors.\n\n"; + } ++ ++ # CXL general media errors ++ use constant CXL_EVENT_GEN_MED_COMP_ID_SIZE => 0x10; ++ $query = "select id, timestamp, memdev, host, serial, log_type, hdr_uuid, hdr_flags, hdr_handle, hdr_related_handle, hdr_ts, hdr_length, hdr_maint_op_class, dpa, dpa_flags, descriptor, type, transaction_type, channel, rank, device, comp_id from cxl_general_media_event$conf{opt}{since} order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $log_type, $hdr_uuid, $hdr_flags, $hdr_handle, $hdr_related_handle, $hdr_ts, $hdr_length, $hdr_maint_op_class, $dpa, $dpa_flags, $descriptor, $mem_event_type, $transaction_type, $channel, $rank, $device, $comp_id)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $timestamp error: "; ++ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev); ++ $out .= "host=$host, " if (defined $host && length $host); ++ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial); ++ $out .= "log=$log_type, " if (defined $log_type && length $log_type); ++ $out .= "hdr_uuid=$hdr_uuid, " if (defined $hdr_uuid && length $hdr_uuid); ++ $out .= sprintf "hdr_flags=0x%llx %s, ", $hdr_flags, get_cxl_hdr_flags_text($hdr_flags) if (defined $hdr_flags && length $hdr_flags); ++ $out .= sprintf "hdr_handle=0x%x, ", $hdr_handle if (defined $hdr_handle && length $hdr_handle); ++ $out .= sprintf "hdr_related_handle=0x%x, ", $hdr_related_handle if (defined $hdr_related_handle && length $hdr_related_handle); ++ $out .= "hdr_timestamp=$hdr_ts, " if (defined $hdr_ts && length $hdr_ts); ++ $out .= sprintf "hdr_length=%u, ", $hdr_length if (defined $hdr_length && length $hdr_length); ++ $out .= sprintf "hdr_maint_op_class=%u, ", $hdr_maint_op_class if (defined $hdr_maint_op_class && length $hdr_maint_op_class); ++ $out .= sprintf "dpa=0x%llx, ", $dpa if (defined $dpa && length $dpa); ++ $out .= sprintf "dpa_flags: %s, ", get_cxl_dpa_flags_text($dpa_flags) if (defined $dpa_flags && length $dpa_flags); ++ $out .= sprintf "descriptor_flags: %s, ", get_cxl_descriptor_flags_text($descriptor) if (defined $descriptor && length $descriptor); ++ $out .= sprintf "memory event type: %s, ", get_cxl_mem_event_type($mem_event_type) if (defined $mem_event_type && length $mem_event_type); ++ $out .= sprintf "transaction_type: %s, ", get_cxl_transaction_type($transaction_type) if (defined $transaction_type && length $transaction_type); ++ $out .= sprintf "channel=%u, ", $channel if (defined $channel && length $channel); ++ $out .= sprintf "rank=%u, ", $rank if (defined $rank && length $rank); ++ $out .= sprintf "device=0x%x, ", $device if (defined $device && length $device); ++ if (defined $comp_id && length $comp_id) { ++ $out .= sprintf "component_id:"; ++ my @bytes = unpack "C*", $comp_id; ++ for (my $i = 0; $i < CXL_EVENT_GEN_MED_COMP_ID_SIZE; $i++) { ++ $out .= sprintf "%02x ", $bytes[$i]; ++ } ++ } ++ $out .= "\n"; ++ } ++ if ($out ne "") { ++ print "CXL general media events:\n$out\n"; ++ } else { ++ print "No CXL general media errors.\n\n"; ++ } + } + + # Extlog errors diff --git a/75c8fec559641f843345ef8fbc36d124b60b914d.patch b/75c8fec559641f843345ef8fbc36d124b60b914d.patch new file mode 100644 index 0000000..cd0aca4 --- /dev/null +++ b/75c8fec559641f843345ef8fbc36d124b60b914d.patch @@ -0,0 +1,663 @@ +commit 75c8fec559641f843345ef8fbc36d124b60b914d +Author: Shiju Jose +Date: Fri Mar 31 13:35:13 2023 +0100 + + rasdaemon: Add support for the CXL poison events + + Add support to log and record the CXL poison events. + + The corresponding Kernel patches here: + https://lore.kernel.org/linux-cxl/64457d30bae07_2028294ac@dwillia2-xfh.jf.intel.com.notmuch/ + + Presently for logging only, could be extended for the policy + based recovery action for the frequent poison events depending on the above + kernel patches. + + Signed-off-by: Shiju Jose + Reviewed-by: Jonathan Cameron + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/Makefile.am b/Makefile.am +index 56c144e..5bddeac 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -73,6 +73,11 @@ endif + if WITH_CPU_FAULT_ISOLATION + rasdaemon_SOURCES += ras-cpu-isolation.c queue.c + endif ++ ++if WITH_CXL ++ rasdaemon_SOURCES += ras-cxl-handler.c ++endif ++ + rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) $(LIBTRACEEVENT_LIBS) + rasdaemon_CFLAGS = $(SQLITE3_CFLAGS) $(LIBTRACEEVENT_CFLAGS) + +@@ -81,7 +86,7 @@ include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ + ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ + ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h \ + non-standard-hisilicon.h non-standard-ampere.h ras-memory-failure-handler.h \ +- ras-cpu-isolation.h queue.h ++ ras-cxl-handler.h ras-cpu-isolation.h queue.h + + # This rule can't be called with more than one Makefile job (like make -j8) + # I can't figure out a way to fix that +diff --git a/configure.ac b/configure.ac +index f588090..ab5697d 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -127,6 +127,16 @@ AS_IF([test "x$enable_memory_failure" = "xyes" || test "x$enable_all" = "xyes"], + AM_CONDITIONAL([WITH_MEMORY_FAILURE], [test x$enable_memory_failure = xyes || test x$enable_all = xyes]) + AM_COND_IF([WITH_MEMORY_FAILURE], [USE_MEMORY_FAILURE="yes"], [USE_MEMORY_FAILURE="no"]) + ++AC_ARG_ENABLE([cxl], ++ AS_HELP_STRING([--enable-cxl], [enable CXL events (currently experimental)])) ++ ++AS_IF([test "x$enable_cxl" = "xyes" || test "x$enable_all" == "xyes"], [ ++ AC_DEFINE(HAVE_CXL,1,"have CXL events collect") ++ AC_SUBST([WITH_CXL]) ++]) ++AM_CONDITIONAL([WITH_CXL], [test x$enable_cxl = xyes || test x$enable_all == xyes]) ++AM_COND_IF([WITH_CXL], [USE_CXL="yes"], [USE_CXL="no"]) ++ + AC_ARG_ENABLE([abrt_report], + AS_HELP_STRING([--enable-abrt-report], [enable report event to ABRT (currently experimental)])) + +@@ -215,6 +225,7 @@ compile time options summary + DEVLINK : $USE_DEVLINK + Disk I/O errors : $USE_DISKERROR + Memory Failure : $USE_MEMORY_FAILURE ++ CXL events : $USE_CXL + Memory CE PFA : $USE_MEMORY_CE_PFA + AMP RAS errors : $USE_AMP_NS_DECODE + CPU fault isolation : $USE_CPU_FAULT_ISOLATION +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +new file mode 100644 +index 0000000..cb23ba2 +--- /dev/null ++++ b/ras-cxl-handler.c +@@ -0,0 +1,202 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include "ras-cxl-handler.h" ++#include "ras-record.h" ++#include "ras-logger.h" ++#include "ras-report.h" ++ ++/* Poison List: Payload out flags */ ++#define CXL_POISON_FLAG_MORE BIT(0) ++#define CXL_POISON_FLAG_OVERFLOW BIT(1) ++#define CXL_POISON_FLAG_SCANNING BIT(2) ++ ++/* CXL poison - source types */ ++enum cxl_poison_source { ++ CXL_POISON_SOURCE_UNKNOWN = 0, ++ CXL_POISON_SOURCE_EXTERNAL = 1, ++ CXL_POISON_SOURCE_INTERNAL = 2, ++ CXL_POISON_SOURCE_INJECTED = 3, ++ CXL_POISON_SOURCE_VENDOR = 7, ++}; ++ ++/* CXL poison - trace types */ ++enum cxl_poison_trace_type { ++ CXL_POISON_TRACE_LIST, ++ CXL_POISON_TRACE_INJECT, ++ CXL_POISON_TRACE_CLEAR, ++}; ++ ++int ras_cxl_poison_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ int len; ++ unsigned long long val; ++ struct ras_events *ras = context; ++ time_t now; ++ struct tm *tm; ++ struct ras_cxl_poison_event ev; ++ ++ now = record->ts / user_hz + ras->uptime_diff; ++ tm = localtime(&now); ++ if (tm) ++ strftime(ev.timestamp, sizeof(ev.timestamp), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ else ++ strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); ++ if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) ++ return -1; ++ ++ ev.memdev = tep_get_field_raw(s, event, "memdev", ++ record, &len, 1); ++ if (!ev.memdev) ++ return -1; ++ if (trace_seq_printf(s, "memdev:%s ", ev.memdev) <= 0) ++ return -1; ++ ++ ev.host = tep_get_field_raw(s, event, "host", ++ record, &len, 1); ++ if (!ev.host) ++ return -1; ++ if (trace_seq_printf(s, "host:%s ", ev.host) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "serial", record, &val, 1) < 0) ++ return -1; ++ ev.serial = val; ++ if (trace_seq_printf(s, "serial:0x%llx ", (unsigned long long)ev.serial) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "trace_type", record, &val, 1) < 0) ++ return -1; ++ switch (val) { ++ case CXL_POISON_TRACE_LIST: ++ ev.trace_type = "List"; ++ break; ++ case CXL_POISON_TRACE_INJECT: ++ ev.trace_type = "Inject"; ++ break; ++ case CXL_POISON_TRACE_CLEAR: ++ ev.trace_type = "Clear"; ++ break; ++ default: ++ ev.trace_type = "Invalid"; ++ } ++ if (trace_seq_printf(s, "trace_type:%s ", ev.trace_type) <= 0) ++ return -1; ++ ++ ev.region = tep_get_field_raw(s, event, "region", ++ record, &len, 1); ++ if (!ev.region) ++ return -1; ++ if (trace_seq_printf(s, "region:%s ", ev.region) <= 0) ++ return -1; ++ ++ ev.uuid = tep_get_field_raw(s, event, "uuid", ++ record, &len, 1); ++ if (!ev.uuid) ++ return -1; ++ if (trace_seq_printf(s, "region_uuid:%s ", ev.uuid) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "hpa", record, &val, 1) < 0) ++ return -1; ++ ev.hpa = val; ++ if (trace_seq_printf(s, "poison list: hpa:0x%llx ", (unsigned long long)ev.hpa) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "dpa", record, &val, 1) < 0) ++ return -1; ++ ev.dpa = val; ++ if (trace_seq_printf(s, "dpa:0x%llx ", (unsigned long long)ev.dpa) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "dpa_length", record, &val, 1) < 0) ++ return -1; ++ ev.dpa_length = val; ++ if (trace_seq_printf(s, "dpa_length:0x%x ", ev.dpa_length) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "source", record, &val, 1) < 0) ++ return -1; ++ switch (val) { ++ case CXL_POISON_SOURCE_UNKNOWN: ++ ev.source = "Unknown"; ++ break; ++ case CXL_POISON_SOURCE_EXTERNAL: ++ ev.source = "External"; ++ break; ++ case CXL_POISON_SOURCE_INTERNAL: ++ ev.source = "Internal"; ++ break; ++ case CXL_POISON_SOURCE_INJECTED: ++ ev.source = "Injected"; ++ break; ++ case CXL_POISON_SOURCE_VENDOR: ++ ev.source = "Vendor"; ++ break; ++ default: ++ ev.source = "Invalid"; ++ } ++ if (trace_seq_printf(s, "source:%s ", ev.source) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "flags", record, &val, 1) < 0) ++ return -1; ++ ev.flags = val; ++ if (trace_seq_printf(s, "flags:%d ", ev.flags) <= 0) ++ return -1; ++ ++ if (ev.flags & CXL_POISON_FLAG_OVERFLOW) { ++ if (tep_get_field_val(s, event, "overflow_ts", record, &val, 1) < 0) ++ return -1; ++ if (val) { ++ /* CXL Specification 3.0 ++ * Overflow timestamp - The number of unsigned nanoseconds ++ * that have elapsed since midnight, 01-Jan-1970 UTC ++ */ ++ time_t ovf_ts_secs = val / 1000000000ULL; ++ ++ tm = localtime(&ovf_ts_secs); ++ if (tm) { ++ strftime(ev.overflow_ts, sizeof(ev.overflow_ts), ++ "%Y-%m-%d %H:%M:%S %z", tm); ++ } ++ } ++ if (!val || !tm) ++ strncpy(ev.overflow_ts, "1970-01-01 00:00:00 +0000", ++ sizeof(ev.overflow_ts)); ++ } else ++ strncpy(ev.overflow_ts, "1970-01-01 00:00:00 +0000", sizeof(ev.overflow_ts)); ++ if (trace_seq_printf(s, "overflow timestamp:%s\n", ev.overflow_ts) <= 0) ++ return -1; ++ ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_cxl_poison_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_cxl_poison_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h +new file mode 100644 +index 0000000..84d5cc6 +--- /dev/null ++++ b/ras-cxl-handler.h +@@ -0,0 +1,24 @@ ++/* ++ * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ */ ++ ++#ifndef __RAS_CXL_HANDLER_H ++#define __RAS_CXL_HANDLER_H ++ ++#include "ras-events.h" ++#include ++ ++int ras_cxl_poison_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context); ++#endif +diff --git a/ras-events.c b/ras-events.c +index 5fe8e19..f95844a 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -40,6 +40,7 @@ + #include "ras-devlink-handler.h" + #include "ras-diskerror-handler.h" + #include "ras-memory-failure-handler.h" ++#include "ras-cxl-handler.h" + #include "ras-record.h" + #include "ras-logger.h" + #include "ras-page-isolation.h" +@@ -243,6 +244,10 @@ int toggle_ras_mc_event(int enable) + rc |= __toggle_ras_mc_event(ras, "ras", "memory_failure_event", enable); + #endif + ++#ifdef HAVE_CXL ++ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_poison", enable); ++#endif ++ + free_ras: + free(ras); + return rc; +@@ -979,6 +984,16 @@ int handle_ras_events(int record_events) + "ras", "memory_failure_event"); + #endif + ++#ifdef HAVE_CXL ++ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_poison", ++ ras_cxl_poison_event_handler, NULL, CXL_POISON_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "cxl", "cxl_poison"); ++#endif ++ + if (!num_events) { + log(ALL, LOG_INFO, + "Failed to trace all supported RAS events. Aborting.\n"); +diff --git a/ras-events.h b/ras-events.h +index 649b0c0..1ef3ecd 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -39,6 +39,7 @@ enum { + DEVLINK_EVENT, + DISKERROR_EVENT, + MF_EVENT, ++ CXL_POISON_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index adc97a4..c31baa0 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -559,6 +559,71 @@ int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) + } + #endif + ++#ifdef HAVE_CXL ++/* ++ * Table and functions to handle cxl:cxl_poison ++ */ ++static const struct db_fields cxl_poison_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "memdev", .type = "TEXT" }, ++ { .name = "host", .type = "TEXT" }, ++ { .name = "serial", .type = "INTEGER" }, ++ { .name = "trace_type", .type = "TEXT" }, ++ { .name = "region", .type = "TEXT" }, ++ { .name = "region_uuid", .type = "TEXT" }, ++ { .name = "hpa", .type = "INTEGER" }, ++ { .name = "dpa", .type = "INTEGER" }, ++ { .name = "dpa_length", .type = "INTEGER" }, ++ { .name = "source", .type = "TEXT" }, ++ { .name = "flags", .type = "INTEGER" }, ++ { .name = "overflow_ts", .type = "TEXT" }, ++}; ++ ++static const struct db_table_descriptor cxl_poison_event_tab = { ++ .name = "cxl_poison_event", ++ .fields = cxl_poison_event_fields, ++ .num_fields = ARRAY_SIZE(cxl_poison_event_fields), ++}; ++ ++int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_cxl_poison_event) ++ return 0; ++ log(TERM, LOG_INFO, "cxl_poison_event store: %p\n", priv->stmt_cxl_poison_event); ++ ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 1, ev->timestamp, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 2, ev->memdev, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 3, ev->host, -1, NULL); ++ sqlite3_bind_int64(priv->stmt_cxl_poison_event, 4, ev->serial); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 5, ev->trace_type, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 6, ev->region, -1, NULL); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 7, ev->uuid, -1, NULL); ++ sqlite3_bind_int64(priv->stmt_cxl_poison_event, 8, ev->hpa); ++ sqlite3_bind_int64(priv->stmt_cxl_poison_event, 9, ev->dpa); ++ sqlite3_bind_int(priv->stmt_cxl_poison_event, 10, ev->dpa_length); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 11, ev->source, -1, NULL); ++ sqlite3_bind_int(priv->stmt_cxl_poison_event, 12, ev->flags); ++ sqlite3_bind_text(priv->stmt_cxl_poison_event, 13, ev->overflow_ts, -1, NULL); ++ ++ rc = sqlite3_step(priv->stmt_cxl_poison_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do cxl_poison_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_cxl_poison_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset cxl_poison_event on sqlite: error = %d\n", ++ rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} ++#endif ++ + /* + * Generic code + */ +@@ -900,6 +965,16 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + } + #endif + ++#ifdef HAVE_CXL ++ rc = ras_mc_create_table(priv, &cxl_poison_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_poison_event, ++ &cxl_poison_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } ++#endif ++ + ras->db_priv = priv; + return 0; + +@@ -1019,6 +1094,16 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + } + #endif + ++#ifdef HAVE_CXL ++ if (priv->stmt_cxl_poison_event) { ++ rc = sqlite3_finalize(priv->stmt_cxl_poison_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize cxl_poison_event sqlite: error = %d\n", ++ cpu, rc); ++ } ++#endif ++ + rc = sqlite3_close_v2(db); + if (rc != SQLITE_OK) + log(TERM, LOG_ERR, +diff --git a/ras-record.h b/ras-record.h +index 219f10b..fd15215 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -114,6 +114,22 @@ struct ras_mf_event { + const char *action_result; + }; + ++struct ras_cxl_poison_event { ++ char timestamp[64]; ++ const char *memdev; ++ const char *host; ++ uint64_t serial; ++ const char *trace_type; ++ const char *region; ++ const char *uuid; ++ uint64_t hpa; ++ uint64_t dpa; ++ uint32_t dpa_length; ++ const char *source; ++ uint8_t flags; ++ char overflow_ts[64]; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -123,6 +139,7 @@ struct mce_event; + struct devlink_event; + struct diskerror_event; + struct ras_mf_event; ++struct ras_cxl_poison_event; + + #ifdef HAVE_SQLITE3 + +@@ -155,6 +172,9 @@ struct sqlite3_priv { + #ifdef HAVE_MEMORY_FAILURE + sqlite3_stmt *stmt_mf_event; + #endif ++#ifdef HAVE_CXL ++ sqlite3_stmt *stmt_cxl_poison_event; ++#endif + }; + + struct db_fields { +@@ -182,6 +202,7 @@ int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev); + int ras_store_devlink_event(struct ras_events *ras, struct devlink_event *ev); + int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev); + int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev); ++int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -195,6 +216,7 @@ static inline int ras_store_arm_record(struct ras_events *ras, struct ras_arm_ev + static inline int ras_store_devlink_event(struct ras_events *ras, struct devlink_event *ev) { return 0; }; + static inline int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; }; + static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; }; ++static inline int ras_store_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 62d5eb7..3daecc0 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -331,6 +331,46 @@ static int set_mf_event_backtrace(char *buf, struct ras_mf_event *ev) + return 0; + } + ++static int set_cxl_poison_event_backtrace(char *buf, struct ras_cxl_poison_event *ev) ++{ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "memdev=%s\n" \ ++ "host=%s\n" \ ++ "serial=0x%lx\n" \ ++ "trace_type=%s\n" \ ++ "region=%s\n" \ ++ "region_uuid=%s\n" \ ++ "hpa=0x%lx\n" \ ++ "dpa=0x%lx\n" \ ++ "dpa_length=0x%x\n" \ ++ "source=%s\n" \ ++ "flags=%u\n" \ ++ "overflow_timestamp=%s\n", \ ++ ev->timestamp, \ ++ ev->memdev, \ ++ ev->host, \ ++ ev->serial, \ ++ ev->trace_type, \ ++ ev->region, \ ++ ev->uuid, \ ++ ev->hpa, \ ++ ev->dpa, \ ++ ev->dpa_length, \ ++ ev->source, \ ++ ev->flags, \ ++ ev->overflow_ts); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -368,6 +408,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case MF_EVENT: + rc = set_mf_event_backtrace(buf, (struct ras_mf_event *)ev); + break; ++ case CXL_POISON_EVENT: ++ rc = set_cxl_poison_event_backtrace(buf, (struct ras_cxl_poison_event *)ev); ++ break; + default: + return -1; + } +@@ -776,3 +819,47 @@ mf_fail: + else + return -1; + } ++ ++int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto cxl_poison_fail; ++ ++ rc = commit_report_backtrace(sockfd, CXL_POISON_EVENT, ev); ++ if (rc < 0) ++ goto cxl_poison_fail; ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl-poison"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_poison_fail; ++ ++ sprintf(buf, "REASON=%s", "CXL poison"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_poison_fail; ++ ++ done = 1; ++ ++cxl_poison_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ else ++ return -1; ++} +diff --git a/ras-report.h b/ras-report.h +index e605eb1..d1591ce 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -39,6 +39,7 @@ int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev); + int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev); + int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev); + int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev); ++int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev); + + #else + +@@ -50,6 +51,7 @@ static inline int ras_report_arm_event(struct ras_events *ras, struct ras_arm_ev + static inline int ras_report_devlink_event(struct ras_events *ras, struct devlink_event *ev) { return 0; }; + static inline int ras_report_diskerror_event(struct ras_events *ras, struct diskerror_event *ev) { return 0; }; + static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; }; ++static inline int ras_report_cxl_poison_event(struct ras_events *ras, struct ras_cxl_poison_event *ev) { return 0; }; + + #endif + diff --git a/7be2edbf863b7acf7e1cab10c2b9f9bf51b3d513.patch b/7be2edbf863b7acf7e1cab10c2b9f9bf51b3d513.patch new file mode 100644 index 0000000..b6092db --- /dev/null +++ b/7be2edbf863b7acf7e1cab10c2b9f9bf51b3d513.patch @@ -0,0 +1,97 @@ +commit 7be2edbf863b7acf7e1cab10c2b9f9bf51b3d513 +Author: Shiju Jose +Date: Tue Apr 4 16:07:21 2023 +0100 + + rasdaemon: Add common function to get timestamp for the event + + Add common function to get the timestamp for the event + reported. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index 59534a4..d540ebb 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -42,6 +42,20 @@ static void convert_timestamp(unsigned long long ts, char *ts_ptr, uint16_t size + size); + } + ++static void get_timestamp(struct trace_seq *s, struct tep_record *record, ++ struct ras_events *ras, char *ts_ptr, uint16_t size) ++{ ++ time_t now; ++ struct tm *tm; ++ ++ now = record->ts / user_hz + ras->uptime_diff; ++ tm = localtime(&now); ++ if (tm) ++ strftime(ts_ptr, size, "%Y-%m-%d %H:%M:%S %z", tm); ++ else ++ strncpy(ts_ptr, "1970-01-01 00:00:00 +0000", size); ++} ++ + /* Poison List: Payload out flags */ + #define CXL_POISON_FLAG_MORE BIT(0) + #define CXL_POISON_FLAG_OVERFLOW BIT(1) +@@ -70,17 +84,9 @@ int ras_cxl_poison_event_handler(struct trace_seq *s, + int len; + unsigned long long val; + struct ras_events *ras = context; +- time_t now; +- struct tm *tm; + struct ras_cxl_poison_event ev; + +- now = record->ts / user_hz + ras->uptime_diff; +- tm = localtime(&now); +- if (tm) +- strftime(ev.timestamp, sizeof(ev.timestamp), +- "%Y-%m-%d %H:%M:%S %z", tm); +- else +- strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); ++ get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); + if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) + return -1; + +@@ -285,19 +291,11 @@ int ras_cxl_aer_ue_event_handler(struct trace_seq *s, + { + int len, i; + unsigned long long val; +- time_t now; +- struct tm *tm; + struct ras_events *ras = context; + struct ras_cxl_aer_ue_event ev; + + memset(&ev, 0, sizeof(ev)); +- now = record->ts / user_hz + ras->uptime_diff; +- tm = localtime(&now); +- if (tm) +- strftime(ev.timestamp, sizeof(ev.timestamp), +- "%Y-%m-%d %H:%M:%S %z", tm); +- else +- strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); ++ get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); + if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) + return -1; + +@@ -380,18 +378,10 @@ int ras_cxl_aer_ce_event_handler(struct trace_seq *s, + { + int len; + unsigned long long val; +- time_t now; +- struct tm *tm; + struct ras_events *ras = context; + struct ras_cxl_aer_ce_event ev; + +- now = record->ts / user_hz + ras->uptime_diff; +- tm = localtime(&now); +- if (tm) +- strftime(ev.timestamp, sizeof(ev.timestamp), +- "%Y-%m-%d %H:%M:%S %z", tm); +- else +- strncpy(ev.timestamp, "1970-01-01 00:00:00 +0000", sizeof(ev.timestamp)); ++ get_timestamp(s, record, ras, (char *)&ev.timestamp, sizeof(ev.timestamp)); + if (trace_seq_printf(s, "%s ", ev.timestamp) <= 0) + return -1; + diff --git a/8f79833e3d78424f4a594985fbeb91890f4af81c.patch b/8f79833e3d78424f4a594985fbeb91890f4af81c.patch new file mode 100644 index 0000000..b509270 --- /dev/null +++ b/8f79833e3d78424f4a594985fbeb91890f4af81c.patch @@ -0,0 +1,78 @@ +commit 8f79833e3d78424f4a594985fbeb91890f4af81c +Author: Shiju Jose +Date: Mon Mar 4 11:49:50 2024 +0000 + + rasdaemon: Fix build warnings unused variable if AMP RAS errors is not enabled + + This patch fixes following build warnings unused variable if AMP RAS errors + is not enabled(--enable-amp-ns-decode). + + ================================================== + ras-aer-handler.c: In function ‘ras_aer_event_handler’: + ras-aer-handler.c:72:21: warning: unused variable ‘fn’ [-Wunused-variable] + int seg, bus, dev, fn; + ^~ + ras-aer-handler.c:72:16: warning: unused variable ‘dev’ [-Wunused-variable] + int seg, bus, dev, fn; + ^~~ + ras-aer-handler.c:72:11: warning: unused variable ‘bus’ [-Wunused-variable] + int seg, bus, dev, fn; + ^~~ + ras-aer-handler.c:72:6: warning: unused variable ‘seg’ [-Wunused-variable] + int seg, bus, dev, fn; + ^~~ + ras-aer-handler.c:71:10: warning: variable ‘sel_data’ set but not used [-Wunused-but-set-variable] + uint8_t sel_data[5]; + ^~~~~~~~ + ras-aer-handler.c:70:7: warning: unused variable ‘ipmi_add_sel’ [-Wunused-variable] + char ipmi_add_sel[105]; + ^~~~~~~~~~~~ + ================================================== + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-aer-handler.c b/ras-aer-handler.c +index bb1a6f6..29f6551 100644 +--- a/ras-aer-handler.c ++++ b/ras-aer-handler.c +@@ -67,9 +67,11 @@ int ras_aer_event_handler(struct trace_seq *s, + struct tm *tm; + struct ras_aer_event ev; + char buf[BUF_LEN]; ++#ifdef HAVE_AMP_NS_DECODE + char ipmi_add_sel[105]; + uint8_t sel_data[5]; + int seg, bus, dev, fn; ++#endif + + /* + * Newer kernels (3.10-rc1 or upper) provide an uptime clock. +@@ -132,19 +134,27 @@ int ras_aer_event_handler(struct trace_seq *s, + switch (severity_val) { + case HW_EVENT_AER_UNCORRECTED_NON_FATAL: + ev.error_type = "Uncorrected (Non-Fatal)"; ++#ifdef HAVE_AMP_NS_DECODE + sel_data[0] = 0xca; ++#endif + break; + case HW_EVENT_AER_UNCORRECTED_FATAL: + ev.error_type = "Uncorrected (Fatal)"; ++#ifdef HAVE_AMP_NS_DECODE + sel_data[0] = 0xca; ++#endif + break; + case HW_EVENT_AER_CORRECTED: + ev.error_type = "Corrected"; ++#ifdef HAVE_AMP_NS_DECODE + sel_data[0] = 0xbf; ++#endif + break; + default: + ev.error_type = "Unknown severity"; ++#ifdef HAVE_AMP_NS_DECODE + sel_data[0] = 0xbf; ++#endif + } + trace_seq_puts(s, ev.error_type); + diff --git a/93ca96b66c917af37b2ae9295dc5df46a7d64dd2.patch b/93ca96b66c917af37b2ae9295dc5df46a7d64dd2.patch new file mode 100644 index 0000000..4952349 --- /dev/null +++ b/93ca96b66c917af37b2ae9295dc5df46a7d64dd2.patch @@ -0,0 +1,82 @@ +commit b6506f22fb2d7f44d9d633d44656dff2a94f257e +Author: Shiju Jose +Date: Mon Feb 12 10:49:10 2024 +0000 + + rasdaemon: ras-mc-ctl: Add support for CXL poison trace events + + Add support for CXL poison events to the ras-mc-ctl tool. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + (cherry picked from commit 93ca96b66c917af37b2ae9295dc5df46a7d64dd2) + +diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in +index 6a319a7..16b0589 100755 +--- a/util/ras-mc-ctl.in ++++ b/util/ras-mc-ctl.in +@@ -1382,6 +1382,22 @@ sub summary + print "No CXL overflow errors.\n\n"; + } + $query_handle->finish; ++ ++ # CXL poison errors ++ $query = "select memdev, count(*) from cxl_poison_event$conf{opt}{since} group by memdev"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($memdev, $count)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "\t$memdev errors: $count\n"; ++ } ++ if ($out ne "") { ++ print "CXL poison events summary:\n$out\n"; ++ } else { ++ print "No CXL poison errors.\n\n"; ++ } ++ $query_handle->finish; + } + + # extlog errors +@@ -1491,6 +1507,7 @@ sub errors + my ($pfn, $page_type, $action_result); + my ($memdev, $host, $serial, $error_status, $first_error, $header_log); + my ($log_type, $first_ts, $last_ts); ++ my ($trace_type, $region, $region_uuid, $hpa, $dpa, $dpa_length, $source, $flags, $overflow_ts); + + my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); + +@@ -1636,6 +1653,34 @@ sub errors + } else { + print "No CXL overflow errors.\n\n"; + } ++ ++ # CXL poison errors ++ $query = "select id, timestamp, memdev, host, serial, trace_type, region, region_uuid, hpa, dpa, dpa_length, source, flags, overflow_ts from cxl_poison_event$conf{opt}{since} order by id"; ++ $query_handle = $dbh->prepare($query); ++ $query_handle->execute(); ++ $query_handle->bind_columns(\($id, $timestamp, $memdev, $host, $serial, $trace_type, $region, $region_uuid, $hpa, $dpa, $dpa_length, $source, $flags, $overflow_ts)); ++ $out = ""; ++ while($query_handle->fetch()) { ++ $out .= "$id $timestamp error: "; ++ $out .= "memdev=$memdev, " if (defined $memdev && length $memdev); ++ $out .= "host=$host, " if (defined $host && length $host); ++ $out .= sprintf "serial=0x%llx, ", $serial if (defined $serial && length $serial); ++ $out .= "trace_type=$trace_type, " if (defined $trace_type && length $trace_type); ++ $out .= "region=$region, " if (defined $region && length $region); ++ $out .= "region_uuid=$region_uuid, " if (defined $region_uuid && length $region_uuid); ++ $out .= sprintf "hpa=0x%llx, ", $hpa if (defined $hpa && length $hpa); ++ $out .= sprintf "dpa=0x%llx, ", $dpa if (defined $dpa && length $dpa); ++ $out .= sprintf "dpa_length=0x%x, ", $dpa_length if (defined $dpa_length && length $dpa_length); ++ $out .= "source=$source, " if (defined $source && length $source); ++ $out .= sprintf "flags=%d, ", $flags if (defined $flags && length $flags); ++ $out .= "overflow timestamp=$overflow_ts " if (defined $overflow_ts && length $overflow_ts); ++ $out .= "\n"; ++ } ++ if ($out ne "") { ++ print "CXL poison events:\n$out\n"; ++ } else { ++ print "No CXL poison errors.\n\n"; ++ } + } + + # Extlog errors diff --git a/9a2f6186db2622788f8868d8ec082684d6a06d4d.patch b/9a2f6186db2622788f8868d8ec082684d6a06d4d.patch new file mode 100644 index 0000000..c85f54e --- /dev/null +++ b/9a2f6186db2622788f8868d8ec082684d6a06d4d.patch @@ -0,0 +1,559 @@ +commit 9a2f6186db2622788f8868d8ec082684d6a06d4d +Author: Shiju Jose +Date: Wed Apr 5 13:28:20 2023 +0100 + + rasdaemon: Add support for the CXL dram events + + Add support to log and record the CXL dram events. + + Signed-off-by: Shiju Jose + Signed-off-by: Mauro Carvalho Chehab + +diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c +index 2de96f6..64b0b50 100644 +--- a/ras-cxl-handler.c ++++ b/ras-cxl-handler.c +@@ -865,3 +865,154 @@ int ras_cxl_general_media_event_handler(struct trace_seq *s, + + return 0; + } ++ ++/* ++ * DRAM Event Record - DER ++ * ++ * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44 ++ */ ++#define CXL_DER_VALID_CHANNEL BIT(0) ++#define CXL_DER_VALID_RANK BIT(1) ++#define CXL_DER_VALID_NIBBLE BIT(2) ++#define CXL_DER_VALID_BANK_GROUP BIT(3) ++#define CXL_DER_VALID_BANK BIT(4) ++#define CXL_DER_VALID_ROW BIT(5) ++#define CXL_DER_VALID_COLUMN BIT(6) ++#define CXL_DER_VALID_CORRECTION_MASK BIT(7) ++ ++int ras_cxl_dram_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context) ++{ ++ int len, i; ++ unsigned long long val; ++ struct ras_events *ras = context; ++ struct ras_cxl_dram_event ev; ++ ++ memset(&ev, 0, sizeof(ev)); ++ if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "dpa", record, &val, 1) < 0) ++ return -1; ++ ev.dpa = val; ++ if (trace_seq_printf(s, "dpa:0x%llx ", (unsigned long long)ev.dpa) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "dpa_flags", record, &val, 1) < 0) ++ return -1; ++ ev.dpa_flags = val; ++ if (trace_seq_printf(s, "dpa_flags:") <= 0) ++ return -1; ++ if (decode_cxl_event_flags(s, ev.dpa_flags, cxl_dpa_flags, ARRAY_SIZE(cxl_dpa_flags)) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "descriptor", record, &val, 1) < 0) ++ return -1; ++ ev.descriptor = val; ++ if (trace_seq_printf(s, "descriptor:") <= 0) ++ return -1; ++ if (decode_cxl_event_flags(s, ev.descriptor, cxl_gmer_event_desc_flags, ++ ARRAY_SIZE(cxl_gmer_event_desc_flags)) < 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "type", record, &val, 1) < 0) ++ return -1; ++ ev.type = val; ++ if (trace_seq_printf(s, "type:%s ", get_cxl_type_str(cxl_gmer_mem_event_type, ++ ARRAY_SIZE(cxl_gmer_mem_event_type), ev.type)) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "transaction_type", record, &val, 1) < 0) ++ return -1; ++ ev.transaction_type = val; ++ if (trace_seq_printf(s, "transaction_type:%s ", ++ get_cxl_type_str(cxl_gmer_trans_type, ++ ARRAY_SIZE(cxl_gmer_trans_type), ++ ev.transaction_type)) <= 0) ++ return -1; ++ ++ if (tep_get_field_val(s, event, "validity_flags", record, &val, 1) < 0) ++ return -1; ++ ev.validity_flags = val; ++ ++ if (ev.validity_flags & CXL_DER_VALID_CHANNEL) { ++ if (tep_get_field_val(s, event, "channel", record, &val, 1) < 0) ++ return -1; ++ ev.channel = val; ++ if (trace_seq_printf(s, "channel:%u ", ev.channel) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_RANK) { ++ if (tep_get_field_val(s, event, "rank", record, &val, 1) < 0) ++ return -1; ++ ev.rank = val; ++ if (trace_seq_printf(s, "rank:%u ", ev.rank) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_NIBBLE) { ++ if (tep_get_field_val(s, event, "nibble_mask", record, &val, 1) < 0) ++ return -1; ++ ev.nibble_mask = val; ++ if (trace_seq_printf(s, "nibble_mask:%u ", ev.nibble_mask) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_BANK_GROUP) { ++ if (tep_get_field_val(s, event, "bank_group", record, &val, 1) < 0) ++ return -1; ++ ev.bank_group = val; ++ if (trace_seq_printf(s, "bank_group:%u ", ev.bank_group) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_BANK) { ++ if (tep_get_field_val(s, event, "bank", record, &val, 1) < 0) ++ return -1; ++ ev.bank = val; ++ if (trace_seq_printf(s, "bank:%u ", ev.bank) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_ROW) { ++ if (tep_get_field_val(s, event, "row", record, &val, 1) < 0) ++ return -1; ++ ev.row = val; ++ if (trace_seq_printf(s, "row:%u ", ev.row) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_COLUMN) { ++ if (tep_get_field_val(s, event, "column", record, &val, 1) < 0) ++ return -1; ++ ev.column = val; ++ if (trace_seq_printf(s, "column:%u ", ev.column) <= 0) ++ return -1; ++ } ++ ++ if (ev.validity_flags & CXL_DER_VALID_CORRECTION_MASK) { ++ ev.cor_mask = tep_get_field_raw(s, event, "cor_mask", record, &len, 1); ++ if (!ev.cor_mask) ++ return -1; ++ if (trace_seq_printf(s, "correction_mask:") <= 0) ++ return -1; ++ for (i = 0; i < CXL_EVENT_DER_CORRECTION_MASK_SIZE; i++) { ++ if (trace_seq_printf(s, "%02x ", ev.cor_mask[i]) <= 0) ++ break; ++ } ++ } ++ ++ /* Insert data into the SGBD */ ++#ifdef HAVE_SQLITE3 ++ ras_store_cxl_dram_event(ras, &ev); ++#endif ++ ++#ifdef HAVE_ABRT_REPORT ++ /* Report event to ABRT */ ++ ras_report_cxl_dram_event(ras, &ev); ++#endif ++ ++ return 0; ++} +diff --git a/ras-cxl-handler.h b/ras-cxl-handler.h +index 3adca4a..35455af 100644 +--- a/ras-cxl-handler.h ++++ b/ras-cxl-handler.h +@@ -38,4 +38,7 @@ int ras_cxl_generic_event_handler(struct trace_seq *s, + int ras_cxl_general_media_event_handler(struct trace_seq *s, + struct tep_record *record, + struct tep_event *event, void *context); ++int ras_cxl_dram_event_handler(struct trace_seq *s, ++ struct tep_record *record, ++ struct tep_event *event, void *context); + #endif +diff --git a/ras-events.c b/ras-events.c +index 978dee4..d27e0c4 100644 +--- a/ras-events.c ++++ b/ras-events.c +@@ -251,6 +251,7 @@ int toggle_ras_mc_event(int enable) + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_overflow", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_generic_event", enable); + rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_general_media", enable); ++ rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_dram", enable); + #endif + + free_ras: +@@ -1072,6 +1073,14 @@ int handle_ras_events(int record_events) + else + log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", + "cxl", "cxl_general_media"); ++ ++ rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_dram", ++ ras_cxl_dram_event_handler, NULL, CXL_DRAM_EVENT); ++ if (!rc) ++ num_events++; ++ else ++ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", ++ "cxl", "cxl_dram"); + #endif + + if (!num_events) { +diff --git a/ras-events.h b/ras-events.h +index 9b83df3..d192a6b 100644 +--- a/ras-events.h ++++ b/ras-events.h +@@ -45,6 +45,7 @@ enum { + CXL_OVERFLOW_EVENT, + CXL_GENERIC_EVENT, + CXL_GENERAL_MEDIA_EVENT, ++ CXL_DRAM_EVENT, + NR_EVENTS + }; + +diff --git a/ras-record.c b/ras-record.c +index 507a58e..fffa81c 100644 +--- a/ras-record.c ++++ b/ras-record.c +@@ -915,6 +915,83 @@ int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_gen + + return rc; + } ++ ++/* ++ * Table and functions to handle cxl:cxl_dram_event ++ */ ++static const struct db_fields cxl_dram_event_fields[] = { ++ { .name = "id", .type = "INTEGER PRIMARY KEY" }, ++ { .name = "timestamp", .type = "TEXT" }, ++ { .name = "memdev", .type = "TEXT" }, ++ { .name = "host", .type = "TEXT" }, ++ { .name = "serial", .type = "INTEGER" }, ++ { .name = "log_type", .type = "TEXT" }, ++ { .name = "hdr_uuid", .type = "TEXT" }, ++ { .name = "hdr_flags", .type = "INTEGER" }, ++ { .name = "hdr_handle", .type = "INTEGER" }, ++ { .name = "hdr_related_handle", .type = "INTEGER" }, ++ { .name = "hdr_ts", .type = "TEXT" }, ++ { .name = "hdr_length", .type = "INTEGER" }, ++ { .name = "hdr_maint_op_class", .type = "INTEGER" }, ++ { .name = "dpa", .type = "INTEGER" }, ++ { .name = "dpa_flags", .type = "INTEGER" }, ++ { .name = "descriptor", .type = "INTEGER" }, ++ { .name = "type", .type = "INTEGER" }, ++ { .name = "transaction_type", .type = "INTEGER" }, ++ { .name = "channel", .type = "INTEGER" }, ++ { .name = "rank", .type = "INTEGER" }, ++ { .name = "nibble_mask", .type = "INTEGER" }, ++ { .name = "bank_group", .type = "INTEGER" }, ++ { .name = "bank", .type = "INTEGER" }, ++ { .name = "row", .type = "INTEGER" }, ++ { .name = "column", .type = "INTEGER" }, ++ { .name = "cor_mask", .type = "BLOB" }, ++}; ++ ++static const struct db_table_descriptor cxl_dram_event_tab = { ++ .name = "cxl_dram_event", ++ .fields = cxl_dram_event_fields, ++ .num_fields = ARRAY_SIZE(cxl_dram_event_fields), ++}; ++ ++int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) ++{ ++ int rc; ++ struct sqlite3_priv *priv = ras->db_priv; ++ ++ if (!priv || !priv->stmt_cxl_dram_event) ++ return 0; ++ log(TERM, LOG_INFO, "cxl_dram_event store: %p\n", ++ priv->stmt_cxl_dram_event); ++ ++ ras_store_cxl_common_hdr(priv->stmt_cxl_dram_event, &ev->hdr); ++ sqlite3_bind_int64(priv->stmt_cxl_dram_event, 13, ev->dpa); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 14, ev->dpa_flags); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 15, ev->descriptor); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 16, ev->type); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 17, ev->transaction_type); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 18, ev->channel); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 19, ev->rank); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 20, ev->nibble_mask); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 21, ev->bank_group); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 22, ev->bank); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 23, ev->row); ++ sqlite3_bind_int(priv->stmt_cxl_dram_event, 24, ev->column); ++ sqlite3_bind_blob(priv->stmt_cxl_dram_event, 25, ev->cor_mask, ++ CXL_EVENT_DER_CORRECTION_MASK_SIZE, NULL); ++ ++ rc = sqlite3_step(priv->stmt_cxl_dram_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed to do stmt_cxl_dram_event step on sqlite: error = %d\n", rc); ++ rc = sqlite3_reset(priv->stmt_cxl_dram_event); ++ if (rc != SQLITE_OK && rc != SQLITE_DONE) ++ log(TERM, LOG_ERR, ++ "Failed reset stmt_cxl_dram_event on sqlite: error = %d\n", rc); ++ log(TERM, LOG_INFO, "register inserted at db\n"); ++ ++ return rc; ++} + #endif + + /* +@@ -1306,6 +1383,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) + if (rc != SQLITE_OK) + goto error; + } ++ ++ rc = ras_mc_create_table(priv, &cxl_dram_event_tab); ++ if (rc == SQLITE_OK) { ++ rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_dram_event, ++ &cxl_dram_event_tab); ++ if (rc != SQLITE_OK) ++ goto error; ++ } + #endif + + ras->db_priv = priv; +@@ -1475,6 +1560,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras) + "cpu %u: Failed to finalize cxl_general_media_event sqlite: error = %d\n", + cpu, rc); + } ++ ++ if (priv->stmt_cxl_dram_event) { ++ rc = sqlite3_finalize(priv->stmt_cxl_dram_event); ++ if (rc != SQLITE_OK) ++ log(TERM, LOG_ERR, ++ "cpu %u: Failed to finalize cxl_dram_event sqlite: error = %d\n", ++ cpu, rc); ++ } + #endif + + rc = sqlite3_close_v2(db); +diff --git a/ras-record.h b/ras-record.h +index 37c32de..480ff92 100644 +--- a/ras-record.h ++++ b/ras-record.h +@@ -135,6 +135,7 @@ struct ras_cxl_poison_event { + #define CXL_HEADERLOG_SIZE_U32 (SZ_512 / sizeof(uint32_t)) + #define CXL_EVENT_RECORD_DATA_LENGTH 0x50 + #define CXL_EVENT_GEN_MED_COMP_ID_SIZE 0x10 ++#define CXL_EVENT_DER_CORRECTION_MASK_SIZE 0x20 + + struct ras_cxl_aer_ue_event { + char timestamp[64]; +@@ -199,6 +200,24 @@ struct ras_cxl_general_media_event { + uint16_t validity_flags; + }; + ++struct ras_cxl_dram_event { ++ struct ras_cxl_event_common_hdr hdr; ++ uint64_t dpa; ++ uint8_t dpa_flags; ++ uint8_t descriptor; ++ uint8_t type; ++ uint8_t transaction_type; ++ uint8_t channel; ++ uint8_t rank; ++ uint32_t nibble_mask; ++ uint8_t bank_group; ++ uint8_t bank; ++ uint32_t row; ++ uint16_t column; ++ uint8_t *cor_mask; ++ uint16_t validity_flags; ++}; ++ + struct ras_mc_event; + struct ras_aer_event; + struct ras_extlog_event; +@@ -214,6 +233,7 @@ struct ras_cxl_aer_ce_event; + struct ras_cxl_overflow_event; + struct ras_cxl_generic_event; + struct ras_cxl_general_media_event; ++struct ras_cxl_dram_event; + + #ifdef HAVE_SQLITE3 + +@@ -253,6 +273,7 @@ struct sqlite3_priv { + sqlite3_stmt *stmt_cxl_overflow_event; + sqlite3_stmt *stmt_cxl_generic_event; + sqlite3_stmt *stmt_cxl_general_media_event; ++ sqlite3_stmt *stmt_cxl_dram_event; + #endif + }; + +@@ -287,6 +308,7 @@ int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_eve + int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); + int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); + int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); ++int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev); + + #else + static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; +@@ -306,6 +328,7 @@ static inline int ras_store_cxl_aer_ce_event(struct ras_events *ras, struct ras_ + static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; + static inline int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; + static inline int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; ++static inline int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) { return 0; }; + + #endif + +diff --git a/ras-report.c b/ras-report.c +index 725dc9b..21180b1 100644 +--- a/ras-report.c ++++ b/ras-report.c +@@ -543,6 +543,68 @@ static int set_cxl_general_media_event_backtrace(char *buf, struct ras_cxl_gener + return 0; + } + ++static int set_cxl_dram_event_backtrace(char *buf, struct ras_cxl_dram_event *ev) ++{ ++ char bt_buf[MAX_BACKTRACE_SIZE]; ++ ++ if (!buf || !ev) ++ return -1; ++ ++ sprintf(bt_buf, "BACKTRACE=" \ ++ "timestamp=%s\n" \ ++ "memdev=%s\n" \ ++ "host=%s\n" \ ++ "serial=0x%lx\n" \ ++ "log_type=%s\n" \ ++ "hdr_uuid=%s\n" \ ++ "hdr_flags=0x%x\n" \ ++ "hdr_handle=0x%x\n" \ ++ "hdr_related_handle=0x%x\n" \ ++ "hdr_timestamp=%s\n" \ ++ "hdr_length=%u\n" \ ++ "hdr_maint_op_class=%u\n" \ ++ "dpa=0x%lx\n" \ ++ "dpa_flags=%u\n" \ ++ "descriptor=%u\n" \ ++ "type=%u\n" \ ++ "transaction_type=%u\n" \ ++ "channel=%u\n" \ ++ "rank=%u\n" \ ++ "nibble_mask=%u\n" \ ++ "bank_group=%u\n" \ ++ "bank=%u\n" \ ++ "row=%u\n" \ ++ "column=%u\n", \ ++ ev->hdr.timestamp, \ ++ ev->hdr.memdev, \ ++ ev->hdr.host, \ ++ ev->hdr.serial, \ ++ ev->hdr.log_type, \ ++ ev->hdr.hdr_uuid, \ ++ ev->hdr.hdr_flags, \ ++ ev->hdr.hdr_handle, \ ++ ev->hdr.hdr_related_handle, \ ++ ev->hdr.hdr_timestamp, \ ++ ev->hdr.hdr_length, \ ++ ev->hdr.hdr_maint_op_class, \ ++ ev->dpa, \ ++ ev->dpa_flags, \ ++ ev->descriptor, \ ++ ev->type, \ ++ ev->transaction_type, \ ++ ev->channel, \ ++ ev->rank, \ ++ ev->nibble_mask, \ ++ ev->bank_group, \ ++ ev->bank, \ ++ ev->row, \ ++ ev->column); ++ ++ strcat(buf, bt_buf); ++ ++ return 0; ++} ++ + static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; +@@ -598,6 +660,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){ + case CXL_GENERAL_MEDIA_EVENT: + rc = set_cxl_general_media_event_backtrace(buf, (struct ras_cxl_general_media_event *)ev); + break; ++ case CXL_DRAM_EVENT: ++ rc = set_cxl_dram_event_backtrace(buf, (struct ras_cxl_dram_event *)ev); ++ break; + default: + return -1; + } +@@ -1271,3 +1336,47 @@ cxl_general_media_fail: + else + return -1; + } ++ ++int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) ++{ ++ char buf[MAX_MESSAGE_SIZE]; ++ int sockfd = 0; ++ int done = 0; ++ int rc = -1; ++ ++ memset(buf, 0, sizeof(buf)); ++ ++ sockfd = setup_report_socket(); ++ if (sockfd < 0) ++ return -1; ++ ++ rc = commit_report_basic(sockfd); ++ if (rc < 0) ++ goto cxl_dram_fail; ++ ++ rc = commit_report_backtrace(sockfd, CXL_DRAM_EVENT, ev); ++ if (rc < 0) ++ goto cxl_dram_fail; ++ ++ sprintf(buf, "ANALYZER=%s", "rasdaemon-cxl_dram_event"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_dram_fail; ++ ++ sprintf(buf, "REASON=%s", "CXL DRAM Event"); ++ rc = write(sockfd, buf, strlen(buf) + 1); ++ if (rc < strlen(buf) + 1) ++ goto cxl_dram_fail; ++ ++ done = 1; ++ ++cxl_dram_fail: ++ ++ if (sockfd >= 0) ++ close(sockfd); ++ ++ if (done) ++ return 0; ++ else ++ return -1; ++} +diff --git a/ras-report.h b/ras-report.h +index d9ec7df..1ad00e0 100644 +--- a/ras-report.h ++++ b/ras-report.h +@@ -45,6 +45,7 @@ int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras_cxl_aer_ce_ev + int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev); + int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev); + int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev); ++int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev); + + #else + +@@ -62,6 +63,7 @@ static inline int ras_report_cxl_aer_ce_event(struct ras_events *ras, struct ras + static inline int ras_report_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow_event *ev) { return 0; }; + static inline int ras_report_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; }; + static inline int ras_report_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; }; ++static inline int ras_report_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) { return 0; }; + + #endif + diff --git a/SOURCES/0862a096c3a1d0f993703ab3299f1ddfadf53d7f.patch b/SOURCES/0862a096c3a1d0f993703ab3299f1ddfadf53d7f.patch deleted file mode 100644 index 852eb4f..0000000 --- a/SOURCES/0862a096c3a1d0f993703ab3299f1ddfadf53d7f.patch +++ /dev/null @@ -1,85 +0,0 @@ -commit 0862a096c3a1d0f993703ab3299f1ddfadf53d7f -Author: Shiju Jose -Date: Tue Aug 11 13:31:46 2020 +0100 - - rasdaemon: ras-mc-ctl: Add ARM processor error information - - Add supporting ARM processor error in the ras-mc-ctl tool. - - Signed-off-by: Shiju Jose - Signed-off-by: Mauro Carvalho Chehab - ---- - util/ras-mc-ctl.in | 40 ++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 40 insertions(+) - ---- rasdaemon-0.6.1.orig/util/ras-mc-ctl.in 2021-10-06 14:14:25.000440090 -0400 -+++ rasdaemon-0.6.1/util/ras-mc-ctl.in 2021-10-06 14:15:59.995598590 -0400 -@@ -1124,6 +1124,7 @@ sub summary - my ($query, $query_handle, $out); - my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg); - my ($etype, $severity, $etype_string, $severity_string); -+ my ($affinity, $mpidr); - - my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - -@@ -1159,6 +1160,22 @@ sub summary - } - $query_handle->finish; - -+ # ARM processor arm_event errors -+ $query = "select affinity, mpidr, count(*) from arm_event group by affinity, mpidr"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($affinity, $mpidr, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "\t$count errors\n"; -+ } -+ if ($out ne "") { -+ print "ARM processor events summary:\n$out\n"; -+ } else { -+ print "No ARM processor errors.\n\n"; -+ } -+ $query_handle->finish; -+ - # extlog errors - $query = "select etype, severity, count(*) from extlog_event group by etype, severity"; - $query_handle = $dbh->prepare($query); -@@ -1202,6 +1219,7 @@ sub errors - my ($query, $query_handle, $id, $time, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out); - my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location); - my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data); -+ my ($error_count, $affinity, $mpidr, $r_state, $psci_state); - - my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {}); - -@@ -1241,6 +1259,28 @@ sub errors - } - $query_handle->finish; - -+ # ARM processor arm_event errors -+ $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "$id $timestamp error: "; -+ $out .= "error_count=$error_count, " if ($error_count); -+ $out .= "affinity_level=$affinity, "; -+ $out .= sprintf "mpidr=0x%x, ", $mpidr; -+ $out .= sprintf "running_state=0x%x, ", $r_state; -+ $out .= sprintf "psci_state=0x%x", $psci_state; -+ $out .= "\n"; -+ } -+ if ($out ne "") { -+ print "ARM processor events:\n$out\n"; -+ } else { -+ print "No ARM processor errors.\n\n"; -+ } -+ $query_handle->finish; -+ - # Extlog errors - $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id"; - $query_handle = $dbh->prepare($query); diff --git a/SOURCES/16d929b024c31d54a7f8a72eab094376c7be27f5.patch b/SOURCES/16d929b024c31d54a7f8a72eab094376c7be27f5.patch deleted file mode 100644 index ab66f52..0000000 --- a/SOURCES/16d929b024c31d54a7f8a72eab094376c7be27f5.patch +++ /dev/null @@ -1,32 +0,0 @@ -commit 16d929b024c31d54a7f8a72eab094376c7be27f5 -Author: Mauro Carvalho Chehab -Date: Wed May 26 10:20:39 2021 +0200 - - Makefile.am: fix build header rules - - non-standard-hisilicon.h was added twice; - ras-memory-failure-handler.h is missing. - - Due to that, the tarball becomes incomplete, causing build - errors. - - While here, also adjust .travis.yml to use --enable-all. - - Signed-off-by: Mauro Carvalho Chehab - ---- - Makefile.am | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - ---- a/Makefile.am 2021-10-13 13:27:53.402685179 -0400 -+++ b/Makefile.am 2021-10-13 13:28:11.664525173 -0400 -@@ -54,7 +54,8 @@ rasdaemon_LDADD = -lpthread $(SQLITE3_LI - - include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ - ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \ -- ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h -+ ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \ -+ ras-memory-failure-handler.h - - # This rule can't be called with more than one Makefile job (like make -j8) - # I can't figure out a way to fix that diff --git a/SOURCES/2290d65b97311dd5736838f1e285355f7f357046.patch b/SOURCES/2290d65b97311dd5736838f1e285355f7f357046.patch deleted file mode 100644 index 0710974..0000000 --- a/SOURCES/2290d65b97311dd5736838f1e285355f7f357046.patch +++ /dev/null @@ -1,538 +0,0 @@ -commit 2290d65b97311dd5736838f1e285355f7f357046 -Author: Shiju Jose -Date: Mon Mar 8 16:57:26 2021 +0000 - - rasdaemon: add support for memory_failure events - - Add support to log the memory_failure kernel trace - events. - - Example rasdaemon log and SQLite DB output for the - memory_failure event, - ================================================= - rasdaemon: memory_failure_event store: 0x126ce8f8 - rasdaemon: register inserted at db - <...>-785 [000] 0.000024: memory_failure_event: 2020-10-02 13:27:13 -0400 pfn=0x204000000 page_type=free buddy page action_result=Delayed - - CREATE TABLE memory_failure_event (id INTEGER PRIMARY KEY, timestamp TEXT, pfn TEXT, page_type TEXT, action_result TEXT); - INSERT INTO memory_failure_event VALUES(1,'2020-10-02 13:27:13 -0400','0x204000000','free buddy page','Delayed'); - ================================================== - - Signed-off-by: Shiju Jose - Signed-off-by: Mauro Carvalho Chehab - ---- - Makefile.am | 4 - ras-events.c | 15 +++ - ras-memory-failure-handler.c | 179 +++++++++++++++++++++++++++++++++++++++++++ - ras-memory-failure-handler.h | 25 ++++++ - ras-record.c | 56 +++++++++++++ - ras-record.h | 13 +++ - ras-report.c | 68 ++++++++++++++++ - ras-report.h | 5 - - 8 files changed, 364 insertions(+), 1 deletion(-) - ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ b/ras-memory-failure-handler.c 2021-10-14 16:31:36.840657728 -0400 -@@ -0,0 +1,179 @@ -+/* -+ * Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ */ -+ -+#include -+#include -+#include -+#include "libtrace/kbuffer.h" -+#include "ras-memory-failure-handler.h" -+#include "ras-record.h" -+#include "ras-logger.h" -+#include "ras-report.h" -+ -+/* Memory failure - various types of pages */ -+enum mf_action_page_type { -+ MF_MSG_KERNEL, -+ MF_MSG_KERNEL_HIGH_ORDER, -+ MF_MSG_SLAB, -+ MF_MSG_DIFFERENT_COMPOUND, -+ MF_MSG_POISONED_HUGE, -+ MF_MSG_HUGE, -+ MF_MSG_FREE_HUGE, -+ MF_MSG_NON_PMD_HUGE, -+ MF_MSG_UNMAP_FAILED, -+ MF_MSG_DIRTY_SWAPCACHE, -+ MF_MSG_CLEAN_SWAPCACHE, -+ MF_MSG_DIRTY_MLOCKED_LRU, -+ MF_MSG_CLEAN_MLOCKED_LRU, -+ MF_MSG_DIRTY_UNEVICTABLE_LRU, -+ MF_MSG_CLEAN_UNEVICTABLE_LRU, -+ MF_MSG_DIRTY_LRU, -+ MF_MSG_CLEAN_LRU, -+ MF_MSG_TRUNCATED_LRU, -+ MF_MSG_BUDDY, -+ MF_MSG_BUDDY_2ND, -+ MF_MSG_DAX, -+ MF_MSG_UNSPLIT_THP, -+ MF_MSG_UNKNOWN, -+}; -+ -+/* Action results for various types of pages */ -+enum mf_action_result { -+ MF_IGNORED, /* Error: cannot be handled */ -+ MF_FAILED, /* Error: handling failed */ -+ MF_DELAYED, /* Will be handled later */ -+ MF_RECOVERED, /* Successfully recovered */ -+}; -+ -+/* memory failure page types */ -+static const struct { -+ int type; -+ const char *page_type; -+} mf_page_type[] = { -+ { MF_MSG_KERNEL, "reserved kernel page" }, -+ { MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page"}, -+ { MF_MSG_SLAB, "kernel slab page"}, -+ { MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking"}, -+ { MF_MSG_POISONED_HUGE, "huge page already hardware poisoned"}, -+ { MF_MSG_HUGE, "huge page"}, -+ { MF_MSG_FREE_HUGE, "free huge page"}, -+ { MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page"}, -+ { MF_MSG_UNMAP_FAILED, "unmapping failed page"}, -+ { MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page"}, -+ { MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page"}, -+ { MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page"}, -+ { MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page"}, -+ { MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page"}, -+ { MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page"}, -+ { MF_MSG_DIRTY_LRU, "dirty LRU page"}, -+ { MF_MSG_CLEAN_LRU, "clean LRU page"}, -+ { MF_MSG_TRUNCATED_LRU, "already truncated LRU page"}, -+ { MF_MSG_BUDDY, "free buddy page"}, -+ { MF_MSG_BUDDY_2ND, "free buddy page (2nd try)"}, -+ { MF_MSG_DAX, "dax page"}, -+ { MF_MSG_UNSPLIT_THP, "unsplit thp"}, -+ { MF_MSG_UNKNOWN, "unknown page"}, -+}; -+ -+/* memory failure action results */ -+static const struct { -+ int result; -+ const char *action_result; -+} mf_action_result[] = { -+ { MF_IGNORED, "Ignored" }, -+ { MF_FAILED, "Failed" }, -+ { MF_DELAYED, "Delayed" }, -+ { MF_RECOVERED, "Recovered" }, -+}; -+ -+static const char *get_page_type(int page_type) -+{ -+ int i; -+ -+ for (i = 0; i < ARRAY_SIZE(mf_page_type); i++) -+ if (mf_page_type[i].type == page_type) -+ return mf_page_type[i].page_type; -+ -+ return "unknown page"; -+} -+ -+static const char *get_action_result(int result) -+{ -+ int i; -+ -+ for (i = 0; i < ARRAY_SIZE(mf_action_result); i++) -+ if (mf_action_result[i].result == result) -+ return mf_action_result[i].action_result; -+ -+ return "unknown"; -+} -+ -+ -+int ras_memory_failure_event_handler(struct trace_seq *s, -+ struct pevent_record *record, -+ struct event_format *event, void *context) -+{ -+ unsigned long long val; -+ struct ras_events *ras = context; -+ time_t now; -+ struct tm *tm; -+ struct ras_mf_event ev; -+ -+ /* -+ * Newer kernels (3.10-rc1 or upper) provide an uptime clock. -+ * On previous kernels, the way to properly generate an event would -+ * be to inject a fake one, measure its timestamp and diff it against -+ * gettimeofday. We won't do it here. Instead, let's use uptime, -+ * falling-back to the event report's time, if "uptime" clock is -+ * not available (legacy kernels). -+ */ -+ -+ if (ras->use_uptime) -+ now = record->ts/user_hz + ras->uptime_diff; -+ else -+ now = time(NULL); -+ -+ tm = localtime(&now); -+ if (tm) -+ strftime(ev.timestamp, sizeof(ev.timestamp), -+ "%Y-%m-%d %H:%M:%S %z", tm); -+ trace_seq_printf(s, "%s ", ev.timestamp); -+ -+ if (pevent_get_field_val(s, event, "pfn", record, &val, 1) < 0) -+ return -1; -+ sprintf(ev.pfn, "0x%llx", val); -+ trace_seq_printf(s, "pfn=0x%llx ", val); -+ -+ if (pevent_get_field_val(s, event, "type", record, &val, 1) < 0) -+ return -1; -+ ev.page_type = get_page_type(val); -+ trace_seq_printf(s, "page_type=%s ", ev.page_type); -+ -+ if (pevent_get_field_val(s, event, "result", record, &val, 1) < 0) -+ return -1; -+ ev.action_result = get_action_result(val); -+ trace_seq_printf(s, "action_result=%s ", ev.action_result); -+ -+ /* Store data into the SQLite DB */ -+#ifdef HAVE_SQLITE3 -+ ras_store_mf_event(ras, &ev); -+#endif -+ -+#ifdef HAVE_ABRT_REPORT -+ /* Report event to ABRT */ -+ ras_report_mf_event(ras, &ev); -+#endif -+ -+ return 0; -+} ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ b/ras-memory-failure-handler.h 2021-10-14 16:31:36.840657728 -0400 -@@ -0,0 +1,25 @@ -+/* -+ * Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+*/ -+ -+#ifndef __RAS_MEMORY_FAILURE_HANDLER_H -+#define __RAS_MEMORY_FAILURE_HANDLER_H -+ -+#include "ras-events.h" -+#include "libtrace/event-parse.h" -+ -+int ras_memory_failure_event_handler(struct trace_seq *s, -+ struct pevent_record *record, -+ struct event_format *event, void *context); -+ -+#endif ---- a/ras-record.c 2018-04-25 06:19:03.000000000 -0400 -+++ b/ras-record.c 2021-10-14 16:31:36.840657728 -0400 -@@ -404,6 +404,55 @@ sqlite3_bind_text(priv->stmt_mce_record, - } - #endif - -+/* -+ * Table and functions to handle ras:memory_failure -+ */ -+ -+#ifdef HAVE_MEMORY_FAILURE -+static const struct db_fields mf_event_fields[] = { -+ { .name="id", .type="INTEGER PRIMARY KEY" }, -+ { .name="timestamp", .type="TEXT" }, -+ { .name="pfn", .type="TEXT" }, -+ { .name="page_type", .type="TEXT" }, -+ { .name="action_result", .type="TEXT" }, -+}; -+ -+static const struct db_table_descriptor mf_event_tab = { -+ .name = "memory_failure_event", -+ .fields = mf_event_fields, -+ .num_fields = ARRAY_SIZE(mf_event_fields), -+}; -+ -+int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) -+{ -+ int rc; -+ struct sqlite3_priv *priv = ras->db_priv; -+ -+ if (!priv || !priv->stmt_mf_event) -+ return 0; -+ log(TERM, LOG_INFO, "memory_failure_event store: %p\n", priv->stmt_mf_event); -+ -+ sqlite3_bind_text(priv->stmt_mf_event, 1, ev->timestamp, -1, NULL); -+ sqlite3_bind_text(priv->stmt_mf_event, 2, ev->pfn, -1, NULL); -+ sqlite3_bind_text(priv->stmt_mf_event, 3, ev->page_type, -1, NULL); -+ sqlite3_bind_text(priv->stmt_mf_event, 4, ev->action_result, -1, NULL); -+ -+ rc = sqlite3_step(priv->stmt_mf_event); -+ if (rc != SQLITE_OK && rc != SQLITE_DONE) -+ log(TERM, LOG_ERR, -+ "Failed to do memory_failure_event step on sqlite: error = %d\n", rc); -+ -+ rc = sqlite3_reset(priv->stmt_mf_event); -+ if (rc != SQLITE_OK && rc != SQLITE_DONE) -+ log(TERM, LOG_ERR, -+ "Failed reset memory_failure_event on sqlite: error = %d\n", -+ rc); -+ -+ log(TERM, LOG_INFO, "register inserted at db\n"); -+ -+ return rc; -+} -+#endif - - /* - * Generic code -@@ -567,6 +616,13 @@ usleep(10000); - rc = ras_mc_prepare_stmt(priv, &priv->stmt_arm_record, - &arm_event_tab); - #endif -+#ifdef HAVE_MEMORY_FAILURE -+ rc = ras_mc_create_table(priv, &mf_event_tab); -+ if (rc == SQLITE_OK) { -+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_mf_event, -+ &mf_event_tab); -+ } -+#endif - - ras->db_priv = priv; - return 0; ---- a/ras-record.h 2018-04-25 06:19:03.000000000 -0400 -+++ b/ras-record.h 2021-10-14 16:31:36.840657728 -0400 -@@ -75,12 +75,20 @@ struct ras_arm_event { - int32_t psci_state; - }; - -+struct ras_mf_event { -+ char timestamp[64]; -+ char pfn[30]; -+ const char *page_type; -+ const char *action_result; -+}; -+ - struct ras_mc_event; - struct ras_aer_event; - struct ras_extlog_event; - struct ras_non_standard_event; - struct ras_arm_event; - struct mce_event; -+struct ras_mf_event; - - #ifdef HAVE_SQLITE3 - -@@ -104,6 +112,9 @@ struct sqlite3_priv { - #ifdef HAVE_ARM - sqlite3_stmt *stmt_arm_record; - #endif -+#ifdef HAVE_MEMORY_FAILURE -+ sqlite3_stmt *stmt_mf_event; -+#endif - }; - - int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras); -@@ -113,6 +124,7 @@ int ras_store_mce_record(struct ras_even - int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev); - int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev); - int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev); -+int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev); - - #else - static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; }; -@@ -122,6 +134,7 @@ static inline int ras_store_mce_record(s - static inline int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev) { return 0; }; - static inline int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; }; - static inline int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) { return 0; }; -+static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; }; - - #endif - ---- a/ras-report.c 2017-10-14 05:11:34.000000000 -0400 -+++ b/ras-report.c 2021-10-14 16:31:36.840657728 -0400 -@@ -255,6 +255,28 @@ "midr=0x%lx\n" \ - return 0; - } - -+static int set_mf_event_backtrace(char *buf, struct ras_mf_event *ev) -+{ -+ char bt_buf[MAX_BACKTRACE_SIZE]; -+ -+ if (!buf || !ev) -+ return -1; -+ -+ sprintf(bt_buf, "BACKTRACE=" \ -+ "timestamp=%s\n" \ -+ "pfn=%s\n" \ -+ "page_type=%s\n" \ -+ "action_result=%s\n", \ -+ ev->timestamp, \ -+ ev->pfn, \ -+ ev->page_type, \ -+ ev->action_result); -+ -+ strcat(buf, bt_buf); -+ -+ return 0; -+} -+ - static int commit_report_backtrace(int sockfd, int type, void *ev){ - char buf[MAX_BACKTRACE_SIZE]; - char *pbuf = buf; -@@ -283,6 +305,9 @@ memset(buf, 0, MAX_BACKTRACE_SIZE); - case ARM_EVENT: - rc = set_arm_event_backtrace(buf, (struct ras_arm_event *)ev); - break; -+ case MF_EVENT: -+ rc = set_mf_event_backtrace(buf, (struct ras_mf_event *)ev); -+ break; - default: - return -1; - } -@@ -549,3 +574,46 @@ return 0; - return -1; - } - } -+ -+int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) -+{ -+ char buf[MAX_MESSAGE_SIZE]; -+ int sockfd = 0; -+ int done = 0; -+ int rc = -1; -+ -+ memset(buf, 0, sizeof(buf)); -+ -+ sockfd = setup_report_socket(); -+ if (sockfd < 0) -+ return -1; -+ -+ rc = commit_report_basic(sockfd); -+ if (rc < 0) -+ goto mf_fail; -+ -+ rc = commit_report_backtrace(sockfd, MF_EVENT, ev); -+ if (rc < 0) -+ goto mf_fail; -+ -+ sprintf(buf, "ANALYZER=%s", "rasdaemon-memory_failure"); -+ rc = write(sockfd, buf, strlen(buf) + 1); -+ if (rc < strlen(buf) + 1) -+ goto mf_fail; -+ -+ sprintf(buf, "REASON=%s", "memory failure problem"); -+ rc = write(sockfd, buf, strlen(buf) + 1); -+ if (rc < strlen(buf) + 1) -+ goto mf_fail; -+ -+ done = 1; -+ -+mf_fail: -+ if (sockfd > 0) -+ close(sockfd); -+ -+ if (done) -+ return 0; -+ else -+ return -1; -+} ---- a/ras-report.h 2017-10-14 05:11:34.000000000 -0400 -+++ b/ras-report.h 2021-10-14 16:31:36.840657728 -0400 -@@ -34,7 +34,8 @@ enum { - MCE_EVENT, - AER_EVENT, - NON_STANDARD_EVENT, -- ARM_EVENT -+ ARM_EVENT, -+ MF_EVENT, - }; - - #ifdef HAVE_ABRT_REPORT -@@ -44,6 +45,7 @@ int ras_report_aer_event(struct ras_even - int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev); - int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev); - int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev); -+int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev); - - #else - -@@ -52,6 +54,7 @@ static inline int ras_report_aer_event(s - static inline int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev) { return 0; }; - static inline int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; }; - static inline int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev) { return 0; }; -+static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; }; - - #endif - ---- a/Makefile.am 2018-04-25 06:21:56.000000000 -0400 -+++ b/Makefile.am 2021-10-14 16:37:42.423639762 -0400 -@@ -41,12 +41,16 @@ endif - if WITH_EXTLOG - rasdaemon_SOURCES += ras-extlog-handler.c - endif -+if WITH_MEMORY_FAILURE -+ rasdaemon_SOURCES += ras-memory-failure-handler.c -+endif - if WITH_ABRT_REPORT - rasdaemon_SOURCES += ras-report.c - endif - if WITH_HISI_NS_DECODE - rasdaemon_SOURCES += non-standard-hisi_hip07.c - endif -+ - rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a - - include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ ---- a/ras-events.c 2021-10-14 16:31:36.730658636 -0400 -+++ b/ras-events.c 2021-10-14 16:37:11.043898809 -0400 -@@ -33,6 +33,7 @@ * Foundation, Inc., 51 Franklin Street, - #include "ras-arm-handler.h" - #include "ras-mce-handler.h" - #include "ras-extlog-handler.h" -+#include "ras-memory-failure-handler.h" - #include "ras-record.h" - #include "ras-logger.h" - -@@ -218,6 +219,10 @@ if (rc < 0) { - rc |= __toggle_ras_mc_event(ras, "ras", "arm_event", enable); - #endif - -+#ifdef HAVE_MEMORY_FAILURE -+ rc |= __toggle_ras_mc_event(ras, "ras", "memory_failure_event", enable); -+#endif -+ - free_ras: - free(ras); - return rc; -@@ -736,6 +741,16 @@ (void)open("/sys/kernel/debug/ras/daemon - "ras", "aer_event"); - #endif - -+#ifdef HAVE_MEMORY_FAILURE -+ rc = add_event_handler(ras, pevent, page_size, "ras", "memory_failure_event", -+ ras_memory_failure_event_handler); -+ if (!rc) -+ num_events++; -+ else -+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n", -+ "ras", "memory_failure_event"); -+#endif -+ - if (!num_events) { - log(ALL, LOG_INFO, - "Failed to trace all supported RAS events. Aborting.\n"); diff --git a/SOURCES/28ea956acc2dab7c18b4701f9657afb9ab3ddc79.patch b/SOURCES/28ea956acc2dab7c18b4701f9657afb9ab3ddc79.patch deleted file mode 100644 index fdc509b..0000000 --- a/SOURCES/28ea956acc2dab7c18b4701f9657afb9ab3ddc79.patch +++ /dev/null @@ -1,28 +0,0 @@ -commit 28ea956acc2dab7c18b4701f9657afb9ab3ddc79 -Author: Muralidhara M K -Date: Mon Jul 12 05:18:43 2021 -0500 - - rasdaemon: set SMCA maximum number of banks to 64 - - Newer AMD systems with SMCA banks support up to 64 MCA banks per CPU. - - This patch is based on the commit below upstremed into the kernel: - a0bc32b3cacf ("x86/mce: Increase maximum number of banks to 64") - - Signed-off-by: Muralidhara M K - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/mce-amd-smca.c b/mce-amd-smca.c -index e0cf512..3c346f4 100644 ---- a/mce-amd-smca.c -+++ b/mce-amd-smca.c -@@ -75,6 +75,9 @@ enum smca_bank_types { - N_SMCA_BANK_TYPES - }; - -+/* Maximum number of MCA banks per CPU. */ -+#define MAX_NR_BANKS 64 -+ - /* SMCA Extended error strings */ - /* Load Store */ - static const char * const smca_ls_mce_desc[] = { diff --git a/SOURCES/2a1d217660351c08eb2f8bccebf939abba2f7e69.patch b/SOURCES/2a1d217660351c08eb2f8bccebf939abba2f7e69.patch deleted file mode 100644 index 1b5844d..0000000 --- a/SOURCES/2a1d217660351c08eb2f8bccebf939abba2f7e69.patch +++ /dev/null @@ -1,66 +0,0 @@ -commit 2a1d217660351c08eb2f8bccebf939abba2f7e69 -Author: Brian WoodsGhannam, Yazen -Date: Fri Nov 1 15:48:13 2019 +0100 - - rasdaemon: rename CPU_NAPLES cputype - - Change CPU_NAPLES to CPU_AMD_SMCA to reflect that it isn't just NAPLES - that is supported, but AMD's Scalable Machine Check Architecture (SMCA). - - [ Yazen: change family check to feature check, and change CPU name. ] - - CC: "mchehab+samsung@kernel.org" , "Namburu, Chandu-babu" # Thread-Topic: [PATCH 1/2] rasdaemon: rename CPU_NAPLES cputype - Signed-off-by: Brian Woods - Signed-off-by: Yazen Ghannam - Cc: Chandu-babu Namburu - Signed-off-by: Mauro Carvalho Chehab - ---- - ras-mce-handler.c | 10 ++++++---- - ras-mce-handler.h | 2 +- - 2 files changed, 7 insertions(+), 5 deletions(-) - ---- rasdaemon-0.6.1.orig/ras-mce-handler.c 2021-05-26 15:16:24.699096556 -0400 -+++ rasdaemon-0.6.1/ras-mce-handler.c 2021-05-26 15:18:06.543162745 -0400 -@@ -55,7 +55,7 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series - [CPU_KNIGHTS_LANDING] = "Knights Landing", - [CPU_KNIGHTS_MILL] = "Knights Mill", - [CPU_SKYLAKE_XEON] = "Skylake server", -- [CPU_NAPLES] = "AMD Family 17h Zen1" -+ [CPU_AMD_SMCA] = "AMD Scalable MCA", - }; - - static enum cputype select_intel_cputype(struct ras_events *ras) -@@ -191,8 +191,10 @@ ret = 0; - if (!strcmp(mce->vendor, "AuthenticAMD")) { - if (mce->family == 15) - mce->cputype = CPU_K8; -- if (mce->family == 23) -- mce->cputype = CPU_NAPLES; -+ if (strstr(mce->processor_flags, "smca")) { -+ mce->cputype = CPU_AMD_SMCA; -+ goto ret; -+ } - if (mce->family > 23) { - log(ALL, LOG_INFO, - "Can't parse MCE for this AMD CPU yet %d\n", -@@ -435,7 +437,7 @@ if (pevent_get_field_val(s, event, "ipid - case CPU_K8: - rc = parse_amd_k8_event(ras, &e); - break; -- case CPU_NAPLES: -+ case CPU_AMD_SMCA: - rc = parse_amd_smca_event(ras, &e); - break; - default: /* All other CPU types are Intel */ ---- rasdaemon-0.6.1.orig/ras-mce-handler.h 2021-05-26 15:17:15.409631590 -0400 -+++ rasdaemon-0.6.1/ras-mce-handler.h 2021-05-26 15:18:20.102038424 -0400 -@@ -50,7 +50,7 @@ enum cputype { - CPU_KNIGHTS_LANDING, - CPU_KNIGHTS_MILL, - CPU_SKYLAKE_XEON, -- CPU_NAPLES, -+ CPU_AMD_SMCA, - }; - - struct mce_event { diff --git a/SOURCES/546cf713f667437fb6e283cc3dc090679eb47d08.patch b/SOURCES/546cf713f667437fb6e283cc3dc090679eb47d08.patch deleted file mode 100644 index 448b1f6..0000000 --- a/SOURCES/546cf713f667437fb6e283cc3dc090679eb47d08.patch +++ /dev/null @@ -1,372 +0,0 @@ -commit 546cf713f667437fb6e283cc3dc090679eb47d08 -Author: Subhendu Saha -Date: Tue Jan 12 03:29:55 2021 -0500 - - Fix ras-mc-ctl script. - - When rasdaemon is compiled without enabling aer, mce, devlink, - etc., those tables are not created in the database file. Then - ras-mc-ctl script breaks trying to query data from non-existent - tables. - - Signed-off-by: Subhendu Saha subhends@akamai.com - Signed-off-by: Mauro Carvalho Chehab - ---- - util/ras-mc-ctl.in | 310 ++++++++++++++++++++++++++++------------------------- - 1 file changed, 168 insertions(+), 142 deletions(-) - ---- a/util/ras-mc-ctl.in 2021-10-12 13:45:43.260646935 -0400 -+++ b/util/ras-mc-ctl.in 2021-10-12 13:46:38.610158949 -0400 -@@ -41,6 +41,16 @@ my $sysconfdir = "@sysconfdir@"; - my $dmidecode = find_prog ("dmidecode"); - my $modprobe = find_prog ("modprobe") or exit (1); - -+my $has_aer = 0; -+my $has_arm = 0; -+my $has_extlog = 0; -+my $has_mce = 0; -+ -+@WITH_AER_TRUE@$has_aer = 1; -+@WITH_ARM_TRUE@$has_arm = 1; -+@WITH_EXTLOG_TRUE@$has_extlog = 1; -+@WITH_MCE_TRUE@$has_mce = 1; -+ - my %conf = (); - my %bus = (); - my %dimm_size = (); -@@ -1145,70 +1155,78 @@ sub summary - $query_handle->finish; - - # PCIe AER aer_event errors -- $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($err_type, $msg, $count)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "\t$count $err_type errors: $msg\n"; -- } -- if ($out ne "") { -- print "PCIe AER events summary:\n$out\n"; -- } else { -- print "No PCIe AER errors.\n\n"; -+ if ($has_aer == 1) { -+ $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($err_type, $msg, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "\t$count $err_type errors: $msg\n"; -+ } -+ if ($out ne "") { -+ print "PCIe AER events summary:\n$out\n"; -+ } else { -+ print "No PCIe AER errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # ARM processor arm_event errors -- $query = "select affinity, mpidr, count(*) from arm_event group by affinity, mpidr"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($affinity, $mpidr, $count)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "\t$count errors\n"; -- } -- if ($out ne "") { -- print "ARM processor events summary:\n$out\n"; -- } else { -- print "No ARM processor errors.\n\n"; -+ if ($has_arm == 1) { -+ $query = "select affinity, mpidr, count(*) from arm_event group by affinity, mpidr"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($affinity, $mpidr, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "\t$count errors\n"; -+ } -+ if ($out ne "") { -+ print "ARM processor events summary:\n$out\n"; -+ } else { -+ print "No ARM processor errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # extlog errors -- $query = "select etype, severity, count(*) from extlog_event group by etype, severity"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($etype, $severity, $count)); -- $out = ""; -- while($query_handle->fetch()) { -- $etype_string = get_extlog_type($etype); -- $severity_string = get_extlog_severity($severity); -- $out .= "\t$count $etype_string $severity_string errors\n"; -- } -- if ($out ne "") { -- print "Extlog records summary:\n$out"; -- } else { -- print "No Extlog errors.\n"; -+ if ($has_extlog == 1) { -+ $query = "select etype, severity, count(*) from extlog_event group by etype, severity"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($etype, $severity, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $etype_string = get_extlog_type($etype); -+ $severity_string = get_extlog_severity($severity); -+ $out .= "\t$count $etype_string $severity_string errors\n"; -+ } -+ if ($out ne "") { -+ print "Extlog records summary:\n$out"; -+ } else { -+ print "No Extlog errors.\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # MCE mce_record errors -- $query = "select error_msg, count(*) from mce_record group by error_msg"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($msg, $count)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "\t$count $msg errors\n"; -- } -- if ($out ne "") { -- print "MCE records summary:\n$out"; -- } else { -- print "No MCE errors.\n"; -+ if ($has_mce == 1) { -+ $query = "select error_msg, count(*) from mce_record group by error_msg"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($msg, $count)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "\t$count $msg errors\n"; -+ } -+ if ($out ne "") { -+ print "MCE records summary:\n$out"; -+ } else { -+ print "No MCE errors.\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - undef($dbh); - } -@@ -1244,105 +1262,113 @@ sub errors - $query_handle->finish; - - # PCIe AER aer_event errors -- $query = "select id, timestamp, err_type, err_msg from aer_event order by id"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($id, $time, $type, $msg)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "$id $time $type error: $msg\n"; -- } -- if ($out ne "") { -- print "PCIe AER events:\n$out\n"; -- } else { -- print "No PCIe AER errors.\n\n"; -+ if ($has_aer == 1) { -+ $query = "select id, timestamp, err_type, err_msg from aer_event order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $time, $type, $msg)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "$id $time $type error: $msg\n"; -+ } -+ if ($out ne "") { -+ print "PCIe AER events:\n$out\n"; -+ } else { -+ print "No PCIe AER errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # ARM processor arm_event errors -- $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "$id $timestamp error: "; -- $out .= "error_count=$error_count, " if ($error_count); -- $out .= "affinity_level=$affinity, "; -- $out .= sprintf "mpidr=0x%x, ", $mpidr; -- $out .= sprintf "running_state=0x%x, ", $r_state; -- $out .= sprintf "psci_state=0x%x", $psci_state; -- $out .= "\n"; -- } -- if ($out ne "") { -- print "ARM processor events:\n$out\n"; -- } else { -- print "No ARM processor errors.\n\n"; -+ if ($has_arm == 1) { -+ $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "$id $timestamp error: "; -+ $out .= "error_count=$error_count, " if ($error_count); -+ $out .= "affinity_level=$affinity, "; -+ $out .= sprintf "mpidr=0x%x, ", $mpidr; -+ $out .= sprintf "running_state=0x%x, ", $r_state; -+ $out .= sprintf "psci_state=0x%x", $psci_state; -+ $out .= "\n"; -+ } -+ if ($out ne "") { -+ print "ARM processor events:\n$out\n"; -+ } else { -+ print "No ARM processor errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # Extlog errors -- $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data)); -- $out = ""; -- while($query_handle->fetch()) { -- $etype_string = get_extlog_type($etype); -- $severity_string = get_extlog_severity($severity); -- $out .= "$id $timestamp error: "; -- $out .= "type=$etype_string, "; -- $out .= "severity=$severity_string, "; -- $out .= sprintf "address=0x%08x, ", $addr; -- $out .= sprintf "fru_id=%s, ", get_uuid_le($fru_id); -- $out .= "fru_text='$fru_text', "; -- $out .= get_cper_data_text($cper_data) if ($cper_data); -- $out .= "\n"; -- } -- if ($out ne "") { -- print "Extlog events:\n$out\n"; -- } else { -- print "No Extlog errors.\n\n"; -+ if ($has_extlog) { -+ $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $etype_string = get_extlog_type($etype); -+ $severity_string = get_extlog_severity($severity); -+ $out .= "$id $timestamp error: "; -+ $out .= "type=$etype_string, "; -+ $out .= "severity=$severity_string, "; -+ $out .= sprintf "address=0x%08x, ", $addr; -+ $out .= sprintf "fru_id=%s, ", get_uuid_le($fru_id); -+ $out .= "fru_text='$fru_text', "; -+ $out .= get_cper_data_text($cper_data) if ($cper_data); -+ $out .= "\n"; -+ } -+ if ($out ne "") { -+ print "Extlog events:\n$out\n"; -+ } else { -+ print "No Extlog errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - # MCE mce_record errors -- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id"; -- $query_handle = $dbh->prepare($query); -- $query_handle->execute(); -- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location)); -- $out = ""; -- while($query_handle->fetch()) { -- $out .= "$id $time error: $msg"; -- $out .= ", CPU $cpuvendor" if ($cpuvendor); -- $out .= ", bank $bank_name" if ($bank_name); -- $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg); -- $out .= ", mci $mcistatus_msg" if ($mcistatus_msg); -- $out .= ", $mc_location" if ($mc_location); -- $out .= ", $user_action" if ($user_action); -- $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap); -- $out .= sprintf ", mcgstatus=0x%08x", $mcgstatus if ($mcgstatus); -- $out .= sprintf ", status=0x%08x", $status if ($status); -- $out .= sprintf ", addr=0x%08x", $addr if ($addr); -- $out .= sprintf ", misc=0x%08x", $misc if ($misc); -- $out .= sprintf ", ip=0x%08x", $ip if ($ip); -- $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc); -- $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime); -- $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu); -- $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid); -- $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid); -- $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid); -- $out .= sprintf ", cs=0x%08x", $cs if ($cs); -- $out .= sprintf ", bank=0x%08x", $bank if ($bank); -+ if ($has_mce == 1) { -+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id"; -+ $query_handle = $dbh->prepare($query); -+ $query_handle->execute(); -+ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location)); -+ $out = ""; -+ while($query_handle->fetch()) { -+ $out .= "$id $time error: $msg"; -+ $out .= ", CPU $cpuvendor" if ($cpuvendor); -+ $out .= ", bank $bank_name" if ($bank_name); -+ $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg); -+ $out .= ", mci $mcistatus_msg" if ($mcistatus_msg); -+ $out .= ", $mc_location" if ($mc_location); -+ $out .= ", $user_action" if ($user_action); -+ $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap); -+ $out .= sprintf ", mcgstatus=0x%08x", $mcgstatus if ($mcgstatus); -+ $out .= sprintf ", status=0x%08x", $status if ($status); -+ $out .= sprintf ", addr=0x%08x", $addr if ($addr); -+ $out .= sprintf ", misc=0x%08x", $misc if ($misc); -+ $out .= sprintf ", ip=0x%08x", $ip if ($ip); -+ $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc); -+ $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime); -+ $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu); -+ $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid); -+ $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid); -+ $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid); -+ $out .= sprintf ", cs=0x%08x", $cs if ($cs); -+ $out .= sprintf ", bank=0x%08x", $bank if ($bank); - -- $out .= "\n"; -- } -- if ($out ne "") { -- print "MCE events:\n$out\n"; -- } else { -- print "No MCE errors.\n\n"; -+ $out .= "\n"; -+ } -+ if ($out ne "") { -+ print "MCE events:\n$out\n"; -+ } else { -+ print "No MCE errors.\n\n"; -+ } -+ $query_handle->finish; - } -- $query_handle->finish; - - undef($dbh); - } diff --git a/SOURCES/60a91e4da4f2daf2b10143fc148a8043312b61e5.patch b/SOURCES/60a91e4da4f2daf2b10143fc148a8043312b61e5.patch deleted file mode 100644 index 57a4e46..0000000 --- a/SOURCES/60a91e4da4f2daf2b10143fc148a8043312b61e5.patch +++ /dev/null @@ -1,149 +0,0 @@ -commit 60a91e4da4f2daf2b10143fc148a8043312b61e5 -Author: Aristeu Rozanski -Date: Wed Aug 1 16:29:58 2018 -0400 - - rasdaemon: ras-mc-ctl: add option to show error counts - - In some scenarios it might not be desirable to have a daemon running - to parse and store the errors provided by EDAC and only having the - number of CEs and UEs is enough. This patch implements this feature - as an ras-mc-ctl option. - - Signed-off-by: Aristeu Rozanski - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in -index 38b7824..aee431a 100755 ---- a/util/ras-mc-ctl.in -+++ b/util/ras-mc-ctl.in -@@ -50,6 +50,8 @@ my %dimm_location = (); - my %csrow_size = (); - my %rank_size = (); - my %csrow_ranks = (); -+my %dimm_ce_count = (); -+my %dimm_ue_count = (); - - my @layers; - my @max_pos; -@@ -76,6 +78,7 @@ Usage: $prog [OPTIONS...] - --layout Display the memory layout. - --summary Presents a summary of the logged errors. - --errors Shows the errors stored at the error database. -+ --error-count Shows the corrected and uncorrected error counts using sysfs. - --help This help message. - EOF - -@@ -83,7 +86,7 @@ parse_cmdline(); - - if ( $conf{opt}{mainboard} || $conf{opt}{print_labels} - || $conf{opt}{register_labels} || $conf{opt}{display_memory_layout} -- || $conf{opt}{guess_dimm_label}) { -+ || $conf{opt}{guess_dimm_label} || $conf{opt}{error_count}) { - - get_mainboard_info(); - -@@ -105,6 +108,9 @@ if ( $conf{opt}{mainboard} || $conf{opt}{print_labels} - if ($conf{opt}{guess_dimm_label}) { - guess_dimm_label (); - } -+ if ($conf{opt}{error_count}) { -+ display_error_count (); -+ } - } - - if ($conf{opt}{status}) { -@@ -134,6 +140,7 @@ sub parse_cmdline - $conf{opt}{guess_dimm_label} = 0; - $conf{opt}{summary} = 0; - $conf{opt}{errors} = 0; -+ $conf{opt}{error_count} = 0; - - my $rref = \$conf{opt}{report}; - my $mref = \$conf{opt}{mainboard}; -@@ -150,7 +157,8 @@ sub parse_cmdline - "status" => \$conf{opt}{status}, - "layout" => \$conf{opt}{display_memory_layout}, - "summary" => \$conf{opt}{summary}, -- "errors" => \$conf{opt}{errors} -+ "errors" => \$conf{opt}{errors}, -+ "error-count" => \$conf{opt}{error_count} - ); - - usage(1) if !$rc; -@@ -284,6 +292,30 @@ sub parse_dimm_nodes - $dimm_label_file{$str_loc} = $file; - $dimm_location{$str_loc} = $location; - -+ my $count; -+ -+ $file =~s/dimm_label/dimm_ce_count/; -+ if (-e $file) { -+ open IN, $file; -+ chomp($count = ); -+ close IN; -+ } else { -+ log_error ("dimm_ce_count not found in sysfs. Old kernel?\n"); -+ exit -1; -+ } -+ $dimm_ce_count{$str_loc} = $count; -+ -+ $file =~s/dimm_ce_count/dimm_ue_count/; -+ if (-e $file) { -+ open IN, $file; -+ chomp($count = ); -+ close IN; -+ } else { -+ log_error ("dimm_ue_count not found in sysfs. Old kernel?\n"); -+ exit -1; -+ } -+ $dimm_ue_count{$str_loc} = $count; -+ - return; - } - } -@@ -906,6 +938,45 @@ sub display_memory_layout - dimm_display_mem(); - } - -+sub display_error_count -+{ -+ my $sysfs_dir = "/sys/devices/system/edac/mc"; -+ my $key; -+ my $max_width = 0; -+ my %dimm_labels = (); -+ -+ find ({wanted => \&parse_dimm_nodes, no_chdir => 1}, $sysfs_dir); -+ -+ if (!scalar(keys %dimm_node)) { -+ log_error ("No DIMMs found in /sys or new sysfs EDAC interface not found.\n"); -+ exit -1; -+ } -+ -+ foreach $key (keys %dimm_node) { -+ my $label_width; -+ -+ open IN, $dimm_label_file{$key}; -+ chomp(my $label = ); -+ close IN; -+ $label_width = length $label; -+ -+ if ($label_width > $max_width) { -+ $max_width = $label_width; -+ } -+ $dimm_labels{$key} = $label; -+ } -+ my $string = "Label"; -+ $string .= " " x ($max_width - length $string); -+ print($string . "\tCE\tUE\n"); -+ -+ foreach $key (keys %dimm_node) { -+ my $ce_count = $dimm_ce_count{$key}; -+ my $ue_count = $dimm_ue_count{$key}; -+ -+ print("$dimm_labels{$key}\t$ce_count\t$ue_count\n"); -+ } -+} -+ - sub find_prog - { - my ($file) = @_; diff --git a/SOURCES/7937f0d6c2aaaed096f3a3d306416743c0dcb7a4.patch b/SOURCES/7937f0d6c2aaaed096f3a3d306416743c0dcb7a4.patch deleted file mode 100644 index 76afc8e..0000000 --- a/SOURCES/7937f0d6c2aaaed096f3a3d306416743c0dcb7a4.patch +++ /dev/null @@ -1,24 +0,0 @@ -commit 7937f0d6c2aaaed096f3a3d306416743c0dcb7a4 -Author: Muralidhara M K -Date: Wed Jul 28 01:52:12 2021 -0500 - - rasdaemon: Support MCE for AMD CPU family 19h - - Add support for family 19h x86 CPUs from AMD. - - Signed-off-by: Muralidhara M K - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/ras-mce-handler.c b/ras-mce-handler.c -index 805004a..f2b53d4 100644 ---- a/ras-mce-handler.c -+++ b/ras-mce-handler.c -@@ -208,7 +208,7 @@ static int detect_cpu(struct ras_events *ras) - mce->cputype = CPU_AMD_SMCA; - goto ret; - } -- if (mce->family > 23) { -+ if (mce->family > 25) { - log(ALL, LOG_INFO, - "Can't parse MCE for this AMD CPU yet %d\n", - mce->family); diff --git a/SOURCES/854364ba44aee9bc5646f6537fc744b0b54aff37.patch b/SOURCES/854364ba44aee9bc5646f6537fc744b0b54aff37.patch deleted file mode 100644 index 91bad1b..0000000 --- a/SOURCES/854364ba44aee9bc5646f6537fc744b0b54aff37.patch +++ /dev/null @@ -1,38 +0,0 @@ -commit 854364ba44aee9bc5646f6537fc744b0b54aff37 -Author: Muralidhara M K -Date: Thu Aug 20 21:00:57 2020 +0530 - - rasdaemon: Add 8 channel decoding for SMCA systems - - Current Scalable Machine Check Architecture (SMCA) systems support up - to 8 UMC channels. - - To find the UMC channel represented by a bank, look at the 6th nibble - in the MCA_IPID[InstanceId] field. - - Signed-off-by: Muralidhara M K - [ Adjust commit message. ] - Signed-off-by: Yazen Ghannam - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/mce-amd-smca.c b/mce-amd-smca.c -index d0b6cb6..7c619fd 100644 ---- a/mce-amd-smca.c -+++ b/mce-amd-smca.c -@@ -438,15 +438,7 @@ static void amd_decode_errcode(struct mce_event *e) - */ - static int find_umc_channel(struct mce_event *e) - { -- uint32_t umc_instance_id[] = {0x50f00, 0x150f00}; -- uint32_t instance_id = EXTRACT(e->ipid, 0, 31); -- int i, channel = -1; -- -- for (i = 0; i < ARRAY_SIZE(umc_instance_id); i++) -- if (umc_instance_id[i] == instance_id) -- channel = i; -- -- return channel; -+ return EXTRACT(e->ipid, 0, 31) >> 20; - } - /* Decode extended errors according to Scalable MCA specification */ - static void decode_smca_error(struct mce_event *e) diff --git a/SOURCES/8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch b/SOURCES/8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch deleted file mode 100644 index e3617fc..0000000 --- a/SOURCES/8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch +++ /dev/null @@ -1,207 +0,0 @@ -commit 8704a85d8dc3483423ec2934fee8132f85f8fdb6 -Author: Brian WoodsGhannam, Yazen -Date: Fri Nov 1 15:48:14 2019 +0100 - - rasdaemon: add support for new AMD SMCA bank types - - Going forward, the Scalable Machine Check Architecture (SMCA) has some - updated and additional bank types which show up in Zen2. The differing - bank types include: CS_V2, PSP_V2, SMU_V2, MP5, NBIO, and PCIE. The V2 - bank types replace the original bank types but have unique HWID/MCAtype - IDs from the originals so there's no conflicts between different - versions or other bank types. All of the differing bank types have new - MCE descriptions which have been added as well. - - CC: "mchehab+samsung@kernel.org" , "Namburu, Chandu-babu" # Thread-Topic: [PATCH 2/2] rasdaemon: add support for new AMD SMCA bank types - Signed-off-by: Brian Woods - Signed-off-by: Yazen Ghannam - Cc: Chandu-babu Namburu - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/mce-amd-smca.c b/mce-amd-smca.c -index 6c3e8a5..114e786 100644 ---- a/mce-amd-smca.c -+++ b/mce-amd-smca.c -@@ -49,11 +49,17 @@ enum smca_bank_types { - SMCA_FP, /* Floating Point */ - SMCA_L3_CACHE, /* L3 Cache */ - SMCA_CS, /* Coherent Slave */ -+ SMCA_CS_V2, /* Coherent Slave V2 */ - SMCA_PIE, /* Power, Interrupts, etc. */ - SMCA_UMC, /* Unified Memory Controller */ - SMCA_PB, /* Parameter Block */ - SMCA_PSP, /* Platform Security Processor */ -+ SMCA_PSP_V2, /* Platform Security Processor V2 */ - SMCA_SMU, /* System Management Unit */ -+ SMCA_SMU_V2, /* System Management Unit V2 */ -+ SMCA_MP5, /* Microprocessor 5 Unit */ -+ SMCA_NBIO, /* Northbridge IO Unit */ -+ SMCA_PCIE, /* PCI Express Unit */ - N_SMCA_BANK_TYPES - }; - -@@ -165,6 +171,23 @@ static const char * const smca_cs_mce_desc[] = { - "Atomic request parity", - "ECC error on probe filter access", - }; -+/* Coherent Slave Unit V2 */ -+static const char * const smca_cs2_mce_desc[] = { -+ "Illegal Request", -+ "Address Violation", -+ "Security Violation", -+ "Illegal Response", -+ "Unexpected Response", -+ "Request or Probe Parity Error", -+ "Read Response Parity Error", -+ "Atomic Request Parity Error", -+ "SDP read response had no match in the CS queue", -+ "Probe Filter Protocol Error", -+ "Probe Filter ECC Error", -+ "SDP read response had an unexpected RETRY error", -+ "Counter overflow error", -+ "Counter underflow error", -+}; - /* Power, Interrupt, etc.. */ - static const char * const smca_pie_mce_desc[] = { - "HW assert", -@@ -189,10 +212,75 @@ static const char * const smca_pb_mce_desc[] = { - static const char * const smca_psp_mce_desc[] = { - "PSP RAM ECC or parity error", - }; -+/* Platform Security Processor V2 */ -+static const char * const smca_psp2_mce_desc[] = { -+ "High SRAM ECC or parity error", -+ "Low SRAM ECC or parity error", -+ "Instruction Cache Bank 0 ECC or parity error", -+ "Instruction Cache Bank 1 ECC or parity error", -+ "Instruction Tag Ram 0 parity error", -+ "Instruction Tag Ram 1 parity error", -+ "Data Cache Bank 0 ECC or parity error", -+ "Data Cache Bank 1 ECC or parity error", -+ "Data Cache Bank 2 ECC or parity error", -+ "Data Cache Bank 3 ECC or parity error", -+ "Data Tag Bank 0 parity error", -+ "Data Tag Bank 1 parity error", -+ "Data Tag Bank 2 parity error", -+ "Data Tag Bank 3 parity error", -+ "Dirty Data Ram parity error", -+ "TLB Bank 0 parity error", -+ "TLB Bank 1 parity error", -+ "System Hub Read Buffer ECC or parity error", -+}; - /* System Management Unit */ - static const char * const smca_smu_mce_desc[] = { - "SMU RAM ECC or parity error", - }; -+/* System Management Unit V2 */ -+static const char * const smca_smu2_mce_desc[] = { -+ "High SRAM ECC or parity error", -+ "Low SRAM ECC or parity error", -+ "Data Cache Bank A ECC or parity error", -+ "Data Cache Bank B ECC or parity error", -+ "Data Tag Cache Bank A ECC or parity error", -+ "Data Tag Cache Bank B ECC or parity error", -+ "Instruction Cache Bank A ECC or parity error", -+ "Instruction Cache Bank B ECC or parity error", -+ "Instruction Tag Cache Bank A ECC or parity error", -+ "Instruction Tag Cache Bank B ECC or parity error", -+ "System Hub Read Buffer ECC or parity error", -+}; -+/* Microprocessor 5 Unit */ -+static const char * const smca_mp5_mce_desc[] = { -+ "High SRAM ECC or parity error", -+ "Low SRAM ECC or parity error", -+ "Data Cache Bank A ECC or parity error", -+ "Data Cache Bank B ECC or parity error", -+ "Data Tag Cache Bank A ECC or parity error", -+ "Data Tag Cache Bank B ECC or parity error", -+ "Instruction Cache Bank A ECC or parity error", -+ "Instruction Cache Bank B ECC or parity error", -+ "Instruction Tag Cache Bank A ECC or parity error", -+ "Instruction Tag Cache Bank B ECC or parity error", -+}; -+/* Northbridge IO Unit */ -+static const char * const smca_nbio_mce_desc[] = { -+ "ECC or Parity error", -+ "PCIE error", -+ "SDP ErrEvent error", -+ "SDP Egress Poison Error", -+ "IOHC Internal Poison Error", -+}; -+/* PCI Express Unit */ -+static const char * const smca_pcie_mce_desc[] = { -+ "CCIX PER Message logging", -+ "CCIX Read Response with Status: Non-Data Error", -+ "CCIX Write Response with Status: Non-Data Error", -+ "CCIX Read Response with Status: Data Error", -+ "CCIX Non-okay write response with data error", -+}; -+ - - struct smca_mce_desc { - const char * const *descs; -@@ -208,11 +296,17 @@ static struct smca_mce_desc smca_mce_descs[] = { - [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) }, - [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, - [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, -+ [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) }, - [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, - [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, - [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, - [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, -+ [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)}, - [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, -+ [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc)}, -+ [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) }, -+ [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)}, -+ [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)}, - }; - - struct smca_hwid { -@@ -235,6 +329,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = { - - /* Data Fabric MCA types */ - { SMCA_CS, 0x0000002E }, -+ { SMCA_CS_V2, 0x0002002E }, - { SMCA_PIE, 0x0001002E }, - - /* Unified Memory Controller MCA type */ -@@ -245,9 +340,20 @@ static struct smca_hwid smca_hwid_mcatypes[] = { - - /* Platform Security Processor MCA type */ - { SMCA_PSP, 0x000000FF }, -+ { SMCA_PSP_V2, 0x000100FF }, - - /* System Management Unit MCA type */ - { SMCA_SMU, 0x00000001 }, -+ { SMCA_SMU_V2, 0x00010001 }, -+ -+ /* Microprocessor 5 Unit MCA type */ -+ { SMCA_MP5, 0x00020001 }, -+ -+ /* Northbridge IO Unit MCA type */ -+ { SMCA_NBIO, 0x00000018 }, -+ -+ /* PCI Express Unit MCA type */ -+ { SMCA_PCIE, 0x00000046 }, - }; - - struct smca_bank_name { -@@ -264,11 +370,17 @@ static struct smca_bank_name smca_names[] = { - [SMCA_FP] = { "Floating Point Unit" }, - [SMCA_L3_CACHE] = { "L3 Cache" }, - [SMCA_CS] = { "Coherent Slave" }, -+ [SMCA_CS_V2] = { "Coherent Slave" }, - [SMCA_PIE] = { "Power, Interrupts, etc." }, - [SMCA_UMC] = { "Unified Memory Controller" }, - [SMCA_PB] = { "Parameter Block" }, - [SMCA_PSP] = { "Platform Security Processor" }, -+ [SMCA_PSP_V2] = { "Platform Security Processor" }, - [SMCA_SMU] = { "System Management Unit" }, -+ [SMCA_SMU_V2] = { "System Management Unit" }, -+ [SMCA_MP5] = { "Microprocessor 5 Unit" }, -+ [SMCA_NBIO] = { "Northbridge IO Unit" }, -+ [SMCA_PCIE] = { "PCI Express Unit" }, - }; - - static void amd_decode_errcode(struct mce_event *e) diff --git a/SOURCES/899fcc2cf21c86b5462c8f4441cd9c92b3d75f7d.patch b/SOURCES/899fcc2cf21c86b5462c8f4441cd9c92b3d75f7d.patch deleted file mode 100644 index 8f26b51..0000000 --- a/SOURCES/899fcc2cf21c86b5462c8f4441cd9c92b3d75f7d.patch +++ /dev/null @@ -1,71 +0,0 @@ -commit 899fcc2cf21c86b5462c8f4441cd9c92b3d75f7d -Author: Aristeu Rozanski -Date: Thu Jan 19 08:45:57 2023 -0500 - - rasdaemon: ras-report: fix possible but unlikely file descriptor leak - - Found with covscan. - - Signed-off-by: Aristeu Rozanski - Signed-off-by: Mauro Carvalho Chehab - ---- - ras-report.c | 12 ++++++------ - 1 file changed, 6 insertions(+), 6 deletions(-) - ---- rasdaemon-0.6.1.orig/ras-report.c 2023-01-23 11:36:20.972368760 -0500 -+++ rasdaemon-0.6.1/ras-report.c 2023-01-23 11:36:23.236343267 -0500 -@@ -374,7 +374,7 @@ if(rc < 0){ - - mc_fail: - -- if(sockfd > 0){ -+ if(sockfd >= 0){ - close(sockfd); - } - -@@ -424,7 +424,7 @@ if(rc < 0){ - - aer_fail: - -- if(sockfd > 0){ -+ if(sockfd >= 0){ - close(sockfd); - } - -@@ -473,7 +473,7 @@ rc = 0; - - non_standard_fail: - -- if(sockfd > 0){ -+ if(sockfd >= 0){ - close(sockfd); - } - -@@ -518,7 +518,7 @@ rc = 0; - - arm_fail: - -- if(sockfd > 0){ -+ if(sockfd >= 0){ - close(sockfd); - } - -@@ -564,7 +564,7 @@ if(rc < 0){ - - mce_fail: - -- if(sockfd > 0){ -+ if(sockfd >= 0){ - close(sockfd); - } - -@@ -609,7 +609,7 @@ if (rc < 0) - done = 1; - - mf_fail: -- if (sockfd > 0) -+ if (sockfd >= 0) - close(sockfd); - - if (done) diff --git a/SOURCES/9acef39f13833f7d53ef96abc5a72e79384260f4.patch b/SOURCES/9acef39f13833f7d53ef96abc5a72e79384260f4.patch deleted file mode 100644 index c4c8af1..0000000 --- a/SOURCES/9acef39f13833f7d53ef96abc5a72e79384260f4.patch +++ /dev/null @@ -1,230 +0,0 @@ -commit 9acef39f13833f7d53ef96abc5a72e79384260f4 -Author: Naveen Krishna Chatradhi -Date: Tue Jun 1 11:01:17 2021 +0530 - - rasdaemon: Add new SMCA bank types with error decoding - - Upcoming systems with Scalable Machine Check Architecture (SMCA) have - new MCA banks added. - - This patch adds the (HWID, MCATYPE) tuple, name and error decoding for - those new SMCA banks. - While at it, optimize the string names in smca_bank_name[]. - - Signed-off-by: Muralidhara M K - Signed-off-by: Naveen Krishna Chatradhi - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/mce-amd-smca.c b/mce-amd-smca.c -index 7c619fd..e0cf512 100644 ---- a/mce-amd-smca.c -+++ b/mce-amd-smca.c -@@ -47,7 +47,7 @@ - /* These may be used by multiple smca_hwid_mcatypes */ - enum smca_bank_types { - SMCA_LS = 0, /* Load Store */ -- SMCA_LS_V2, /* Load Store */ -+ SMCA_LS_V2, - SMCA_IF, /* Instruction Fetch */ - SMCA_L2_CACHE, /* L2 Cache */ - SMCA_DE, /* Decoder Unit */ -@@ -56,17 +56,22 @@ enum smca_bank_types { - SMCA_FP, /* Floating Point */ - SMCA_L3_CACHE, /* L3 Cache */ - SMCA_CS, /* Coherent Slave */ -- SMCA_CS_V2, /* Coherent Slave V2 */ -+ SMCA_CS_V2, - SMCA_PIE, /* Power, Interrupts, etc. */ - SMCA_UMC, /* Unified Memory Controller */ -+ SMCA_UMC_V2, - SMCA_PB, /* Parameter Block */ - SMCA_PSP, /* Platform Security Processor */ -- SMCA_PSP_V2, /* Platform Security Processor V2 */ -+ SMCA_PSP_V2, - SMCA_SMU, /* System Management Unit */ -- SMCA_SMU_V2, /* System Management Unit V2 */ -+ SMCA_SMU_V2, - SMCA_MP5, /* Microprocessor 5 Unit */ - SMCA_NBIO, /* Northbridge IO Unit */ - SMCA_PCIE, /* PCI Express Unit */ -+ SMCA_PCIE_V2, -+ SMCA_XGMI_PCS, /* xGMI PCS Unit */ -+ SMCA_XGMI_PHY, /* xGMI PHY Unit */ -+ SMCA_WAFL_PHY, /* WAFL PHY Unit */ - N_SMCA_BANK_TYPES - }; - -@@ -237,6 +242,22 @@ static const char * const smca_umc_mce_desc[] = { - "Command/address parity error", - "Write data CRC error", - }; -+ -+static const char * const smca_umc2_mce_desc[] = { -+ "DRAM ECC error", -+ "Data poison error", -+ "SDP parity error", -+ "Reserved", -+ "Address/Command parity error", -+ "Write data parity error", -+ "DCQ SRAM ECC error", -+ "Reserved", -+ "Read data parity error", -+ "Rdb SRAM ECC error", -+ "RdRsp SRAM ECC error", -+ "LM32 MP errors", -+}; -+ - /* Parameter Block */ - static const char * const smca_pb_mce_desc[] = { - "Parameter Block RAM ECC error", -@@ -314,6 +335,55 @@ static const char * const smca_pcie_mce_desc[] = { - "CCIX Non-okay write response with data error", - }; - -+static const char * const smca_pcie2_mce_desc[] = { -+ "SDP Parity Error logging", -+}; -+ -+static const char * const smca_xgmipcs_mce_desc[] = { -+ "Data Loss Error", -+ "Training Error", -+ "Flow Control Acknowledge Error", -+ "Rx Fifo Underflow Error", -+ "Rx Fifo Overflow Error", -+ "CRC Error", -+ "BER Exceeded Error", -+ "Tx Vcid Data Error", -+ "Replay Buffer Parity Error", -+ "Data Parity Error", -+ "Replay Fifo Overflow Error", -+ "Replay Fifo Underflow Error", -+ "Elastic Fifo Overflow Error", -+ "Deskew Error", -+ "Flow Control CRC Error", -+ "Data Startup Limit Error", -+ "FC Init Timeout Error", -+ "Recovery Timeout Error", -+ "Ready Serial Timeout Error", -+ "Ready Serial Attempt Error", -+ "Recovery Attempt Error", -+ "Recovery Relock Attempt Error", -+ "Replay Attempt Error", -+ "Sync Header Error", -+ "Tx Replay Timeout Error", -+ "Rx Replay Timeout Error", -+ "LinkSub Tx Timeout Error", -+ "LinkSub Rx Timeout Error", -+ "Rx CMD Pocket Error", -+}; -+ -+static const char * const smca_xgmiphy_mce_desc[] = { -+ "RAM ECC Error", -+ "ARC instruction buffer parity error", -+ "ARC data buffer parity error", -+ "PHY APB error", -+}; -+ -+static const char * const smca_waflphy_mce_desc[] = { -+ "RAM ECC Error", -+ "ARC instruction buffer parity error", -+ "ARC data buffer parity error", -+ "PHY APB error", -+}; - - struct smca_mce_desc { - const char * const *descs; -@@ -333,6 +403,7 @@ static struct smca_mce_desc smca_mce_descs[] = { - [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) }, - [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, - [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, -+ [SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) }, - [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, - [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, - [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)}, -@@ -341,6 +412,10 @@ static struct smca_mce_desc smca_mce_descs[] = { - [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) }, - [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)}, - [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)}, -+ [SMCA_PCIE_V2] = { smca_pcie2_mce_desc, ARRAY_SIZE(smca_pcie2_mce_desc) }, -+ [SMCA_XGMI_PCS] = { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc) }, -+ [SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) }, -+ [SMCA_WAFL_PHY] = { smca_waflphy_mce_desc, ARRAY_SIZE(smca_waflphy_mce_desc) }, - }; - - struct smca_hwid { -@@ -369,6 +444,8 @@ static struct smca_hwid smca_hwid_mcatypes[] = { - - /* Unified Memory Controller MCA type */ - { SMCA_UMC, 0x00000096 }, -+ /* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */ -+ { SMCA_UMC_V2, 0x00010096 }, - - /* Parameter Block MCA type */ - { SMCA_PB, 0x00000005 }, -@@ -389,6 +466,16 @@ static struct smca_hwid smca_hwid_mcatypes[] = { - - /* PCI Express Unit MCA type */ - { SMCA_PCIE, 0x00000046 }, -+ { SMCA_PCIE_V2, 0x00010046 }, -+ -+ /* Ext Global Memory Interconnect PCS MCA type */ -+ { SMCA_XGMI_PCS, 0x00000050 }, -+ -+ /* Ext Global Memory Interconnect PHY MCA type */ -+ { SMCA_XGMI_PHY, 0x00000259 }, -+ -+ /* WAFL PHY MCA type */ -+ { SMCA_WAFL_PHY, 0x00000267 }, - }; - - struct smca_bank_name { -@@ -396,27 +483,28 @@ struct smca_bank_name { - }; - - static struct smca_bank_name smca_names[] = { -- [SMCA_LS] = { "Load Store Unit" }, -- [SMCA_LS_V2] = { "Load Store Unit" }, -- [SMCA_IF] = { "Instruction Fetch Unit" }, -- [SMCA_L2_CACHE] = { "L2 Cache" }, -- [SMCA_DE] = { "Decode Unit" }, -- [SMCA_RESERVED] = { "Reserved" }, -- [SMCA_EX] = { "Execution Unit" }, -- [SMCA_FP] = { "Floating Point Unit" }, -- [SMCA_L3_CACHE] = { "L3 Cache" }, -- [SMCA_CS] = { "Coherent Slave" }, -- [SMCA_CS_V2] = { "Coherent Slave" }, -- [SMCA_PIE] = { "Power, Interrupts, etc." }, -- [SMCA_UMC] = { "Unified Memory Controller" }, -- [SMCA_PB] = { "Parameter Block" }, -- [SMCA_PSP] = { "Platform Security Processor" }, -- [SMCA_PSP_V2] = { "Platform Security Processor" }, -- [SMCA_SMU] = { "System Management Unit" }, -- [SMCA_SMU_V2] = { "System Management Unit" }, -- [SMCA_MP5] = { "Microprocessor 5 Unit" }, -- [SMCA_NBIO] = { "Northbridge IO Unit" }, -- [SMCA_PCIE] = { "PCI Express Unit" }, -+ [SMCA_LS ... SMCA_LS_V2] = { "Load Store Unit" }, -+ [SMCA_IF] = { "Instruction Fetch Unit" }, -+ [SMCA_L2_CACHE] = { "L2 Cache" }, -+ [SMCA_DE] = { "Decode Unit" }, -+ [SMCA_RESERVED] = { "Reserved" }, -+ [SMCA_EX] = { "Execution Unit" }, -+ [SMCA_FP] = { "Floating Point Unit" }, -+ [SMCA_L3_CACHE] = { "L3 Cache" }, -+ [SMCA_CS ... SMCA_CS_V2] = { "Coherent Slave" }, -+ [SMCA_PIE] = { "Power, Interrupts, etc." }, -+ [SMCA_UMC] = { "Unified Memory Controller" }, -+ [SMCA_UMC_V2] = { "Unified Memory Controller V2" }, -+ [SMCA_PB] = { "Parameter Block" }, -+ [SMCA_PSP ... SMCA_PSP_V2] = { "Platform Security Processor" }, -+ [SMCA_SMU ... SMCA_SMU_V2] = { "System Management Unit" }, -+ [SMCA_MP5] = { "Microprocessor 5 Unit" }, -+ [SMCA_NBIO] = { "Northbridge IO Unit" }, -+ [SMCA_PCIE ... SMCA_PCIE_V2] = { "PCI Express Unit" }, -+ [SMCA_XGMI_PCS] = { "Ext Global Memory Interconnect PCS Unit" }, -+ [SMCA_XGMI_PHY] = { "Ext Global Memory Interconnect PHY Unit" }, -+ [SMCA_WAFL_PHY] = { "WAFL PHY Unit" }, -+ - }; - - static void amd_decode_errcode(struct mce_event *e) diff --git a/SOURCES/a16ca0711001957ee98f2c124abce0fa1f801529.patch b/SOURCES/a16ca0711001957ee98f2c124abce0fa1f801529.patch deleted file mode 100644 index 3a96263..0000000 --- a/SOURCES/a16ca0711001957ee98f2c124abce0fa1f801529.patch +++ /dev/null @@ -1,670 +0,0 @@ -commit a16ca0711001957ee98f2c124abce0fa1f801529 -Author: Chandu-babu Namburu -Date: Wed Jan 30 20:36:45 2019 +0530 - - rasdaemon: add support for AMD Scalable MCA - - Add logic here to decode errors from all known IP blocks for - AMD Scalable MCA supported processors - - Reviewed-by: Yazen Ghannam - Signed-off-by: Chandu-babu Namburu - ---- - mce-amd-smca.c | 371 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ - mce-amd.c | 122 +++++++++++++++++ - ras-mce-handler.c | 24 +++ - ras-mce-handler.h | 15 ++ - 4 files changed, 530 insertions(+), 2 deletions(-) - ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ rasdaemon-0.6.1/mce-amd-smca.c 2019-07-12 11:35:04.836470461 -0400 -@@ -0,0 +1,371 @@ -+/* -+ * Copyright (c) 2018, AMD, Inc. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 and -+ * only version 2 as published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ */ -+ -+#include -+#include -+ -+#include "ras-mce-handler.h" -+#include "bitfield.h" -+ -+/* MCA_STATUS REGISTER FOR FAMILY 17H -+ *********************** Higher 32-bits ***************************** -+ * 63: VALIDERROR, 62: OVERFLOW, 61: UC, 60: Err ENABLE, -+ * 59: Misc Valid, 58: Addr Valid, 57: PCC, 56: ErrCoreID Valid, -+ * 55: TCC, 54: RES, 53: Syndrom Valid, 52: Transparanet, -+ * 51: RES, 50: RES, 49: RES, 48: RES, -+ * 47: RES, 46: CECC, 45: UECC, 44: Deferred, -+ * 43: Poison, 42: RES, 41: RES, 40: RES, -+ * 39: RES, 38: RES, 37: ErrCoreID[5], 36: ErrCoreID[4], -+ * 35: ErrCoreID[3], 34: ErrCoreID[2] 33: ErrCoreID[1] 32: ErrCoreID[0] -+ *********************** Lower 32-bits ****************************** -+ * 31: RES, 30: RES, 29: RES, 28: RES, -+ * 27: RES, 26: RES, 25: RES, 24: RES -+ * 23: RES, 22: RES, 21: XEC[5], 20: XEC[4], -+ * 19: XEC[3], 18: XEC[2], 17: XEC[1], 16: XEC[0] -+ * 15: EC[15], 14: EC[14], 13: EC[13], 12: EC[12], -+ * 11: EC[11], 10: EC[10], 09: EC[9], 08: EC[8], -+ * 07: EC[7], 06: EC[6], 05: EC[5], 04: EC[4], -+ * 03: EC[3], 02: EC[2], 01: EC[1], 00: EC[0] -+ */ -+ -+/* These may be used by multiple smca_hwid_mcatypes */ -+enum smca_bank_types { -+ SMCA_LS = 0, /* Load Store */ -+ SMCA_IF, /* Instruction Fetch */ -+ SMCA_L2_CACHE, /* L2 Cache */ -+ SMCA_DE, /* Decoder Unit */ -+ SMCA_RESERVED, /* Reserved */ -+ SMCA_EX, /* Execution Unit */ -+ SMCA_FP, /* Floating Point */ -+ SMCA_L3_CACHE, /* L3 Cache */ -+ SMCA_CS, /* Coherent Slave */ -+ SMCA_PIE, /* Power, Interrupts, etc. */ -+ SMCA_UMC, /* Unified Memory Controller */ -+ SMCA_PB, /* Parameter Block */ -+ SMCA_PSP, /* Platform Security Processor */ -+ SMCA_SMU, /* System Management Unit */ -+ N_SMCA_BANK_TYPES -+}; -+ -+/* SMCA Extended error strings */ -+/* Load Store */ -+static const char * const smca_ls_mce_desc[] = { -+ "Load queue parity", -+ "Store queue parity", -+ "Miss address buffer payload parity", -+ "L1 TLB parity", -+ "Reserved", -+ "DC tag error type 6", -+ "DC tag error type 1", -+ "Internal error type 1", -+ "Internal error type 2", -+ "Sys Read data error thread 0", -+ "Sys read data error thread 1", -+ "DC tag error type 2", -+ "DC data error type 1 (poison consumption)", -+ "DC data error type 2", -+ "DC data error type 3", -+ "DC tag error type 4", -+ "L2 TLB parity", -+ "PDC parity error", -+ "DC tag error type 3", -+ "DC tag error type 5", -+ "L2 fill data error", -+}; -+/* Instruction Fetch */ -+static const char * const smca_if_mce_desc[] = { -+ "microtag probe port parity error", -+ "IC microtag or full tag multi-hit error", -+ "IC full tag parity", -+ "IC data array parity", -+ "Decoupling queue phys addr parity error", -+ "L0 ITLB parity error", -+ "L1 ITLB parity error", -+ "L2 ITLB parity error", -+ "BPQ snoop parity on Thread 0", -+ "BPQ snoop parity on Thread 1", -+ "L1 BTB multi-match error", -+ "L2 BTB multi-match error", -+ "L2 Cache Response Poison error", -+ "System Read Data error", -+}; -+/* L2 Cache */ -+static const char * const smca_l2_mce_desc[] = { -+ "L2M tag multi-way-hit error", -+ "L2M tag ECC error", -+ "L2M data ECC error", -+ "HW assert", -+}; -+/* Decoder Unit */ -+static const char * const smca_de_mce_desc[] = { -+ "uop cache tag parity error", -+ "uop cache data parity error", -+ "Insn buffer parity error", -+ "uop queue parity error", -+ "Insn dispatch queue parity error", -+ "Fetch address FIFO parity", -+ "Patch RAM data parity", -+ "Patch RAM sequencer parity", -+ "uop buffer parity" -+}; -+/* Execution Unit */ -+static const char * const smca_ex_mce_desc[] = { -+ "Watchdog timeout error", -+ "Phy register file parity", -+ "Flag register file parity", -+ "Immediate displacement register file parity", -+ "Address generator payload parity", -+ "EX payload parity", -+ "Checkpoint queue parity", -+ "Retire dispatch queue parity", -+ "Retire status queue parity error", -+ "Scheduling queue parity error", -+ "Branch buffer queue parity error", -+}; -+/* Floating Point Unit */ -+static const char * const smca_fp_mce_desc[] = { -+ "Physical register file parity", -+ "Freelist parity error", -+ "Schedule queue parity", -+ "NSQ parity error", -+ "Retire queue parity", -+ "Status register file parity", -+ "Hardware assertion", -+}; -+/* L3 Cache */ -+static const char * const smca_l3_mce_desc[] = { -+ "Shadow tag macro ECC error", -+ "Shadow tag macro multi-way-hit error", -+ "L3M tag ECC error", -+ "L3M tag multi-way-hit error", -+ "L3M data ECC error", -+ "XI parity, L3 fill done channel error", -+ "L3 victim queue parity", -+ "L3 HW assert", -+}; -+/* Coherent Slave Unit */ -+static const char * const smca_cs_mce_desc[] = { -+ "Illegal request from transport layer", -+ "Address violation", -+ "Security violation", -+ "Illegal response from transport layer", -+ "Unexpected response", -+ "Parity error on incoming request or probe response data", -+ "Parity error on incoming read response data", -+ "Atomic request parity", -+ "ECC error on probe filter access", -+}; -+/* Power, Interrupt, etc.. */ -+static const char * const smca_pie_mce_desc[] = { -+ "HW assert", -+ "Internal PIE register security violation", -+ "Error on GMI link", -+ "Poison data written to internal PIE register", -+}; -+/* Unified Memory Controller */ -+static const char * const smca_umc_mce_desc[] = { -+ "DRAM ECC error", -+ "Data poison error on DRAM", -+ "SDP parity error", -+ "Advanced peripheral bus error", -+ "Command/address parity error", -+ "Write data CRC error", -+}; -+/* Parameter Block */ -+static const char * const smca_pb_mce_desc[] = { -+ "Parameter Block RAM ECC error", -+}; -+/* Platform Security Processor */ -+static const char * const smca_psp_mce_desc[] = { -+ "PSP RAM ECC or parity error", -+}; -+/* System Management Unit */ -+static const char * const smca_smu_mce_desc[] = { -+ "SMU RAM ECC or parity error", -+}; -+ -+struct smca_mce_desc { -+ const char * const *descs; -+ unsigned int num_descs; -+}; -+ -+static struct smca_mce_desc smca_mce_descs[] = { -+ [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) }, -+ [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) }, -+ [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) }, -+ [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) }, -+ [SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) }, -+ [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) }, -+ [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, -+ [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, -+ [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, -+ [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, -+ [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, -+ [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, -+ [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, -+}; -+ -+struct smca_hwid { -+ unsigned int bank_type; /* Use with smca_bank_types for easy indexing.*/ -+ uint32_t mcatype_hwid; /* mcatype,hwid bit 63-32 in MCx_IPID Register*/ -+}; -+ -+static struct smca_hwid smca_hwid_mcatypes[] = { -+ /* { bank_type, mcatype_hwid } */ -+ -+ /* ZN Core (HWID=0xB0) MCA types */ -+ { SMCA_LS, 0x000000B0 }, -+ { SMCA_IF, 0x000100B0 }, -+ { SMCA_L2_CACHE, 0x000200B0 }, -+ { SMCA_DE, 0x000300B0 }, -+ /* HWID 0xB0 MCATYPE 0x4 is Reserved */ -+ { SMCA_EX, 0x000500B0 }, -+ { SMCA_FP, 0x000600B0 }, -+ { SMCA_L3_CACHE, 0x000700B0 }, -+ -+ /* Data Fabric MCA types */ -+ { SMCA_CS, 0x0000002E }, -+ { SMCA_PIE, 0x0001002E }, -+ -+ /* Unified Memory Controller MCA type */ -+ { SMCA_UMC, 0x00000096 }, -+ -+ /* Parameter Block MCA type */ -+ { SMCA_PB, 0x00000005 }, -+ -+ /* Platform Security Processor MCA type */ -+ { SMCA_PSP, 0x000000FF }, -+ -+ /* System Management Unit MCA type */ -+ { SMCA_SMU, 0x00000001 }, -+}; -+ -+struct smca_bank_name { -+ const char *name; -+}; -+ -+static struct smca_bank_name smca_names[] = { -+ [SMCA_LS] = { "Load Store Unit" }, -+ [SMCA_IF] = { "Instruction Fetch Unit" }, -+ [SMCA_L2_CACHE] = { "L2 Cache" }, -+ [SMCA_DE] = { "Decode Unit" }, -+ [SMCA_RESERVED] = { "Reserved" }, -+ [SMCA_EX] = { "Execution Unit" }, -+ [SMCA_FP] = { "Floating Point Unit" }, -+ [SMCA_L3_CACHE] = { "L3 Cache" }, -+ [SMCA_CS] = { "Coherent Slave" }, -+ [SMCA_PIE] = { "Power, Interrupts, etc." }, -+ [SMCA_UMC] = { "Unified Memory Controller" }, -+ [SMCA_PB] = { "Parameter Block" }, -+ [SMCA_PSP] = { "Platform Security Processor" }, -+ [SMCA_SMU] = { "System Management Unit" }, -+}; -+ -+static void amd_decode_errcode(struct mce_event *e) -+{ -+ -+ decode_amd_errcode(e); -+ -+ if (e->status & MCI_STATUS_POISON) -+ mce_snprintf(e->mcistatus_msg, "Poison consumed"); -+ -+ if (e->status & MCI_STATUS_TCC) -+ mce_snprintf(e->mcistatus_msg, "Task_context_corrupt"); -+ -+} -+/* -+ * To find the UMC channel represented by this bank we need to match on its -+ * instance_id. The instance_id of a bank is held in the lower 32 bits of its -+ * IPID. -+ */ -+static int find_umc_channel(struct mce_event *e) -+{ -+ uint32_t umc_instance_id[] = {0x50f00, 0x150f00}; -+ uint32_t instance_id = EXTRACT(e->ipid, 0, 31); -+ int i, channel = -1; -+ -+ for (i = 0; i < ARRAY_SIZE(umc_instance_id); i++) -+ if (umc_instance_id[i] == instance_id) -+ channel = i; -+ -+ return channel; -+} -+/* Decode extended errors according to Scalable MCA specification */ -+static void decode_smca_error(struct mce_event *e) -+{ -+ enum smca_bank_types bank_type; -+ const char *ip_name; -+ unsigned short xec = (e->status >> 16) & 0x3f; -+ const struct smca_hwid *s_hwid; -+ uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63); -+ unsigned int csrow = -1, channel = -1; -+ unsigned int i; -+ -+ for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { -+ s_hwid = &smca_hwid_mcatypes[i]; -+ if (mcatype_hwid == s_hwid->mcatype_hwid) { -+ bank_type = s_hwid->bank_type; -+ break; -+ } -+ } -+ -+ if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) { -+ strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID"); -+ return; -+ } -+ -+ if (bank_type >= N_SMCA_BANK_TYPES) { -+ strcpy(e->mcastatus_msg, "Don't know how to decode this bank"); -+ return; -+ } -+ -+ if (bank_type == SMCA_RESERVED) { -+ strcpy(e->mcastatus_msg, "Bank 4 is reserved.\n"); -+ return; -+ } -+ -+ ip_name = smca_names[bank_type].name; -+ -+ mce_snprintf(e->bank_name, "%s (bank=%d)", ip_name, e->bank); -+ -+ /* Only print the descriptor of valid extended error code */ -+ if (xec < smca_mce_descs[bank_type].num_descs) -+ mce_snprintf(e->mcastatus_msg, -+ " %s.\n", smca_mce_descs[bank_type].descs[xec]); -+ -+ if (bank_type == SMCA_UMC && xec == 0) { -+ channel = find_umc_channel(e); -+ csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */ -+ mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d", -+ channel, csrow); -+ } -+} -+ -+int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e) -+{ -+ uint64_t mcgstatus = e->mcgstatus; -+ -+ mce_snprintf(e->mcgstatus_msg, "mcgstatus=%lld", -+ (long long)e->mcgstatus); -+ -+ if (mcgstatus & MCG_STATUS_RIPV) -+ mce_snprintf(e->mcgstatus_msg, "RIPV"); -+ if (mcgstatus & MCG_STATUS_EIPV) -+ mce_snprintf(e->mcgstatus_msg, "EIPV"); -+ if (mcgstatus & MCG_STATUS_MCIP) -+ mce_snprintf(e->mcgstatus_msg, "MCIP"); -+ -+ decode_smca_error(e); -+ amd_decode_errcode(e); -+ return 0; -+} ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ rasdaemon-0.6.1/mce-amd.c 2019-07-12 11:35:04.836470461 -0400 -@@ -0,0 +1,122 @@ -+/* -+ * Copyright (c) 2018, The AMD, Inc. All rights reserved. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 and -+ * only version 2 as published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ */ -+ -+#include -+#include -+ -+#include "ras-mce-handler.h" -+ -+/* Error Code Types */ -+#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010) -+#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100) -+#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800) -+#define INT_ERROR(x) (((x) & 0xF4FF) == 0x0400) -+ -+/* Error code: transaction type (TT) */ -+static char *transaction[] = { -+ "instruction", "data", "generic", "reserved" -+}; -+/* Error codes: cache level (LL) */ -+static char *cachelevel[] = { -+ "reserved", "L1", "L2", "L3/generic" -+}; -+/* Error codes: memory transaction type (RRRR) */ -+static char *memtrans[] = { -+ "generic", "generic read", "generic write", "data read", -+ "data write", "instruction fetch", "prefetch", "evict", "snoop", -+ "?", "?", "?", "?", "?", "?", "?" -+}; -+/* Participation Processor */ -+static char *partproc[] = { -+ "local node origin", "local node response", -+ "local node observed", "generic participation" -+}; -+/* Timeout */ -+static char *timeout[] = { -+ "request didn't time out", -+ "request timed out" -+}; -+/* internal unclassified error code */ -+static char *internal[] = { "reserved", -+ "reserved", -+ "hardware assert", -+ "reserved" }; -+ -+#define TT(x) (((x) >> 2) & 0x3) /*bit 2, bit 3*/ -+#define TT_MSG(x) transaction[TT(x)] -+#define LL(x) ((x) & 0x3) /*bit 0, bit 1*/ -+#define LL_MSG(x) cachelevel[LL(x)] -+ -+#define R4(x) (((x) >> 4) & 0xF) /*bit 4, bit 5, bit 6, bit 7 */ -+#define R4_MSG(x) ((R4(x) < 9) ? memtrans[R4(x)] : "Wrong R4!") -+ -+#define TO(x) (((x) >> 8) & 0x1) /*bit 8*/ -+#define TO_MSG(x) timeout[TO(x)] -+#define PP(x) (((x) >> 9) & 0x3) /*bit 9, bit 10*/ -+#define PP_MSG(x) partproc[PP(x)] -+ -+#define UU(x) (((x) >> 8) & 0x3) /*bit 8, bit 9*/ -+#define UU_MSG(x) internal[UU(x)] -+ -+void decode_amd_errcode(struct mce_event *e) -+{ -+ uint16_t ec = e->status & 0xffff; -+ uint16_t ecc = (e->status >> 45) & 0x3; -+ -+ if (e->status & MCI_STATUS_UC) { -+ if (e->status & MCI_STATUS_PCC) -+ strcpy(e->error_msg, "System Fatal error."); -+ if (e->mcgstatus & MCG_STATUS_RIPV) -+ strcpy(e->error_msg, -+ "Uncorrected, software restartable error."); -+ strcpy(e->error_msg, -+ "Uncorrected, software containable error."); -+ } else if (e->status & MCI_STATUS_DEFERRED) -+ strcpy(e->error_msg, "Deferred error, no action required."); -+ else -+ strcpy(e->error_msg, "Corrected error, no action required."); -+ -+ if (!(e->status & MCI_STATUS_VAL)) -+ mce_snprintf(e->mcistatus_msg, "MCE_INVALID"); -+ -+ if (e->status & MCI_STATUS_OVER) -+ mce_snprintf(e->mcistatus_msg, "Error_overflow"); -+ -+ if (e->status & MCI_STATUS_PCC) -+ mce_snprintf(e->mcistatus_msg, "Processor_context_corrupt"); -+ -+ if (ecc) -+ mce_snprintf(e->mcistatus_msg, -+ "%sECC", ((ecc == 2) ? "C" : "U")); -+ -+ if (INT_ERROR(ec)) { -+ mce_snprintf(e->mcastatus_msg, "Internal '%s'", UU_MSG(ec)); -+ return; -+ } -+ -+ if (TLB_ERROR(ec)) -+ mce_snprintf(e->mcastatus_msg, -+ "TLB Error 'tx: %s, level: %s'", -+ TT_MSG(ec), LL_MSG(ec)); -+ else if (MEM_ERROR(ec)) -+ mce_snprintf(e->mcastatus_msg, -+ "Memory Error 'mem-tx: %s, tx: %s, level: %s'", -+ R4_MSG(ec), TT_MSG(ec), LL_MSG(ec)); -+ else if (BUS_ERROR(ec)) -+ mce_snprintf(e->mcastatus_msg, -+ "Bus Error '%s, %s, mem-tx: %s, level: %s'", -+ PP_MSG(ec), TO_MSG(ec), -+ R4_MSG(ec), LL_MSG(ec)); -+ return; -+ -+} ---- rasdaemon-0.6.1.orig/ras-mce-handler.c 2019-07-12 11:35:01.585502811 -0400 -+++ rasdaemon-0.6.1/ras-mce-handler.c 2019-07-12 11:35:04.836470461 -0400 -@@ -55,6 +55,7 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series - [CPU_KNIGHTS_LANDING] = "Knights Landing", - [CPU_KNIGHTS_MILL] = "Knights Mill", - [CPU_SKYLAKE_XEON] = "Skylake server", -+ [CPU_NAPLES] = "AMD Family 17h Zen1" - }; - - static enum cputype select_intel_cputype(struct ras_events *ras) -@@ -190,9 +191,12 @@ ret = 0; - if (!strcmp(mce->vendor, "AuthenticAMD")) { - if (mce->family == 15) - mce->cputype = CPU_K8; -- if (mce->family > 15) { -+ if (mce->family == 23) -+ mce->cputype = CPU_NAPLES; -+ if (mce->family > 23) { - log(ALL, LOG_INFO, -- "Can't parse MCE for this AMD CPU yet\n"); -+ "Can't parse MCE for this AMD CPU yet %d\n", -+ mce->family); - ret = EINVAL; - } - goto ret; -@@ -331,6 +335,12 @@ #if 0 - if (e->status & MCI_STATUS_ADDRV) - trace_seq_printf(s, ", addr= %llx", (long long)e->addr); - -+ if (e->status & MCI_STATUS_SYNDV) -+ trace_seq_printf(s, ", synd= %llx", (long long)e->synd); -+ -+ if (e->ipid) -+ trace_seq_printf(s, ", ipid= %llx", (long long)e->ipid); -+ - if (e->mcgstatus_msg) - trace_seq_printf(s, ", %s", e->mcgstatus_msg); - else -@@ -411,6 +421,13 @@ if (pevent_get_field_val(s, event, "bank - if (pevent_get_field_val(s, event, "cpuvendor", record, &val, 1) < 0) - return -1; - e.cpuvendor = val; -+ /* Get New entries */ -+ if (pevent_get_field_val(s, event, "synd", record, &val, 1) < 0) -+ return -1; -+ e.synd = val; -+ if (pevent_get_field_val(s, event, "ipid", record, &val, 1) < 0) -+ return -1; -+ e.ipid = val; - - switch (mce->cputype) { - case CPU_GENERIC: -@@ -418,6 +435,9 @@ if (pevent_get_field_val(s, event, "cpuv - case CPU_K8: - rc = parse_amd_k8_event(ras, &e); - break; -+ case CPU_NAPLES: -+ rc = parse_amd_smca_event(ras, &e); -+ break; - default: /* All other CPU types are Intel */ - rc = parse_intel_event(ras, &e); - } ---- rasdaemon-0.6.1.orig/ras-mce-handler.h 2019-07-12 11:35:01.585502811 -0400 -+++ rasdaemon-0.6.1/ras-mce-handler.h 2019-07-12 11:35:04.836470461 -0400 -@@ -50,6 +50,7 @@ enum cputype { - CPU_KNIGHTS_LANDING, - CPU_KNIGHTS_MILL, - CPU_SKYLAKE_XEON, -+ CPU_NAPLES, - }; - - struct mce_event { -@@ -69,6 +70,8 @@ struct mce_event { - uint8_t cs; - uint8_t bank; - uint8_t cpuvendor; -+ uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */ -+ uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */ - - /* Parsed data */ - char timestamp[64]; -@@ -129,6 +132,9 @@ void broadwell_de_decode_model(struct ra - void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e); - void skylake_s_decode_model(struct ras_events *ras, struct mce_event *e); - -+/* AMD error code decode function */ -+void decode_amd_errcode(struct mce_event *e); -+ - /* Software defined banks */ - #define MCE_EXTENDED_BANK 128 - -@@ -144,6 +150,13 @@ #define MCI_STATUS_EN (1ULL<<60) /* - #define MCI_STATUS_S (1ULL<<56) /* signalled */ - #define MCI_STATUS_AR (1ULL<<55) /* action-required */ - -+/* AMD-specific bits */ -+#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */ -+#define MCI_STATUS_SYNDV (1ULL<<53) /* synd reg. valid */ -+/* uncorrected error,deferred exception */ -+#define MCI_STATUS_DEFERRED (1ULL<<44) -+#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ -+ - #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ - #define MCG_STATUS_EIPV (1ULL<<1) /* eip points to correct instruction */ - #define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ -@@ -154,4 +167,6 @@ int parse_intel_event(struct ras_events - - int parse_amd_k8_event(struct ras_events *ras, struct mce_event *e); - -+int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e); -+ - #endif ---- rasdaemon-0.6.1.orig/Makefile.in 2018-04-25 06:29:05.000000000 -0400 -+++ rasdaemon-0.6.1/Makefile.in 2019-07-15 14:41:22.308278851 -0400 -@@ -100,7 +100,7 @@ sbin_PROGRAMS = rasdaemon$(EXEEXT) - @WITH_MCE_TRUE@ mce-intel-dunnington.c mce-intel-tulsa.c \ - @WITH_MCE_TRUE@ mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \ - @WITH_MCE_TRUE@ mce-intel-knl.c mce-intel-broadwell-de.c \ --@WITH_MCE_TRUE@ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c -+@WITH_MCE_TRUE@ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c mce-amd.c mce-amd-smca.c - - @WITH_EXTLOG_TRUE@am__append_6 = ras-extlog-handler.c - @WITH_ABRT_REPORT_TRUE@am__append_7 = ras-report.c -@@ -132,7 +132,7 @@ am__rasdaemon_SOURCES_DIST = rasdaemon.c - mce-intel-ivb.c mce-intel-haswell.c mce-intel-knl.c \ - mce-intel-broadwell-de.c mce-intel-broadwell-epex.c \ - mce-intel-skylake-xeon.c ras-extlog-handler.c ras-report.c \ -- non-standard-hisi_hip07.c -+ non-standard-hisi_hip07.c mce-amd-smca.c mce-amd.c - @WITH_SQLITE3_TRUE@am__objects_1 = ras-record.$(OBJEXT) - @WITH_AER_TRUE@am__objects_2 = ras-aer-handler.$(OBJEXT) - @WITH_NON_STANDARD_TRUE@am__objects_3 = \ -@@ -149,7 +149,9 @@ non-standard-hisi_hip07.c - @WITH_MCE_TRUE@ mce-intel-knl.$(OBJEXT) \ - @WITH_MCE_TRUE@ mce-intel-broadwell-de.$(OBJEXT) \ - @WITH_MCE_TRUE@ mce-intel-broadwell-epex.$(OBJEXT) \ --@WITH_MCE_TRUE@ mce-intel-skylake-xeon.$(OBJEXT) -+@WITH_MCE_TRUE@ mce-intel-skylake-xeon.$(OBJEXT) \ -+@WITH_MCE_TRUE@ mce-amd-smca.$(OBJEXT) \ -+@WITH_MCE_TRUE@ mce-amd.$(OBJEXT) - @WITH_EXTLOG_TRUE@am__objects_6 = ras-extlog-handler.$(OBJEXT) - @WITH_ABRT_REPORT_TRUE@am__objects_7 = ras-report.$(OBJEXT) - @WITH_HISI_NS_DECODE_TRUE@am__objects_8 = \ -@@ -595,6 +597,8 @@ distclean-compile: - - @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bitfield.Po@am__quote@ - @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd-k8.Po@am__quote@ -+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd.Po@am__quote@ -+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd-scma.Po@am__quote@ - @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-broadwell-de.Po@am__quote@ - @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-broadwell-epex.Po@am__quote@ - @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-dunnington.Po@am__quote@ diff --git a/SOURCES/a8c776ed94f68ae31d7b5f74e19545698898c13c.patch b/SOURCES/a8c776ed94f68ae31d7b5f74e19545698898c13c.patch deleted file mode 100644 index 38657d4..0000000 --- a/SOURCES/a8c776ed94f68ae31d7b5f74e19545698898c13c.patch +++ /dev/null @@ -1,138 +0,0 @@ -commit a8c776ed94f68ae31d7b5f74e19545698898c13c -Author: Mauro Carvalho Chehab -Date: Tue Aug 14 13:06:27 2018 -0300 - - mce-intel-*: fix a warning when using FIELD(, NULL) - - Internally, FIELD() macro checks the size of an array, by - using ARRAY_SIZE. Well, this macro causes a division by zero - if NULL is used, as its type is void, as warned: - - mce-intel-dunnington.c:30:2: note: in expansion of macro ‘FIELD’ - FIELD(17, NULL), - ^~~~~ - ras-mce-handler.h:28:33: warning: division ‘sizeof (void *) / sizeof (void)’ does not compute the number of array elements [-Wsizeof-pointer-div] - #define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x))) - ^ - bitfield.h:37:51: note: in expansion of macro ‘ARRAY_SIZE’ - #define FIELD(start_bit, name) { start_bit, name, ARRAY_SIZE(name) } - ^~~~~~~~~~ - - While this warning is harmless, it may prevent seeing more serios - warnings. So, add a FIELD_NULL() macro to avoid that. - - Signed-off-by: Mauro Carvalho Chehab - -diff --git a/bitfield.h b/bitfield.h -index c7dfeb1..fccbb36 100644 ---- a/bitfield.h -+++ b/bitfield.h -@@ -35,6 +35,7 @@ struct numfield { - }; - - #define FIELD(start_bit, name) { start_bit, name, ARRAY_SIZE(name) } -+#define FIELD_NULL(start_bit) { start_bit, NULL, 0 } - #define SBITFIELD(start_bit, string) { start_bit, ((char * [2]) { NULL, string }), 2 } - - #define NUMBER(start, end, name) { start, end, name, "%Lu", 0 } -diff --git a/mce-intel-dunnington.c b/mce-intel-dunnington.c -index 4b1c7e3..c695c62 100644 ---- a/mce-intel-dunnington.c -+++ b/mce-intel-dunnington.c -@@ -27,14 +27,14 @@ - - static struct field dunnington_bus_status[] = { - SBITFIELD(16, "Parity error detected during FSB request phase"), -- FIELD(17, NULL), -+ FIELD_NULL(17), - SBITFIELD(20, "Hard Failure response received for a local transaction"), - SBITFIELD(21, "Parity error on FSB response field detected"), - SBITFIELD(22, "Parity data error on inbound data detected"), -- FIELD(23, NULL), -- FIELD(25, NULL), -- FIELD(28, NULL), -- FIELD(31, NULL), -+ FIELD_NULL(23), -+ FIELD_NULL(25), -+ FIELD_NULL(28), -+ FIELD_NULL(31), - {} - }; - -diff --git a/mce-intel-p4-p6.c b/mce-intel-p4-p6.c -index 4615e1a..5c6c3ff 100644 ---- a/mce-intel-p4-p6.c -+++ b/mce-intel-p4-p6.c -@@ -60,7 +60,7 @@ static char *bus_queue_error_type[] = { - }; - - static struct field p6_shared_status[] = { -- FIELD(16, NULL), -+ FIELD_NULL(16), - FIELD(19, bus_queue_req_type), - FIELD(25, bus_queue_error_type), - FIELD(25, bus_queue_error_type), -@@ -68,7 +68,7 @@ static struct field p6_shared_status[] = { - SBITFIELD(36, "received parity error on response transaction"), - SBITFIELD(38, "timeout BINIT (ROB timeout)." - " No micro-instruction retired for some time"), -- FIELD(39, NULL), -+ FIELD_NULL(39), - SBITFIELD(42, "bus transaction received hard error response"), - SBITFIELD(43, "failure that caused IERR"), - /* The following are reserved for Core in the SDM. Let's keep them here anyways*/ -@@ -76,15 +76,15 @@ static struct field p6_shared_status[] = { - SBITFIELD(45, "uncorrectable ECC error"), - SBITFIELD(46, "correctable ECC error"), - /* [47..54]: ECC syndrome */ -- FIELD(55, NULL), -+ FIELD_NULL(55), - {}, - }; - - static struct field p6old_status[] = { - SBITFIELD(28, "FRC error"), - SBITFIELD(29, "BERR on this CPU"), -- FIELD(31, NULL), -- FIELD(32, NULL), -+ FIELD_NULL(31), -+ FIELD_NULL(32), - SBITFIELD(35, "BINIT received from external bus"), - SBITFIELD(37, "Received hard error reponse on split transaction (Bus BINIT)"), - {} -@@ -94,9 +94,9 @@ static struct field core2_status[] = { - SBITFIELD(28, "MCE driven"), - SBITFIELD(29, "MCE is observed"), - SBITFIELD(31, "BINIT observed"), -- FIELD(32, NULL), -+ FIELD_NULL(32), - SBITFIELD(34, "PIC or FSB data parity error"), -- FIELD(35, NULL), -+ FIELD_NULL(35), - SBITFIELD(37, "FSB address parity error detected"), - {} - }; -diff --git a/mce-intel-tulsa.c b/mce-intel-tulsa.c -index 6cea421..e59bf06 100644 ---- a/mce-intel-tulsa.c -+++ b/mce-intel-tulsa.c -@@ -39,7 +39,7 @@ static struct field tls_bus_status[] = { - SBITFIELD(16, "Parity error detected during FSB request phase"), - SBITFIELD(17, "Partity error detected on Core 0 request's address field"), - SBITFIELD(18, "Partity error detected on Core 1 request's address field"), -- FIELD(19, NULL), -+ FIELD_NULL(19), - SBITFIELD(20, "Parity error on FSB response field detected"), - SBITFIELD(21, "FSB data parity error on inbound date detected"), - SBITFIELD(22, "Data parity error on data received from Core 0 detected"), -@@ -48,8 +48,8 @@ static struct field tls_bus_status[] = { - SBITFIELD(25, "Data ECC event to error on inbound data correctable or uncorrectable"), - SBITFIELD(26, "Pad logic detected a data strobe glitch or sequencing error"), - SBITFIELD(27, "Pad logic detected a request strobe glitch or sequencing error"), -- FIELD(28, NULL), -- FIELD(31, NULL), -+ FIELD_NULL(28), -+ FIELD_NULL(31), - {} - }; - diff --git a/SOURCES/add_upstream_labels.patch b/SOURCES/add_upstream_labels.patch deleted file mode 100644 index 70a04df..0000000 --- a/SOURCES/add_upstream_labels.patch +++ /dev/null @@ -1,159 +0,0 @@ ---- - labels/dell | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 152 insertions(+) - ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ rasdaemon-0.6.1/labels/dell 2020-02-20 11:53:39.574579258 -0500 -@@ -0,0 +1,152 @@ -+# RASDAEMON Motherboard DIMM labels Database file. -+# -+# Vendor-name and model-name are found from the program 'dmidecode' -+# labels are found from the silk screen on the motherboard. -+# -+#Vendor: -+# Product: -+# Model: -+#