Auto sync2gitlab import of rasdaemon-0.6.1-12.el8.src.rpm

This commit is contained in:
James Antill 2022-05-26 14:05:25 -04:00
parent 0596340e84
commit 5fc60fd728
24 changed files with 4009 additions and 1 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/rasdaemon-0.6.1.tar.bz2

View File

@ -0,0 +1,85 @@
commit 0862a096c3a1d0f993703ab3299f1ddfadf53d7f
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Tue Aug 11 13:31:46 2020 +0100
rasdaemon: ras-mc-ctl: Add ARM processor error information
Add supporting ARM processor error in the ras-mc-ctl tool.
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
util/ras-mc-ctl.in | 40 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 40 insertions(+)
--- rasdaemon-0.6.1.orig/util/ras-mc-ctl.in 2021-10-06 14:14:25.000440090 -0400
+++ rasdaemon-0.6.1/util/ras-mc-ctl.in 2021-10-06 14:15:59.995598590 -0400
@@ -1124,6 +1124,7 @@ sub summary
my ($query, $query_handle, $out);
my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg);
my ($etype, $severity, $etype_string, $severity_string);
+ my ($affinity, $mpidr);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
@@ -1159,6 +1160,22 @@ sub summary
}
$query_handle->finish;
+ # ARM processor arm_event errors
+ $query = "select affinity, mpidr, count(*) from arm_event group by affinity, mpidr";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($affinity, $mpidr, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$count errors\n";
+ }
+ if ($out ne "") {
+ print "ARM processor events summary:\n$out\n";
+ } else {
+ print "No ARM processor errors.\n\n";
+ }
+ $query_handle->finish;
+
# extlog errors
$query = "select etype, severity, count(*) from extlog_event group by etype, severity";
$query_handle = $dbh->prepare($query);
@@ -1202,6 +1219,7 @@ sub errors
my ($query, $query_handle, $id, $time, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out);
my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location);
my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data);
+ my ($error_count, $affinity, $mpidr, $r_state, $psci_state);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
@@ -1241,6 +1259,28 @@ sub errors
}
$query_handle->finish;
+ # ARM processor arm_event errors
+ $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "error_count=$error_count, " if ($error_count);
+ $out .= "affinity_level=$affinity, ";
+ $out .= sprintf "mpidr=0x%x, ", $mpidr;
+ $out .= sprintf "running_state=0x%x, ", $r_state;
+ $out .= sprintf "psci_state=0x%x", $psci_state;
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "ARM processor events:\n$out\n";
+ } else {
+ print "No ARM processor errors.\n\n";
+ }
+ $query_handle->finish;
+
# Extlog errors
$query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id";
$query_handle = $dbh->prepare($query);

View File

@ -0,0 +1,32 @@
commit 16d929b024c31d54a7f8a72eab094376c7be27f5
Author: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Wed May 26 10:20:39 2021 +0200
Makefile.am: fix build header rules
non-standard-hisilicon.h was added twice;
ras-memory-failure-handler.h is missing.
Due to that, the tarball becomes incomplete, causing build
errors.
While here, also adjust .travis.yml to use --enable-all.
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
Makefile.am | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/Makefile.am 2021-10-13 13:27:53.402685179 -0400
+++ b/Makefile.am 2021-10-13 13:28:11.664525173 -0400
@@ -54,7 +54,8 @@ rasdaemon_LDADD = -lpthread $(SQLITE3_LI
include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \
- ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h
+ ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \
+ ras-memory-failure-handler.h
# This rule can't be called with more than one Makefile job (like make -j8)
# I can't figure out a way to fix that

View File

@ -0,0 +1,538 @@
commit 2290d65b97311dd5736838f1e285355f7f357046
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Mon Mar 8 16:57:26 2021 +0000
rasdaemon: add support for memory_failure events
Add support to log the memory_failure kernel trace
events.
Example rasdaemon log and SQLite DB output for the
memory_failure event,
=================================================
rasdaemon: memory_failure_event store: 0x126ce8f8
rasdaemon: register inserted at db
<...>-785 [000] 0.000024: memory_failure_event: 2020-10-02 13:27:13 -0400 pfn=0x204000000 page_type=free buddy page action_result=Delayed
CREATE TABLE memory_failure_event (id INTEGER PRIMARY KEY, timestamp TEXT, pfn TEXT, page_type TEXT, action_result TEXT);
INSERT INTO memory_failure_event VALUES(1,'2020-10-02 13:27:13 -0400','0x204000000','free buddy page','Delayed');
==================================================
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
Makefile.am | 4
ras-events.c | 15 +++
ras-memory-failure-handler.c | 179 +++++++++++++++++++++++++++++++++++++++++++
ras-memory-failure-handler.h | 25 ++++++
ras-record.c | 56 +++++++++++++
ras-record.h | 13 +++
ras-report.c | 68 ++++++++++++++++
ras-report.h | 5 -
8 files changed, 364 insertions(+), 1 deletion(-)
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ b/ras-memory-failure-handler.c 2021-10-14 16:31:36.840657728 -0400
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "libtrace/kbuffer.h"
+#include "ras-memory-failure-handler.h"
+#include "ras-record.h"
+#include "ras-logger.h"
+#include "ras-report.h"
+
+/* Memory failure - various types of pages */
+enum mf_action_page_type {
+ MF_MSG_KERNEL,
+ MF_MSG_KERNEL_HIGH_ORDER,
+ MF_MSG_SLAB,
+ MF_MSG_DIFFERENT_COMPOUND,
+ MF_MSG_POISONED_HUGE,
+ MF_MSG_HUGE,
+ MF_MSG_FREE_HUGE,
+ MF_MSG_NON_PMD_HUGE,
+ MF_MSG_UNMAP_FAILED,
+ MF_MSG_DIRTY_SWAPCACHE,
+ MF_MSG_CLEAN_SWAPCACHE,
+ MF_MSG_DIRTY_MLOCKED_LRU,
+ MF_MSG_CLEAN_MLOCKED_LRU,
+ MF_MSG_DIRTY_UNEVICTABLE_LRU,
+ MF_MSG_CLEAN_UNEVICTABLE_LRU,
+ MF_MSG_DIRTY_LRU,
+ MF_MSG_CLEAN_LRU,
+ MF_MSG_TRUNCATED_LRU,
+ MF_MSG_BUDDY,
+ MF_MSG_BUDDY_2ND,
+ MF_MSG_DAX,
+ MF_MSG_UNSPLIT_THP,
+ MF_MSG_UNKNOWN,
+};
+
+/* Action results for various types of pages */
+enum mf_action_result {
+ MF_IGNORED, /* Error: cannot be handled */
+ MF_FAILED, /* Error: handling failed */
+ MF_DELAYED, /* Will be handled later */
+ MF_RECOVERED, /* Successfully recovered */
+};
+
+/* memory failure page types */
+static const struct {
+ int type;
+ const char *page_type;
+} mf_page_type[] = {
+ { MF_MSG_KERNEL, "reserved kernel page" },
+ { MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page"},
+ { MF_MSG_SLAB, "kernel slab page"},
+ { MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking"},
+ { MF_MSG_POISONED_HUGE, "huge page already hardware poisoned"},
+ { MF_MSG_HUGE, "huge page"},
+ { MF_MSG_FREE_HUGE, "free huge page"},
+ { MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page"},
+ { MF_MSG_UNMAP_FAILED, "unmapping failed page"},
+ { MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page"},
+ { MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page"},
+ { MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page"},
+ { MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page"},
+ { MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page"},
+ { MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page"},
+ { MF_MSG_DIRTY_LRU, "dirty LRU page"},
+ { MF_MSG_CLEAN_LRU, "clean LRU page"},
+ { MF_MSG_TRUNCATED_LRU, "already truncated LRU page"},
+ { MF_MSG_BUDDY, "free buddy page"},
+ { MF_MSG_BUDDY_2ND, "free buddy page (2nd try)"},
+ { MF_MSG_DAX, "dax page"},
+ { MF_MSG_UNSPLIT_THP, "unsplit thp"},
+ { MF_MSG_UNKNOWN, "unknown page"},
+};
+
+/* memory failure action results */
+static const struct {
+ int result;
+ const char *action_result;
+} mf_action_result[] = {
+ { MF_IGNORED, "Ignored" },
+ { MF_FAILED, "Failed" },
+ { MF_DELAYED, "Delayed" },
+ { MF_RECOVERED, "Recovered" },
+};
+
+static const char *get_page_type(int page_type)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mf_page_type); i++)
+ if (mf_page_type[i].type == page_type)
+ return mf_page_type[i].page_type;
+
+ return "unknown page";
+}
+
+static const char *get_action_result(int result)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mf_action_result); i++)
+ if (mf_action_result[i].result == result)
+ return mf_action_result[i].action_result;
+
+ return "unknown";
+}
+
+
+int ras_memory_failure_event_handler(struct trace_seq *s,
+ struct pevent_record *record,
+ struct event_format *event, void *context)
+{
+ unsigned long long val;
+ struct ras_events *ras = context;
+ time_t now;
+ struct tm *tm;
+ struct ras_mf_event ev;
+
+ /*
+ * Newer kernels (3.10-rc1 or upper) provide an uptime clock.
+ * On previous kernels, the way to properly generate an event would
+ * be to inject a fake one, measure its timestamp and diff it against
+ * gettimeofday. We won't do it here. Instead, let's use uptime,
+ * falling-back to the event report's time, if "uptime" clock is
+ * not available (legacy kernels).
+ */
+
+ if (ras->use_uptime)
+ now = record->ts/user_hz + ras->uptime_diff;
+ else
+ now = time(NULL);
+
+ tm = localtime(&now);
+ if (tm)
+ strftime(ev.timestamp, sizeof(ev.timestamp),
+ "%Y-%m-%d %H:%M:%S %z", tm);
+ trace_seq_printf(s, "%s ", ev.timestamp);
+
+ if (pevent_get_field_val(s, event, "pfn", record, &val, 1) < 0)
+ return -1;
+ sprintf(ev.pfn, "0x%llx", val);
+ trace_seq_printf(s, "pfn=0x%llx ", val);
+
+ if (pevent_get_field_val(s, event, "type", record, &val, 1) < 0)
+ return -1;
+ ev.page_type = get_page_type(val);
+ trace_seq_printf(s, "page_type=%s ", ev.page_type);
+
+ if (pevent_get_field_val(s, event, "result", record, &val, 1) < 0)
+ return -1;
+ ev.action_result = get_action_result(val);
+ trace_seq_printf(s, "action_result=%s ", ev.action_result);
+
+ /* Store data into the SQLite DB */
+#ifdef HAVE_SQLITE3
+ ras_store_mf_event(ras, &ev);
+#endif
+
+#ifdef HAVE_ABRT_REPORT
+ /* Report event to ABRT */
+ ras_report_mf_event(ras, &ev);
+#endif
+
+ return 0;
+}
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ b/ras-memory-failure-handler.h 2021-10-14 16:31:36.840657728 -0400
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+*/
+
+#ifndef __RAS_MEMORY_FAILURE_HANDLER_H
+#define __RAS_MEMORY_FAILURE_HANDLER_H
+
+#include "ras-events.h"
+#include "libtrace/event-parse.h"
+
+int ras_memory_failure_event_handler(struct trace_seq *s,
+ struct pevent_record *record,
+ struct event_format *event, void *context);
+
+#endif
--- a/ras-record.c 2018-04-25 06:19:03.000000000 -0400
+++ b/ras-record.c 2021-10-14 16:31:36.840657728 -0400
@@ -404,6 +404,55 @@ sqlite3_bind_text(priv->stmt_mce_record,
}
#endif
+/*
+ * Table and functions to handle ras:memory_failure
+ */
+
+#ifdef HAVE_MEMORY_FAILURE
+static const struct db_fields mf_event_fields[] = {
+ { .name="id", .type="INTEGER PRIMARY KEY" },
+ { .name="timestamp", .type="TEXT" },
+ { .name="pfn", .type="TEXT" },
+ { .name="page_type", .type="TEXT" },
+ { .name="action_result", .type="TEXT" },
+};
+
+static const struct db_table_descriptor mf_event_tab = {
+ .name = "memory_failure_event",
+ .fields = mf_event_fields,
+ .num_fields = ARRAY_SIZE(mf_event_fields),
+};
+
+int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev)
+{
+ int rc;
+ struct sqlite3_priv *priv = ras->db_priv;
+
+ if (!priv || !priv->stmt_mf_event)
+ return 0;
+ log(TERM, LOG_INFO, "memory_failure_event store: %p\n", priv->stmt_mf_event);
+
+ sqlite3_bind_text(priv->stmt_mf_event, 1, ev->timestamp, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mf_event, 2, ev->pfn, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mf_event, 3, ev->page_type, -1, NULL);
+ sqlite3_bind_text(priv->stmt_mf_event, 4, ev->action_result, -1, NULL);
+
+ rc = sqlite3_step(priv->stmt_mf_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed to do memory_failure_event step on sqlite: error = %d\n", rc);
+
+ rc = sqlite3_reset(priv->stmt_mf_event);
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
+ log(TERM, LOG_ERR,
+ "Failed reset memory_failure_event on sqlite: error = %d\n",
+ rc);
+
+ log(TERM, LOG_INFO, "register inserted at db\n");
+
+ return rc;
+}
+#endif
/*
* Generic code
@@ -567,6 +616,13 @@ usleep(10000);
rc = ras_mc_prepare_stmt(priv, &priv->stmt_arm_record,
&arm_event_tab);
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ rc = ras_mc_create_table(priv, &mf_event_tab);
+ if (rc == SQLITE_OK) {
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_mf_event,
+ &mf_event_tab);
+ }
+#endif
ras->db_priv = priv;
return 0;
--- a/ras-record.h 2018-04-25 06:19:03.000000000 -0400
+++ b/ras-record.h 2021-10-14 16:31:36.840657728 -0400
@@ -75,12 +75,20 @@ struct ras_arm_event {
int32_t psci_state;
};
+struct ras_mf_event {
+ char timestamp[64];
+ char pfn[30];
+ const char *page_type;
+ const char *action_result;
+};
+
struct ras_mc_event;
struct ras_aer_event;
struct ras_extlog_event;
struct ras_non_standard_event;
struct ras_arm_event;
struct mce_event;
+struct ras_mf_event;
#ifdef HAVE_SQLITE3
@@ -104,6 +112,9 @@ struct sqlite3_priv {
#ifdef HAVE_ARM
sqlite3_stmt *stmt_arm_record;
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ sqlite3_stmt *stmt_mf_event;
+#endif
};
int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras);
@@ -113,6 +124,7 @@ int ras_store_mce_record(struct ras_even
int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev);
int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev);
int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev);
+int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
#else
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
@@ -122,6 +134,7 @@ static inline int ras_store_mce_record(s
static inline int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev) { return 0; };
static inline int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; };
static inline int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) { return 0; };
+static inline int ras_store_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; };
#endif
--- a/ras-report.c 2017-10-14 05:11:34.000000000 -0400
+++ b/ras-report.c 2021-10-14 16:31:36.840657728 -0400
@@ -255,6 +255,28 @@ "midr=0x%lx\n" \
return 0;
}
+static int set_mf_event_backtrace(char *buf, struct ras_mf_event *ev)
+{
+ char bt_buf[MAX_BACKTRACE_SIZE];
+
+ if (!buf || !ev)
+ return -1;
+
+ sprintf(bt_buf, "BACKTRACE=" \
+ "timestamp=%s\n" \
+ "pfn=%s\n" \
+ "page_type=%s\n" \
+ "action_result=%s\n", \
+ ev->timestamp, \
+ ev->pfn, \
+ ev->page_type, \
+ ev->action_result);
+
+ strcat(buf, bt_buf);
+
+ return 0;
+}
+
static int commit_report_backtrace(int sockfd, int type, void *ev){
char buf[MAX_BACKTRACE_SIZE];
char *pbuf = buf;
@@ -283,6 +305,9 @@ memset(buf, 0, MAX_BACKTRACE_SIZE);
case ARM_EVENT:
rc = set_arm_event_backtrace(buf, (struct ras_arm_event *)ev);
break;
+ case MF_EVENT:
+ rc = set_mf_event_backtrace(buf, (struct ras_mf_event *)ev);
+ break;
default:
return -1;
}
@@ -549,3 +574,46 @@ return 0;
return -1;
}
}
+
+int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev)
+{
+ char buf[MAX_MESSAGE_SIZE];
+ int sockfd = 0;
+ int done = 0;
+ int rc = -1;
+
+ memset(buf, 0, sizeof(buf));
+
+ sockfd = setup_report_socket();
+ if (sockfd < 0)
+ return -1;
+
+ rc = commit_report_basic(sockfd);
+ if (rc < 0)
+ goto mf_fail;
+
+ rc = commit_report_backtrace(sockfd, MF_EVENT, ev);
+ if (rc < 0)
+ goto mf_fail;
+
+ sprintf(buf, "ANALYZER=%s", "rasdaemon-memory_failure");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto mf_fail;
+
+ sprintf(buf, "REASON=%s", "memory failure problem");
+ rc = write(sockfd, buf, strlen(buf) + 1);
+ if (rc < strlen(buf) + 1)
+ goto mf_fail;
+
+ done = 1;
+
+mf_fail:
+ if (sockfd > 0)
+ close(sockfd);
+
+ if (done)
+ return 0;
+ else
+ return -1;
+}
--- a/ras-report.h 2017-10-14 05:11:34.000000000 -0400
+++ b/ras-report.h 2021-10-14 16:31:36.840657728 -0400
@@ -34,7 +34,8 @@ enum {
MCE_EVENT,
AER_EVENT,
NON_STANDARD_EVENT,
- ARM_EVENT
+ ARM_EVENT,
+ MF_EVENT,
};
#ifdef HAVE_ABRT_REPORT
@@ -44,6 +45,7 @@ int ras_report_aer_event(struct ras_even
int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev);
int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev);
int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev);
+int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev);
#else
@@ -52,6 +54,7 @@ static inline int ras_report_aer_event(s
static inline int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev) { return 0; };
static inline int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; };
static inline int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev) { return 0; };
+static inline int ras_report_mf_event(struct ras_events *ras, struct ras_mf_event *ev) { return 0; };
#endif
--- a/Makefile.am 2018-04-25 06:21:56.000000000 -0400
+++ b/Makefile.am 2021-10-14 16:37:42.423639762 -0400
@@ -41,12 +41,16 @@ endif
if WITH_EXTLOG
rasdaemon_SOURCES += ras-extlog-handler.c
endif
+if WITH_MEMORY_FAILURE
+ rasdaemon_SOURCES += ras-memory-failure-handler.c
+endif
if WITH_ABRT_REPORT
rasdaemon_SOURCES += ras-report.c
endif
if WITH_HISI_NS_DECODE
rasdaemon_SOURCES += non-standard-hisi_hip07.c
endif
+
rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a
include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
--- a/ras-events.c 2021-10-14 16:31:36.730658636 -0400
+++ b/ras-events.c 2021-10-14 16:37:11.043898809 -0400
@@ -33,6 +33,7 @@ * Foundation, Inc., 51 Franklin Street,
#include "ras-arm-handler.h"
#include "ras-mce-handler.h"
#include "ras-extlog-handler.h"
+#include "ras-memory-failure-handler.h"
#include "ras-record.h"
#include "ras-logger.h"
@@ -218,6 +219,10 @@ if (rc < 0) {
rc |= __toggle_ras_mc_event(ras, "ras", "arm_event", enable);
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ rc |= __toggle_ras_mc_event(ras, "ras", "memory_failure_event", enable);
+#endif
+
free_ras:
free(ras);
return rc;
@@ -736,6 +741,16 @@ (void)open("/sys/kernel/debug/ras/daemon
"ras", "aer_event");
#endif
+#ifdef HAVE_MEMORY_FAILURE
+ rc = add_event_handler(ras, pevent, page_size, "ras", "memory_failure_event",
+ ras_memory_failure_event_handler);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "ras", "memory_failure_event");
+#endif
+
if (!num_events) {
log(ALL, LOG_INFO,
"Failed to trace all supported RAS events. Aborting.\n");

View File

@ -0,0 +1,28 @@
commit 28ea956acc2dab7c18b4701f9657afb9ab3ddc79
Author: Muralidhara M K <muralimk@amd.com>
Date: Mon Jul 12 05:18:43 2021 -0500
rasdaemon: set SMCA maximum number of banks to 64
Newer AMD systems with SMCA banks support up to 64 MCA banks per CPU.
This patch is based on the commit below upstremed into the kernel:
a0bc32b3cacf ("x86/mce: Increase maximum number of banks to 64")
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index e0cf512..3c346f4 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -75,6 +75,9 @@ enum smca_bank_types {
N_SMCA_BANK_TYPES
};
+/* Maximum number of MCA banks per CPU. */
+#define MAX_NR_BANKS 64
+
/* SMCA Extended error strings */
/* Load Store */
static const char * const smca_ls_mce_desc[] = {

View File

@ -0,0 +1,66 @@
commit 2a1d217660351c08eb2f8bccebf939abba2f7e69
Author: Brian WoodsGhannam, Yazen <brian.woods@amd.comYazen.Ghannam@amd.com>
Date: Fri Nov 1 15:48:13 2019 +0100
rasdaemon: rename CPU_NAPLES cputype
Change CPU_NAPLES to CPU_AMD_SMCA to reflect that it isn't just NAPLES
that is supported, but AMD's Scalable Machine Check Architecture (SMCA).
[ Yazen: change family check to feature check, and change CPU name. ]
CC: "mchehab+samsung@kernel.org" <mchehab+samsung@kernel.org>, "Namburu, Chandu-babu" <chandu@amd.com> # Thread-Topic: [PATCH 1/2] rasdaemon: rename CPU_NAPLES cputype
Signed-off-by: Brian Woods <brian.woods@amd.com>
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Cc: Chandu-babu Namburu <chandu@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
ras-mce-handler.c | 10 ++++++----
ras-mce-handler.h | 2 +-
2 files changed, 7 insertions(+), 5 deletions(-)
--- rasdaemon-0.6.1.orig/ras-mce-handler.c 2021-05-26 15:16:24.699096556 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.c 2021-05-26 15:18:06.543162745 -0400
@@ -55,7 +55,7 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series
[CPU_KNIGHTS_LANDING] = "Knights Landing",
[CPU_KNIGHTS_MILL] = "Knights Mill",
[CPU_SKYLAKE_XEON] = "Skylake server",
- [CPU_NAPLES] = "AMD Family 17h Zen1"
+ [CPU_AMD_SMCA] = "AMD Scalable MCA",
};
static enum cputype select_intel_cputype(struct ras_events *ras)
@@ -191,8 +191,10 @@ ret = 0;
if (!strcmp(mce->vendor, "AuthenticAMD")) {
if (mce->family == 15)
mce->cputype = CPU_K8;
- if (mce->family == 23)
- mce->cputype = CPU_NAPLES;
+ if (strstr(mce->processor_flags, "smca")) {
+ mce->cputype = CPU_AMD_SMCA;
+ goto ret;
+ }
if (mce->family > 23) {
log(ALL, LOG_INFO,
"Can't parse MCE for this AMD CPU yet %d\n",
@@ -435,7 +437,7 @@ if (pevent_get_field_val(s, event, "ipid
case CPU_K8:
rc = parse_amd_k8_event(ras, &e);
break;
- case CPU_NAPLES:
+ case CPU_AMD_SMCA:
rc = parse_amd_smca_event(ras, &e);
break;
default: /* All other CPU types are Intel */
--- rasdaemon-0.6.1.orig/ras-mce-handler.h 2021-05-26 15:17:15.409631590 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.h 2021-05-26 15:18:20.102038424 -0400
@@ -50,7 +50,7 @@ enum cputype {
CPU_KNIGHTS_LANDING,
CPU_KNIGHTS_MILL,
CPU_SKYLAKE_XEON,
- CPU_NAPLES,
+ CPU_AMD_SMCA,
};
struct mce_event {

View File

@ -0,0 +1,372 @@
commit 546cf713f667437fb6e283cc3dc090679eb47d08
Author: Subhendu Saha <subhends@akamai.com>
Date: Tue Jan 12 03:29:55 2021 -0500
Fix ras-mc-ctl script.
When rasdaemon is compiled without enabling aer, mce, devlink,
etc., those tables are not created in the database file. Then
ras-mc-ctl script breaks trying to query data from non-existent
tables.
Signed-off-by: Subhendu Saha subhends@akamai.com
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
util/ras-mc-ctl.in | 310 ++++++++++++++++++++++++++++-------------------------
1 file changed, 168 insertions(+), 142 deletions(-)
--- a/util/ras-mc-ctl.in 2021-10-12 13:45:43.260646935 -0400
+++ b/util/ras-mc-ctl.in 2021-10-12 13:46:38.610158949 -0400
@@ -41,6 +41,16 @@ my $sysconfdir = "@sysconfdir@";
my $dmidecode = find_prog ("dmidecode");
my $modprobe = find_prog ("modprobe") or exit (1);
+my $has_aer = 0;
+my $has_arm = 0;
+my $has_extlog = 0;
+my $has_mce = 0;
+
+@WITH_AER_TRUE@$has_aer = 1;
+@WITH_ARM_TRUE@$has_arm = 1;
+@WITH_EXTLOG_TRUE@$has_extlog = 1;
+@WITH_MCE_TRUE@$has_mce = 1;
+
my %conf = ();
my %bus = ();
my %dimm_size = ();
@@ -1145,70 +1155,78 @@ sub summary
$query_handle->finish;
# PCIe AER aer_event errors
- $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($err_type, $msg, $count));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "\t$count $err_type errors: $msg\n";
- }
- if ($out ne "") {
- print "PCIe AER events summary:\n$out\n";
- } else {
- print "No PCIe AER errors.\n\n";
+ if ($has_aer == 1) {
+ $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($err_type, $msg, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$count $err_type errors: $msg\n";
+ }
+ if ($out ne "") {
+ print "PCIe AER events summary:\n$out\n";
+ } else {
+ print "No PCIe AER errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# ARM processor arm_event errors
- $query = "select affinity, mpidr, count(*) from arm_event group by affinity, mpidr";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($affinity, $mpidr, $count));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "\t$count errors\n";
- }
- if ($out ne "") {
- print "ARM processor events summary:\n$out\n";
- } else {
- print "No ARM processor errors.\n\n";
+ if ($has_arm == 1) {
+ $query = "select affinity, mpidr, count(*) from arm_event group by affinity, mpidr";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($affinity, $mpidr, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$count errors\n";
+ }
+ if ($out ne "") {
+ print "ARM processor events summary:\n$out\n";
+ } else {
+ print "No ARM processor errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# extlog errors
- $query = "select etype, severity, count(*) from extlog_event group by etype, severity";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($etype, $severity, $count));
- $out = "";
- while($query_handle->fetch()) {
- $etype_string = get_extlog_type($etype);
- $severity_string = get_extlog_severity($severity);
- $out .= "\t$count $etype_string $severity_string errors\n";
- }
- if ($out ne "") {
- print "Extlog records summary:\n$out";
- } else {
- print "No Extlog errors.\n";
+ if ($has_extlog == 1) {
+ $query = "select etype, severity, count(*) from extlog_event group by etype, severity";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($etype, $severity, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $etype_string = get_extlog_type($etype);
+ $severity_string = get_extlog_severity($severity);
+ $out .= "\t$count $etype_string $severity_string errors\n";
+ }
+ if ($out ne "") {
+ print "Extlog records summary:\n$out";
+ } else {
+ print "No Extlog errors.\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# MCE mce_record errors
- $query = "select error_msg, count(*) from mce_record group by error_msg";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($msg, $count));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "\t$count $msg errors\n";
- }
- if ($out ne "") {
- print "MCE records summary:\n$out";
- } else {
- print "No MCE errors.\n";
+ if ($has_mce == 1) {
+ $query = "select error_msg, count(*) from mce_record group by error_msg";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($msg, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$count $msg errors\n";
+ }
+ if ($out ne "") {
+ print "MCE records summary:\n$out";
+ } else {
+ print "No MCE errors.\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
undef($dbh);
}
@@ -1244,105 +1262,113 @@ sub errors
$query_handle->finish;
# PCIe AER aer_event errors
- $query = "select id, timestamp, err_type, err_msg from aer_event order by id";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($id, $time, $type, $msg));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "$id $time $type error: $msg\n";
- }
- if ($out ne "") {
- print "PCIe AER events:\n$out\n";
- } else {
- print "No PCIe AER errors.\n\n";
+ if ($has_aer == 1) {
+ $query = "select id, timestamp, err_type, err_msg from aer_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $time, $type, $msg));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $time $type error: $msg\n";
+ }
+ if ($out ne "") {
+ print "PCIe AER events:\n$out\n";
+ } else {
+ print "No PCIe AER errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# ARM processor arm_event errors
- $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "$id $timestamp error: ";
- $out .= "error_count=$error_count, " if ($error_count);
- $out .= "affinity_level=$affinity, ";
- $out .= sprintf "mpidr=0x%x, ", $mpidr;
- $out .= sprintf "running_state=0x%x, ", $r_state;
- $out .= sprintf "psci_state=0x%x", $psci_state;
- $out .= "\n";
- }
- if ($out ne "") {
- print "ARM processor events:\n$out\n";
- } else {
- print "No ARM processor errors.\n\n";
+ if ($has_arm == 1) {
+ $query = "select id, timestamp, error_count, affinity, mpidr, running_state, psci_state from arm_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $error_count, $affinity, $mpidr, $r_state, $psci_state));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "error_count=$error_count, " if ($error_count);
+ $out .= "affinity_level=$affinity, ";
+ $out .= sprintf "mpidr=0x%x, ", $mpidr;
+ $out .= sprintf "running_state=0x%x, ", $r_state;
+ $out .= sprintf "psci_state=0x%x", $psci_state;
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "ARM processor events:\n$out\n";
+ } else {
+ print "No ARM processor errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# Extlog errors
- $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data));
- $out = "";
- while($query_handle->fetch()) {
- $etype_string = get_extlog_type($etype);
- $severity_string = get_extlog_severity($severity);
- $out .= "$id $timestamp error: ";
- $out .= "type=$etype_string, ";
- $out .= "severity=$severity_string, ";
- $out .= sprintf "address=0x%08x, ", $addr;
- $out .= sprintf "fru_id=%s, ", get_uuid_le($fru_id);
- $out .= "fru_text='$fru_text', ";
- $out .= get_cper_data_text($cper_data) if ($cper_data);
- $out .= "\n";
- }
- if ($out ne "") {
- print "Extlog events:\n$out\n";
- } else {
- print "No Extlog errors.\n\n";
+ if ($has_extlog) {
+ $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data));
+ $out = "";
+ while($query_handle->fetch()) {
+ $etype_string = get_extlog_type($etype);
+ $severity_string = get_extlog_severity($severity);
+ $out .= "$id $timestamp error: ";
+ $out .= "type=$etype_string, ";
+ $out .= "severity=$severity_string, ";
+ $out .= sprintf "address=0x%08x, ", $addr;
+ $out .= sprintf "fru_id=%s, ", get_uuid_le($fru_id);
+ $out .= "fru_text='$fru_text', ";
+ $out .= get_cper_data_text($cper_data) if ($cper_data);
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "Extlog events:\n$out\n";
+ } else {
+ print "No Extlog errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
# MCE mce_record errors
- $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id";
- $query_handle = $dbh->prepare($query);
- $query_handle->execute();
- $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location));
- $out = "";
- while($query_handle->fetch()) {
- $out .= "$id $time error: $msg";
- $out .= ", CPU $cpuvendor" if ($cpuvendor);
- $out .= ", bank $bank_name" if ($bank_name);
- $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg);
- $out .= ", mci $mcistatus_msg" if ($mcistatus_msg);
- $out .= ", $mc_location" if ($mc_location);
- $out .= ", $user_action" if ($user_action);
- $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap);
- $out .= sprintf ", mcgstatus=0x%08x", $mcgstatus if ($mcgstatus);
- $out .= sprintf ", status=0x%08x", $status if ($status);
- $out .= sprintf ", addr=0x%08x", $addr if ($addr);
- $out .= sprintf ", misc=0x%08x", $misc if ($misc);
- $out .= sprintf ", ip=0x%08x", $ip if ($ip);
- $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc);
- $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime);
- $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu);
- $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid);
- $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid);
- $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid);
- $out .= sprintf ", cs=0x%08x", $cs if ($cs);
- $out .= sprintf ", bank=0x%08x", $bank if ($bank);
+ if ($has_mce == 1) {
+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $time error: $msg";
+ $out .= ", CPU $cpuvendor" if ($cpuvendor);
+ $out .= ", bank $bank_name" if ($bank_name);
+ $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg);
+ $out .= ", mci $mcistatus_msg" if ($mcistatus_msg);
+ $out .= ", $mc_location" if ($mc_location);
+ $out .= ", $user_action" if ($user_action);
+ $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap);
+ $out .= sprintf ", mcgstatus=0x%08x", $mcgstatus if ($mcgstatus);
+ $out .= sprintf ", status=0x%08x", $status if ($status);
+ $out .= sprintf ", addr=0x%08x", $addr if ($addr);
+ $out .= sprintf ", misc=0x%08x", $misc if ($misc);
+ $out .= sprintf ", ip=0x%08x", $ip if ($ip);
+ $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc);
+ $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime);
+ $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu);
+ $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid);
+ $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid);
+ $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid);
+ $out .= sprintf ", cs=0x%08x", $cs if ($cs);
+ $out .= sprintf ", bank=0x%08x", $bank if ($bank);
- $out .= "\n";
- }
- if ($out ne "") {
- print "MCE events:\n$out\n";
- } else {
- print "No MCE errors.\n\n";
+ $out .= "\n";
+ }
+ if ($out ne "") {
+ print "MCE events:\n$out\n";
+ } else {
+ print "No MCE errors.\n\n";
+ }
+ $query_handle->finish;
}
- $query_handle->finish;
undef($dbh);
}

View File

@ -0,0 +1,149 @@
commit 60a91e4da4f2daf2b10143fc148a8043312b61e5
Author: Aristeu Rozanski <aris@redhat.com>
Date: Wed Aug 1 16:29:58 2018 -0400
rasdaemon: ras-mc-ctl: add option to show error counts
In some scenarios it might not be desirable to have a daemon running
to parse and store the errors provided by EDAC and only having the
number of CEs and UEs is enough. This patch implements this feature
as an ras-mc-ctl option.
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 38b7824..aee431a 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -50,6 +50,8 @@ my %dimm_location = ();
my %csrow_size = ();
my %rank_size = ();
my %csrow_ranks = ();
+my %dimm_ce_count = ();
+my %dimm_ue_count = ();
my @layers;
my @max_pos;
@@ -76,6 +78,7 @@ Usage: $prog [OPTIONS...]
--layout Display the memory layout.
--summary Presents a summary of the logged errors.
--errors Shows the errors stored at the error database.
+ --error-count Shows the corrected and uncorrected error counts using sysfs.
--help This help message.
EOF
@@ -83,7 +86,7 @@ parse_cmdline();
if ( $conf{opt}{mainboard} || $conf{opt}{print_labels}
|| $conf{opt}{register_labels} || $conf{opt}{display_memory_layout}
- || $conf{opt}{guess_dimm_label}) {
+ || $conf{opt}{guess_dimm_label} || $conf{opt}{error_count}) {
get_mainboard_info();
@@ -105,6 +108,9 @@ if ( $conf{opt}{mainboard} || $conf{opt}{print_labels}
if ($conf{opt}{guess_dimm_label}) {
guess_dimm_label ();
}
+ if ($conf{opt}{error_count}) {
+ display_error_count ();
+ }
}
if ($conf{opt}{status}) {
@@ -134,6 +140,7 @@ sub parse_cmdline
$conf{opt}{guess_dimm_label} = 0;
$conf{opt}{summary} = 0;
$conf{opt}{errors} = 0;
+ $conf{opt}{error_count} = 0;
my $rref = \$conf{opt}{report};
my $mref = \$conf{opt}{mainboard};
@@ -150,7 +157,8 @@ sub parse_cmdline
"status" => \$conf{opt}{status},
"layout" => \$conf{opt}{display_memory_layout},
"summary" => \$conf{opt}{summary},
- "errors" => \$conf{opt}{errors}
+ "errors" => \$conf{opt}{errors},
+ "error-count" => \$conf{opt}{error_count}
);
usage(1) if !$rc;
@@ -284,6 +292,30 @@ sub parse_dimm_nodes
$dimm_label_file{$str_loc} = $file;
$dimm_location{$str_loc} = $location;
+ my $count;
+
+ $file =~s/dimm_label/dimm_ce_count/;
+ if (-e $file) {
+ open IN, $file;
+ chomp($count = <IN>);
+ close IN;
+ } else {
+ log_error ("dimm_ce_count not found in sysfs. Old kernel?\n");
+ exit -1;
+ }
+ $dimm_ce_count{$str_loc} = $count;
+
+ $file =~s/dimm_ce_count/dimm_ue_count/;
+ if (-e $file) {
+ open IN, $file;
+ chomp($count = <IN>);
+ close IN;
+ } else {
+ log_error ("dimm_ue_count not found in sysfs. Old kernel?\n");
+ exit -1;
+ }
+ $dimm_ue_count{$str_loc} = $count;
+
return;
}
}
@@ -906,6 +938,45 @@ sub display_memory_layout
dimm_display_mem();
}
+sub display_error_count
+{
+ my $sysfs_dir = "/sys/devices/system/edac/mc";
+ my $key;
+ my $max_width = 0;
+ my %dimm_labels = ();
+
+ find ({wanted => \&parse_dimm_nodes, no_chdir => 1}, $sysfs_dir);
+
+ if (!scalar(keys %dimm_node)) {
+ log_error ("No DIMMs found in /sys or new sysfs EDAC interface not found.\n");
+ exit -1;
+ }
+
+ foreach $key (keys %dimm_node) {
+ my $label_width;
+
+ open IN, $dimm_label_file{$key};
+ chomp(my $label = <IN>);
+ close IN;
+ $label_width = length $label;
+
+ if ($label_width > $max_width) {
+ $max_width = $label_width;
+ }
+ $dimm_labels{$key} = $label;
+ }
+ my $string = "Label";
+ $string .= " " x ($max_width - length $string);
+ print($string . "\tCE\tUE\n");
+
+ foreach $key (keys %dimm_node) {
+ my $ce_count = $dimm_ce_count{$key};
+ my $ue_count = $dimm_ue_count{$key};
+
+ print("$dimm_labels{$key}\t$ce_count\t$ue_count\n");
+ }
+}
+
sub find_prog
{
my ($file) = @_;

View File

@ -0,0 +1,24 @@
commit 7937f0d6c2aaaed096f3a3d306416743c0dcb7a4
Author: Muralidhara M K <muralimk@amd.com>
Date: Wed Jul 28 01:52:12 2021 -0500
rasdaemon: Support MCE for AMD CPU family 19h
Add support for family 19h x86 CPUs from AMD.
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/ras-mce-handler.c b/ras-mce-handler.c
index 805004a..f2b53d4 100644
--- a/ras-mce-handler.c
+++ b/ras-mce-handler.c
@@ -208,7 +208,7 @@ static int detect_cpu(struct ras_events *ras)
mce->cputype = CPU_AMD_SMCA;
goto ret;
}
- if (mce->family > 23) {
+ if (mce->family > 25) {
log(ALL, LOG_INFO,
"Can't parse MCE for this AMD CPU yet %d\n",
mce->family);

View File

@ -0,0 +1,38 @@
commit 854364ba44aee9bc5646f6537fc744b0b54aff37
Author: Muralidhara M K <muralimk@amd.com>
Date: Thu Aug 20 21:00:57 2020 +0530
rasdaemon: Add 8 channel decoding for SMCA systems
Current Scalable Machine Check Architecture (SMCA) systems support up
to 8 UMC channels.
To find the UMC channel represented by a bank, look at the 6th nibble
in the MCA_IPID[InstanceId] field.
Signed-off-by: Muralidhara M K <muralimk@amd.com>
[ Adjust commit message. ]
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index d0b6cb6..7c619fd 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -438,15 +438,7 @@ static void amd_decode_errcode(struct mce_event *e)
*/
static int find_umc_channel(struct mce_event *e)
{
- uint32_t umc_instance_id[] = {0x50f00, 0x150f00};
- uint32_t instance_id = EXTRACT(e->ipid, 0, 31);
- int i, channel = -1;
-
- for (i = 0; i < ARRAY_SIZE(umc_instance_id); i++)
- if (umc_instance_id[i] == instance_id)
- channel = i;
-
- return channel;
+ return EXTRACT(e->ipid, 0, 31) >> 20;
}
/* Decode extended errors according to Scalable MCA specification */
static void decode_smca_error(struct mce_event *e)

View File

@ -0,0 +1,207 @@
commit 8704a85d8dc3483423ec2934fee8132f85f8fdb6
Author: Brian WoodsGhannam, Yazen <brian.woods@amd.comYazen.Ghannam@amd.com>
Date: Fri Nov 1 15:48:14 2019 +0100
rasdaemon: add support for new AMD SMCA bank types
Going forward, the Scalable Machine Check Architecture (SMCA) has some
updated and additional bank types which show up in Zen2. The differing
bank types include: CS_V2, PSP_V2, SMU_V2, MP5, NBIO, and PCIE. The V2
bank types replace the original bank types but have unique HWID/MCAtype
IDs from the originals so there's no conflicts between different
versions or other bank types. All of the differing bank types have new
MCE descriptions which have been added as well.
CC: "mchehab+samsung@kernel.org" <mchehab+samsung@kernel.org>, "Namburu, Chandu-babu" <chandu@amd.com> # Thread-Topic: [PATCH 2/2] rasdaemon: add support for new AMD SMCA bank types
Signed-off-by: Brian Woods <brian.woods@amd.com>
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Cc: Chandu-babu Namburu <chandu@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 6c3e8a5..114e786 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -49,11 +49,17 @@ enum smca_bank_types {
SMCA_FP, /* Floating Point */
SMCA_L3_CACHE, /* L3 Cache */
SMCA_CS, /* Coherent Slave */
+ SMCA_CS_V2, /* Coherent Slave V2 */
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
+ SMCA_PSP_V2, /* Platform Security Processor V2 */
SMCA_SMU, /* System Management Unit */
+ SMCA_SMU_V2, /* System Management Unit V2 */
+ SMCA_MP5, /* Microprocessor 5 Unit */
+ SMCA_NBIO, /* Northbridge IO Unit */
+ SMCA_PCIE, /* PCI Express Unit */
N_SMCA_BANK_TYPES
};
@@ -165,6 +171,23 @@ static const char * const smca_cs_mce_desc[] = {
"Atomic request parity",
"ECC error on probe filter access",
};
+/* Coherent Slave Unit V2 */
+static const char * const smca_cs2_mce_desc[] = {
+ "Illegal Request",
+ "Address Violation",
+ "Security Violation",
+ "Illegal Response",
+ "Unexpected Response",
+ "Request or Probe Parity Error",
+ "Read Response Parity Error",
+ "Atomic Request Parity Error",
+ "SDP read response had no match in the CS queue",
+ "Probe Filter Protocol Error",
+ "Probe Filter ECC Error",
+ "SDP read response had an unexpected RETRY error",
+ "Counter overflow error",
+ "Counter underflow error",
+};
/* Power, Interrupt, etc.. */
static const char * const smca_pie_mce_desc[] = {
"HW assert",
@@ -189,10 +212,75 @@ static const char * const smca_pb_mce_desc[] = {
static const char * const smca_psp_mce_desc[] = {
"PSP RAM ECC or parity error",
};
+/* Platform Security Processor V2 */
+static const char * const smca_psp2_mce_desc[] = {
+ "High SRAM ECC or parity error",
+ "Low SRAM ECC or parity error",
+ "Instruction Cache Bank 0 ECC or parity error",
+ "Instruction Cache Bank 1 ECC or parity error",
+ "Instruction Tag Ram 0 parity error",
+ "Instruction Tag Ram 1 parity error",
+ "Data Cache Bank 0 ECC or parity error",
+ "Data Cache Bank 1 ECC or parity error",
+ "Data Cache Bank 2 ECC or parity error",
+ "Data Cache Bank 3 ECC or parity error",
+ "Data Tag Bank 0 parity error",
+ "Data Tag Bank 1 parity error",
+ "Data Tag Bank 2 parity error",
+ "Data Tag Bank 3 parity error",
+ "Dirty Data Ram parity error",
+ "TLB Bank 0 parity error",
+ "TLB Bank 1 parity error",
+ "System Hub Read Buffer ECC or parity error",
+};
/* System Management Unit */
static const char * const smca_smu_mce_desc[] = {
"SMU RAM ECC or parity error",
};
+/* System Management Unit V2 */
+static const char * const smca_smu2_mce_desc[] = {
+ "High SRAM ECC or parity error",
+ "Low SRAM ECC or parity error",
+ "Data Cache Bank A ECC or parity error",
+ "Data Cache Bank B ECC or parity error",
+ "Data Tag Cache Bank A ECC or parity error",
+ "Data Tag Cache Bank B ECC or parity error",
+ "Instruction Cache Bank A ECC or parity error",
+ "Instruction Cache Bank B ECC or parity error",
+ "Instruction Tag Cache Bank A ECC or parity error",
+ "Instruction Tag Cache Bank B ECC or parity error",
+ "System Hub Read Buffer ECC or parity error",
+};
+/* Microprocessor 5 Unit */
+static const char * const smca_mp5_mce_desc[] = {
+ "High SRAM ECC or parity error",
+ "Low SRAM ECC or parity error",
+ "Data Cache Bank A ECC or parity error",
+ "Data Cache Bank B ECC or parity error",
+ "Data Tag Cache Bank A ECC or parity error",
+ "Data Tag Cache Bank B ECC or parity error",
+ "Instruction Cache Bank A ECC or parity error",
+ "Instruction Cache Bank B ECC or parity error",
+ "Instruction Tag Cache Bank A ECC or parity error",
+ "Instruction Tag Cache Bank B ECC or parity error",
+};
+/* Northbridge IO Unit */
+static const char * const smca_nbio_mce_desc[] = {
+ "ECC or Parity error",
+ "PCIE error",
+ "SDP ErrEvent error",
+ "SDP Egress Poison Error",
+ "IOHC Internal Poison Error",
+};
+/* PCI Express Unit */
+static const char * const smca_pcie_mce_desc[] = {
+ "CCIX PER Message logging",
+ "CCIX Read Response with Status: Non-Data Error",
+ "CCIX Write Response with Status: Non-Data Error",
+ "CCIX Read Response with Status: Data Error",
+ "CCIX Non-okay write response with data error",
+};
+
struct smca_mce_desc {
const char * const *descs;
@@ -208,11 +296,17 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) },
[SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
[SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
+ [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) },
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
[SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
+ [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)},
[SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
+ [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc)},
+ [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) },
+ [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)},
+ [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)},
};
struct smca_hwid {
@@ -235,6 +329,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Data Fabric MCA types */
{ SMCA_CS, 0x0000002E },
+ { SMCA_CS_V2, 0x0002002E },
{ SMCA_PIE, 0x0001002E },
/* Unified Memory Controller MCA type */
@@ -245,9 +340,20 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Platform Security Processor MCA type */
{ SMCA_PSP, 0x000000FF },
+ { SMCA_PSP_V2, 0x000100FF },
/* System Management Unit MCA type */
{ SMCA_SMU, 0x00000001 },
+ { SMCA_SMU_V2, 0x00010001 },
+
+ /* Microprocessor 5 Unit MCA type */
+ { SMCA_MP5, 0x00020001 },
+
+ /* Northbridge IO Unit MCA type */
+ { SMCA_NBIO, 0x00000018 },
+
+ /* PCI Express Unit MCA type */
+ { SMCA_PCIE, 0x00000046 },
};
struct smca_bank_name {
@@ -264,11 +370,17 @@ static struct smca_bank_name smca_names[] = {
[SMCA_FP] = { "Floating Point Unit" },
[SMCA_L3_CACHE] = { "L3 Cache" },
[SMCA_CS] = { "Coherent Slave" },
+ [SMCA_CS_V2] = { "Coherent Slave" },
[SMCA_PIE] = { "Power, Interrupts, etc." },
[SMCA_UMC] = { "Unified Memory Controller" },
[SMCA_PB] = { "Parameter Block" },
[SMCA_PSP] = { "Platform Security Processor" },
+ [SMCA_PSP_V2] = { "Platform Security Processor" },
[SMCA_SMU] = { "System Management Unit" },
+ [SMCA_SMU_V2] = { "System Management Unit" },
+ [SMCA_MP5] = { "Microprocessor 5 Unit" },
+ [SMCA_NBIO] = { "Northbridge IO Unit" },
+ [SMCA_PCIE] = { "PCI Express Unit" },
};
static void amd_decode_errcode(struct mce_event *e)

View File

@ -0,0 +1,230 @@
commit 9acef39f13833f7d53ef96abc5a72e79384260f4
Author: Naveen Krishna Chatradhi <nchatrad@amd.com>
Date: Tue Jun 1 11:01:17 2021 +0530
rasdaemon: Add new SMCA bank types with error decoding
Upcoming systems with Scalable Machine Check Architecture (SMCA) have
new MCA banks added.
This patch adds the (HWID, MCATYPE) tuple, name and error decoding for
those new SMCA banks.
While at it, optimize the string names in smca_bank_name[].
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 7c619fd..e0cf512 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -47,7 +47,7 @@
/* These may be used by multiple smca_hwid_mcatypes */
enum smca_bank_types {
SMCA_LS = 0, /* Load Store */
- SMCA_LS_V2, /* Load Store */
+ SMCA_LS_V2,
SMCA_IF, /* Instruction Fetch */
SMCA_L2_CACHE, /* L2 Cache */
SMCA_DE, /* Decoder Unit */
@@ -56,17 +56,22 @@ enum smca_bank_types {
SMCA_FP, /* Floating Point */
SMCA_L3_CACHE, /* L3 Cache */
SMCA_CS, /* Coherent Slave */
- SMCA_CS_V2, /* Coherent Slave V2 */
+ SMCA_CS_V2,
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
+ SMCA_UMC_V2,
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
- SMCA_PSP_V2, /* Platform Security Processor V2 */
+ SMCA_PSP_V2,
SMCA_SMU, /* System Management Unit */
- SMCA_SMU_V2, /* System Management Unit V2 */
+ SMCA_SMU_V2,
SMCA_MP5, /* Microprocessor 5 Unit */
SMCA_NBIO, /* Northbridge IO Unit */
SMCA_PCIE, /* PCI Express Unit */
+ SMCA_PCIE_V2,
+ SMCA_XGMI_PCS, /* xGMI PCS Unit */
+ SMCA_XGMI_PHY, /* xGMI PHY Unit */
+ SMCA_WAFL_PHY, /* WAFL PHY Unit */
N_SMCA_BANK_TYPES
};
@@ -237,6 +242,22 @@ static const char * const smca_umc_mce_desc[] = {
"Command/address parity error",
"Write data CRC error",
};
+
+static const char * const smca_umc2_mce_desc[] = {
+ "DRAM ECC error",
+ "Data poison error",
+ "SDP parity error",
+ "Reserved",
+ "Address/Command parity error",
+ "Write data parity error",
+ "DCQ SRAM ECC error",
+ "Reserved",
+ "Read data parity error",
+ "Rdb SRAM ECC error",
+ "RdRsp SRAM ECC error",
+ "LM32 MP errors",
+};
+
/* Parameter Block */
static const char * const smca_pb_mce_desc[] = {
"Parameter Block RAM ECC error",
@@ -314,6 +335,55 @@ static const char * const smca_pcie_mce_desc[] = {
"CCIX Non-okay write response with data error",
};
+static const char * const smca_pcie2_mce_desc[] = {
+ "SDP Parity Error logging",
+};
+
+static const char * const smca_xgmipcs_mce_desc[] = {
+ "Data Loss Error",
+ "Training Error",
+ "Flow Control Acknowledge Error",
+ "Rx Fifo Underflow Error",
+ "Rx Fifo Overflow Error",
+ "CRC Error",
+ "BER Exceeded Error",
+ "Tx Vcid Data Error",
+ "Replay Buffer Parity Error",
+ "Data Parity Error",
+ "Replay Fifo Overflow Error",
+ "Replay Fifo Underflow Error",
+ "Elastic Fifo Overflow Error",
+ "Deskew Error",
+ "Flow Control CRC Error",
+ "Data Startup Limit Error",
+ "FC Init Timeout Error",
+ "Recovery Timeout Error",
+ "Ready Serial Timeout Error",
+ "Ready Serial Attempt Error",
+ "Recovery Attempt Error",
+ "Recovery Relock Attempt Error",
+ "Replay Attempt Error",
+ "Sync Header Error",
+ "Tx Replay Timeout Error",
+ "Rx Replay Timeout Error",
+ "LinkSub Tx Timeout Error",
+ "LinkSub Rx Timeout Error",
+ "Rx CMD Pocket Error",
+};
+
+static const char * const smca_xgmiphy_mce_desc[] = {
+ "RAM ECC Error",
+ "ARC instruction buffer parity error",
+ "ARC data buffer parity error",
+ "PHY APB error",
+};
+
+static const char * const smca_waflphy_mce_desc[] = {
+ "RAM ECC Error",
+ "ARC instruction buffer parity error",
+ "ARC data buffer parity error",
+ "PHY APB error",
+};
struct smca_mce_desc {
const char * const *descs;
@@ -333,6 +403,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) },
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
+ [SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) },
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
[SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
[SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)},
@@ -341,6 +412,10 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) },
[SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)},
[SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)},
+ [SMCA_PCIE_V2] = { smca_pcie2_mce_desc, ARRAY_SIZE(smca_pcie2_mce_desc) },
+ [SMCA_XGMI_PCS] = { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc) },
+ [SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
+ [SMCA_WAFL_PHY] = { smca_waflphy_mce_desc, ARRAY_SIZE(smca_waflphy_mce_desc) },
};
struct smca_hwid {
@@ -369,6 +444,8 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Unified Memory Controller MCA type */
{ SMCA_UMC, 0x00000096 },
+ /* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */
+ { SMCA_UMC_V2, 0x00010096 },
/* Parameter Block MCA type */
{ SMCA_PB, 0x00000005 },
@@ -389,6 +466,16 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* PCI Express Unit MCA type */
{ SMCA_PCIE, 0x00000046 },
+ { SMCA_PCIE_V2, 0x00010046 },
+
+ /* Ext Global Memory Interconnect PCS MCA type */
+ { SMCA_XGMI_PCS, 0x00000050 },
+
+ /* Ext Global Memory Interconnect PHY MCA type */
+ { SMCA_XGMI_PHY, 0x00000259 },
+
+ /* WAFL PHY MCA type */
+ { SMCA_WAFL_PHY, 0x00000267 },
};
struct smca_bank_name {
@@ -396,27 +483,28 @@ struct smca_bank_name {
};
static struct smca_bank_name smca_names[] = {
- [SMCA_LS] = { "Load Store Unit" },
- [SMCA_LS_V2] = { "Load Store Unit" },
- [SMCA_IF] = { "Instruction Fetch Unit" },
- [SMCA_L2_CACHE] = { "L2 Cache" },
- [SMCA_DE] = { "Decode Unit" },
- [SMCA_RESERVED] = { "Reserved" },
- [SMCA_EX] = { "Execution Unit" },
- [SMCA_FP] = { "Floating Point Unit" },
- [SMCA_L3_CACHE] = { "L3 Cache" },
- [SMCA_CS] = { "Coherent Slave" },
- [SMCA_CS_V2] = { "Coherent Slave" },
- [SMCA_PIE] = { "Power, Interrupts, etc." },
- [SMCA_UMC] = { "Unified Memory Controller" },
- [SMCA_PB] = { "Parameter Block" },
- [SMCA_PSP] = { "Platform Security Processor" },
- [SMCA_PSP_V2] = { "Platform Security Processor" },
- [SMCA_SMU] = { "System Management Unit" },
- [SMCA_SMU_V2] = { "System Management Unit" },
- [SMCA_MP5] = { "Microprocessor 5 Unit" },
- [SMCA_NBIO] = { "Northbridge IO Unit" },
- [SMCA_PCIE] = { "PCI Express Unit" },
+ [SMCA_LS ... SMCA_LS_V2] = { "Load Store Unit" },
+ [SMCA_IF] = { "Instruction Fetch Unit" },
+ [SMCA_L2_CACHE] = { "L2 Cache" },
+ [SMCA_DE] = { "Decode Unit" },
+ [SMCA_RESERVED] = { "Reserved" },
+ [SMCA_EX] = { "Execution Unit" },
+ [SMCA_FP] = { "Floating Point Unit" },
+ [SMCA_L3_CACHE] = { "L3 Cache" },
+ [SMCA_CS ... SMCA_CS_V2] = { "Coherent Slave" },
+ [SMCA_PIE] = { "Power, Interrupts, etc." },
+ [SMCA_UMC] = { "Unified Memory Controller" },
+ [SMCA_UMC_V2] = { "Unified Memory Controller V2" },
+ [SMCA_PB] = { "Parameter Block" },
+ [SMCA_PSP ... SMCA_PSP_V2] = { "Platform Security Processor" },
+ [SMCA_SMU ... SMCA_SMU_V2] = { "System Management Unit" },
+ [SMCA_MP5] = { "Microprocessor 5 Unit" },
+ [SMCA_NBIO] = { "Northbridge IO Unit" },
+ [SMCA_PCIE ... SMCA_PCIE_V2] = { "PCI Express Unit" },
+ [SMCA_XGMI_PCS] = { "Ext Global Memory Interconnect PCS Unit" },
+ [SMCA_XGMI_PHY] = { "Ext Global Memory Interconnect PHY Unit" },
+ [SMCA_WAFL_PHY] = { "WAFL PHY Unit" },
+
};
static void amd_decode_errcode(struct mce_event *e)

1
EMPTY
View File

@ -1 +0,0 @@

View File

@ -0,0 +1,670 @@
commit a16ca0711001957ee98f2c124abce0fa1f801529
Author: Chandu-babu Namburu <chandu@amd.com>
Date: Wed Jan 30 20:36:45 2019 +0530
rasdaemon: add support for AMD Scalable MCA
Add logic here to decode errors from all known IP blocks for
AMD Scalable MCA supported processors
Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Chandu-babu Namburu <chandu@amd.com>
---
mce-amd-smca.c | 371 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
mce-amd.c | 122 +++++++++++++++++
ras-mce-handler.c | 24 +++
ras-mce-handler.h | 15 ++
4 files changed, 530 insertions(+), 2 deletions(-)
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ rasdaemon-0.6.1/mce-amd-smca.c 2019-07-12 11:35:04.836470461 -0400
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2018, AMD, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "ras-mce-handler.h"
+#include "bitfield.h"
+
+/* MCA_STATUS REGISTER FOR FAMILY 17H
+ *********************** Higher 32-bits *****************************
+ * 63: VALIDERROR, 62: OVERFLOW, 61: UC, 60: Err ENABLE,
+ * 59: Misc Valid, 58: Addr Valid, 57: PCC, 56: ErrCoreID Valid,
+ * 55: TCC, 54: RES, 53: Syndrom Valid, 52: Transparanet,
+ * 51: RES, 50: RES, 49: RES, 48: RES,
+ * 47: RES, 46: CECC, 45: UECC, 44: Deferred,
+ * 43: Poison, 42: RES, 41: RES, 40: RES,
+ * 39: RES, 38: RES, 37: ErrCoreID[5], 36: ErrCoreID[4],
+ * 35: ErrCoreID[3], 34: ErrCoreID[2] 33: ErrCoreID[1] 32: ErrCoreID[0]
+ *********************** Lower 32-bits ******************************
+ * 31: RES, 30: RES, 29: RES, 28: RES,
+ * 27: RES, 26: RES, 25: RES, 24: RES
+ * 23: RES, 22: RES, 21: XEC[5], 20: XEC[4],
+ * 19: XEC[3], 18: XEC[2], 17: XEC[1], 16: XEC[0]
+ * 15: EC[15], 14: EC[14], 13: EC[13], 12: EC[12],
+ * 11: EC[11], 10: EC[10], 09: EC[9], 08: EC[8],
+ * 07: EC[7], 06: EC[6], 05: EC[5], 04: EC[4],
+ * 03: EC[3], 02: EC[2], 01: EC[1], 00: EC[0]
+ */
+
+/* These may be used by multiple smca_hwid_mcatypes */
+enum smca_bank_types {
+ SMCA_LS = 0, /* Load Store */
+ SMCA_IF, /* Instruction Fetch */
+ SMCA_L2_CACHE, /* L2 Cache */
+ SMCA_DE, /* Decoder Unit */
+ SMCA_RESERVED, /* Reserved */
+ SMCA_EX, /* Execution Unit */
+ SMCA_FP, /* Floating Point */
+ SMCA_L3_CACHE, /* L3 Cache */
+ SMCA_CS, /* Coherent Slave */
+ SMCA_PIE, /* Power, Interrupts, etc. */
+ SMCA_UMC, /* Unified Memory Controller */
+ SMCA_PB, /* Parameter Block */
+ SMCA_PSP, /* Platform Security Processor */
+ SMCA_SMU, /* System Management Unit */
+ N_SMCA_BANK_TYPES
+};
+
+/* SMCA Extended error strings */
+/* Load Store */
+static const char * const smca_ls_mce_desc[] = {
+ "Load queue parity",
+ "Store queue parity",
+ "Miss address buffer payload parity",
+ "L1 TLB parity",
+ "Reserved",
+ "DC tag error type 6",
+ "DC tag error type 1",
+ "Internal error type 1",
+ "Internal error type 2",
+ "Sys Read data error thread 0",
+ "Sys read data error thread 1",
+ "DC tag error type 2",
+ "DC data error type 1 (poison consumption)",
+ "DC data error type 2",
+ "DC data error type 3",
+ "DC tag error type 4",
+ "L2 TLB parity",
+ "PDC parity error",
+ "DC tag error type 3",
+ "DC tag error type 5",
+ "L2 fill data error",
+};
+/* Instruction Fetch */
+static const char * const smca_if_mce_desc[] = {
+ "microtag probe port parity error",
+ "IC microtag or full tag multi-hit error",
+ "IC full tag parity",
+ "IC data array parity",
+ "Decoupling queue phys addr parity error",
+ "L0 ITLB parity error",
+ "L1 ITLB parity error",
+ "L2 ITLB parity error",
+ "BPQ snoop parity on Thread 0",
+ "BPQ snoop parity on Thread 1",
+ "L1 BTB multi-match error",
+ "L2 BTB multi-match error",
+ "L2 Cache Response Poison error",
+ "System Read Data error",
+};
+/* L2 Cache */
+static const char * const smca_l2_mce_desc[] = {
+ "L2M tag multi-way-hit error",
+ "L2M tag ECC error",
+ "L2M data ECC error",
+ "HW assert",
+};
+/* Decoder Unit */
+static const char * const smca_de_mce_desc[] = {
+ "uop cache tag parity error",
+ "uop cache data parity error",
+ "Insn buffer parity error",
+ "uop queue parity error",
+ "Insn dispatch queue parity error",
+ "Fetch address FIFO parity",
+ "Patch RAM data parity",
+ "Patch RAM sequencer parity",
+ "uop buffer parity"
+};
+/* Execution Unit */
+static const char * const smca_ex_mce_desc[] = {
+ "Watchdog timeout error",
+ "Phy register file parity",
+ "Flag register file parity",
+ "Immediate displacement register file parity",
+ "Address generator payload parity",
+ "EX payload parity",
+ "Checkpoint queue parity",
+ "Retire dispatch queue parity",
+ "Retire status queue parity error",
+ "Scheduling queue parity error",
+ "Branch buffer queue parity error",
+};
+/* Floating Point Unit */
+static const char * const smca_fp_mce_desc[] = {
+ "Physical register file parity",
+ "Freelist parity error",
+ "Schedule queue parity",
+ "NSQ parity error",
+ "Retire queue parity",
+ "Status register file parity",
+ "Hardware assertion",
+};
+/* L3 Cache */
+static const char * const smca_l3_mce_desc[] = {
+ "Shadow tag macro ECC error",
+ "Shadow tag macro multi-way-hit error",
+ "L3M tag ECC error",
+ "L3M tag multi-way-hit error",
+ "L3M data ECC error",
+ "XI parity, L3 fill done channel error",
+ "L3 victim queue parity",
+ "L3 HW assert",
+};
+/* Coherent Slave Unit */
+static const char * const smca_cs_mce_desc[] = {
+ "Illegal request from transport layer",
+ "Address violation",
+ "Security violation",
+ "Illegal response from transport layer",
+ "Unexpected response",
+ "Parity error on incoming request or probe response data",
+ "Parity error on incoming read response data",
+ "Atomic request parity",
+ "ECC error on probe filter access",
+};
+/* Power, Interrupt, etc.. */
+static const char * const smca_pie_mce_desc[] = {
+ "HW assert",
+ "Internal PIE register security violation",
+ "Error on GMI link",
+ "Poison data written to internal PIE register",
+};
+/* Unified Memory Controller */
+static const char * const smca_umc_mce_desc[] = {
+ "DRAM ECC error",
+ "Data poison error on DRAM",
+ "SDP parity error",
+ "Advanced peripheral bus error",
+ "Command/address parity error",
+ "Write data CRC error",
+};
+/* Parameter Block */
+static const char * const smca_pb_mce_desc[] = {
+ "Parameter Block RAM ECC error",
+};
+/* Platform Security Processor */
+static const char * const smca_psp_mce_desc[] = {
+ "PSP RAM ECC or parity error",
+};
+/* System Management Unit */
+static const char * const smca_smu_mce_desc[] = {
+ "SMU RAM ECC or parity error",
+};
+
+struct smca_mce_desc {
+ const char * const *descs;
+ unsigned int num_descs;
+};
+
+static struct smca_mce_desc smca_mce_descs[] = {
+ [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) },
+ [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) },
+ [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) },
+ [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) },
+ [SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) },
+ [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) },
+ [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
+ [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
+ [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
+ [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
+ [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
+ [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
+ [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
+};
+
+struct smca_hwid {
+ unsigned int bank_type; /* Use with smca_bank_types for easy indexing.*/
+ uint32_t mcatype_hwid; /* mcatype,hwid bit 63-32 in MCx_IPID Register*/
+};
+
+static struct smca_hwid smca_hwid_mcatypes[] = {
+ /* { bank_type, mcatype_hwid } */
+
+ /* ZN Core (HWID=0xB0) MCA types */
+ { SMCA_LS, 0x000000B0 },
+ { SMCA_IF, 0x000100B0 },
+ { SMCA_L2_CACHE, 0x000200B0 },
+ { SMCA_DE, 0x000300B0 },
+ /* HWID 0xB0 MCATYPE 0x4 is Reserved */
+ { SMCA_EX, 0x000500B0 },
+ { SMCA_FP, 0x000600B0 },
+ { SMCA_L3_CACHE, 0x000700B0 },
+
+ /* Data Fabric MCA types */
+ { SMCA_CS, 0x0000002E },
+ { SMCA_PIE, 0x0001002E },
+
+ /* Unified Memory Controller MCA type */
+ { SMCA_UMC, 0x00000096 },
+
+ /* Parameter Block MCA type */
+ { SMCA_PB, 0x00000005 },
+
+ /* Platform Security Processor MCA type */
+ { SMCA_PSP, 0x000000FF },
+
+ /* System Management Unit MCA type */
+ { SMCA_SMU, 0x00000001 },
+};
+
+struct smca_bank_name {
+ const char *name;
+};
+
+static struct smca_bank_name smca_names[] = {
+ [SMCA_LS] = { "Load Store Unit" },
+ [SMCA_IF] = { "Instruction Fetch Unit" },
+ [SMCA_L2_CACHE] = { "L2 Cache" },
+ [SMCA_DE] = { "Decode Unit" },
+ [SMCA_RESERVED] = { "Reserved" },
+ [SMCA_EX] = { "Execution Unit" },
+ [SMCA_FP] = { "Floating Point Unit" },
+ [SMCA_L3_CACHE] = { "L3 Cache" },
+ [SMCA_CS] = { "Coherent Slave" },
+ [SMCA_PIE] = { "Power, Interrupts, etc." },
+ [SMCA_UMC] = { "Unified Memory Controller" },
+ [SMCA_PB] = { "Parameter Block" },
+ [SMCA_PSP] = { "Platform Security Processor" },
+ [SMCA_SMU] = { "System Management Unit" },
+};
+
+static void amd_decode_errcode(struct mce_event *e)
+{
+
+ decode_amd_errcode(e);
+
+ if (e->status & MCI_STATUS_POISON)
+ mce_snprintf(e->mcistatus_msg, "Poison consumed");
+
+ if (e->status & MCI_STATUS_TCC)
+ mce_snprintf(e->mcistatus_msg, "Task_context_corrupt");
+
+}
+/*
+ * To find the UMC channel represented by this bank we need to match on its
+ * instance_id. The instance_id of a bank is held in the lower 32 bits of its
+ * IPID.
+ */
+static int find_umc_channel(struct mce_event *e)
+{
+ uint32_t umc_instance_id[] = {0x50f00, 0x150f00};
+ uint32_t instance_id = EXTRACT(e->ipid, 0, 31);
+ int i, channel = -1;
+
+ for (i = 0; i < ARRAY_SIZE(umc_instance_id); i++)
+ if (umc_instance_id[i] == instance_id)
+ channel = i;
+
+ return channel;
+}
+/* Decode extended errors according to Scalable MCA specification */
+static void decode_smca_error(struct mce_event *e)
+{
+ enum smca_bank_types bank_type;
+ const char *ip_name;
+ unsigned short xec = (e->status >> 16) & 0x3f;
+ const struct smca_hwid *s_hwid;
+ uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
+ unsigned int csrow = -1, channel = -1;
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
+ s_hwid = &smca_hwid_mcatypes[i];
+ if (mcatype_hwid == s_hwid->mcatype_hwid) {
+ bank_type = s_hwid->bank_type;
+ break;
+ }
+ }
+
+ if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) {
+ strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID");
+ return;
+ }
+
+ if (bank_type >= N_SMCA_BANK_TYPES) {
+ strcpy(e->mcastatus_msg, "Don't know how to decode this bank");
+ return;
+ }
+
+ if (bank_type == SMCA_RESERVED) {
+ strcpy(e->mcastatus_msg, "Bank 4 is reserved.\n");
+ return;
+ }
+
+ ip_name = smca_names[bank_type].name;
+
+ mce_snprintf(e->bank_name, "%s (bank=%d)", ip_name, e->bank);
+
+ /* Only print the descriptor of valid extended error code */
+ if (xec < smca_mce_descs[bank_type].num_descs)
+ mce_snprintf(e->mcastatus_msg,
+ " %s.\n", smca_mce_descs[bank_type].descs[xec]);
+
+ if (bank_type == SMCA_UMC && xec == 0) {
+ channel = find_umc_channel(e);
+ csrow = e->synd & 0x7; /* Bit 0, 1 ,2 */
+ mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
+ channel, csrow);
+ }
+}
+
+int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)
+{
+ uint64_t mcgstatus = e->mcgstatus;
+
+ mce_snprintf(e->mcgstatus_msg, "mcgstatus=%lld",
+ (long long)e->mcgstatus);
+
+ if (mcgstatus & MCG_STATUS_RIPV)
+ mce_snprintf(e->mcgstatus_msg, "RIPV");
+ if (mcgstatus & MCG_STATUS_EIPV)
+ mce_snprintf(e->mcgstatus_msg, "EIPV");
+ if (mcgstatus & MCG_STATUS_MCIP)
+ mce_snprintf(e->mcgstatus_msg, "MCIP");
+
+ decode_smca_error(e);
+ amd_decode_errcode(e);
+ return 0;
+}
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ rasdaemon-0.6.1/mce-amd.c 2019-07-12 11:35:04.836470461 -0400
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2018, The AMD, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "ras-mce-handler.h"
+
+/* Error Code Types */
+#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010)
+#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100)
+#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800)
+#define INT_ERROR(x) (((x) & 0xF4FF) == 0x0400)
+
+/* Error code: transaction type (TT) */
+static char *transaction[] = {
+ "instruction", "data", "generic", "reserved"
+};
+/* Error codes: cache level (LL) */
+static char *cachelevel[] = {
+ "reserved", "L1", "L2", "L3/generic"
+};
+/* Error codes: memory transaction type (RRRR) */
+static char *memtrans[] = {
+ "generic", "generic read", "generic write", "data read",
+ "data write", "instruction fetch", "prefetch", "evict", "snoop",
+ "?", "?", "?", "?", "?", "?", "?"
+};
+/* Participation Processor */
+static char *partproc[] = {
+ "local node origin", "local node response",
+ "local node observed", "generic participation"
+};
+/* Timeout */
+static char *timeout[] = {
+ "request didn't time out",
+ "request timed out"
+};
+/* internal unclassified error code */
+static char *internal[] = { "reserved",
+ "reserved",
+ "hardware assert",
+ "reserved" };
+
+#define TT(x) (((x) >> 2) & 0x3) /*bit 2, bit 3*/
+#define TT_MSG(x) transaction[TT(x)]
+#define LL(x) ((x) & 0x3) /*bit 0, bit 1*/
+#define LL_MSG(x) cachelevel[LL(x)]
+
+#define R4(x) (((x) >> 4) & 0xF) /*bit 4, bit 5, bit 6, bit 7 */
+#define R4_MSG(x) ((R4(x) < 9) ? memtrans[R4(x)] : "Wrong R4!")
+
+#define TO(x) (((x) >> 8) & 0x1) /*bit 8*/
+#define TO_MSG(x) timeout[TO(x)]
+#define PP(x) (((x) >> 9) & 0x3) /*bit 9, bit 10*/
+#define PP_MSG(x) partproc[PP(x)]
+
+#define UU(x) (((x) >> 8) & 0x3) /*bit 8, bit 9*/
+#define UU_MSG(x) internal[UU(x)]
+
+void decode_amd_errcode(struct mce_event *e)
+{
+ uint16_t ec = e->status & 0xffff;
+ uint16_t ecc = (e->status >> 45) & 0x3;
+
+ if (e->status & MCI_STATUS_UC) {
+ if (e->status & MCI_STATUS_PCC)
+ strcpy(e->error_msg, "System Fatal error.");
+ if (e->mcgstatus & MCG_STATUS_RIPV)
+ strcpy(e->error_msg,
+ "Uncorrected, software restartable error.");
+ strcpy(e->error_msg,
+ "Uncorrected, software containable error.");
+ } else if (e->status & MCI_STATUS_DEFERRED)
+ strcpy(e->error_msg, "Deferred error, no action required.");
+ else
+ strcpy(e->error_msg, "Corrected error, no action required.");
+
+ if (!(e->status & MCI_STATUS_VAL))
+ mce_snprintf(e->mcistatus_msg, "MCE_INVALID");
+
+ if (e->status & MCI_STATUS_OVER)
+ mce_snprintf(e->mcistatus_msg, "Error_overflow");
+
+ if (e->status & MCI_STATUS_PCC)
+ mce_snprintf(e->mcistatus_msg, "Processor_context_corrupt");
+
+ if (ecc)
+ mce_snprintf(e->mcistatus_msg,
+ "%sECC", ((ecc == 2) ? "C" : "U"));
+
+ if (INT_ERROR(ec)) {
+ mce_snprintf(e->mcastatus_msg, "Internal '%s'", UU_MSG(ec));
+ return;
+ }
+
+ if (TLB_ERROR(ec))
+ mce_snprintf(e->mcastatus_msg,
+ "TLB Error 'tx: %s, level: %s'",
+ TT_MSG(ec), LL_MSG(ec));
+ else if (MEM_ERROR(ec))
+ mce_snprintf(e->mcastatus_msg,
+ "Memory Error 'mem-tx: %s, tx: %s, level: %s'",
+ R4_MSG(ec), TT_MSG(ec), LL_MSG(ec));
+ else if (BUS_ERROR(ec))
+ mce_snprintf(e->mcastatus_msg,
+ "Bus Error '%s, %s, mem-tx: %s, level: %s'",
+ PP_MSG(ec), TO_MSG(ec),
+ R4_MSG(ec), LL_MSG(ec));
+ return;
+
+}
--- rasdaemon-0.6.1.orig/ras-mce-handler.c 2019-07-12 11:35:01.585502811 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.c 2019-07-12 11:35:04.836470461 -0400
@@ -55,6 +55,7 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series
[CPU_KNIGHTS_LANDING] = "Knights Landing",
[CPU_KNIGHTS_MILL] = "Knights Mill",
[CPU_SKYLAKE_XEON] = "Skylake server",
+ [CPU_NAPLES] = "AMD Family 17h Zen1"
};
static enum cputype select_intel_cputype(struct ras_events *ras)
@@ -190,9 +191,12 @@ ret = 0;
if (!strcmp(mce->vendor, "AuthenticAMD")) {
if (mce->family == 15)
mce->cputype = CPU_K8;
- if (mce->family > 15) {
+ if (mce->family == 23)
+ mce->cputype = CPU_NAPLES;
+ if (mce->family > 23) {
log(ALL, LOG_INFO,
- "Can't parse MCE for this AMD CPU yet\n");
+ "Can't parse MCE for this AMD CPU yet %d\n",
+ mce->family);
ret = EINVAL;
}
goto ret;
@@ -331,6 +335,12 @@ #if 0
if (e->status & MCI_STATUS_ADDRV)
trace_seq_printf(s, ", addr= %llx", (long long)e->addr);
+ if (e->status & MCI_STATUS_SYNDV)
+ trace_seq_printf(s, ", synd= %llx", (long long)e->synd);
+
+ if (e->ipid)
+ trace_seq_printf(s, ", ipid= %llx", (long long)e->ipid);
+
if (e->mcgstatus_msg)
trace_seq_printf(s, ", %s", e->mcgstatus_msg);
else
@@ -411,6 +421,13 @@ if (pevent_get_field_val(s, event, "bank
if (pevent_get_field_val(s, event, "cpuvendor", record, &val, 1) < 0)
return -1;
e.cpuvendor = val;
+ /* Get New entries */
+ if (pevent_get_field_val(s, event, "synd", record, &val, 1) < 0)
+ return -1;
+ e.synd = val;
+ if (pevent_get_field_val(s, event, "ipid", record, &val, 1) < 0)
+ return -1;
+ e.ipid = val;
switch (mce->cputype) {
case CPU_GENERIC:
@@ -418,6 +435,9 @@ if (pevent_get_field_val(s, event, "cpuv
case CPU_K8:
rc = parse_amd_k8_event(ras, &e);
break;
+ case CPU_NAPLES:
+ rc = parse_amd_smca_event(ras, &e);
+ break;
default: /* All other CPU types are Intel */
rc = parse_intel_event(ras, &e);
}
--- rasdaemon-0.6.1.orig/ras-mce-handler.h 2019-07-12 11:35:01.585502811 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.h 2019-07-12 11:35:04.836470461 -0400
@@ -50,6 +50,7 @@ enum cputype {
CPU_KNIGHTS_LANDING,
CPU_KNIGHTS_MILL,
CPU_SKYLAKE_XEON,
+ CPU_NAPLES,
};
struct mce_event {
@@ -69,6 +70,8 @@ struct mce_event {
uint8_t cs;
uint8_t bank;
uint8_t cpuvendor;
+ uint64_t synd; /* MCA_SYND MSR: only valid on SMCA systems */
+ uint64_t ipid; /* MCA_IPID MSR: only valid on SMCA systems */
/* Parsed data */
char timestamp[64];
@@ -129,6 +132,9 @@ void broadwell_de_decode_model(struct ra
void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e);
void skylake_s_decode_model(struct ras_events *ras, struct mce_event *e);
+/* AMD error code decode function */
+void decode_amd_errcode(struct mce_event *e);
+
/* Software defined banks */
#define MCE_EXTENDED_BANK 128
@@ -144,6 +150,13 @@ #define MCI_STATUS_EN (1ULL<<60) /*
#define MCI_STATUS_S (1ULL<<56) /* signalled */
#define MCI_STATUS_AR (1ULL<<55) /* action-required */
+/* AMD-specific bits */
+#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */
+#define MCI_STATUS_SYNDV (1ULL<<53) /* synd reg. valid */
+/* uncorrected error,deferred exception */
+#define MCI_STATUS_DEFERRED (1ULL<<44)
+#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
+
#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
#define MCG_STATUS_EIPV (1ULL<<1) /* eip points to correct instruction */
#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
@@ -154,4 +167,6 @@ int parse_intel_event(struct ras_events
int parse_amd_k8_event(struct ras_events *ras, struct mce_event *e);
+int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e);
+
#endif
--- rasdaemon-0.6.1.orig/Makefile.in 2018-04-25 06:29:05.000000000 -0400
+++ rasdaemon-0.6.1/Makefile.in 2019-07-15 14:41:22.308278851 -0400
@@ -100,7 +100,7 @@ sbin_PROGRAMS = rasdaemon$(EXEEXT)
@WITH_MCE_TRUE@ mce-intel-dunnington.c mce-intel-tulsa.c \
@WITH_MCE_TRUE@ mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \
@WITH_MCE_TRUE@ mce-intel-knl.c mce-intel-broadwell-de.c \
-@WITH_MCE_TRUE@ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c
+@WITH_MCE_TRUE@ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c mce-amd.c mce-amd-smca.c
@WITH_EXTLOG_TRUE@am__append_6 = ras-extlog-handler.c
@WITH_ABRT_REPORT_TRUE@am__append_7 = ras-report.c
@@ -132,7 +132,7 @@ am__rasdaemon_SOURCES_DIST = rasdaemon.c
mce-intel-ivb.c mce-intel-haswell.c mce-intel-knl.c \
mce-intel-broadwell-de.c mce-intel-broadwell-epex.c \
mce-intel-skylake-xeon.c ras-extlog-handler.c ras-report.c \
- non-standard-hisi_hip07.c
+ non-standard-hisi_hip07.c mce-amd-smca.c mce-amd.c
@WITH_SQLITE3_TRUE@am__objects_1 = ras-record.$(OBJEXT)
@WITH_AER_TRUE@am__objects_2 = ras-aer-handler.$(OBJEXT)
@WITH_NON_STANDARD_TRUE@am__objects_3 = \
@@ -149,7 +149,9 @@ non-standard-hisi_hip07.c
@WITH_MCE_TRUE@ mce-intel-knl.$(OBJEXT) \
@WITH_MCE_TRUE@ mce-intel-broadwell-de.$(OBJEXT) \
@WITH_MCE_TRUE@ mce-intel-broadwell-epex.$(OBJEXT) \
-@WITH_MCE_TRUE@ mce-intel-skylake-xeon.$(OBJEXT)
+@WITH_MCE_TRUE@ mce-intel-skylake-xeon.$(OBJEXT) \
+@WITH_MCE_TRUE@ mce-amd-smca.$(OBJEXT) \
+@WITH_MCE_TRUE@ mce-amd.$(OBJEXT)
@WITH_EXTLOG_TRUE@am__objects_6 = ras-extlog-handler.$(OBJEXT)
@WITH_ABRT_REPORT_TRUE@am__objects_7 = ras-report.$(OBJEXT)
@WITH_HISI_NS_DECODE_TRUE@am__objects_8 = \
@@ -595,6 +597,8 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bitfield.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd-k8.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-amd-scma.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-broadwell-de.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-broadwell-epex.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mce-intel-dunnington.Po@am__quote@

View File

@ -0,0 +1,138 @@
commit a8c776ed94f68ae31d7b5f74e19545698898c13c
Author: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Tue Aug 14 13:06:27 2018 -0300
mce-intel-*: fix a warning when using FIELD(<num>, NULL)
Internally, FIELD() macro checks the size of an array, by
using ARRAY_SIZE. Well, this macro causes a division by zero
if NULL is used, as its type is void, as warned:
mce-intel-dunnington.c:30:2: note: in expansion of macro FIELD
FIELD(17, NULL),
^~~~~
ras-mce-handler.h:28:33: warning: division sizeof (void *) / sizeof (void) does not compute the number of array elements [-Wsizeof-pointer-div]
#define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x)))
^
bitfield.h:37:51: note: in expansion of macro ARRAY_SIZE
#define FIELD(start_bit, name) { start_bit, name, ARRAY_SIZE(name) }
^~~~~~~~~~
While this warning is harmless, it may prevent seeing more serios
warnings. So, add a FIELD_NULL(<num>) macro to avoid that.
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
diff --git a/bitfield.h b/bitfield.h
index c7dfeb1..fccbb36 100644
--- a/bitfield.h
+++ b/bitfield.h
@@ -35,6 +35,7 @@ struct numfield {
};
#define FIELD(start_bit, name) { start_bit, name, ARRAY_SIZE(name) }
+#define FIELD_NULL(start_bit) { start_bit, NULL, 0 }
#define SBITFIELD(start_bit, string) { start_bit, ((char * [2]) { NULL, string }), 2 }
#define NUMBER(start, end, name) { start, end, name, "%Lu", 0 }
diff --git a/mce-intel-dunnington.c b/mce-intel-dunnington.c
index 4b1c7e3..c695c62 100644
--- a/mce-intel-dunnington.c
+++ b/mce-intel-dunnington.c
@@ -27,14 +27,14 @@
static struct field dunnington_bus_status[] = {
SBITFIELD(16, "Parity error detected during FSB request phase"),
- FIELD(17, NULL),
+ FIELD_NULL(17),
SBITFIELD(20, "Hard Failure response received for a local transaction"),
SBITFIELD(21, "Parity error on FSB response field detected"),
SBITFIELD(22, "Parity data error on inbound data detected"),
- FIELD(23, NULL),
- FIELD(25, NULL),
- FIELD(28, NULL),
- FIELD(31, NULL),
+ FIELD_NULL(23),
+ FIELD_NULL(25),
+ FIELD_NULL(28),
+ FIELD_NULL(31),
{}
};
diff --git a/mce-intel-p4-p6.c b/mce-intel-p4-p6.c
index 4615e1a..5c6c3ff 100644
--- a/mce-intel-p4-p6.c
+++ b/mce-intel-p4-p6.c
@@ -60,7 +60,7 @@ static char *bus_queue_error_type[] = {
};
static struct field p6_shared_status[] = {
- FIELD(16, NULL),
+ FIELD_NULL(16),
FIELD(19, bus_queue_req_type),
FIELD(25, bus_queue_error_type),
FIELD(25, bus_queue_error_type),
@@ -68,7 +68,7 @@ static struct field p6_shared_status[] = {
SBITFIELD(36, "received parity error on response transaction"),
SBITFIELD(38, "timeout BINIT (ROB timeout)."
" No micro-instruction retired for some time"),
- FIELD(39, NULL),
+ FIELD_NULL(39),
SBITFIELD(42, "bus transaction received hard error response"),
SBITFIELD(43, "failure that caused IERR"),
/* The following are reserved for Core in the SDM. Let's keep them here anyways*/
@@ -76,15 +76,15 @@ static struct field p6_shared_status[] = {
SBITFIELD(45, "uncorrectable ECC error"),
SBITFIELD(46, "correctable ECC error"),
/* [47..54]: ECC syndrome */
- FIELD(55, NULL),
+ FIELD_NULL(55),
{},
};
static struct field p6old_status[] = {
SBITFIELD(28, "FRC error"),
SBITFIELD(29, "BERR on this CPU"),
- FIELD(31, NULL),
- FIELD(32, NULL),
+ FIELD_NULL(31),
+ FIELD_NULL(32),
SBITFIELD(35, "BINIT received from external bus"),
SBITFIELD(37, "Received hard error reponse on split transaction (Bus BINIT)"),
{}
@@ -94,9 +94,9 @@ static struct field core2_status[] = {
SBITFIELD(28, "MCE driven"),
SBITFIELD(29, "MCE is observed"),
SBITFIELD(31, "BINIT observed"),
- FIELD(32, NULL),
+ FIELD_NULL(32),
SBITFIELD(34, "PIC or FSB data parity error"),
- FIELD(35, NULL),
+ FIELD_NULL(35),
SBITFIELD(37, "FSB address parity error detected"),
{}
};
diff --git a/mce-intel-tulsa.c b/mce-intel-tulsa.c
index 6cea421..e59bf06 100644
--- a/mce-intel-tulsa.c
+++ b/mce-intel-tulsa.c
@@ -39,7 +39,7 @@ static struct field tls_bus_status[] = {
SBITFIELD(16, "Parity error detected during FSB request phase"),
SBITFIELD(17, "Partity error detected on Core 0 request's address field"),
SBITFIELD(18, "Partity error detected on Core 1 request's address field"),
- FIELD(19, NULL),
+ FIELD_NULL(19),
SBITFIELD(20, "Parity error on FSB response field detected"),
SBITFIELD(21, "FSB data parity error on inbound date detected"),
SBITFIELD(22, "Data parity error on data received from Core 0 detected"),
@@ -48,8 +48,8 @@ static struct field tls_bus_status[] = {
SBITFIELD(25, "Data ECC event to error on inbound data correctable or uncorrectable"),
SBITFIELD(26, "Pad logic detected a data strobe glitch or sequencing error"),
SBITFIELD(27, "Pad logic detected a request strobe glitch or sequencing error"),
- FIELD(28, NULL),
- FIELD(31, NULL),
+ FIELD_NULL(28),
+ FIELD_NULL(31),
{}
};

159
add_upstream_labels.patch Normal file
View File

@ -0,0 +1,159 @@
---
labels/dell | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 152 insertions(+)
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ rasdaemon-0.6.1/labels/dell 2020-02-20 11:53:39.574579258 -0500
@@ -0,0 +1,152 @@
+# RASDAEMON Motherboard DIMM labels Database file.
+#
+# Vendor-name and model-name are found from the program 'dmidecode'
+# labels are found from the silk screen on the motherboard.
+#
+#Vendor: <vendor-name>
+# Product: <product-name>
+# Model: <model-name>
+# <label>: <mc>.<top>.<mid>.<low>
+#
+
+Vendor: Dell Inc.
+# 1-socket
+ Product: PowerEdge R220, PowerEdge R330, PowerEdge T330, PowerEdge R230, PowerEdge T130, PowerEdge T30
+ DIMM_A1: 0.0.0; DIMM_A2: 0.0.1;
+ DIMM_A3: 0.1.0; DIMM_A4: 0.1.1;
+
+ Product: PowerEdge T110 II, PowerEdge T20
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0;
+
+ DIMM_B1: 0.0.1; DIMM_B2: 0.1.1;
+
+ Product: PowerEdge R320, PowerEdge T320
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0;
+ DIMM_A4: 0.0.1; DIMM_A5: 0.1.1; DIMM_A6: 0.2.1;
+
+# 2-socket
+ Product: PowerEdge R610
+ DIMM_A1: 0.0.0; DIMM_A2: 0.0.1; DIMM_A3: 0.0.2;
+ DIMM_A4: 0.1.0; DIMM_A5: 0.1.1; DIMM_A6: 0.1.2;
+
+ DIMM_B1: 1.0.0; DIMM_B2: 1.0.1; DIMM_B3: 1.0.2;
+ DIMM_B4: 1.1.0; DIMM_B5: 1.1.1; DIMM_B6: 1.1.2;
+
+ Product: PowerEdge T710, PowerEdge R710
+ DIMM_A3: 0.0.0; DIMM_A2: 0.1.0; DIMM_A1: 0.2.0;
+ DIMM_A6: 0.0.1; DIMM_A5: 0.1.1; DIMM_A4: 0.2.1;
+ DIMM_A9: 0.0.2; DIMM_A8: 0.1.2; DIMM_A7: 0.2.2;
+
+ DIMM_B3: 1.0.0; DIMM_B2: 1.1.0; DIMM_B1: 1.2.0;
+ DIMM_B6: 1.0.1; DIMM_B5: 1.1.1; DIMM_B4: 1.2.1;
+ DIMM_B9: 1.0.2; DIMM_B8: 1.1.2; DIMM_B7: 1.2.2;
+
+ Product: PowerEdge R620, PowerEdge T620, PowerEdge R720xd, PowerEdge R730xd, PowerEdge T630, PowerEdge R730, PowerEdge R630, PowerEdge T620, PowerEdge M620, PowerEdge FC620, PowerEdge M630, PowerEdge FC630
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
+ DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1;
+ DIMM_A9: 0.0.2; DIMM_A10: 0.1.2; DIMM_A11: 0.2.2; DIMM_A12: 0.3.2;
+
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
+ DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1;
+ DIMM_B9: 1.0.2; DIMM_B10: 1.1.2; DIMM_B11: 1.2.2; DIMM_B12: 1.3.2;
+
+ Product: PowerEdge R640, PowerEdge R740, PowerEdge R740xd, PowerEdge T640
+ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0;
+ A7: 0.0.1; A8: 0.1.1; A9: 0.2.1; A10: 1.0.1; A11: 1.1.1; A12: 1.2.1;
+
+ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0;
+ B7: 2.0.1; B8: 2.1.1; B9: 2.2.1; B10: 3.0.1; B11: 3.1.1; B12: 3.2.1;
+
+ Product: PowerEdge M520, PowerEdge R420, PowerEdge T420
+ DIMM_A1: 0.1.0; DIMM_A2: 0.2.0; DIMM_A3: 0.3.0;
+ DIMM_A4: 0.1.1; DIMM_A5: 0.2.1; DIMM_A6: 0.3.1;
+
+ DIMM_B1: 1.1.0; DIMM_B2: 1.2.0; DIMM_B3: 1.3.0;
+ DIMM_B4: 1.1.1; DIMM_B5: 1.2.1; DIMM_B6: 1.3.1;
+
+ Product: PowerEdge FC420, PowerEdge M420
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0;
+
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0;
+
+ Product: PowerEdge C6320, PowerEdge C4130
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
+ DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1;
+
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
+ DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1;
+
+ Product: PowerEdge C6320p
+ A1: 0.0.0; B1: 0.1.0; C1: 0.2.0;
+ D1: 1.0.0; E1: 1.1.0; F1: 1.2.0;
+
+ Product: PowerEdge C6420
+ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0;
+ A7: 0.0.1; A8: 1.0.1;
+
+ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0;
+ B7: 2.0.1; B8: 3.0.1;
+
+ Product: PowerEdge R430, PowerEdge T430, PowerEdge R530
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
+ DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1;
+
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
+
+ Product: PowerEdge FC430
+ DIMM_A1: 0.1.0; DIMM_A2: 0.0.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
+
+ DIMM_B1: 1.1.0; DIMM_B2: 1.0.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
+
+# 4-socket
+ Product: PowerEdge M820, PowerEdge R830, PowerEdge M830, PowerEdge R930, PowerEdge FC830
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
+ DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1;
+ DIMM_A9: 0.0.2; DIMM_A10: 0.1.2; DIMM_A11: 0.2.2; DIMM_A12: 0.3.2;
+
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
+ DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1;
+ DIMM_B9: 1.0.2; DIMM_B10: 1.1.2; DIMM_B11: 1.2.2; DIMM_B12: 1.3.2;
+
+ DIMM_C1: 2.0.0; DIMM_C2: 2.1.0; DIMM_C3: 2.2.0; DIMM_C4: 2.3.0;
+ DIMM_C5: 2.0.1; DIMM_C6: 2.1.1; DIMM_C7: 2.2.1; DIMM_C8: 2.3.1;
+ DIMM_C9: 2.0.2; DIMM_C10: 2.1.2; DIMM_C11: 2.2.2; DIMM_C12: 2.3.2;
+
+ DIMM_D1: 3.0.0; DIMM_D2: 3.1.0; DIMM_D3: 3.2.0; DIMM_D4: 3.3.0;
+ DIMM_D5: 3.0.1; DIMM_D6: 3.1.1; DIMM_D7: 3.2.1; DIMM_D8: 3.3.1;
+ DIMM_D9: 3.0.2; DIMM_D10: 3.1.2; DIMM_D11: 3.2.2; DIMM_D12: 3.3.2;
+
+ Product: PowerEdge FM120x4
+ DIMM_A_A1: 0.1.0; DIMM_A_A2: 0.2.0;
+
+ DIMM_B_A1: 1.1.0; DIMM_B_A2: 1.2.0;
+
+ DIMM_C_A1: 2.1.0; DIMM_C_A2: 2.2.0;
+
+ DIMM_D_A1: 3.1.0; DIMM_D_A2: 3.2.0;
+
+ Product: PowerEdge R940
+ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0;
+ A7: 0.0.1; A8: 0.1.1; A9: 0.2.1; A10: 1.0.1; A11: 1.1.1; A12: 1.2.1;
+
+ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0;
+ B7: 2.0.1; B8: 2.1.1; B9: 2.2.1; B10: 3.0.1; B11: 3.1.1; B12: 3.2.1;
+
+ C1: 4.0.0; C2: 4.1.0; C3: 4.2.0; C4: 5.0.0; C5: 5.1.0; C6: 5.2.0;
+ C7: 4.0.1; C8: 4.1.1; C9: 4.2.1; C10: 5.0.1; C11: 5.1.1; C12: 5.2.1;
+
+ D1: 6.0.0; D2: 6.1.0; D3: 6.2.0; D4: 7.0.0; D5: 7.1.0; D6: 7.2.0;
+ D7: 6.0.1; D8: 6.1.1; D9: 6.2.1; D10: 7.0.1; D11: 7.1.1; D12: 7.2.1;
+
+ Product: PowerEdge R440, PowerEdge R540
+ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0;
+ A7: 0.0.1; A8: 0.1.1; A9: 1.0.1; A10: 1.1.1;
+
+ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0;
+
+ Product: PowerEdge M640, PowerEdge FC640
+ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0;
+ A7: 0.0.1; A8: 1.0.1;
+
+ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0;
+ B7: 2.0.1; B8: 3.0.1;

View File

@ -0,0 +1,107 @@
commit aecf33aa70331670c06db6b652712b476e24051c
Author: Muralidhara M K <muralimk@amd.com>
Date: Mon Jul 12 05:40:46 2021 -0500
rasdaemon: Enumerate memory on noncpu nodes
On newer heterogeneous systems from AMD with GPU nodes (with HBM2 memory
banks) connected via xGMI links to the CPUs.
The node id information is available in the InstanceHI[47:44] of
the IPID register.
The UMC Phys on Aldeberan nodes are enumerated as csrow
The UMC channels connected to HBMs are enumerated as ranks.
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 3c346f4..f3379fc 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -78,6 +78,12 @@ enum smca_bank_types {
/* Maximum number of MCA banks per CPU. */
#define MAX_NR_BANKS 64
+/*
+ * On Newer heterogeneous systems from AMD with CPU and GPU nodes connected
+ * via xGMI links, the NON CPU Nodes are enumerated from index 8
+ */
+#define NONCPU_NODE_INDEX 8
+
/* SMCA Extended error strings */
/* Load Store */
static const char * const smca_ls_mce_desc[] = {
@@ -531,6 +537,26 @@ static int find_umc_channel(struct mce_event *e)
{
return EXTRACT(e->ipid, 0, 31) >> 20;
}
+
+/*
+ * The HBM memory managed by the UMCCH of the noncpu node
+ * can be calculated based on the [15:12]bits of IPID
+ */
+static int find_hbm_channel(struct mce_event *e)
+{
+ int umc, tmp;
+
+ umc = EXTRACT(e->ipid, 0, 31) >> 20;
+
+ /*
+ * The HBM channel managed by the UMC of the noncpu node
+ * can be calculated based on the [15:12]bits of IPID as follows
+ */
+ tmp = ((e->ipid >> 12) & 0xf);
+
+ return (umc % 2) ? tmp + 4 : tmp;
+}
+
/* Decode extended errors according to Scalable MCA specification */
static void decode_smca_error(struct mce_event *e)
{
@@ -539,6 +565,7 @@ static void decode_smca_error(struct mce_event *e)
unsigned short xec = (e->status >> 16) & 0x3f;
const struct smca_hwid *s_hwid;
uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
+ uint8_t mcatype_instancehi = EXTRACT(e->ipid, 44, 47);
unsigned int csrow = -1, channel = -1;
unsigned int i;
@@ -548,14 +575,16 @@ static void decode_smca_error(struct mce_event *e)
bank_type = s_hwid->bank_type;
break;
}
+ if (mcatype_instancehi >= NONCPU_NODE_INDEX)
+ bank_type = SMCA_UMC_V2;
}
- if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) {
+ if (i >= MAX_NR_BANKS) {
strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID");
return;
}
- if (bank_type >= N_SMCA_BANK_TYPES) {
+ if (bank_type >= MAX_NR_BANKS) {
strcpy(e->mcastatus_msg, "Don't know how to decode this bank");
return;
}
@@ -580,6 +609,16 @@ static void decode_smca_error(struct mce_event *e)
mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
channel, csrow);
}
+
+ if (bank_type == SMCA_UMC_V2 && xec == 0) {
+ /* The UMCPHY is reported as csrow in case of noncpu nodes */
+ csrow = find_umc_channel(e) / 2;
+ /* UMCCH is managing the HBM memory */
+ channel = find_hbm_channel(e);
+ mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
+ channel, csrow);
+ }
+
}
int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)

View File

@ -0,0 +1,37 @@
commit b22be68453b2497e86cbd273b9cd56fadc5859e3
Author: Ying Lv <lvying6@huawei.com>
Date: Wed May 15 11:15:42 2019 +0800
fix rasdaemon high CPU usage when part of CPUs offline
When we set part of CPU core offline, such as by setting the kernel cmdline
maxcpus = N(N is less than the total number of system CPU cores).
And then, we will observe that the CPU usage of some rasdaemon threads
is very close to 100.
This is because when part of CPU offline, poll in read_ras_event_all_cpus func
will fallback to pthread way.
Offlined CPU thread will return negative value when read trace_pipe_raw,
negative return value will covert to positive value because of 'unsigned size'.
So code will always go into 'size > 0' branch, and the CPU usage is too high.
Here, variable size uses int type will go to the right branch.
Fiexs: eff7c9e0("ras-events: Only use pthreads for collect if poll() not available")
Reported-by: Zhipeng Xie <xiezhipeng1@huawei.com>
Signed-off-by: Ying Lv <lvying6@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
diff --git a/ras-events.c b/ras-events.c
index 4e7b815..38ebe1e 100644
--- a/ras-events.c
+++ b/ras-events.c
@@ -426,7 +426,7 @@ static int read_ras_event(int fd,
struct kbuffer *kbuf,
void *page)
{
- unsigned size;
+ int size;
unsigned long long time_stamp;
void *data;

View File

@ -0,0 +1,148 @@
commit b497a3d6a39d402c41065e9284d49114b97e3bfe
Author: Shiju Jose <shiju.jose@huawei.com>
Date: Mon Mar 8 16:57:28 2021 +0000
rasdaemon: ras-mc-ctl: Add memory failure events
Add supporting memory failure errors (memory_failure_event)
to the ras-mc-ctl tool.
Sample Log,
ras-mc-ctl --summary
...
Memory failure events summary:
Delayed errors: 4
Failed errors: 1
...
ras-mc-ctl --errors
...
Memory failure events:
1 2020-10-28 23:20:41 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Delayed
2 2020-10-28 23:31:38 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Delayed
3 2020-10-28 23:54:54 -0800 error: pfn=0x205000000, page_type=free buddy page, action_result=Delayed
4 2020-10-29 00:12:25 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Delayed
5 2020-10-29 00:26:36 -0800 error: pfn=0x204000000, page_type=free buddy page, action_result=Failed
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
configure.ac | 11 +++++++++++
util/ras-mc-ctl.in | 46 +++++++++++++++++++++++++++++++++++++++++++---
2 files changed, 54 insertions(+), 3 deletions(-)
--- a/util/ras-mc-ctl.in 2021-10-13 13:51:00.887292563 -0400
+++ b/util/ras-mc-ctl.in 2021-10-13 13:51:27.536061894 -0400
@@ -44,11 +44,13 @@ my $modprobe = find_prog ("modprobe")
my $has_aer = 0;
my $has_arm = 0;
my $has_extlog = 0;
+my $has_mem_failure = 0;
my $has_mce = 0;
@WITH_AER_TRUE@$has_aer = 1;
@WITH_ARM_TRUE@$has_arm = 1;
@WITH_EXTLOG_TRUE@$has_extlog = 1;
+@WITH_MEMORY_FAILURE_TRUE@$has_mem_failure = 1;
@WITH_MCE_TRUE@$has_mce = 1;
my %conf = ();
@@ -1132,7 +1134,7 @@ sub summary
{
require DBI;
my ($query, $query_handle, $out);
- my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg);
+ my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg, $action_result);
my ($etype, $severity, $etype_string, $severity_string);
my ($affinity, $mpidr);
@@ -1203,9 +1205,27 @@ sub summary
$out .= "\t$count $etype_string $severity_string errors\n";
}
if ($out ne "") {
- print "Extlog records summary:\n$out";
+ print "Extlog records summary:\n$out\n";
} else {
- print "No Extlog errors.\n";
+ print "No Extlog errors.\n\n";
+ }
+ $query_handle->finish;
+ }
+
+ # Memory failure errors
+ if ($has_mem_failure == 1) {
+ $query = "select action_result, count(*) from memory_failure_event group by action_result";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($action_result, $count));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "\t$action_result errors: $count\n";
+ }
+ if ($out ne "") {
+ print "Memory failure events summary:\n$out\n";
+ } else {
+ print "No Memory failure errors.\n\n";
}
$query_handle->finish;
}
@@ -1238,6 +1258,7 @@ sub errors
my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location);
my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data);
my ($error_count, $affinity, $mpidr, $r_state, $psci_state);
+ my ($pfn, $page_type, $action_result);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
@@ -1329,6 +1350,25 @@ $out .= sprintf "address=0x%08x, ", $add
}
$query_handle->finish;
}
+
+ # Memory failure errors
+ if ($has_mem_failure == 1) {
+ $query = "select id, timestamp, pfn, page_type, action_result from memory_failure_event order by id";
+ $query_handle = $dbh->prepare($query);
+ $query_handle->execute();
+ $query_handle->bind_columns(\($id, $timestamp, $pfn, $page_type, $action_result));
+ $out = "";
+ while($query_handle->fetch()) {
+ $out .= "$id $timestamp error: ";
+ $out .= "pfn=$pfn, page_type=$page_type, action_result=$action_result\n";
+ }
+ if ($out ne "") {
+ print "Memory failure events:\n$out\n";
+ } else {
+ print "No Memory failure errors.\n\n";
+ }
+ $query_handle->finish;
+ }
# MCE mce_record errors
if ($has_mce == 1) {
--- a/configure.ac 2018-04-25 06:28:51.000000000 -0400
+++ b/configure.ac 2021-10-13 13:51:00.916292312 -0400
@@ -80,6 +80,16 @@ AS_IF([test "x$enable_extlog" = "xyes"],
])
AM_CONDITIONAL([WITH_EXTLOG], [test x$enable_extlog = xyes])
+AC_ARG_ENABLE([memory_failure],
+ AS_HELP_STRING([--enable-memory-failure], [enable memory failure events (currently experimental)]))
+
+AS_IF([test "x$enable_memory_failure" = "xyes" || test "x$enable_all" == "xyes"], [
+ AC_DEFINE(HAVE_MEMORY_FAILURE,1,"have memory failure events collect")
+ AC_SUBST([WITH_MEMORY_FAILURE])
+])
+AM_CONDITIONAL([WITH_MEMORY_FAILURE], [test x$enable_memory_failure = xyes || test x$enable_all == xyes])
+AM_COND_IF([WITH_MEMORY_FAILURE], [USE_MEMORY_FAILURE="yes"], [USE_MEMORY_FAILURE="no"])
+
AC_ARG_ENABLE([abrt_report],
AS_HELP_STRING([--enable-abrt-report], [enable report event to ABRT (currently experimental)]))
@@ -127,4 +137,5 @@ compile time options summary
ABRT report : $enable_abrt_report
HIP07 SAS HW errors : $enable_hisi_ns_decode
ARM events : $enable_arm
+ Memory Failure : $USE_MEMORY_FAILURE
EOF

View File

@ -0,0 +1,94 @@
commit cc2ce5c65ed5a42eaa97aa3659854add6d808da5
Author: Muralidhara M K <muralidhara.mk@amd.com>
Date: Mon Jan 13 19:12:06 2020 +0530
rasdaemon: Add error decoding for new SMCA Load Store bank type
Future Scalable Machine Check Architecture (SMCA) systems will have a
new Load Store bank type.
Add the new type's (HWID, McaType) ID and error decoding.
Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
[ Adjust commit message. ]
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 114e786..d0b6cb6 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -38,9 +38,16 @@
* 03: EC[3], 02: EC[2], 01: EC[1], 00: EC[0]
*/
+/* MCA_STATUS REGISTER FOR FAMILY 19H
+ * The bits 24 ~ 29 contains AddressLsb
+ * 29: ADDRLS[5], 28: ADDRLS[4], 27: ADDRLS[3],
+ * 26: ADDRLS[2], 25: ADDRLS[1], 24: ADDRLS[0]
+ */
+
/* These may be used by multiple smca_hwid_mcatypes */
enum smca_bank_types {
SMCA_LS = 0, /* Load Store */
+ SMCA_LS_V2, /* Load Store */
SMCA_IF, /* Instruction Fetch */
SMCA_L2_CACHE, /* L2 Cache */
SMCA_DE, /* Decoder Unit */
@@ -88,6 +95,32 @@ static const char * const smca_ls_mce_desc[] = {
"DC tag error type 5",
"L2 fill data error",
};
+static const char * const smca_ls2_mce_desc[] = {
+ "An ECC error was detected on a data cache read by a probe or victimization",
+ "An ECC error or L2 poison was detected on a data cache read by a load",
+ "An ECC error was detected on a data cache read-modify-write by a store",
+ "An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization",
+ "An ECC error or poison bit mismatch was detected on a tag read by a load",
+ "An ECC error or poison bit mismatch was detected on a tag read by a store",
+ "An ECC error was detected on an EMEM read by a load",
+ "An ECC error was detected on an EMEM read-modify-write by a store",
+ "A parity error was detected in an L1 TLB entry by any access",
+ "A parity error was detected in an L2 TLB entry by any access",
+ "A parity error was detected in a PWC entry by any access",
+ "A parity error was detected in an STQ entry by any access",
+ "A parity error was detected in an LDQ entry by any access",
+ "A parity error was detected in a MAB entry by any access",
+ "A parity error was detected in an SCB entry state field by any access",
+ "A parity error was detected in an SCB entry address field by any access",
+ "A parity error was detected in an SCB entry data field by any access",
+ "A parity error was detected in a WCB entry by any access",
+ "A poisoned line was detected in an SCB entry by any access",
+ "A SystemReadDataError error was reported on read data returned from L2 for a load",
+ "A SystemReadDataError error was reported on read data returned from L2 for an SCB store",
+ "A SystemReadDataError error was reported on read data returned from L2 for a WCB store",
+ "A hardware assertion error was reported",
+ "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access",
+};
/* Instruction Fetch */
static const char * const smca_if_mce_desc[] = {
"microtag probe port parity error",
@@ -289,6 +322,7 @@ struct smca_mce_desc {
static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) },
+ [SMCA_LS_V2] = { smca_ls2_mce_desc, ARRAY_SIZE(smca_ls2_mce_desc) },
[SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) },
[SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) },
[SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) },
@@ -319,6 +353,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* ZN Core (HWID=0xB0) MCA types */
{ SMCA_LS, 0x000000B0 },
+ { SMCA_LS_V2, 0x001000B0 },
{ SMCA_IF, 0x000100B0 },
{ SMCA_L2_CACHE, 0x000200B0 },
{ SMCA_DE, 0x000300B0 },
@@ -362,6 +397,7 @@ struct smca_bank_name {
static struct smca_bank_name smca_names[] = {
[SMCA_LS] = { "Load Store Unit" },
+ [SMCA_LS_V2] = { "Load Store Unit" },
[SMCA_IF] = { "Instruction Fetch Unit" },
[SMCA_L2_CACHE] = { "L2 Cache" },
[SMCA_DE] = { "Decode Unit" },

View File

@ -0,0 +1,611 @@
commit ce6e7864f11f709c4f803828fbc8e507d115d03b
Author: Greg Edwards <gedwards@ddn.com>
Date: Thu Apr 8 15:03:30 2021 -0600
rasdaemon: Add Ice Lake and Sapphire Rapids MSCOD values
Based on mcelog commits:
ee90ff20ce6a ("mcelog: Add support for Icelake server, Icelake-D, and Snow Ridge")
391abaac9bdf ("mcelog: Add decode for MCi_MISC from 10nm memory controller")
59cb7ad4bc72 ("mcelog: i10nm: Fix mapping from bank number to functional unit")
c0acd0e6a639 ("mcelog: Add support for Sapphirerapids server.")
Signed-off-by: Greg Edwards <gedwards@ddn.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
Makefile.am | 3
mce-intel-i10nm.c | 509 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
mce-intel.c | 5
ras-mce-handler.c | 12 +
ras-mce-handler.h | 5
5 files changed, 533 insertions(+), 1 deletion(-)
--- rasdaemon-0.6.1.orig/Makefile.am 2021-09-17 15:29:45.977790658 -0400
+++ rasdaemon-0.6.1/Makefile.am 2021-09-17 15:29:57.439698580 -0400
@@ -36,7 +36,8 @@ if WITH_MCE
mce-intel-dunnington.c mce-intel-tulsa.c \
mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \
mce-intel-knl.c mce-intel-broadwell-de.c \
- mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c
+ mce-intel-broadwell-epex.c mce-intel-skylake-xeon.c \
+ mce-amd.c mce-amd-smca.c mce-intel-i10nm.c
endif
if WITH_EXTLOG
rasdaemon_SOURCES += ras-extlog-handler.c
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ rasdaemon-0.6.1/mce-intel-i10nm.c 2021-09-17 15:29:45.977790658 -0400
@@ -0,0 +1,509 @@
+/*
+ * The code below came from Tony Luck's mcelog code,
+ * released under GNU Public General License, v.2
+ *
+ * Copyright (C) 2019 Intel Corporation
+ * Decode Intel 10nm specific machine check errors.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "ras-mce-handler.h"
+#include "bitfield.h"
+
+static char *pcu_1[] = {
+ [0x0D] = "MCA_LLC_BIST_ACTIVE_TIMEOUT",
+ [0x0E] = "MCA_DMI_TRAINING_TIMEOUT",
+ [0x0F] = "MCA_DMI_STRAP_SET_ARRIVAL_TIMEOUT",
+ [0x10] = "MCA_DMI_CPU_RESET_ACK_TIMEOUT",
+ [0x11] = "MCA_MORE_THAN_ONE_LT_AGENT",
+ [0x14] = "MCA_INCOMPATIBLE_PCH_TYPE",
+ [0x1E] = "MCA_BIOS_RST_CPL_INVALID_SEQ",
+ [0x1F] = "MCA_BIOS_INVALID_PKG_STATE_CONFIG",
+ [0x2D] = "MCA_PCU_PMAX_CALIB_ERROR",
+ [0x2E] = "MCA_TSC100_SYNC_TIMEOUT",
+ [0x3A] = "MCA_GPSB_TIMEOUT",
+ [0x3B] = "MCA_PMSB_TIMEOUT",
+ [0x3E] = "MCA_IOSFSB_PMREQ_CMP_TIMEOUT",
+ [0x40] = "MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE",
+ [0x42] = "MCA_SVID_VCCIN_VR_VOUT_FAILURE",
+ [0x43] = "MCA_SVID_CPU_VR_CAPABILITY_ERROR",
+ [0x44] = "MCA_SVID_CRITICAL_VR_FAILED",
+ [0x45] = "MCA_SVID_SA_ITD_ERROR",
+ [0x46] = "MCA_SVID_READ_REG_FAILED",
+ [0x47] = "MCA_SVID_WRITE_REG_FAILED",
+ [0x4A] = "MCA_SVID_PKGC_REQUEST_FAILED",
+ [0x4B] = "MCA_SVID_IMON_REQUEST_FAILED",
+ [0x4C] = "MCA_SVID_ALERT_REQUEST_FAILED",
+ [0x4D] = "MCA_SVID_MCP_VR_RAMP_ERROR",
+ [0x56] = "MCA_FIVR_PD_HARDERR",
+ [0x58] = "MCA_WATCHDOG_TIMEOUT_PKGC_SLAVE",
+ [0x59] = "MCA_WATCHDOG_TIMEOUT_PKGC_MASTER",
+ [0x5A] = "MCA_WATCHDOG_TIMEOUT_PKGS_MASTER",
+ [0x5B] = "MCA_WATCHDOG_TIMEOUT_MSG_CH_FSM",
+ [0x5C] = "MCA_WATCHDOG_TIMEOUT_BULK_CR_FSM",
+ [0x5D] = "MCA_WATCHDOG_TIMEOUT_IOSFSB_FSM",
+ [0x60] = "MCA_PKGS_SAFE_WP_TIMEOUT",
+ [0x61] = "MCA_PKGS_CPD_UNCPD_TIMEOUT",
+ [0x62] = "MCA_PKGS_INVALID_REQ_PCH",
+ [0x63] = "MCA_PKGS_INVALID_REQ_INTERNAL",
+ [0x64] = "MCA_PKGS_INVALID_RSP_INTERNAL",
+ [0x65 ... 0x7A] = "MCA_PKGS_RESET_PREP_TIMEOUT",
+ [0x7B] = "MCA_PKGS_SMBUS_VPP_PAUSE_TIMEOUT",
+ [0x7C] = "MCA_PKGS_SMBUS_MCP_PAUSE_TIMEOUT",
+ [0x7D] = "MCA_PKGS_SMBUS_SPD_PAUSE_TIMEOUT",
+ [0x80] = "MCA_PKGC_DISP_BUSY_TIMEOUT",
+ [0x81] = "MCA_PKGC_INVALID_RSP_PCH",
+ [0x83] = "MCA_PKGC_WATCHDOG_HANG_CBZ_DOWN",
+ [0x84] = "MCA_PKGC_WATCHDOG_HANG_CBZ_UP",
+ [0x87] = "MCA_PKGC_WATCHDOG_HANG_C2_BLKMASTER",
+ [0x88] = "MCA_PKGC_WATCHDOG_HANG_C2_PSLIMIT",
+ [0x89] = "MCA_PKGC_WATCHDOG_HANG_SETDISP",
+ [0x8B] = "MCA_PKGC_ALLOW_L1_ERROR",
+ [0x90] = "MCA_RECOVERABLE_DIE_THERMAL_TOO_HOT",
+ [0xA0] = "MCA_ADR_SIGNAL_TIMEOUT",
+ [0xA1] = "MCA_BCLK_FREQ_OC_ABOVE_THRESHOLD",
+ [0xB0] = "MCA_DISPATCHER_RUN_BUSY_TIMEOUT",
+};
+
+static char *pcu_2[] = {
+ [0x04] = "Clock/power IP response timeout",
+ [0x05] = "SMBus controller raised SMI",
+ [0x09] = "PM controller received invalid transaction",
+};
+
+static char *pcu_3[] = {
+ [0x01] = "Instruction address out of valid space",
+ [0x02] = "Double bit RAM error on Instruction Fetch",
+ [0x03] = "Invalid OpCode seen",
+ [0x04] = "Stack Underflow",
+ [0x05] = "Stack Overflow",
+ [0x06] = "Data address out of valid space",
+ [0x07] = "Double bit RAM error on Data Fetch",
+};
+
+static struct field pcu1[] = {
+ FIELD(0, pcu_1),
+ {}
+};
+
+static struct field pcu2[] = {
+ FIELD(0, pcu_2),
+ {}
+};
+
+static struct field pcu3[] = {
+ FIELD(0, pcu_3),
+ {}
+};
+
+static struct field upi1[] = {
+ SBITFIELD(22, "Phy Control Error"),
+ SBITFIELD(23, "Unexpected Retry.Ack flit"),
+ SBITFIELD(24, "Unexpected Retry.Req flit"),
+ SBITFIELD(25, "RF parity error"),
+ SBITFIELD(26, "Routeback Table error"),
+ SBITFIELD(27, "Unexpected Tx Protocol flit (EOP, Header or Data)"),
+ SBITFIELD(28, "Rx Header-or-Credit BGF credit overflow/underflow"),
+ SBITFIELD(29, "Link Layer Reset still in progress when Phy enters L0"),
+ SBITFIELD(30, "Link Layer reset initiated while protocol traffic not idle"),
+ SBITFIELD(31, "Link Layer Tx Parity Error"),
+ {}
+};
+
+static char *upi_2[] = {
+ [0x00] = "Phy Initialization Failure (NumInit)",
+ [0x01] = "Phy Detected Drift Buffer Alarm",
+ [0x02] = "Phy Detected Latency Buffer Rollover",
+ [0x10] = "LL Rx detected CRC error: unsuccessful LLR (entered Abort state)",
+ [0x11] = "LL Rx Unsupported/Undefined packet",
+ [0x12] = "LL or Phy Control Error",
+ [0x13] = "LL Rx Parameter Exception",
+ [0x1F] = "LL Detected Control Error",
+ [0x20] = "Phy Initialization Abort",
+ [0x21] = "Phy Inband Reset",
+ [0x22] = "Phy Lane failure, recovery in x8 width",
+ [0x23] = "Phy L0c error corrected without Phy reset",
+ [0x24] = "Phy L0c error triggering Phy reset",
+ [0x25] = "Phy L0p exit error corrected with reset",
+ [0x30] = "LL Rx detected CRC error: successful LLR without Phy Reinit",
+ [0x31] = "LL Rx detected CRC error: successful LLR with Phy Reinit",
+ [0x32] = "Tx received LLR",
+};
+
+static struct field upi2[] = {
+ FIELD(0, upi_2),
+ {}
+};
+
+static struct field m2m[] = {
+ SBITFIELD(16, "MC read data error"),
+ SBITFIELD(17, "Reserved"),
+ SBITFIELD(18, "MC partial write data error"),
+ SBITFIELD(19, "Full write data error"),
+ SBITFIELD(20, "M2M clock-domain-crossing buffer (BGF) error"),
+ SBITFIELD(21, "M2M time out"),
+ SBITFIELD(22, "M2M tracker parity error"),
+ SBITFIELD(23, "fatal Bucket1 error"),
+ {}
+};
+
+static char *imc_0[] = {
+ [0x01] = "Address parity error",
+ [0x02] = "Data parity error",
+ [0x03] = "Data ECC error",
+ [0x04] = "Data byte enable parity error",
+ [0x07] = "Transaction ID parity error",
+ [0x08] = "Corrected patrol scrub error",
+ [0x10] = "Uncorrected patrol scrub error",
+ [0x20] = "Corrected spare error",
+ [0x40] = "Uncorrected spare error",
+ [0x80] = "Corrected read error",
+ [0xA0] = "Uncorrected read error",
+ [0xC0] = "Uncorrected metadata",
+};
+
+static char *imc_1[] = {
+ [0x00] = "WDB read parity error",
+ [0x03] = "RPA parity error",
+ [0x06] = "DDR_T_DPPP data BE error",
+ [0x07] = "DDR_T_DPPP data error",
+ [0x08] = "DDR link failure",
+ [0x11] = "PCLS CAM error",
+ [0x12] = "PCLS data error",
+};
+
+static char *imc_2[] = {
+ [0x00] = "DDR4 command / address parity error",
+ [0x20] = "HBM command / address parity error",
+ [0x21] = "HBM data parity error",
+};
+
+static char *imc_4[] = {
+ [0x00] = "RPQ parity (primary) error",
+};
+
+static char *imc_8[] = {
+ [0x00] = "DDR-T bad request",
+ [0x01] = "DDR Data response to an invalid entry",
+ [0x02] = "DDR data response to an entry not expecting data",
+ [0x03] = "DDR4 completion to an invalid entry",
+ [0x04] = "DDR-T completion to an invalid entry",
+ [0x05] = "DDR data/completion FIFO overflow",
+ [0x06] = "DDR-T ERID correctable parity error",
+ [0x07] = "DDR-T ERID uncorrectable error",
+ [0x08] = "DDR-T interrupt received while outstanding interrupt was not ACKed",
+ [0x09] = "ERID FI FO overflow",
+ [0x0A] = "DDR-T error on FNV write credits",
+ [0x0B] = "DDR-T error on FNV read credits",
+ [0x0C] = "DDR-T scheduler error",
+ [0x0D] = "DDR-T FNV error event",
+ [0x0E] = "DDR-T FNV thermal event",
+ [0x0F] = "CMI packet while idle",
+ [0x10] = "DDR_T_RPQ_REQ_PARITY_ERR",
+ [0x11] = "DDR_T_WPQ_REQ_PARITY_ERR",
+ [0x12] = "2LM_NMFILLWR_CAM_ERR",
+ [0x13] = "CMI_CREDIT_OVERSUB_ERR",
+ [0x14] = "CMI_CREDIT_TOTAL_ERR",
+ [0x15] = "CMI_CREDIT_RSVD_POOL_ERR",
+ [0x16] = "DDR_T_RD_ERROR",
+ [0x17] = "WDB_FIFO_ERR",
+ [0x18] = "CMI_REQ_FIFO_OVERFLOW",
+ [0x19] = "CMI_REQ_FIFO_UNDERFLOW",
+ [0x1A] = "CMI_RSP_FIFO_OVERFLOW",
+ [0x1B] = "CMI_RSP_FIFO_UNDERFLOW",
+ [0x1C] = "CMI _MISC_MC_CRDT_ERRORS",
+ [0x1D] = "CMI_MISC_MC_ARB_ERRORS",
+ [0x1E] = "DDR_T_WR_CMPL_FI FO_OVERFLOW",
+ [0x1F] = "DDR_T_WR_CMPL_FI FO_UNDERFLOW",
+ [0x20] = "CMI_RD_CPL_FIFO_OVERFLOW",
+ [0x21] = "CMI_RD_CPL_FIFO_UNDERFLOW",
+ [0x22] = "TME_KEY_PAR_ERR",
+ [0x23] = "TME_CMI_MISC_ERR",
+ [0x24] = "TME_CMI_OVFL_ERR",
+ [0x25] = "TME_CMI_UFL_ERR",
+ [0x26] = "TME_TEM_SECURE_ERR",
+ [0x27] = "TME_UFILL_PAR_ERR",
+ [0x29] = "INTERNAL_ERR",
+ [0x2A] = "TME_INTEGRITY_ERR",
+ [0x2B] = "TME_TDX_ERR",
+ [0x2C] = "TME_UFILL_TEM_SECURE_ERR",
+ [0x2D] = "TME_KEY_POISON_ERR",
+ [0x2E] = "TME_SECURITY_ENGINE_ERR",
+};
+
+static char *imc_10[] = {
+ [0x08] = "CORR_PATSCRUB_MIRR2ND_ERR",
+ [0x10] = "UC_PATSCRUB_MIRR2ND_ERR",
+ [0x20] = "COR_SPARE_MIRR2ND_ERR",
+ [0x40] = "UC_SPARE_MIRR2ND_ERR",
+ [0x80] = "HA_RD_MIRR2ND_ERR",
+ [0xA0] = "HA_UNCORR_RD_MIRR2ND_ERR",
+};
+
+static struct field imc0[] = {
+ FIELD(0, imc_0),
+ {}
+};
+
+static struct field imc1[] = {
+ FIELD(0, imc_1),
+ {}
+};
+
+static struct field imc2[] = {
+ FIELD(0, imc_2),
+ {}
+};
+
+static struct field imc4[] = {
+ FIELD(0, imc_4),
+ {}
+};
+
+static struct field imc8[] = {
+ FIELD(0, imc_8),
+ {}
+};
+
+static struct field imc10[] = {
+ FIELD(0, imc_10),
+ {}
+};
+
+static void i10nm_imc_misc(struct mce_event *e)
+{
+ uint32_t column = EXTRACT(e->misc, 9, 18) << 2;
+ uint32_t row = EXTRACT(e->misc, 19, 39);
+ uint32_t bank = EXTRACT(e->misc, 42, 43);
+ uint32_t bankgroup = EXTRACT(e->misc, 40, 41) | (EXTRACT(e->misc, 44, 44) << 2);
+ uint32_t fdevice = EXTRACT(e->misc, 46, 51);
+ uint32_t subrank = EXTRACT(e->misc, 52, 55);
+ uint32_t rank = EXTRACT(e->misc, 56, 58);
+ uint32_t eccmode = EXTRACT(e->misc, 59, 62);
+ uint32_t transient = EXTRACT(e->misc, 63, 63);
+
+ mce_snprintf(e->error_msg, "bank: 0x%x bankgroup: 0x%x row: 0x%x column: 0x%x", bank, bankgroup, row, column);
+ if (!transient && !EXTRACT(e->status, 61, 61))
+ mce_snprintf(e->error_msg, "failed device: 0x%x", fdevice);
+ mce_snprintf(e->error_msg, "rank: 0x%x subrank: 0x%x", rank, subrank);
+ mce_snprintf(e->error_msg, "ecc mode: ");
+ switch (eccmode) {
+ case 0: mce_snprintf(e->error_msg, "SDDC memory mode"); break;
+ case 1: mce_snprintf(e->error_msg, "SDDC"); break;
+ case 4: mce_snprintf(e->error_msg, "ADDDC memory mode"); break;
+ case 5: mce_snprintf(e->error_msg, "ADDDC"); break;
+ case 8: mce_snprintf(e->error_msg, "DDRT read"); break;
+ default: mce_snprintf(e->error_msg, "unknown"); break;
+ }
+ if (transient)
+ mce_snprintf(e->error_msg, "transient");
+}
+
+enum banktype {
+ BT_UNKNOWN,
+ BT_PCU,
+ BT_UPI,
+ BT_M2M,
+ BT_IMC,
+};
+
+static enum banktype icelake[32] = {
+ [4] = BT_PCU,
+ [5] = BT_UPI,
+ [7 ... 8] = BT_UPI,
+ [12] = BT_M2M,
+ [16] = BT_M2M,
+ [20] = BT_M2M,
+ [24] = BT_M2M,
+ [13 ... 15] = BT_IMC,
+ [17 ... 19] = BT_IMC,
+ [21 ... 23] = BT_IMC,
+ [25 ... 27] = BT_IMC,
+};
+
+static enum banktype icelake_de[32] = {
+ [4] = BT_PCU,
+ [12] = BT_M2M,
+ [16] = BT_M2M,
+ [13 ... 15] = BT_IMC,
+ [17 ... 19] = BT_IMC,
+};
+
+static enum banktype tremont[32] = {
+ [4] = BT_PCU,
+ [12] = BT_M2M,
+ [13 ... 15] = BT_IMC,
+};
+
+static enum banktype sapphire[32] = {
+ [4] = BT_PCU,
+ [5] = BT_UPI,
+ [12] = BT_M2M,
+ [13 ... 20] = BT_IMC,
+};
+
+void i10nm_memerr_misc(struct mce_event *e, int *channel);
+
+void i10nm_decode_model(enum cputype cputype, struct ras_events *ras,
+ struct mce_event *e)
+{
+ enum banktype banktype;
+ uint64_t f, status = e->status;
+ uint32_t mca = status & 0xffff;
+ int channel = -1;
+
+ switch (cputype) {
+ case CPU_ICELAKE_XEON:
+ banktype = icelake[e->bank];
+ break;
+ case CPU_ICELAKE_DE:
+ banktype = icelake_de[e->bank];
+ break;
+ case CPU_TREMONT_D:
+ banktype = tremont[e->bank];
+ break;
+ case CPU_SAPPHIRERAPIDS:
+ banktype = sapphire[e->bank];
+ break;
+ default:
+ return;
+ }
+
+ switch (banktype) {
+ case BT_UNKNOWN:
+ break;
+
+ case BT_PCU:
+ mce_snprintf(e->error_msg, "PCU: ");
+ f = EXTRACT(status, 24, 31);
+ if (f)
+ decode_bitfield(e, f, pcu1);
+ f = EXTRACT(status, 20, 23);
+ if (f)
+ decode_bitfield(e, f, pcu2);
+ f = EXTRACT(status, 16, 19);
+ if (f)
+ decode_bitfield(e, f, pcu3);
+ break;
+
+ case BT_UPI:
+ mce_snprintf(e->error_msg, "UPI: ");
+ f = EXTRACT(status, 22, 31);
+ if (f)
+ decode_bitfield(e, status, upi1);
+ f = EXTRACT(status, 16, 21);
+ decode_bitfield(e, f, upi2);
+ break;
+
+ case BT_M2M:
+ mce_snprintf(e->error_msg, "M2M: ");
+ f = EXTRACT(status, 24, 25);
+ mce_snprintf(e->error_msg, "MscodDDRType=0x%" PRIx64, f);
+ f = EXTRACT(status, 26, 31);
+ mce_snprintf(e->error_msg, "MscodMiscErrs=0x%" PRIx64, f);
+ decode_bitfield(e, status, m2m);
+ break;
+
+ case BT_IMC:
+ mce_snprintf(e->error_msg, "MemCtrl: ");
+ f = EXTRACT(status, 16, 23);
+ switch (EXTRACT(status, 24, 31)) {
+ case 0: decode_bitfield(e, f, imc0); break;
+ case 1: decode_bitfield(e, f, imc1); break;
+ case 2: decode_bitfield(e, f, imc2); break;
+ case 4: decode_bitfield(e, f, imc4); break;
+ case 8: decode_bitfield(e, f, imc8); break;
+ case 0x10: decode_bitfield(e, f, imc10); break;
+ }
+ i10nm_imc_misc(e);
+ break;
+ }
+
+ /*
+ * Memory error specific code. Returns if the error is not a MC one
+ */
+
+ /* Check if the error is at the memory controller */
+ if ((mca >> 7) != 1)
+ return;
+
+ /* Ignore unless this is an corrected extended error from an iMC bank */
+ if (banktype != BT_IMC || (status & MCI_STATUS_UC))
+ return;
+
+ /*
+ * Parse the reported channel
+ */
+
+ i10nm_memerr_misc(e, &channel);
+ if (channel == -1)
+ return;
+ mce_snprintf(e->mc_location, "memory_channel=%d", channel);
+}
+
+/*
+ * There isn't enough information to identify the DIMM. But
+ * we can derive the channel from the bank number.
+ * There can be four memory controllers with two channels each.
+ */
+void i10nm_memerr_misc(struct mce_event *e, int *channel)
+{
+ uint64_t status = e->status;
+ unsigned int chan, imc;
+
+ /* Check this is a memory error */
+ if (!test_prefix(7, status & 0xefff))
+ return;
+
+ chan = EXTRACT(status, 0, 3);
+ if (chan == 0xf)
+ return;
+
+ switch (e->bank) {
+ case 12: /* M2M 0 */
+ case 13: /* IMC 0, Channel 0 */
+ case 14: /* IMC 0, Channel 1 */
+ case 15: /* IMC 0, Channel 2 */
+ imc = 0;
+ break;
+ case 16: /* M2M 1 */
+ case 17: /* IMC 1, Channel 0 */
+ case 18: /* IMC 1, Channel 1 */
+ case 19: /* IMC 1, Channel 2 */
+ imc = 1;
+ break;
+ case 20: /* M2M 2 */
+ case 21: /* IMC 2, Channel 0 */
+ case 22: /* IMC 2, Channel 1 */
+ case 23: /* IMC 2, Channel 2 */
+ imc = 2;
+ break;
+ case 24: /* M2M 3 */
+ case 25: /* IMC 3, Channel 0 */
+ case 26: /* IMC 3, Channel 1 */
+ case 27: /* IMC 3, Channel 2 */
+ imc = 3;
+ break;
+ default:
+ return;
+ }
+
+ channel[0] = imc * 3 + chan;
+}
--- rasdaemon-0.6.1.orig/mce-intel.c 2021-09-17 15:29:39.189845188 -0400
+++ rasdaemon-0.6.1/mce-intel.c 2021-09-17 15:29:45.977790658 -0400
@@ -411,6 +411,11 @@ if (test_prefix(11, (e->status & 0xffffL
case CPU_SKYLAKE_XEON:
skylake_s_decode_model(ras, e);
break;
+ case CPU_ICELAKE_XEON:
+ case CPU_ICELAKE_DE:
+ case CPU_TREMONT_D:
+ case CPU_SAPPHIRERAPIDS:
+ i10nm_decode_model(mce->cputype, ras, e);
default:
break;
}
--- rasdaemon-0.6.1.orig/ras-mce-handler.c 2021-09-17 15:29:39.189845188 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.c 2021-09-17 15:29:45.977790658 -0400
@@ -56,6 +56,10 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series
[CPU_KNIGHTS_MILL] = "Knights Mill",
[CPU_SKYLAKE_XEON] = "Skylake server",
[CPU_AMD_SMCA] = "AMD Scalable MCA",
+ [CPU_ICELAKE_XEON] = "Icelake server",
+ [CPU_ICELAKE_DE] = "Icelake server D Family",
+ [CPU_TREMONT_D] = "Tremont microserver",
+ [CPU_SAPPHIRERAPIDS] = "Sapphirerapids server",
};
static enum cputype select_intel_cputype(struct ras_events *ras)
@@ -107,6 +111,14 @@ else if (mce->model == 0x85)
return CPU_KNIGHTS_MILL;
else if (mce->model == 0x55)
return CPU_SKYLAKE_XEON;
+ else if (mce->model == 0x6a)
+ return CPU_ICELAKE_XEON;
+ else if (mce->model == 0x6c)
+ return CPU_ICELAKE_DE;
+ else if (mce->model == 0x86)
+ return CPU_TREMONT_D;
+ else if (mce->model == 0x8f)
+ return CPU_SAPPHIRERAPIDS;
if (mce->model > 0x1a) {
log(ALL, LOG_INFO,
--- rasdaemon-0.6.1.orig/ras-mce-handler.h 2021-09-17 15:29:39.189845188 -0400
+++ rasdaemon-0.6.1/ras-mce-handler.h 2021-09-17 15:29:45.977790658 -0400
@@ -51,6 +51,10 @@ enum cputype {
CPU_KNIGHTS_MILL,
CPU_SKYLAKE_XEON,
CPU_AMD_SMCA,
+ CPU_ICELAKE_XEON,
+ CPU_ICELAKE_DE,
+ CPU_TREMONT_D,
+ CPU_SAPPHIRERAPIDS,
};
struct mce_event {
@@ -131,6 +135,7 @@ void tulsa_decode_model(struct mce_event
void broadwell_de_decode_model(struct ras_events *ras, struct mce_event *e);
void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e);
void skylake_s_decode_model(struct ras_events *ras, struct mce_event *e);
+void i10nm_decode_model(enum cputype cputype, struct ras_events *ras, struct mce_event *e);
/* AMD error code decode function */
void decode_amd_errcode(struct mce_event *e);

View File

@ -0,0 +1,47 @@
From: Muralidhara M K <muralimk@amd.com>
This patch removes trailing spaces at the end of a line from
file location and fixes --layout option to parse dimm nodes
to get the size from ras-mc-ctl.
Issue is reported https://github.com/mchehab/rasdaemon/issues/43
Where '> ras-mc-ctl --layout' reports all 0s
With this change the layout prints the correct dimm sizes
> sudo ras-mc-ctl --layout
+-----------------------------------------------+
| mc0 |
| csrow0 | csrow1 | csrow2 | csrow3 |
----------+-----------------------------------------------+
...
channel7: | 16384 MB | 0 MB | 0 MB | 0 MB |
channel6: | 16384 MB | 0 MB | 0 MB | 0 MB |
...
----------+-----------------------------------------------+
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
---
util/ras-mc-ctl.in | 2 ++
1 file changed, 2 insertions(+)
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index 1e3aeb7..b22dd60 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -246,6 +246,7 @@ sub parse_dimm_nodes
if (($file =~ /max_location$/)) {
open IN, $file;
my $location = <IN>;
+ $location =~ s/\s+$//;
close IN;
my @temp = split(/ /, $location);
@@ -288,6 +289,7 @@ sub parse_dimm_nodes
open IN, $file;
my $location = <IN>;
+ $location =~ s/\s+$//;
close IN;
my @pos;

227
rasdaemon.spec Normal file
View File

@ -0,0 +1,227 @@
Name: rasdaemon
Version: 0.6.1
Release: 12%{?dist}
Summary: Utility to receive RAS error tracings
Group: Applications/System
License: GPLv2
URL: http://git.infradead.org/users/mchehab/rasdaemon.git
Source0: http://www.infradead.org/~mchehab/rasdaemon/%{name}-%{version}.tar.bz2
ExcludeArch: s390 s390x
BuildRequires: gettext-devel
BuildRequires: perl-generators
BuildRequires: sqlite-devel
BuildRequires: systemd
BuildRequires: libtool
Provides: bundled(kernel-event-lib)
Requires: hwdata
Requires: perl-DBD-SQLite
%ifarch %{ix86} x86_64
Requires: dmidecode
%endif
Requires(post): systemd
Requires(preun): systemd
Requires(postun): systemd
Patch1: 60a91e4da4f2daf2b10143fc148a8043312b61e5.patch
Patch2: a16ca0711001957ee98f2c124abce0fa1f801529.patch
Patch3: add_upstream_labels.patch
Patch4: b22be68453b2497e86cbd273b9cd56fadc5859e3.patch
Patch5: 2a1d217660351c08eb2f8bccebf939abba2f7e69.patch
Patch6: 8704a85d8dc3483423ec2934fee8132f85f8fdb6.patch
Patch7: cc2ce5c65ed5a42eaa97aa3659854add6d808da5.patch
Patch8: 854364ba44aee9bc5646f6537fc744b0b54aff37.patch
Patch9: 9acef39f13833f7d53ef96abc5a72e79384260f4.patch
Patch10: 28ea956acc2dab7c18b4701f9657afb9ab3ddc79.patch
Patch11: aecf33aa70331670c06db6b652712b476e24051c.patch
Patch12: 7937f0d6c2aaaed096f3a3d306416743c0dcb7a4.patch
Patch13: rasdaemon-ras-mc-ctl-Fix-script-to-parse-dimm-sizes.patch
Patch14: 0862a096c3a1d0f993703ab3299f1ddfadf53d7f.patch
Patch15: 546cf713f667437fb6e283cc3dc090679eb47d08.patch
Patch16: 2290d65b97311dd5736838f1e285355f7f357046.patch
Patch17: 16d929b024c31d54a7f8a72eab094376c7be27f5.patch
Patch18: b497a3d6a39d402c41065e9284d49114b97e3bfe.patch
Patch19: ce6e7864f11f709c4f803828fbc8e507d115d03b.patch
Patch20: a8c776ed94f68ae31d7b5f74e19545698898c13c.patch
%description
%{name} is a RAS (Reliability, Availability and Serviceability) logging tool.
It currently records memory errors, using the EDAC tracing events.
EDAC is drivers in the Linux kernel that handle detection of ECC errors
from memory controllers for most chipsets on i386 and x86_64 architectures.
EDAC drivers for other architectures like arm also exists.
This userspace component consists of an init script which makes sure
EDAC drivers and DIMM labels are loaded at system startup, as well as
an utility for reporting current error counts from the EDAC sysfs files.
%prep
%setup -q
%patch1 -p1
%patch2 -p1
%patch3 -p1
%patch4 -p1
%patch5 -p1
%patch6 -p1
%patch7 -p1
%patch8 -p1
%patch9 -p1
%patch10 -p1
%patch11 -p1
%patch12 -p1
%patch13 -p1
%patch14 -p1
%patch15 -p1
%patch16 -p1
%patch17 -p1
%patch18 -p1
%patch19 -p1
%patch20 -p1
# The tarball is locked in time the first time aclocal was ran and will keep
# requiring an older version of automake
autoreconf -vfi
%build
%ifarch %{arm} aarch64
%configure --enable-aer --enable-sqlite3 --enable-abrt-report --enable-non-standard --enable-hisi-ns-decode --enable-arm
%else
%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-abrt-report --enable-memory-failure
%endif
make %{?_smp_mflags}
%install
make install DESTDIR=%{buildroot}
install -D -p -m 0644 misc/rasdaemon.service %{buildroot}/%{_unitdir}/rasdaemon.service
install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service
install -D -p -m 0655 labels/* %{buildroot}%{_sysconfdir}/ras/dimm_labels.d
rm INSTALL %{buildroot}/usr/include/*.h
%files
%doc AUTHORS ChangeLog COPYING README TODO
%{_sbindir}/rasdaemon
%{_sbindir}/ras-mc-ctl
%{_mandir}/*/*
%{_unitdir}/*.service
%{_sharedstatedir}/rasdaemon
%{_sysconfdir}/ras/dimm_labels.d
%changelog
* Tue Oct 12 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-12
- Adding missing bits from b497a3d6a39d402c41065e9284d49114b97e3bfe [1923254]
* Tue Oct 12 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-11
- Removed bits from devlink and diskerrors that aren't used yet [1923254]
* Tue Oct 12 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-10
- Add miscellaneous patches required by customer [1923254]
* Wed Oct 06 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-9
- Prevent ras-mc-ctl trying to access extlog and mce tables if rasdaemon was built without support for them [2011404]
* Thu Aug 26 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-8
- Disable MCE and extlog in arm packages [2009499]
* Thu Aug 26 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-7
- Add support for AMD SMCA banks for family 19 [1991955]
* Wed May 26 2021 Aristeu Rozanski <aris@redhat.com> 0.6.1-6
- Add support for AMD SMCA [1965011]
* Wed Apr 08 2020 Aristeu Rozanski <aris@redhat.com> 0.6.1-5
- Fix high CPU usage when CPUs are offline [1683420]
* Wed Apr 08 2020 Aristeu Rozanski <aris@redhat.com> 0.6.1-4
- Include upstream labels [1665418]
* Thu Jul 11 2019 Aristeu Rozanski <aris@redhat.com> 0.6.1-3
- Add support for AMD scalable MCA [1725488]
* Mon Aug 20 2018 Aristeu Rozanski <aris@redhat.com> 0.6.1-2
- Add support for error count display [1573685]
* Wed Apr 25 2018 Mauro Carvalho Chehab <mchehab+samsung@kernel.org> 0.6.1-1
- Bump to version 0.6.1 adding support for Skylake Xeon MSCOD, a bug fix and some new DELL labels
* Fri Feb 09 2018 Fedora Release Engineering <releng@fedoraproject.org> - 0.6.0-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_28_Mass_Rebuild
* Sat Oct 14 2017 Mauro Carvalho Chehab <mchehab@osg.samsung.com> 0.6.0-1
- Bump to version 0.6.0 adding support for Arm and Hisilicon events and update Dell Skylate labels
* Thu Aug 03 2017 Fedora Release Engineering <releng@fedoraproject.org> - 0.5.8-6
- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Binutils_Mass_Rebuild
* Thu Jul 27 2017 Fedora Release Engineering <releng@fedoraproject.org> - 0.5.8-5
- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Mass_Rebuild
* Sat Feb 11 2017 Fedora Release Engineering <releng@fedoraproject.org> - 0.5.8-4
- Rebuilt for https://fedoraproject.org/wiki/Fedora_26_Mass_Rebuild
* Fri Apr 15 2016 Mauro Carvalho Chehab <mchehab@osg.samsung.com> 0.5.8-3
- Add a virtual provide, per BZ#104132
* Fri Apr 15 2016 Mauro Carvalho Chehab <mchehab@osg.samsung.com> 0.5.8-2
- Bump to version 0.5.8 with support for Broadwell EP/EX MSCOD/DE MSCOD
* Thu Feb 04 2016 Fedora Release Engineering <releng@fedoraproject.org> - 0.5.6-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_24_Mass_Rebuild
* Fri Jul 03 2015 Mauro Carvalho Chehab <mchehab@osg.samsung.com> 0.5.6-1
- Bump to version 0.5.6 with support for LMCE and some fixes
* Thu Jun 18 2015 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 0.5.5-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_23_Mass_Rebuild
* Wed Jun 03 2015 Mauro Carvalho Chehab <mchehab@osg.samsung.com> 0.5.5-1
- Bump to version 0.5.5 with support for newer Intel platforms & some fixes
* Tue Sep 16 2014 Peter Robinson <pbrobinson@fedoraproject.org> 0.5.4-3
- aarch64/ppc64 have edac capabilities
- spec cleanups
- No need to run autoreconf
* Sun Aug 17 2014 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 0.5.4-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_21_22_Mass_Rebuild
* Fri Aug 15 2014 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.5.4-1
- Bump to version 0.5.4 with some fixes, mainly for amd64
* Sun Aug 10 2014 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.5.3-1
- Bump to version 0.5.3 and enable ABRT and ExtLog
* Sun Jun 08 2014 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 0.5.2-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_21_Mass_Rebuild
* Thu Apr 03 2014 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.5.2-1
- fix and enable ABRT report support
* Fri Mar 28 2014 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.5.1-1
- Do some fixes at the service files and add some documentation for --record
* Sun Feb 16 2014 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.5.0-1
- Add experimental ABRT support
* Tue Sep 10 2013 Mauro Carvalho Chehab <m.chehab@samsung.com> 0.4.2-1
- Fix ras-mc-ctl layout filling
* Sun Aug 04 2013 Fedora Release Engineering <rel-eng@lists.fedoraproject.org> - 0.4.1-5
- Rebuilt for https://fedoraproject.org/wiki/Fedora_20_Mass_Rebuild
* Wed Jul 17 2013 Petr Pisar <ppisar@redhat.com> - 0.4.1-4
- Perl 5.18 rebuild
* Sun Jun 2 2013 Peter Robinson <pbrobinson@fedoraproject.org> 0.4.1-3
- ARM has EDMA drivers (currently supported in Calxeda highbank)
* Wed May 29 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.4.1-2
- Fix the name of perl-DBD-SQLite package
* Wed May 29 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.4.1-1
- Updated to version 0.4.1 with contains some bug fixes
* Tue May 28 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.4.0-1
- Updated to version 0.4.0 and added support for mce, aer and sqlite3 storage
* Mon May 20 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.3.0-1
- Package created

1
sources Normal file
View File

@ -0,0 +1 @@
SHA512 (rasdaemon-0.6.1.tar.bz2) = a221a7ea0e4555befbf8acb9d4ce22d5cc7861a0da458111a24905fa06be0e8f3b18d8de5334310a423657febc0532d2705236de89cb1788302bf487ede29f60