Import from CS git

This commit is contained in:
eabdullin 2024-10-04 11:32:15 +00:00
parent 2468c69d7d
commit c68ac7d2a3
7 changed files with 413 additions and 2 deletions

View File

@ -0,0 +1,93 @@
commit 73d8177ce0d2fcb7693cacee4778d0845ebd3788
Author: sathya priya kumar <SathyaPriya.K@amd.com>
Date: Thu Jun 13 05:29:09 2024 +0000
rasdaemon: mce-amd-smca: Optimizing decoding of MCA_CTL_SMU bits
Optimize smca_smu2_mce_desc in better way from the commit ced615c.
Update existing array with extended error descriptions instead
of creating new array, simplifying the code.
Signed-off-by: Sathya Priya Kumar <sathyapriya.k@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
mce-amd-smca.c | 29 +++--------------------------
ras-mce-handler.h | 1 -
2 files changed, 3 insertions(+), 27 deletions(-)
--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-07-18 11:14:26.008582740 -0400
+++ rasdaemon-0.6.7/mce-amd-smca.c 2024-07-18 11:15:05.510270132 -0400
@@ -397,7 +397,7 @@ static const char * const smca_smu_mce_d
"An ECC or parity error in an SMU RAM instance",
};
-static const char * smca_smu2_mce_desc[64] = {
+static const char * const smca_smu2_mce_desc[] = {
"High SRAM ECC or parity error",
"Low SRAM ECC or parity error",
"Data Cache Bank A ECC or parity error",
@@ -410,14 +410,13 @@ static const char * smca_smu2_mce_desc[6
"Instruction Tag Cache Bank B ECC or parity error",
"System Hub Read Buffer ECC or parity error",
"PHY RAS ECC Error",
-};
-
-static const char * smca_smu2_ext_mce_desc[] = {
+ [12 ... 57] = "Reserved",
"A correctable error from a GFX Sub-IP",
"A fatal error from a GFX Sub-IP",
"Reserved",
"Reserved",
"A poison error from a GFX Sub-IP",
+ "Reserved",
};
static const char * const smca_mp5_mce_desc[] = {
@@ -824,27 +823,6 @@ static struct smca_bank_name smca_names[
[SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" },
};
-void smca_smu2_ext_err_desc(void)
-{
- int i, j;
- int smu2_bits = 62;
-
- /*
- * MCA_CTL_SMU error stings are defined for b'58:59 and b'62
- * in MI300A AMD systems. See AMD PPR MCA::SMU::MCA_CTL_SMU
- *
- * b'0:11 can be decoded from existing array smca_smu2_mce_desc.
- * b'12:57 are Reserved and b'58:62 are appended to the
- * smca_smu2_mce_desc.
- */
- for (i = 12, j = 0; i < smu2_bits || j < 5; i++, j++) {
- for ( ; i < 58; i++)
- smca_smu2_mce_desc[i] = "Reserved";
-
- smca_smu2_mce_desc[i] = smca_smu2_ext_mce_desc[j];
- }
-}
-
void amd_decode_errcode(struct mce_event *e)
{
@@ -936,7 +914,6 @@ unsigned short xec = (e->status >> 16) &
mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID,
(ipid_high & MCI_IPID_MCATYPE) >> 16);
- smca_smu2_ext_err_desc();
fixup_hwid(m, &mcatype_hwid);
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-07-18 11:14:26.008582740 -0400
+++ rasdaemon-0.6.7/ras-mce-handler.h 2024-07-18 11:14:28.987559165 -0400
@@ -121,7 +121,6 @@ int set_intel_imc_log(enum cputype cputy
/* Undertake AMD SMCA Error Decoding */
void decode_smca_error(struct mce_event *e, struct mce_priv *m);
void amd_decode_errcode(struct mce_event *e);
-void smca_smu2_ext_err_desc(void);
/* Per-CPU-type decoders for Intel CPUs */
void p4_decode_model(struct mce_event *e);

View File

@ -0,0 +1,34 @@
commit 7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e
Author: Aristeu Rozanski <arozansk@redhat.com>
Date: Tue Apr 9 10:06:30 2024 -0400
mce-amd-smca: update smca_hwid to use smca_bank_types
bank_type is used as smca_bank_types everywhere, there's no point in
declaring it as unsigned int. It also upsets covscan:
3. rasdaemon-0.6.7/mce-amd-smca.c:914: assignment: Assigning: "bank_type" = "s_hwid->bank_type".
7. rasdaemon-0.6.7/mce-amd-smca.c:926: cond_at_most: Checking "bank_type >= 64U" implies that "bank_type" and "s_hwid->bank_type" may be up to 63 on the false branch.
14. rasdaemon-0.6.7/mce-amd-smca.c:942: overrun-local: Overrunning array "smca_mce_descs" of 38 16-byte elements at element index 63 (byte offset 1023) using index "bank_type" (which evaluates to 63).
# 940| /* Only print the descriptor of valid extended error code */
# 941| if (xec < smca_mce_descs[bank_type].num_descs)
# 942|-> mce_snprintf(e->mcastatus_msg,
# 943| "%s. Ext Err Code: %d",
# 944| smca_mce_descs[bank_type].descs[xec],
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 7521ff7..6632663 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -706,7 +706,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
};
struct smca_hwid {
- unsigned int bank_type; /* Use with smca_bank_types for easy indexing.*/
+ enum smca_bank_types bank_type;
uint32_t mcatype_hwid; /* mcatype,hwid bit 63-32 in MCx_IPID Register*/
};

View File

@ -0,0 +1,22 @@
commit 885e546add918457c453bd3f753ac7df90b39e36
Author: weidongkl <weidongkl@sina.com>
Date: Tue Sep 19 16:29:21 2023 +0800
Add a space between "diskerror_event" and "store"
Signed-off-by: weidongkl <weidongkl@sina.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/ras-record.c b/ras-record.c
index a5f99ae..6b050bb 100644
--- a/ras-record.c
+++ b/ras-record.c
@@ -484,7 +484,7 @@ int ras_store_diskerror_event(struct ras_events *ras, struct diskerror_event *ev
if (!priv || !priv->stmt_diskerror_event)
return 0;
- log(TERM, LOG_INFO, "diskerror_eventstore: %p\n", priv->stmt_diskerror_event);
+ log(TERM, LOG_INFO, "diskerror_event store: %p\n", priv->stmt_diskerror_event);
sqlite3_bind_text(priv->stmt_diskerror_event, 1, ev->timestamp, -1, NULL);
sqlite3_bind_text(priv->stmt_diskerror_event, 2, ev->dev, -1, NULL);

View File

@ -0,0 +1,24 @@
commit 9bd84aef87978b806178a73ed33c39d6c442fc1f
Author: weidong <weidongkl@sina.com>
Date: Tue Aug 8 08:59:12 2023 +0000
add ':' before error output
All prints except disk are preceded by a colon
Signed-off-by: weidong <weidongkl@sina.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
index dc326d3..13078c2 100755
--- a/util/ras-mc-ctl.in
+++ b/util/ras-mc-ctl.in
@@ -1469,7 +1469,7 @@ sub errors
$out .= "\n";
}
if ($out ne "") {
- print "Disk errors\n$out\n";
+ print "Disk errors:\n$out\n";
} else {
print "No disk errors.\n\n";
}

View File

@ -0,0 +1,117 @@
commit 9c86f6255f67a8bae28cd46c54500fc16bfc7a30
Author: Yang Shi <shy828301@gmail.com>
Date: Mon Apr 4 16:34:05 2022 -0700
rasdaemon: use the new block_rq_error tracepoint
Since Linux 5.18-rc1 a new block tracepoint called block_rq_error is
available for tracing disk error events dedicatedly. Currently
rasdaemon is using block_rq_complete which also traces successful cases.
It incurs excessive tracing logs and somehow overhead since the event is
triggered quite often.
Use the new tracepoint for disk error reporting, and the new trace point
has the same format as block_rq_complete.
Signed-off-by: Yang Shi <shy828301@gmail.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
ras-events.c | 53 ++++++++++-------------------------------------------
ras-record.c | 2 +-
2 files changed, 11 insertions(+), 44 deletions(-)
--- rasdaemon-0.6.7.orig/ras-events.c 2024-05-14 11:05:40.020599541 -0400
+++ rasdaemon-0.6.7/ras-events.c 2024-05-14 11:06:38.831067957 -0400
@@ -27,6 +27,7 @@ * Foundation, Inc., 51 Franklin Street,
#include <sys/poll.h>
#include <signal.h>
#include <sys/signalfd.h>
+#include <linux/version.h>
#include "libtrace/kbuffer.h"
#include "libtrace/event-parse.h"
#include "ras-mc-handler.h"
@@ -229,7 +230,7 @@ if (rc < 0) {
#endif
#ifdef HAVE_DISKERROR
- rc |= __toggle_ras_mc_event(ras, "block", "block_rq_complete", enable);
+ rc |= __toggle_ras_mc_event(ras, "block", "block_rq_error", enable);
#endif
#ifdef HAVE_MEMORY_FAILURE
@@ -241,37 +242,6 @@ free_ras:
return rc;
}
-/*
- * Set kernel filter. libtrace doesn't provide an API for setting filters
- * in kernel, we have to implement it here.
- */
-static int filter_ras_mc_event(struct ras_events *ras, char *group, char *event,
- const char *filter_str)
-{
- int fd, rc;
- char fname[MAX_PATH + 1];
-
- snprintf(fname, sizeof(fname), "events/%s/%s/filter", group, event);
- fd = open_trace(ras, fname, O_RDWR | O_APPEND);
- if (fd < 0) {
- log(ALL, LOG_WARNING, "Can't open filter file\n");
- return errno;
- }
-
- rc = write(fd, filter_str ,strlen(filter_str));
- if (rc < 0) {
- log(ALL, LOG_WARNING, "Can't write to filter file\n");
- close(fd);
- return rc;
- }
- close(fd);
- if (!rc) {
- log(ALL, LOG_WARNING, "Nothing was written on filter file\n");
- return EIO;
- }
-
- return 0;
-}
/*
* Tracing read code
@@ -901,17 +871,14 @@ (void)open("/sys/kernel/debug/ras/daemon
#endif
#ifdef HAVE_DISKERROR
- rc = filter_ras_mc_event(ras, "block", "block_rq_complete", "error != 0");
- if (!rc) {
- rc = add_event_handler(ras, pevent, page_size, "block",
- "block_rq_complete", ras_diskerror_event_handler,
- NULL, DISKERROR_EVENT);
- if (!rc)
- num_events++;
- else
- log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
- "block", "block_rq_complete");
- }
+ rc = add_event_handler(ras, pevent, page_size, "block",
+ "block_rq_error", ras_diskerror_event_handler,
+ NULL, DISKERROR_EVENT);
+ if (!rc)
+ num_events++;
+ else
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
+ "block", "block_rq_error");
#endif
#ifdef HAVE_MEMORY_FAILURE
--- rasdaemon-0.6.7.orig/ras-record.c 2024-05-14 11:07:24.573654494 -0400
+++ rasdaemon-0.6.7/ras-record.c 2024-05-14 11:07:07.626807674 -0400
@@ -456,7 +456,7 @@ return 0;
#endif
/*
- * Table and functions to handle block:block_rq_complete
+ * Table and functions to handle block:block_rq_error
*/
#ifdef HAVE_DISKERROR

View File

@ -0,0 +1,94 @@
commit ced615cf8146f51b5d6fe7a29107a2adc77407ca
Author: Sathya Priya Kumar <sathyapriya.k@amd.com>
Date: Thu Jan 11 01:20:07 2024 -0600
rasdaemon: Add error decoding for MCA_CTL_SMU extended bits
Enable error decoding support for the newly added extended
error bit descriptions from MCA_CTL_SMU.
b'0:11 can be decoded from existing array smca_smu2_mce_desc.
Define a function to append the newly defined b'58:62 to the
smca_smu2_mce_desc. This reduces the maintaining Reserved bits
from b'12:57 in the code.
Signed-off-by: Sathya Priya Kumar <sathyapriya.k@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
mce-amd-smca.c | 33 ++++++++++++++++++++++++++++++++-
ras-mce-handler.h | 1 +
2 files changed, 33 insertions(+), 1 deletion(-)
--- rasdaemon-0.6.7.orig/mce-amd-smca.c 2024-06-28 10:34:16.453522865 -0400
+++ rasdaemon-0.6.7/mce-amd-smca.c 2024-06-28 10:34:46.049124270 -0400
@@ -397,7 +397,7 @@ static const char * const smca_smu_mce_d
"An ECC or parity error in an SMU RAM instance",
};
-static const char * const smca_smu2_mce_desc[] = {
+static const char * smca_smu2_mce_desc[64] = {
"High SRAM ECC or parity error",
"Low SRAM ECC or parity error",
"Data Cache Bank A ECC or parity error",
@@ -409,6 +409,15 @@ static const char * const smca_smu2_mce_
"Instruction Tag Cache Bank A ECC or parity error",
"Instruction Tag Cache Bank B ECC or parity error",
"System Hub Read Buffer ECC or parity error",
+ "PHY RAS ECC Error",
+};
+
+static const char * smca_smu2_ext_mce_desc[] = {
+ "A correctable error from a GFX Sub-IP",
+ "A fatal error from a GFX Sub-IP",
+ "Reserved",
+ "Reserved",
+ "A poison error from a GFX Sub-IP",
};
static const char * const smca_mp5_mce_desc[] = {
@@ -815,6 +824,27 @@ static struct smca_bank_name smca_names[
[SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" },
};
+void smca_smu2_ext_err_desc(void)
+{
+ int i, j;
+ int smu2_bits = 62;
+
+ /*
+ * MCA_CTL_SMU error stings are defined for b'58:59 and b'62
+ * in MI300A AMD systems. See AMD PPR MCA::SMU::MCA_CTL_SMU
+ *
+ * b'0:11 can be decoded from existing array smca_smu2_mce_desc.
+ * b'12:57 are Reserved and b'58:62 are appended to the
+ * smca_smu2_mce_desc.
+ */
+ for (i = 12, j = 0; i < smu2_bits || j < 5; i++, j++) {
+ for ( ; i < 58; i++)
+ smca_smu2_mce_desc[i] = "Reserved";
+
+ smca_smu2_mce_desc[i] = smca_smu2_ext_mce_desc[j];
+ }
+}
+
void amd_decode_errcode(struct mce_event *e)
{
@@ -906,6 +936,7 @@ unsigned short xec = (e->status >> 16) &
mcatype_hwid = HWID_MCATYPE(ipid_high & MCI_IPID_HWID,
(ipid_high & MCI_IPID_MCATYPE) >> 16);
+ smca_smu2_ext_err_desc();
fixup_hwid(m, &mcatype_hwid);
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
--- rasdaemon-0.6.7.orig/ras-mce-handler.h 2024-06-28 10:34:16.453522865 -0400
+++ rasdaemon-0.6.7/ras-mce-handler.h 2024-06-28 10:34:17.795508302 -0400
@@ -121,6 +121,7 @@ int set_intel_imc_log(enum cputype cputy
/* Undertake AMD SMCA Error Decoding */
void decode_smca_error(struct mce_event *e, struct mce_priv *m);
void amd_decode_errcode(struct mce_event *e);
+void smca_smu2_ext_err_desc(void);
/* Per-CPU-type decoders for Intel CPUs */
void p4_decode_model(struct mce_event *e);

View File

@ -1,8 +1,8 @@
Name: rasdaemon
Version: 0.6.7
Release: 9%{?dist}
Release: 15%{?dist}
Summary: Utility to receive RAS error tracings
License: GPLv2
License: GPL-2.0-only
URL: http://git.infradead.org/users/mchehab/rasdaemon.git
Source0: http://www.infradead.org/~mchehab/rasdaemon/%{name}-%{version}.tar.bz2
Patch0: labels.patch
@ -33,6 +33,12 @@ Patch24: 1f74a59ee33b7448b00d7ba13d5ecd4918b9853c.patch
Patch25: 2d15882a0cbfce0b905039bebc811ac8311cd739.patch
Patch26: c785d309dcbdeb7ecd219975244f3944a8d047e9.patch
Patch27: b6a64416ab31b66ce92cabcc7fa1f3c5e9db2e87.patch
Patch28: 9c86f6255f67a8bae28cd46c54500fc16bfc7a30.patch
Patch29: 9bd84aef87978b806178a73ed33c39d6c442fc1f.patch
Patch30: 885e546add918457c453bd3f753ac7df90b39e36.patch
Patch31: 7ed2da7aedf8bc8ad4c4efe7acbda60ba061be6e.patch
Patch32: ced615cf8146f51b5d6fe7a29107a2adc77407ca.patch
Patch33: 73d8177ce0d2fcb7693cacee4778d0845ebd3788.patch
ExcludeArch: s390 s390x
BuildRequires: make
@ -95,6 +101,12 @@ an utility for reporting current error counts from the EDAC sysfs files.
%patch25 -p1
%patch26 -p1
%patch27 -p1
%patch28 -p1
%patch29 -p1
%patch30 -p1
%patch31 -p1
%patch32 -p1
%patch33 -p1
# The tarball is locked in time the first time aclocal was ran and will keep
# requiring an older version of automake
@ -130,6 +142,21 @@ sed -i "s/^PAGE_CE_ACTION=.*/PAGE_CE_ACTION=account/" %{buildroot}/%{_sysconfdir
%{_sysconfdir}/sysconfig/rasdaemon
%changelog
* Thu Jul 18 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-14
- rasdaemon: mce-amd-smca: Optimizing decoding of MCA_CTL_SMU bits [RHEL-48819]
* Fri Jun 28 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-13
- rasdaemon: Add error decoding for MCA_CTL_SMU extended bits [RHEL-35718]
* Thu Jun 20 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-12
- mce-amd-smca: update smca_hwid to use smca_bank_types [RHEL-24170]
* Wed May 08 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-11
- Fix excessive block messages [RHEL-8708]
* Wed Jan 10 2024 Aristeu Rozanski <aris@redhat.com> 0.6.7-10
- Update License string to use SPDX [RHELMISC-1262]
* Thu Oct 26 2023 Aristeu Rozanski <aris@redhat.com> 0.6.7-9
- Update SMCA support for AMD processors [RHEL-11092]