457386b234
Resolves: #RHEL-6982
160 lines
6.4 KiB
Diff
160 lines
6.4 KiB
Diff
diff -U0 smartmontools-7.1/ChangeLog.r5472 smartmontools-7.1/ChangeLog
|
|
diff -up smartmontools-7.1/smartd.conf.5.in.r5472 smartmontools-7.1/smartd.conf.5.in
|
|
--- smartmontools-7.1/smartd.conf.5.in.r5472 2019-12-13 21:20:45.000000000 +0100
|
|
+++ smartmontools-7.1/smartd.conf.5.in 2023-11-22 12:32:37.341051288 +0100
|
|
@@ -696,6 +696,20 @@ error log has increased since the last c
|
|
.I error
|
|
\- [NVMe] report if the "Number of Error Information Log Entries" from the
|
|
SMART/Health Information log has increased since the last check.
|
|
+.br
|
|
+[NEW EXPERIMENTAL SMARTD FEATURE]
|
|
+This will only be logged as LOG_CRIT if at least one of the new errors is
|
|
+still present in the Error Information log and its status indicates a
|
|
+device related error.
|
|
+Up to eight of the most recent of these errors are logged as LOG_INFO then.
|
|
+This is useful because the NVMe Error Information log is not persistent
|
|
+across power cycles or device resets.
|
|
+.br
|
|
+If all new errors are either no longer present in the log or are not device
|
|
+related (e.g. invalid command, invalid field in command, ...), a LOG_INFO
|
|
+message is generated instead.
|
|
+This avoids misleading warnings if the operating system issues unsupported
|
|
+commands and the device firmware also logs these kind of errors.
|
|
.Sp
|
|
.\" %ENDIF OS Darwin FreeBSD Linux NetBSD Windows Cygwin
|
|
.I xerror
|
|
diff -up smartmontools-7.1/smartd.cpp.r5472 smartmontools-7.1/smartd.cpp
|
|
--- smartmontools-7.1/smartd.cpp.r5472 2019-12-29 14:10:18.000000000 +0100
|
|
+++ smartmontools-7.1/smartd.cpp 2023-11-22 12:35:19.254046678 +0100
|
|
@@ -2,7 +2,7 @@
|
|
* Home page of code is: https://www.smartmontools.org
|
|
*
|
|
* Copyright (C) 2002-11 Bruce Allen
|
|
- * Copyright (C) 2008-19 Christian Franke
|
|
+ * Copyright (C) 2008-23 Christian Franke
|
|
* Copyright (C) 2000 Michael Cornwell <cornwell@acm.org>
|
|
* Copyright (C) 2008 Oliver Bock <brevilo@users.sourceforge.net>
|
|
*
|
|
@@ -410,6 +410,9 @@ struct dev_config
|
|
|
|
ata_vendor_attr_defs attribute_defs; // -v options
|
|
|
|
+ // NVMe only
|
|
+ unsigned nvme_err_log_max_entries{}; // size of error log
|
|
+
|
|
dev_config();
|
|
};
|
|
|
|
@@ -2628,6 +2631,74 @@ static int nvme_get_max_temp_kelvin(cons
|
|
return k;
|
|
}
|
|
|
|
+// Check the NVMe Error Information log for device related errors.
|
|
+static bool check_nvme_error_log(const dev_config & cfg, dev_state & state, nvme_device * nvmedev,
|
|
+ uint64_t newcnt = 0)
|
|
+{
|
|
+ // Limit transfer size to one page (64 entries) to avoid problems with
|
|
+ // limits of NVMe pass-through layer or too low MDTS values.
|
|
+ unsigned want_entries = 64;
|
|
+ if (want_entries > cfg.nvme_err_log_max_entries)
|
|
+ want_entries = cfg.nvme_err_log_max_entries;
|
|
+ raw_buffer error_log_buf(want_entries * sizeof(nvme_error_log_page));
|
|
+ nvme_error_log_page * error_log =
|
|
+ reinterpret_cast<nvme_error_log_page *>(error_log_buf.data());
|
|
+ unsigned read_entries = nvme_read_error_log(nvmedev, error_log, want_entries, false /*!lpo_sup*/);
|
|
+ if (!read_entries) {
|
|
+ PrintOut(LOG_INFO, "Device: %s, Read %u entries from Error Information Log failed\n",
|
|
+ cfg.name.c_str(), want_entries);
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ if (!newcnt)
|
|
+ return true; // Support check only
|
|
+
|
|
+ // Scan log, find device related errors
|
|
+ uint64_t oldcnt = state.nvme_err_log_entries, mincnt = newcnt;
|
|
+ int err = 0, ign = 0;
|
|
+ for (unsigned i = 0; i < read_entries; i++) {
|
|
+ const nvme_error_log_page & e = error_log[i];
|
|
+ if (!e.error_count)
|
|
+ continue; // unused
|
|
+ if (e.error_count <= oldcnt)
|
|
+ break; // stop on first old entry
|
|
+ if (e.error_count < mincnt)
|
|
+ mincnt = e.error_count; // min known error
|
|
+ if (e.error_count > newcnt)
|
|
+ newcnt = e.error_count; // adjust maximum
|
|
+ uint16_t status = e.status_field >> 1;
|
|
+ if (!nvme_status_is_error(status) || nvme_status_to_errno(status) == EINVAL) {
|
|
+ ign++; // Not a device related error
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ // Log the most recent 8 errors
|
|
+ if (++err > 8)
|
|
+ continue;
|
|
+ char buf[64];
|
|
+ PrintOut(LOG_INFO, "Device: %s, NVMe error [%u], count %" PRIu64 ", status 0x%04x: %s\n",
|
|
+ cfg.name.c_str(), i, e.error_count, e.status_field,
|
|
+ nvme_status_to_info_str(buf, e.status_field >> 1));
|
|
+ }
|
|
+
|
|
+ std::string msg = strprintf("Device: %s, NVMe error count increased from %" PRIu64 " to %" PRIu64
|
|
+ " (%d new, %d ignored, %" PRIu64 " unknown)",
|
|
+ cfg.name.c_str(), oldcnt, newcnt, err, ign,
|
|
+ (mincnt > oldcnt + 1 ? mincnt - oldcnt - 1 : 0));
|
|
+ // LOG_CRIT only if device related errors are found
|
|
+ if (!err) {
|
|
+ PrintOut(LOG_INFO, "%s\n", msg.c_str());
|
|
+ }
|
|
+ else {
|
|
+ PrintOut(LOG_CRIT, "%s\n", msg.c_str());
|
|
+ MailWarning(cfg, state, 4, "%s", msg.c_str());
|
|
+ }
|
|
+
|
|
+ state.nvme_err_log_entries = newcnt;
|
|
+ state.must_write = true;
|
|
+ return true;
|
|
+}
|
|
+
|
|
static int NVMeDeviceScan(dev_config & cfg, dev_state & state, nvme_device * nvmedev,
|
|
const dev_config_vector * prev_cfgs)
|
|
{
|
|
@@ -2687,8 +2758,14 @@ static int NVMeDeviceScan(dev_config & c
|
|
}
|
|
|
|
// Init total error count
|
|
+ cfg.nvme_err_log_max_entries = id_ctrl.elpe + 1; // 0's based value
|
|
if (cfg.errorlog || cfg.xerrorlog) {
|
|
- state.nvme_err_log_entries = le128_to_uint64(smart_log.num_err_log_entries);
|
|
+ if (!check_nvme_error_log(cfg, state, nvmedev)) {
|
|
+ PrintOut(LOG_INFO, "Device: %s, Error Information unavailable, ignoring -l [x]error\n", name);
|
|
+ cfg.errorlog = cfg.xerrorlog = false;
|
|
+ }
|
|
+ else
|
|
+ state.nvme_err_log_entries = le128_to_uint64(smart_log.num_err_log_entries);
|
|
}
|
|
|
|
// If no supported tests selected, return
|
|
@@ -3760,16 +3837,12 @@ static int NVMeCheckDevice(const dev_con
|
|
|
|
// Check if number of errors has increased
|
|
if (cfg.errorlog || cfg.xerrorlog) {
|
|
- uint64_t oldcnt = state.nvme_err_log_entries;
|
|
uint64_t newcnt = le128_to_uint64(smart_log.num_err_log_entries);
|
|
- if (newcnt > oldcnt) {
|
|
- PrintOut(LOG_CRIT, "Device: %s, number of Error Log entries increased from %" PRIu64 " to %" PRIu64 "\n",
|
|
- name, oldcnt, newcnt);
|
|
- MailWarning(cfg, state, 4, "Device: %s, number of Error Log entries increased from %" PRIu64 " to %" PRIu64,
|
|
- name, oldcnt, newcnt);
|
|
- state.must_write = true;
|
|
+ if (newcnt > state.nvme_err_log_entries) {
|
|
+ // Warn only if device related errors are found
|
|
+ check_nvme_error_log(cfg, state, nvmedev, newcnt);
|
|
}
|
|
- state.nvme_err_log_entries = newcnt;
|
|
+ // else // TODO: Handle decrease of count?
|
|
}
|
|
|
|
CloseDevice(nvmedev, name);
|