smartmontools/smartmontools-7.4-r5472.patch
2023-11-22 16:19:43 +01:00

160 lines
6.4 KiB
Diff

diff -U0 smartmontools-7.1/ChangeLog.r5472 smartmontools-7.1/ChangeLog
diff -up smartmontools-7.1/smartd.conf.5.in.r5472 smartmontools-7.1/smartd.conf.5.in
--- smartmontools-7.1/smartd.conf.5.in.r5472 2019-12-13 21:20:45.000000000 +0100
+++ smartmontools-7.1/smartd.conf.5.in 2023-11-22 12:32:37.341051288 +0100
@@ -696,6 +696,20 @@ error log has increased since the last c
.I error
\- [NVMe] report if the "Number of Error Information Log Entries" from the
SMART/Health Information log has increased since the last check.
+.br
+[NEW EXPERIMENTAL SMARTD FEATURE]
+This will only be logged as LOG_CRIT if at least one of the new errors is
+still present in the Error Information log and its status indicates a
+device related error.
+Up to eight of the most recent of these errors are logged as LOG_INFO then.
+This is useful because the NVMe Error Information log is not persistent
+across power cycles or device resets.
+.br
+If all new errors are either no longer present in the log or are not device
+related (e.g. invalid command, invalid field in command, ...), a LOG_INFO
+message is generated instead.
+This avoids misleading warnings if the operating system issues unsupported
+commands and the device firmware also logs these kind of errors.
.Sp
.\" %ENDIF OS Darwin FreeBSD Linux NetBSD Windows Cygwin
.I xerror
diff -up smartmontools-7.1/smartd.cpp.r5472 smartmontools-7.1/smartd.cpp
--- smartmontools-7.1/smartd.cpp.r5472 2019-12-29 14:10:18.000000000 +0100
+++ smartmontools-7.1/smartd.cpp 2023-11-22 12:35:19.254046678 +0100
@@ -2,7 +2,7 @@
* Home page of code is: https://www.smartmontools.org
*
* Copyright (C) 2002-11 Bruce Allen
- * Copyright (C) 2008-19 Christian Franke
+ * Copyright (C) 2008-23 Christian Franke
* Copyright (C) 2000 Michael Cornwell <cornwell@acm.org>
* Copyright (C) 2008 Oliver Bock <brevilo@users.sourceforge.net>
*
@@ -410,6 +410,9 @@ struct dev_config
ata_vendor_attr_defs attribute_defs; // -v options
+ // NVMe only
+ unsigned nvme_err_log_max_entries{}; // size of error log
+
dev_config();
};
@@ -2628,6 +2631,74 @@ static int nvme_get_max_temp_kelvin(cons
return k;
}
+// Check the NVMe Error Information log for device related errors.
+static bool check_nvme_error_log(const dev_config & cfg, dev_state & state, nvme_device * nvmedev,
+ uint64_t newcnt = 0)
+{
+ // Limit transfer size to one page (64 entries) to avoid problems with
+ // limits of NVMe pass-through layer or too low MDTS values.
+ unsigned want_entries = 64;
+ if (want_entries > cfg.nvme_err_log_max_entries)
+ want_entries = cfg.nvme_err_log_max_entries;
+ raw_buffer error_log_buf(want_entries * sizeof(nvme_error_log_page));
+ nvme_error_log_page * error_log =
+ reinterpret_cast<nvme_error_log_page *>(error_log_buf.data());
+ unsigned read_entries = nvme_read_error_log(nvmedev, error_log, want_entries, false /*!lpo_sup*/);
+ if (!read_entries) {
+ PrintOut(LOG_INFO, "Device: %s, Read %u entries from Error Information Log failed\n",
+ cfg.name.c_str(), want_entries);
+ return false;
+ }
+
+ if (!newcnt)
+ return true; // Support check only
+
+ // Scan log, find device related errors
+ uint64_t oldcnt = state.nvme_err_log_entries, mincnt = newcnt;
+ int err = 0, ign = 0;
+ for (unsigned i = 0; i < read_entries; i++) {
+ const nvme_error_log_page & e = error_log[i];
+ if (!e.error_count)
+ continue; // unused
+ if (e.error_count <= oldcnt)
+ break; // stop on first old entry
+ if (e.error_count < mincnt)
+ mincnt = e.error_count; // min known error
+ if (e.error_count > newcnt)
+ newcnt = e.error_count; // adjust maximum
+ uint16_t status = e.status_field >> 1;
+ if (!nvme_status_is_error(status) || nvme_status_to_errno(status) == EINVAL) {
+ ign++; // Not a device related error
+ continue;
+ }
+
+ // Log the most recent 8 errors
+ if (++err > 8)
+ continue;
+ char buf[64];
+ PrintOut(LOG_INFO, "Device: %s, NVMe error [%u], count %" PRIu64 ", status 0x%04x: %s\n",
+ cfg.name.c_str(), i, e.error_count, e.status_field,
+ nvme_status_to_info_str(buf, e.status_field >> 1));
+ }
+
+ std::string msg = strprintf("Device: %s, NVMe error count increased from %" PRIu64 " to %" PRIu64
+ " (%d new, %d ignored, %" PRIu64 " unknown)",
+ cfg.name.c_str(), oldcnt, newcnt, err, ign,
+ (mincnt > oldcnt + 1 ? mincnt - oldcnt - 1 : 0));
+ // LOG_CRIT only if device related errors are found
+ if (!err) {
+ PrintOut(LOG_INFO, "%s\n", msg.c_str());
+ }
+ else {
+ PrintOut(LOG_CRIT, "%s\n", msg.c_str());
+ MailWarning(cfg, state, 4, "%s", msg.c_str());
+ }
+
+ state.nvme_err_log_entries = newcnt;
+ state.must_write = true;
+ return true;
+}
+
static int NVMeDeviceScan(dev_config & cfg, dev_state & state, nvme_device * nvmedev,
const dev_config_vector * prev_cfgs)
{
@@ -2687,8 +2758,14 @@ static int NVMeDeviceScan(dev_config & c
}
// Init total error count
+ cfg.nvme_err_log_max_entries = id_ctrl.elpe + 1; // 0's based value
if (cfg.errorlog || cfg.xerrorlog) {
- state.nvme_err_log_entries = le128_to_uint64(smart_log.num_err_log_entries);
+ if (!check_nvme_error_log(cfg, state, nvmedev)) {
+ PrintOut(LOG_INFO, "Device: %s, Error Information unavailable, ignoring -l [x]error\n", name);
+ cfg.errorlog = cfg.xerrorlog = false;
+ }
+ else
+ state.nvme_err_log_entries = le128_to_uint64(smart_log.num_err_log_entries);
}
// If no supported tests selected, return
@@ -3760,16 +3837,12 @@ static int NVMeCheckDevice(const dev_con
// Check if number of errors has increased
if (cfg.errorlog || cfg.xerrorlog) {
- uint64_t oldcnt = state.nvme_err_log_entries;
uint64_t newcnt = le128_to_uint64(smart_log.num_err_log_entries);
- if (newcnt > oldcnt) {
- PrintOut(LOG_CRIT, "Device: %s, number of Error Log entries increased from %" PRIu64 " to %" PRIu64 "\n",
- name, oldcnt, newcnt);
- MailWarning(cfg, state, 4, "Device: %s, number of Error Log entries increased from %" PRIu64 " to %" PRIu64,
- name, oldcnt, newcnt);
- state.must_write = true;
+ if (newcnt > state.nvme_err_log_entries) {
+ // Warn only if device related errors are found
+ check_nvme_error_log(cfg, state, nvmedev, newcnt);
}
- state.nvme_err_log_entries = newcnt;
+ // else // TODO: Handle decrease of count?
}
CloseDevice(nvmedev, name);