From 5ac15c7dc49476e7cd7cc3a4b507282c9f78d528 Mon Sep 17 00:00:00 2001 From: Eric DeVolder Date: Mon, 21 Nov 2022 11:27:27 -0500 Subject: [PATCH] pstore: fixes for dmesg.txt reconstruction This patch fixes problems with the re-assembly of the dmesg from the records stored in pstore. The current code simply ignores the last 6 characters of the file name to form a base record id, which then groups any pstore files with this base id into the reconstructed dmesg.txt. This approach fails when the following oops generated the following in pstore: -rw-------. 1 root root 1808 Oct 27 22:07 dmesg-efi-166692286101001 -rw-------. 1 root root 1341 Oct 27 22:07 dmesg-efi-166692286101002 -rw-------. 1 root root 1812 Oct 27 22:07 dmesg-efi-166692286102001 -rw-------. 1 root root 1820 Oct 27 22:07 dmesg-efi-166692286102002 -rw-------. 1 root root 1807 Oct 27 22:07 dmesg-efi-166692286103001 -rw-------. 1 root root 1791 Oct 27 22:07 dmesg-efi-166692286103002 -rw-------. 1 root root 1773 Oct 27 22:07 dmesg-efi-166692286104001 -rw-------. 1 root root 1801 Oct 27 22:07 dmesg-efi-166692286104002 -rw-------. 1 root root 1821 Oct 27 22:07 dmesg-efi-166692286105001 -rw-------. 1 root root 1809 Oct 27 22:07 dmesg-efi-166692286105002 -rw-------. 1 root root 1804 Oct 27 22:07 dmesg-efi-166692286106001 -rw-------. 1 root root 1817 Oct 27 22:07 dmesg-efi-166692286106002 -rw-------. 1 root root 1792 Oct 27 22:07 dmesg-efi-166692286107001 -rw-------. 1 root root 1810 Oct 27 22:07 dmesg-efi-166692286107002 -rw-------. 1 root root 1717 Oct 27 22:07 dmesg-efi-166692286108001 -rw-------. 1 root root 1808 Oct 27 22:07 dmesg-efi-166692286108002 -rw-------. 1 root root 1764 Oct 27 22:07 dmesg-efi-166692286109001 -rw-------. 1 root root 1765 Oct 27 22:07 dmesg-efi-166692286109002 -rw-------. 1 root root 1796 Oct 27 22:07 dmesg-efi-166692286110001 -rw-------. 1 root root 1816 Oct 27 22:07 dmesg-efi-166692286110002 -rw-------. 1 root root 1793 Oct 27 22:07 dmesg-efi-166692286111001 -rw-------. 1 root root 1751 Oct 27 22:07 dmesg-efi-166692286111002 -rw-------. 1 root root 1813 Oct 27 22:07 dmesg-efi-166692286112001 -rw-------. 1 root root 1786 Oct 27 22:07 dmesg-efi-166692286112002 -rw-------. 1 root root 1754 Oct 27 22:07 dmesg-efi-166692286113001 -rw-------. 1 root root 1752 Oct 27 22:07 dmesg-efi-166692286113002 -rw-------. 1 root root 1803 Oct 27 22:07 dmesg-efi-166692286114001 -rw-------. 1 root root 1759 Oct 27 22:07 dmesg-efi-166692286114002 -rw-------. 1 root root 1805 Oct 27 22:07 dmesg-efi-166692286115001 -rw-------. 1 root root 1787 Oct 27 22:07 dmesg-efi-166692286115002 -rw-------. 1 root root 1815 Oct 27 22:07 dmesg-efi-166692286116001 -rw-------. 1 root root 1771 Oct 27 22:07 dmesg-efi-166692286116002 -rw-------. 1 root root 1816 Oct 27 22:07 dmesg-efi-166692286117002 -rw-------. 1 root root 1388 Oct 27 22:07 dmesg-efi-166692286701003 -rw-------. 1 root root 1824 Oct 27 22:07 dmesg-efi-166692286702003 -rw-------. 1 root root 1795 Oct 27 22:07 dmesg-efi-166692286703003 -rw-------. 1 root root 1805 Oct 27 22:07 dmesg-efi-166692286704003 -rw-------. 1 root root 1813 Oct 27 22:07 dmesg-efi-166692286705003 -rw-------. 1 root root 1821 Oct 27 22:07 dmesg-efi-166692286706003 -rw-------. 1 root root 1814 Oct 27 22:07 dmesg-efi-166692286707003 -rw-------. 1 root root 1812 Oct 27 22:07 dmesg-efi-166692286708003 -rw-------. 1 root root 1769 Oct 27 22:07 dmesg-efi-166692286709003 -rw-------. 1 root root 1820 Oct 27 22:07 dmesg-efi-166692286710003 -rw-------. 1 root root 1755 Oct 27 22:07 dmesg-efi-166692286711003 -rw-------. 1 root root 1790 Oct 27 22:07 dmesg-efi-166692286712003 -rw-------. 1 root root 1756 Oct 27 22:07 dmesg-efi-166692286713003 -rw-------. 1 root root 1763 Oct 27 22:07 dmesg-efi-166692286714003 -rw-------. 1 root root 1791 Oct 27 22:07 dmesg-efi-166692286715003 -rw-------. 1 root root 1775 Oct 27 22:07 dmesg-efi-166692286716003 -rw-------. 1 root root 1820 Oct 27 22:07 dmesg-efi-166692286717003 The "reconstructed" dmesg.txt that resulted from the above contained the following (ignoring actual contents, just providing the Part info): Emergency#3 Part17 Emergency#3 Part16 Emergency#3 Part15 Emergency#3 Part14 Emergency#3 Part13 Emergency#3 Part12 Emergency#3 Part11 Emergency#3 Part10 Emergency#3 Part9 Emergency#3 Part8 Emergency#3 Part7 Emergency#3 Part6 Emergency#3 Part5 Emergency#3 Part4 Emergency#3 Part3 Emergency#3 Part2 Emergency#3 Part1 Panic#2 Part17 Panic#2 Part16 Oops#1 Part16 Panic#2 Part15 Oops#1 Part15 Panic#2 Part14 Oops#1 Part14 Panic#2 Part13 Oops#1 Part13 Panic#2 Part12 Oops#1 Part12 Panic#2 Part11 Oops#1 Part11 Panic#2 Part10 Oops#1 Part10 Panic#2 Part9 Oops#1 Part9 Panic#2 Part8 Oops#1 Part8 Panic#2 Part7 Oops#1 Part7 Panic#2 Part6 Oops#1 Part6 Panic#2 Part5 Oops#1 Part5 Panic#2 Part4 Oops#1 Part4 Panic#2 Part3 Oops#1 Part3 Panic#2 Part2 Oops#1 Part2 Panic#2 Part1 Oops#1 Part1 The above is a interleaved mess of three dmesg dumps. This patch fixes the above problems, and simplifies the dmesg reconstruction process. The code now distinguishes between records on EFI vs ERST, which have differently formatted record identifiers. Using knowledge of the format of the record ids allows vastly improved reconstruction process. With this change in place, the above pstore records now result in the following: # ls -alR /var/lib/systemd/pstore 1666922861: total 8 drwxr-xr-x. 4 root root 28 Nov 18 14:58 . drwxr-xr-x. 7 root root 144 Nov 18 14:58 .. drwxr-xr-x. 2 root root 4096 Nov 18 14:58 001 drwxr-xr-x. 2 root root 4096 Nov 18 14:58 002 1666922861/001: total 100 drwxr-xr-x. 2 root root 4096 Nov 18 14:58 . drwxr-xr-x. 4 root root 28 Nov 18 14:58 .. -rw-------. 1 root root 1808 Oct 27 22:07 dmesg-efi-166692286101001 -rw-------. 1 root root 1812 Oct 27 22:07 dmesg-efi-166692286102001 -rw-------. 1 root root 1807 Oct 27 22:07 dmesg-efi-166692286103001 -rw-------. 1 root root 1773 Oct 27 22:07 dmesg-efi-166692286104001 -rw-------. 1 root root 1821 Oct 27 22:07 dmesg-efi-166692286105001 -rw-------. 1 root root 1804 Oct 27 22:07 dmesg-efi-166692286106001 -rw-------. 1 root root 1792 Oct 27 22:07 dmesg-efi-166692286107001 -rw-------. 1 root root 1717 Oct 27 22:07 dmesg-efi-166692286108001 -rw-------. 1 root root 1764 Oct 27 22:07 dmesg-efi-166692286109001 -rw-------. 1 root root 1796 Oct 27 22:07 dmesg-efi-166692286110001 -rw-------. 1 root root 1793 Oct 27 22:07 dmesg-efi-166692286111001 -rw-------. 1 root root 1813 Oct 27 22:07 dmesg-efi-166692286112001 -rw-------. 1 root root 1754 Oct 27 22:07 dmesg-efi-166692286113001 -rw-------. 1 root root 1803 Oct 27 22:07 dmesg-efi-166692286114001 -rw-------. 1 root root 1805 Oct 27 22:07 dmesg-efi-166692286115001 -rw-------. 1 root root 1815 Oct 27 22:07 dmesg-efi-166692286116001 -rw-r-----. 1 root root 28677 Nov 18 14:58 dmesg.txt 1666922861/002: total 104 drwxr-xr-x. 2 root root 4096 Nov 18 14:58 . drwxr-xr-x. 4 root root 28 Nov 18 14:58 .. -rw-------. 1 root root 1341 Oct 27 22:07 dmesg-efi-166692286101002 -rw-------. 1 root root 1820 Oct 27 22:07 dmesg-efi-166692286102002 -rw-------. 1 root root 1791 Oct 27 22:07 dmesg-efi-166692286103002 -rw-------. 1 root root 1801 Oct 27 22:07 dmesg-efi-166692286104002 -rw-------. 1 root root 1809 Oct 27 22:07 dmesg-efi-166692286105002 -rw-------. 1 root root 1817 Oct 27 22:07 dmesg-efi-166692286106002 -rw-------. 1 root root 1810 Oct 27 22:07 dmesg-efi-166692286107002 -rw-------. 1 root root 1808 Oct 27 22:07 dmesg-efi-166692286108002 -rw-------. 1 root root 1765 Oct 27 22:07 dmesg-efi-166692286109002 -rw-------. 1 root root 1816 Oct 27 22:07 dmesg-efi-166692286110002 -rw-------. 1 root root 1751 Oct 27 22:07 dmesg-efi-166692286111002 -rw-------. 1 root root 1786 Oct 27 22:07 dmesg-efi-166692286112002 -rw-------. 1 root root 1752 Oct 27 22:07 dmesg-efi-166692286113002 -rw-------. 1 root root 1759 Oct 27 22:07 dmesg-efi-166692286114002 -rw-------. 1 root root 1787 Oct 27 22:07 dmesg-efi-166692286115002 -rw-------. 1 root root 1771 Oct 27 22:07 dmesg-efi-166692286116002 -rw-------. 1 root root 1816 Oct 27 22:07 dmesg-efi-166692286117002 -rw-r-----. 1 root root 30000 Nov 18 14:58 dmesg.txt 1666922867: total 4 drwxr-xr-x. 3 root root 17 Nov 18 14:58 . drwxr-xr-x. 7 root root 144 Nov 18 14:58 .. drwxr-xr-x. 2 root root 4096 Nov 18 14:58 003 1666922867/003: total 104 drwxr-xr-x. 2 root root 4096 Nov 18 14:58 . drwxr-xr-x. 3 root root 17 Nov 18 14:58 .. -rw-------. 1 root root 1388 Oct 27 22:07 dmesg-efi-166692286701003 -rw-------. 1 root root 1824 Oct 27 22:07 dmesg-efi-166692286702003 -rw-------. 1 root root 1795 Oct 27 22:07 dmesg-efi-166692286703003 -rw-------. 1 root root 1805 Oct 27 22:07 dmesg-efi-166692286704003 -rw-------. 1 root root 1813 Oct 27 22:07 dmesg-efi-166692286705003 -rw-------. 1 root root 1821 Oct 27 22:07 dmesg-efi-166692286706003 -rw-------. 1 root root 1814 Oct 27 22:07 dmesg-efi-166692286707003 -rw-------. 1 root root 1812 Oct 27 22:07 dmesg-efi-166692286708003 -rw-------. 1 root root 1769 Oct 27 22:07 dmesg-efi-166692286709003 -rw-------. 1 root root 1820 Oct 27 22:07 dmesg-efi-166692286710003 -rw-------. 1 root root 1755 Oct 27 22:07 dmesg-efi-166692286711003 -rw-------. 1 root root 1790 Oct 27 22:07 dmesg-efi-166692286712003 -rw-------. 1 root root 1756 Oct 27 22:07 dmesg-efi-166692286713003 -rw-------. 1 root root 1763 Oct 27 22:07 dmesg-efi-166692286714003 -rw-------. 1 root root 1791 Oct 27 22:07 dmesg-efi-166692286715003 -rw-------. 1 root root 1775 Oct 27 22:07 dmesg-efi-166692286716003 -rw-------. 1 root root 1820 Oct 27 22:07 dmesg-efi-166692286717003 -rw-r-----. 1 root root 30111 Nov 18 14:58 dmesg.txt Furthemore, pstore records on ERST are now able to accurately identify the change in timestamp sequence in order to start a new dmesg.txt, as needed. (cherry picked from commit 5fbaa757077bde2db8d33b1c358518c41b990339) Related: #2158832 --- src/pstore/pstore.c | 216 +++++++++++++++++++------------------------- 1 file changed, 92 insertions(+), 124 deletions(-) diff --git a/src/pstore/pstore.c b/src/pstore/pstore.c index d70e142b4d..9f61e8f7f8 100644 --- a/src/pstore/pstore.c +++ b/src/pstore/pstore.c @@ -112,8 +112,8 @@ static int compare_pstore_entries(const void *_a, const void *_b) { return strcmp(a->dirent.d_name, b->dirent.d_name); } -static int move_file(PStoreEntry *pe, const char *subdir) { - _cleanup_free_ char *ifd_path = NULL, *ofd_path = NULL; +static int move_file(PStoreEntry *pe, const char *subdir1, const char *subdir2) { + _cleanup_free_ char *ifd_path = NULL, *ofd_path = NULL, *ofd_path_base = NULL; _cleanup_free_ void *field = NULL; const char *suffix, *message; struct iovec iovec[2]; @@ -126,7 +126,11 @@ static int move_file(PStoreEntry *pe, const char *subdir) { if (!ifd_path) return log_oom(); - ofd_path = path_join(arg_archivedir, subdir, pe->dirent.d_name); + ofd_path_base = path_join(arg_archivedir, subdir1, subdir2); + if (!ofd_path_base) + return log_oom(); + + ofd_path = path_join(NULL, ofd_path_base, pe->dirent.d_name); if (!ofd_path) return log_oom(); @@ -169,155 +173,119 @@ static int move_file(PStoreEntry *pe, const char *subdir) { return 0; } -static int write_dmesg(const char *dmesg, size_t size, const char *id) { - _cleanup_(unlink_and_freep) char *tmp_path = NULL; - _cleanup_free_ char *ofd_path = NULL; +static int append_dmesg(PStoreEntry *pe, const char *subdir1, const char *subdir2) { + /* Append dmesg chunk to end, create if needed */ + _cleanup_free_ char *ofd_path = NULL, *ofd_path_base = NULL; _cleanup_close_ int ofd = -1; ssize_t wr; - int r; - if (size == 0) - return 0; + assert(pe); - assert(dmesg); + if (pe->content_size == 0) + return 0; - /* log_info("Record ID %s", id); */ + ofd_path_base = path_join(arg_archivedir, subdir1, subdir2); + if (!ofd_path_base) + return log_oom(); - ofd_path = path_join(arg_archivedir, id, "dmesg.txt"); + ofd_path = path_join(NULL, ofd_path_base, "dmesg.txt"); if (!ofd_path) return log_oom(); - ofd = open_tmpfile_linkable(ofd_path, O_CLOEXEC|O_CREAT|O_TRUNC|O_WRONLY, &tmp_path); + ofd = open(ofd_path, O_CREAT|O_NOFOLLOW|O_NOCTTY|O_CLOEXEC|O_APPEND|O_WRONLY, 0640); if (ofd < 0) - return log_error_errno(ofd, "Failed to open temporary file %s: %m", ofd_path); - wr = write(ofd, dmesg, size); + return log_error_errno(ofd, "Failed to open file %s: %m", ofd_path); + wr = write(ofd, pe->content, pe->content_size); if (wr < 0) return log_error_errno(errno, "Failed to store dmesg to %s: %m", ofd_path); - if (wr != (ssize_t)size) - return log_error_errno(-EIO, "Failed to store dmesg to %s. %zu bytes are lost.", ofd_path, size - wr); - r = link_tmpfile(ofd, tmp_path, ofd_path); - if (r < 0) - return log_error_errno(r, "Failed to write temporary file %s: %m", ofd_path); - tmp_path = mfree(tmp_path); + if ((size_t)wr != pe->content_size) + return log_error_errno(-EIO, "Failed to store dmesg to %s. %zu bytes are lost.", ofd_path, pe->content_size - wr); return 0; } -static void process_dmesg_files(PStoreList *list) { +static int process_dmesg_files(PStoreList *list) { /* Move files, reconstruct dmesg.txt */ - _cleanup_free_ char *dmesg = NULL, *dmesg_id = NULL; - size_t dmesg_size = 0, dmesg_allocated = 0; - bool dmesg_bad = false; - PStoreEntry *pe; + _cleanup_free_ char *erst_subdir = NULL; + uint64_t last_record_id = 0; + + /* When dmesg is written into pstore, it is done so in small chunks, whatever the exchange buffer + * size is with the underlying pstore backend (ie. EFI may be ~2KiB), which means an example + * pstore with approximately 64KB of storage may have up to roughly 32 dmesg files, some likely + * related. + * + * Here we look at the dmesg filename and try to discern if files are part of a related group, + * meaning the same original dmesg. + * + * The dmesg- filename contains the backend-type and the Common Platform Error Record, CPER, + * record id, a 64-bit number. + * + * Files are processed in reverse lexigraphical order so as to properly reconstruct original dmesg.*/ - /* Handle each dmesg file: files processed in reverse - * order so as to properly reconstruct original dmesg */ for (size_t n = list->n_entries; n > 0; n--) { - bool move_file_and_continue = false; - _cleanup_free_ char *pe_id = NULL; + PStoreEntry *pe; char *p; - size_t plen; pe = &list->entries[n-1]; if (pe->handled) continue; - if (!startswith(pe->dirent.d_name, "dmesg-")) - continue; - if (endswith(pe->dirent.d_name, ".enc.z")) /* indicates a problem */ - move_file_and_continue = true; - p = strrchr(pe->dirent.d_name, '-'); - if (!p) - move_file_and_continue = true; - - if (move_file_and_continue) { - /* A dmesg file on which we do NO additional processing */ - (void) move_file(pe, NULL); continue; - } - - /* See if this file is one of a related group of files - * in order to reconstruct dmesg */ - - /* When dmesg is written into pstore, it is done so in - * small chunks, whatever the exchange buffer size is - * with the underlying pstore backend (ie. EFI may be - * ~2KiB), which means an example pstore with approximately - * 64KB of storage may have up to roughly 32 dmesg files - * that could be related, depending upon the size of the - * original dmesg. - * - * Here we look at the dmesg filename and try to discern - * if files are part of a related group, meaning the same - * original dmesg. - * - * The two known pstore backends are EFI and ERST. These - * backends store data in the Common Platform Error - * Record, CPER, format. The dmesg- filename contains the - * CPER record id, a 64bit number (in decimal notation). - * In Linux, the record id is encoded with two digits for - * the dmesg part (chunk) number and 3 digits for the - * count number. So allowing an additional digit to - * compensate for advancing time, this code ignores the - * last six digits of the filename in determining the - * record id. - * - * For the EFI backend, the record id encodes an id in the - * upper 32 bits, and a timestamp in the lower 32-bits. - * So ignoring the least significant 6 digits has proven - * to generally identify related dmesg entries. */ -#define PSTORE_FILENAME_IGNORE 6 - - /* determine common portion of record id */ - ++p; /* move beyond dmesg- */ - plen = strlen(p); - if (plen > PSTORE_FILENAME_IGNORE) { - pe_id = memdup_suffix0(p, plen - PSTORE_FILENAME_IGNORE); - if (!pe_id) { - log_oom(); - return; - } - } else - pe_id = mfree(pe_id); - - /* Now move file from pstore to archive storage */ - move_file(pe, pe_id); - - if (dmesg_bad) + if (!startswith(pe->dirent.d_name, "dmesg-")) continue; - /* If the current record id is NOT the same as the - * previous record id, then start a new dmesg.txt file */ - if (!streq_ptr(pe_id, dmesg_id)) { - /* Encountered a new dmesg group, close out old one, open new one */ - (void) write_dmesg(dmesg, dmesg_size, dmesg_id); - dmesg_size = 0; - - /* now point dmesg_id to storage of pe_id */ - free_and_replace(dmesg_id, pe_id); - } - - /* Reconstruction of dmesg is done as a useful courtesy: do not fail, but don't write garbled - * output either. */ - size_t needed = strlen(pe->dirent.d_name) + strlen(":\n") + pe->content_size + 1; - if (!GREEDY_REALLOC(dmesg, dmesg_allocated, dmesg_size + needed)) { - log_warning_errno(ENOMEM, "Failed to write dmesg file: %m"); - dmesg_bad = true; - continue; - } - - dmesg_size += sprintf(dmesg + dmesg_size, "%s:\n", pe->dirent.d_name); - if (pe->content) { - memcpy(dmesg + dmesg_size, pe->content, pe->content_size); - dmesg_size += pe->content_size; - } - - pe_id = mfree(pe_id); + if ((p = startswith(pe->dirent.d_name, "dmesg-efi-"))) { + /* For the EFI backend, the 3 least significant digits of record id encodes a + * "count" number, the next 2 least significant digits for the dmesg part + * (chunk) number, and the remaining digits as the timestamp. See + * linux/drivers/firmware/efi/efi-pstore.c in efi_pstore_write(). */ + _cleanup_free_ char *subdir1 = NULL, *subdir2 = NULL; + size_t plen = strlen(p); + + if (plen < 6) + continue; + + /* Extract base record id */ + subdir1 = strndup(p, plen - 5); + if (!subdir1) + return log_oom(); + /* Extract "count" field */ + subdir2 = strndup(p + plen - 3, 3); + if (!subdir2) + return log_oom(); + + /* Now move file from pstore to archive storage */ + (void) move_file(pe, subdir1, subdir2); + + /* Append to the dmesg */ + (void) append_dmesg(pe, subdir1, subdir2); + } else if ((p = startswith(pe->dirent.d_name, "dmesg-erst-"))) { + /* For the ERST backend, the record is a monotonically increasing number, seeded as + * a timestamp. See linux/drivers/acpi/apei/erst.c in erst_writer(). */ + uint64_t record_id; + + if (safe_atou64(p, &record_id) < 0) + continue; + if (last_record_id - 1 != record_id) + /* A discontinuity in the number has been detected, this current record id + * will become the directory name for all pieces of the dmesg in this + * series. */ + if (free_and_strdup(&erst_subdir, p) < 0) + return log_oom(); + + /* Now move file from pstore to archive storage */ + (void) move_file(pe, erst_subdir, NULL); + + /* Append to the dmesg */ + (void) append_dmesg(pe, erst_subdir, NULL); + + /* Update, but keep erst_subdir for next file */ + last_record_id = record_id; + } else + log_debug("Unknown backend, ignoring \"%s\".", pe->dirent.d_name); } - - if (!dmesg_bad) - (void) write_dmesg(dmesg, dmesg_size, dmesg_id); + return 0; } static int list_files(PStoreList *list, const char *sourcepath) { @@ -394,11 +362,11 @@ static int run(int argc, char *argv[]) { qsort_safe(list.entries, list.n_entries, sizeof(PStoreEntry), compare_pstore_entries); /* Process known file types */ - process_dmesg_files(&list); + (void) process_dmesg_files(&list); /* Move left over files out of pstore */ for (size_t n = 0; n < list.n_entries; n++) - move_file(&list.entries[n], NULL); + (void) move_file(&list.entries[n], NULL, NULL); return 0; }