From 27faf1af778849841d7c3140bd3d92aceaea2ee3 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Sun, 13 Apr 2025 22:10:36 +0100 Subject: [PATCH] coredump: add support for new %F PIDFD specifier A new core_pattern specifier was added, %F, to provide a PIDFD to the usermode helper process referring to the crashed process. This removes all possible race conditions, ensuring only the crashed process gets inspected by systemd-coredump. (cherry picked from commit 868d95577ec9f862580ad365726515459be582fc) Resolves: RHEL-104138 --- man/systemd-coredump.xml | 9 ++++ src/coredump/coredump.c | 89 ++++++++++++++++++++++++++++++++---- sysctl.d/50-coredump.conf.in | 2 +- 3 files changed, 89 insertions(+), 11 deletions(-) diff --git a/man/systemd-coredump.xml b/man/systemd-coredump.xml index 6cfa04f466..b3d81d838a 100644 --- a/man/systemd-coredump.xml +++ b/man/systemd-coredump.xml @@ -186,6 +186,15 @@ COREDUMP_FILENAME=/var/lib/systemd/coredump/core.Web….552351.….zst + + COREDUMP_BY_PIDFD= + If the crashed process was analyzed using a PIDFD provided by the kernel (requires + kernel v6.16) then this field will be present and set to 1. If this field is + not set, then the crashed process was analyzed via a PID, which is known to be subject to race + conditions. + + + COREDUMP_TIMESTAMP= The time of the crash as reported by the kernel (in µs since the epoch). diff --git a/src/coredump/coredump.c b/src/coredump/coredump.c index cd10678c43..e0aac3c8d0 100644 --- a/src/coredump/coredump.c +++ b/src/coredump/coredump.c @@ -39,6 +39,7 @@ #include "mkdir-label.h" #include "namespace-util.h" #include "parse-util.h" +#include "pidref.h" #include "process-util.h" #include "signal-util.h" #include "socket-util.h" @@ -98,8 +99,8 @@ enum { /* The fields below were added to kernel/core_pattern at later points, so they might be missing. */ META_ARGV_HOSTNAME = _META_ARGV_REQUIRED, /* %h: hostname */ META_ARGV_DUMPABLE, /* %d: as set by the kernel */ + META_ARGV_PIDFD, /* %F: pidfd of the process, since v6.16 */ _META_ARGV_MAX, - /* If new fields are added, they should be added here, to maintain compatibility * with callers which don't know about the new fields. */ @@ -129,6 +130,7 @@ static const char * const meta_field_names[_META_MAX] = { [META_ARGV_RLIMIT] = "COREDUMP_RLIMIT=", [META_ARGV_HOSTNAME] = "COREDUMP_HOSTNAME=", [META_ARGV_DUMPABLE] = "COREDUMP_DUMPABLE=", + [META_ARGV_PIDFD] = "COREDUMP_BY_PIDFD=", [META_COMM] = "COREDUMP_COMM=", [META_EXE] = "COREDUMP_EXE=", [META_UNIT] = "COREDUMP_UNIT=", @@ -136,6 +138,7 @@ static const char * const meta_field_names[_META_MAX] = { }; typedef struct Context { + PidRef pidref; const char *meta[_META_MAX]; size_t meta_size[_META_MAX]; pid_t pid; @@ -146,6 +149,14 @@ typedef struct Context { bool is_journald; } Context; +#define CONTEXT_NULL \ + (Context) { \ + .pidref = PIDREF_NULL, \ + .uid = UID_INVALID, \ + .gid = GID_INVALID, \ + } + + typedef enum CoredumpStorage { COREDUMP_STORAGE_NONE, COREDUMP_STORAGE_EXTERNAL, @@ -171,6 +182,12 @@ static uint64_t arg_journal_size_max = JOURNAL_SIZE_MAX; static uint64_t arg_keep_free = UINT64_MAX; static uint64_t arg_max_use = UINT64_MAX; +static void context_done(Context *c) { + assert(c); + + pidref_done(&c->pidref); +} + static int parse_config(void) { static const ConfigTableItem items[] = { { "Coredump", "Storage", config_parse_coredump_storage, 0, &arg_storage }, @@ -1114,7 +1131,7 @@ static int save_context(Context *context, const struct iovec_wrapper *iovw) { static int process_socket(int fd) { _cleanup_close_ int input_fd = -EBADF, mntns_fd = -EBADF; - Context context = {}; + _cleanup_(context_done) Context context = CONTEXT_NULL; struct iovec_wrapper iovw = {}; struct iovec iovec; int iterations = 0, r; @@ -1215,7 +1232,7 @@ static int process_socket(int fd) { goto finish; /* Make sure we received at least all fields we need. */ - for (int i = 0; i < _META_MANDATORY_MAX; i++) + for (int i = 0; i < _META_ARGV_REQUIRED; i++) if (!context.meta[i]) { r = log_error_errno(SYNTHETIC_ERRNO(EINVAL), "A mandatory argument (%i) has not been sent, aborting.", @@ -1301,9 +1318,9 @@ static int gather_pid_metadata_from_argv( Context *context, int argc, char **argv) { + _cleanup_(pidref_done) PidRef local_pidref = PIDREF_NULL; _cleanup_free_ char *free_timestamp = NULL; - int r, signo; - char *t; + int r, signo, kernel_fd = -EBADF; /* We gather all metadata that were passed via argv[] into an array of iovecs that * we'll forward to the socket unit. @@ -1317,8 +1334,7 @@ static int gather_pid_metadata_from_argv( argc, _META_ARGV_REQUIRED, _META_ARGV_MAX); for (int i = 0; i < MIN(argc, _META_ARGV_MAX); i++) { - - t = argv[i]; + const char *t = argv[i]; switch (i) { @@ -1343,6 +1359,47 @@ static int gather_pid_metadata_from_argv( break; } + if (i == META_ARGV_PID) { + /* Store this so that we can check whether the core will be forwarded to a container + * even when the kernel doesn't provide a pidfd. Can be dropped once baseline is + * >= v6.16. */ + r = pidref_set_pidstr(&local_pidref, t); + if (r < 0) + return log_error_errno(r, "Failed to initialize pidref from pid %s: %m", t); + } + + if (i == META_ARGV_PIDFD) { + /* If the current kernel doesn't support the %F specifier (which resolves to a + * pidfd), but we included it in the core_pattern expression, we'll receive an empty + * string here. Deal with that gracefully. */ + if (isempty(t)) + continue; + + assert(!pidref_is_set(&context->pidref)); + assert(kernel_fd < 0); + + kernel_fd = parse_fd(t); + if (kernel_fd < 0) + return log_error_errno(kernel_fd, "Failed to parse pidfd \"%s\": %m", t); + + r = pidref_set_pidfd(&context->pidref, kernel_fd); + if (r < 0) + return log_error_errno(r, "Failed to initialize pidref from pidfd %d: %m", kernel_fd); + + /* If there are containers involved with different versions of the code they might + * not be using pidfds, so it would be wrong to set the metadata, skip it. */ + r = in_same_namespace(getpid_cached(), context->pidref.pid, NAMESPACE_PID); + if (r < 0) + log_debug_errno(r, "Failed to check pidns of crashing process, ignoring: %m"); + if (r <= 0) + continue; + + /* We don't print the fd number in the journal as it's meaningless, but we still + * record that the parsing was done with a kernel-provided fd as it means it's safe + * from races, which is valuable information to provide in the journal record. */ + t = "1"; + } + r = iovw_put_string_field(iovw, meta_field_names[i], t); if (r < 0) return r; @@ -1350,7 +1407,19 @@ static int gather_pid_metadata_from_argv( /* Cache some of the process metadata we collected so far and that we'll need to * access soon */ - return save_context(context, iovw); + r = save_context(context, iovw); + if (r < 0) + return r; + + /* If the kernel didn't give us a PIDFD, then use the one derived from the + * PID immediately, given we have it. */ + if (!pidref_is_set(&context->pidref)) + context->pidref = TAKE_PIDREF(local_pidref); + + /* Close the kernel-provided FD as the last thing after everything else succeeded. */ + kernel_fd = safe_close(kernel_fd); + + return 0; } static int gather_pid_metadata(struct iovec_wrapper *iovw, Context *context) { @@ -1466,7 +1535,7 @@ static int gather_pid_metadata(struct iovec_wrapper *iovw, Context *context) { } static int process_kernel(int argc, char* argv[]) { - Context context = {}; + _cleanup_(context_done) Context context = CONTEXT_NULL; struct iovec_wrapper *iovw; int r, mntns_fd = -EBADF; @@ -1543,7 +1612,7 @@ static int process_kernel(int argc, char* argv[]) { } static int process_backtrace(int argc, char *argv[]) { - Context context = {}; + _cleanup_(context_done) Context context = CONTEXT_NULL; struct iovec_wrapper *iovw; char *message; int r; diff --git a/sysctl.d/50-coredump.conf.in b/sysctl.d/50-coredump.conf.in index 9c10a89828..1c6230ad93 100644 --- a/sysctl.d/50-coredump.conf.in +++ b/sysctl.d/50-coredump.conf.in @@ -13,7 +13,7 @@ # the core dump. # # See systemd-coredump(8) and core(5). -kernel.core_pattern=|{{ROOTLIBEXECDIR}}/systemd-coredump %P %u %g %s %t %c %h %d +kernel.core_pattern=|{{ROOTLIBEXECDIR}}/systemd-coredump %P %u %g %s %t %c %h %d %F # Allow 16 coredumps to be dispatched in parallel by the kernel. # We collect metadata from /proc/%P/, and thus need to make sure the crashed