systemd/1283-coredump-add-support-for-new-F-PIDFD-specifier.patch
Jan Macku a673ceed38 systemd-252-60
Resolves: RHEL-115182,RHEL-97175,RHEL-107268
2025-11-05 09:44:11 +01:00

246 lines
10 KiB
Diff

From 27faf1af778849841d7c3140bd3d92aceaea2ee3 Mon Sep 17 00:00:00 2001
From: Luca Boccassi <luca.boccassi@gmail.com>
Date: Sun, 13 Apr 2025 22:10:36 +0100
Subject: [PATCH] coredump: add support for new %F PIDFD specifier
A new core_pattern specifier was added, %F, to provide a PIDFD
to the usermode helper process referring to the crashed process.
This removes all possible race conditions, ensuring only the
crashed process gets inspected by systemd-coredump.
(cherry picked from commit 868d95577ec9f862580ad365726515459be582fc)
Resolves: RHEL-104138
---
man/systemd-coredump.xml | 9 ++++
src/coredump/coredump.c | 89 ++++++++++++++++++++++++++++++++----
sysctl.d/50-coredump.conf.in | 2 +-
3 files changed, 89 insertions(+), 11 deletions(-)
diff --git a/man/systemd-coredump.xml b/man/systemd-coredump.xml
index 6cfa04f466..b3d81d838a 100644
--- a/man/systemd-coredump.xml
+++ b/man/systemd-coredump.xml
@@ -186,6 +186,15 @@ COREDUMP_FILENAME=/var/lib/systemd/coredump/core.Web….552351.….zst
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><varname>COREDUMP_BY_PIDFD=</varname></term>
+ <listitem><para>If the crashed process was analyzed using a PIDFD provided by the kernel (requires
+ kernel v6.16) then this field will be present and set to <literal>1</literal>. If this field is
+ not set, then the crashed process was analyzed via a PID, which is known to be subject to race
+ conditions.</para>
+ </listitem>
+ </varlistentry>
+
<varlistentry>
<term><varname>COREDUMP_TIMESTAMP=</varname></term>
<listitem><para>The time of the crash as reported by the kernel (in µs since the epoch).</para>
diff --git a/src/coredump/coredump.c b/src/coredump/coredump.c
index cd10678c43..e0aac3c8d0 100644
--- a/src/coredump/coredump.c
+++ b/src/coredump/coredump.c
@@ -39,6 +39,7 @@
#include "mkdir-label.h"
#include "namespace-util.h"
#include "parse-util.h"
+#include "pidref.h"
#include "process-util.h"
#include "signal-util.h"
#include "socket-util.h"
@@ -98,8 +99,8 @@ enum {
/* The fields below were added to kernel/core_pattern at later points, so they might be missing. */
META_ARGV_HOSTNAME = _META_ARGV_REQUIRED, /* %h: hostname */
META_ARGV_DUMPABLE, /* %d: as set by the kernel */
+ META_ARGV_PIDFD, /* %F: pidfd of the process, since v6.16 */
_META_ARGV_MAX,
-
/* If new fields are added, they should be added here, to maintain compatibility
* with callers which don't know about the new fields. */
@@ -129,6 +130,7 @@ static const char * const meta_field_names[_META_MAX] = {
[META_ARGV_RLIMIT] = "COREDUMP_RLIMIT=",
[META_ARGV_HOSTNAME] = "COREDUMP_HOSTNAME=",
[META_ARGV_DUMPABLE] = "COREDUMP_DUMPABLE=",
+ [META_ARGV_PIDFD] = "COREDUMP_BY_PIDFD=",
[META_COMM] = "COREDUMP_COMM=",
[META_EXE] = "COREDUMP_EXE=",
[META_UNIT] = "COREDUMP_UNIT=",
@@ -136,6 +138,7 @@ static const char * const meta_field_names[_META_MAX] = {
};
typedef struct Context {
+ PidRef pidref;
const char *meta[_META_MAX];
size_t meta_size[_META_MAX];
pid_t pid;
@@ -146,6 +149,14 @@ typedef struct Context {
bool is_journald;
} Context;
+#define CONTEXT_NULL \
+ (Context) { \
+ .pidref = PIDREF_NULL, \
+ .uid = UID_INVALID, \
+ .gid = GID_INVALID, \
+ }
+
+
typedef enum CoredumpStorage {
COREDUMP_STORAGE_NONE,
COREDUMP_STORAGE_EXTERNAL,
@@ -171,6 +182,12 @@ static uint64_t arg_journal_size_max = JOURNAL_SIZE_MAX;
static uint64_t arg_keep_free = UINT64_MAX;
static uint64_t arg_max_use = UINT64_MAX;
+static void context_done(Context *c) {
+ assert(c);
+
+ pidref_done(&c->pidref);
+}
+
static int parse_config(void) {
static const ConfigTableItem items[] = {
{ "Coredump", "Storage", config_parse_coredump_storage, 0, &arg_storage },
@@ -1114,7 +1131,7 @@ static int save_context(Context *context, const struct iovec_wrapper *iovw) {
static int process_socket(int fd) {
_cleanup_close_ int input_fd = -EBADF, mntns_fd = -EBADF;
- Context context = {};
+ _cleanup_(context_done) Context context = CONTEXT_NULL;
struct iovec_wrapper iovw = {};
struct iovec iovec;
int iterations = 0, r;
@@ -1215,7 +1232,7 @@ static int process_socket(int fd) {
goto finish;
/* Make sure we received at least all fields we need. */
- for (int i = 0; i < _META_MANDATORY_MAX; i++)
+ for (int i = 0; i < _META_ARGV_REQUIRED; i++)
if (!context.meta[i]) {
r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
"A mandatory argument (%i) has not been sent, aborting.",
@@ -1301,9 +1318,9 @@ static int gather_pid_metadata_from_argv(
Context *context,
int argc, char **argv) {
+ _cleanup_(pidref_done) PidRef local_pidref = PIDREF_NULL;
_cleanup_free_ char *free_timestamp = NULL;
- int r, signo;
- char *t;
+ int r, signo, kernel_fd = -EBADF;
/* We gather all metadata that were passed via argv[] into an array of iovecs that
* we'll forward to the socket unit.
@@ -1317,8 +1334,7 @@ static int gather_pid_metadata_from_argv(
argc, _META_ARGV_REQUIRED, _META_ARGV_MAX);
for (int i = 0; i < MIN(argc, _META_ARGV_MAX); i++) {
-
- t = argv[i];
+ const char *t = argv[i];
switch (i) {
@@ -1343,6 +1359,47 @@ static int gather_pid_metadata_from_argv(
break;
}
+ if (i == META_ARGV_PID) {
+ /* Store this so that we can check whether the core will be forwarded to a container
+ * even when the kernel doesn't provide a pidfd. Can be dropped once baseline is
+ * >= v6.16. */
+ r = pidref_set_pidstr(&local_pidref, t);
+ if (r < 0)
+ return log_error_errno(r, "Failed to initialize pidref from pid %s: %m", t);
+ }
+
+ if (i == META_ARGV_PIDFD) {
+ /* If the current kernel doesn't support the %F specifier (which resolves to a
+ * pidfd), but we included it in the core_pattern expression, we'll receive an empty
+ * string here. Deal with that gracefully. */
+ if (isempty(t))
+ continue;
+
+ assert(!pidref_is_set(&context->pidref));
+ assert(kernel_fd < 0);
+
+ kernel_fd = parse_fd(t);
+ if (kernel_fd < 0)
+ return log_error_errno(kernel_fd, "Failed to parse pidfd \"%s\": %m", t);
+
+ r = pidref_set_pidfd(&context->pidref, kernel_fd);
+ if (r < 0)
+ return log_error_errno(r, "Failed to initialize pidref from pidfd %d: %m", kernel_fd);
+
+ /* If there are containers involved with different versions of the code they might
+ * not be using pidfds, so it would be wrong to set the metadata, skip it. */
+ r = in_same_namespace(getpid_cached(), context->pidref.pid, NAMESPACE_PID);
+ if (r < 0)
+ log_debug_errno(r, "Failed to check pidns of crashing process, ignoring: %m");
+ if (r <= 0)
+ continue;
+
+ /* We don't print the fd number in the journal as it's meaningless, but we still
+ * record that the parsing was done with a kernel-provided fd as it means it's safe
+ * from races, which is valuable information to provide in the journal record. */
+ t = "1";
+ }
+
r = iovw_put_string_field(iovw, meta_field_names[i], t);
if (r < 0)
return r;
@@ -1350,7 +1407,19 @@ static int gather_pid_metadata_from_argv(
/* Cache some of the process metadata we collected so far and that we'll need to
* access soon */
- return save_context(context, iovw);
+ r = save_context(context, iovw);
+ if (r < 0)
+ return r;
+
+ /* If the kernel didn't give us a PIDFD, then use the one derived from the
+ * PID immediately, given we have it. */
+ if (!pidref_is_set(&context->pidref))
+ context->pidref = TAKE_PIDREF(local_pidref);
+
+ /* Close the kernel-provided FD as the last thing after everything else succeeded. */
+ kernel_fd = safe_close(kernel_fd);
+
+ return 0;
}
static int gather_pid_metadata(struct iovec_wrapper *iovw, Context *context) {
@@ -1466,7 +1535,7 @@ static int gather_pid_metadata(struct iovec_wrapper *iovw, Context *context) {
}
static int process_kernel(int argc, char* argv[]) {
- Context context = {};
+ _cleanup_(context_done) Context context = CONTEXT_NULL;
struct iovec_wrapper *iovw;
int r, mntns_fd = -EBADF;
@@ -1543,7 +1612,7 @@ static int process_kernel(int argc, char* argv[]) {
}
static int process_backtrace(int argc, char *argv[]) {
- Context context = {};
+ _cleanup_(context_done) Context context = CONTEXT_NULL;
struct iovec_wrapper *iovw;
char *message;
int r;
diff --git a/sysctl.d/50-coredump.conf.in b/sysctl.d/50-coredump.conf.in
index 9c10a89828..1c6230ad93 100644
--- a/sysctl.d/50-coredump.conf.in
+++ b/sysctl.d/50-coredump.conf.in
@@ -13,7 +13,7 @@
# the core dump.
#
# See systemd-coredump(8) and core(5).
-kernel.core_pattern=|{{ROOTLIBEXECDIR}}/systemd-coredump %P %u %g %s %t %c %h %d
+kernel.core_pattern=|{{ROOTLIBEXECDIR}}/systemd-coredump %P %u %g %s %t %c %h %d %F
# Allow 16 coredumps to be dispatched in parallel by the kernel.
# We collect metadata from /proc/%P/, and thus need to make sure the crashed