systemd/SOURCES/0723-coredump-generate-stacktraces-also-for-processes-run.patch

531 lines
22 KiB
Diff

From d0427b44ecb56cdbc40ca156af8e013b64cce74d Mon Sep 17 00:00:00 2001
From: Michal Sekletar <msekleta@redhat.com>
Date: Mon, 18 Mar 2024 13:01:40 +0100
Subject: [PATCH] coredump: generate stacktraces also for processes running in
containers w/o coredump forwarding
Note that entering container namespace has to be explicitly enabled by
setting SYSTEMD_COREDUMP_ALLOW_NAMESPACE_CHANGE environment variable.
RHEL-only
Resolves: RHEL-29430
---
src/analyze/analyze-inspect-elf.c | 2 +-
src/basic/socket-util.c | 52 +++++++++++
src/basic/socket-util.h | 15 ++++
src/coredump/coredump.c | 143 +++++++++++++++++++-----------
src/shared/elf-util.c | 46 +++++++++-
src/shared/elf-util.h | 4 +-
6 files changed, 202 insertions(+), 60 deletions(-)
diff --git a/src/analyze/analyze-inspect-elf.c b/src/analyze/analyze-inspect-elf.c
index 155c611c71..b8074100a5 100644
--- a/src/analyze/analyze-inspect-elf.c
+++ b/src/analyze/analyze-inspect-elf.c
@@ -30,7 +30,7 @@ static int analyze_elf(char **filenames, JsonFormatFlags json_flags) {
if (fd < 0)
return log_error_errno(fd, "Could not open \"%s\": %m", abspath);
- r = parse_elf_object(fd, abspath, /* fork_disable_dump= */false, NULL, &package_metadata);
+ r = parse_elf_object(fd, -EBADF, UID_NOBODY, GID_NOBODY, abspath, /* fork_disable_dump= */false, NULL, &package_metadata);
if (r < 0)
return log_error_errno(r, "Parsing \"%s\" as ELF object failed: %m", abspath);
diff --git a/src/basic/socket-util.c b/src/basic/socket-util.c
index f39be19a59..1d86ca3f55 100644
--- a/src/basic/socket-util.c
+++ b/src/basic/socket-util.c
@@ -41,6 +41,11 @@
# define IDN_FLAGS 0
#endif
+/* From the kernel's include/net/scm.h */
+#ifndef SCM_MAX_FD
+# define SCM_MAX_FD 253
+#endif
+
static const char* const socket_address_type_table[] = {
[SOCK_STREAM] = "Stream",
[SOCK_DGRAM] = "Datagram",
@@ -951,6 +956,53 @@ int getpeergroups(int fd, gid_t **ret) {
return (int) n;
}
+ssize_t send_many_fds_iov_sa(
+ int transport_fd,
+ int *fds_array, size_t n_fds_array,
+ const struct iovec *iov, size_t iovlen,
+ const struct sockaddr *sa, socklen_t len,
+ int flags) {
+
+ _cleanup_free_ struct cmsghdr *cmsg = NULL;
+ struct msghdr mh = {
+ .msg_name = (struct sockaddr*) sa,
+ .msg_namelen = len,
+ .msg_iov = (struct iovec *)iov,
+ .msg_iovlen = iovlen,
+ };
+ ssize_t k;
+
+ assert(transport_fd >= 0);
+ assert(fds_array || n_fds_array == 0);
+
+ /* The kernel will reject sending more than SCM_MAX_FD FDs at once */
+ if (n_fds_array > SCM_MAX_FD)
+ return -E2BIG;
+
+ /* We need either an FD array or data to send. If there's nothing, return an error. */
+ if (n_fds_array == 0 && !iov)
+ return -EINVAL;
+
+ if (n_fds_array > 0) {
+ mh.msg_controllen = CMSG_SPACE(sizeof(int) * n_fds_array);
+ mh.msg_control = cmsg = malloc(mh.msg_controllen);
+ if (!cmsg)
+ return -ENOMEM;
+
+ *cmsg = (struct cmsghdr) {
+ .cmsg_len = CMSG_LEN(sizeof(int) * n_fds_array),
+ .cmsg_level = SOL_SOCKET,
+ .cmsg_type = SCM_RIGHTS,
+ };
+ memcpy(CMSG_DATA(cmsg), fds_array, sizeof(int) * n_fds_array);
+ }
+ k = sendmsg(transport_fd, &mh, MSG_NOSIGNAL | flags);
+ if (k < 0)
+ return (ssize_t) -errno;
+
+ return k;
+}
+
ssize_t send_one_fd_iov_sa(
int transport_fd,
int fd,
diff --git a/src/basic/socket-util.h b/src/basic/socket-util.h
index 2e36e1a56b..61bf8ff32b 100644
--- a/src/basic/socket-util.h
+++ b/src/basic/socket-util.h
@@ -153,6 +153,13 @@ int getpeercred(int fd, struct ucred *ucred);
int getpeersec(int fd, char **ret);
int getpeergroups(int fd, gid_t **ret);
+ssize_t send_many_fds_iov_sa(
+ int transport_fd,
+ int *fds_array, size_t n_fds_array,
+ const struct iovec *iov, size_t iovlen,
+ const struct sockaddr *sa, socklen_t len,
+ int flags);
+
ssize_t send_one_fd_iov_sa(
int transport_fd,
int fd,
@@ -163,6 +170,14 @@ int send_one_fd_sa(int transport_fd,
int fd,
const struct sockaddr *sa, socklen_t len,
int flags);
+static inline int send_many_fds(
+ int transport_fd,
+ int *fds_array,
+ size_t n_fds_array,
+ int flags) {
+
+ return send_many_fds_iov_sa(transport_fd, fds_array, n_fds_array, NULL, 0, NULL, 0, flags);
+}
#define send_one_fd_iov(transport_fd, fd, iov, iovlen, flags) send_one_fd_iov_sa(transport_fd, fd, iov, iovlen, NULL, 0, flags)
#define send_one_fd(transport_fd, fd, flags) send_one_fd_iov_sa(transport_fd, fd, NULL, 0, NULL, 0, flags)
ssize_t receive_one_fd_iov(int transport_fd, struct iovec *iov, size_t iovlen, int flags, int *ret_fd);
diff --git a/src/coredump/coredump.c b/src/coredump/coredump.c
index b9c5f3ad04..dca78fa72c 100644
--- a/src/coredump/coredump.c
+++ b/src/coredump/coredump.c
@@ -24,6 +24,7 @@
#include "coredump-vacuum.h"
#include "dirent-util.h"
#include "elf-util.h"
+#include "env-util.h"
#include "escape.h"
#include "fd-util.h"
#include "fileio.h"
@@ -36,6 +37,7 @@
#include "main-func.h"
#include "memory-util.h"
#include "mkdir-label.h"
+#include "namespace-util.h"
#include "parse-util.h"
#include "process-util.h"
#include "signal-util.h"
@@ -130,6 +132,8 @@ typedef struct Context {
const char *meta[_META_MAX];
size_t meta_size[_META_MAX];
pid_t pid;
+ uid_t uid;
+ gid_t gid;
bool is_pid1;
bool is_journald;
} Context;
@@ -866,36 +870,11 @@ static int get_process_container_parent_cmdline(pid_t pid, char** cmdline) {
return 1;
}
-static int change_uid_gid(const Context *context) {
- uid_t uid;
- gid_t gid;
- int r;
-
- r = parse_uid(context->meta[META_ARGV_UID], &uid);
- if (r < 0)
- return r;
-
- if (uid_is_system(uid)) {
- const char *user = "systemd-coredump";
-
- r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0);
- if (r < 0) {
- log_warning_errno(r, "Cannot resolve %s user. Proceeding to dump core as root: %m", user);
- uid = gid = 0;
- }
- } else {
- r = parse_gid(context->meta[META_ARGV_GID], &gid);
- if (r < 0)
- return r;
- }
-
- return drop_privileges(uid, gid, 0);
-}
-
static int submit_coredump(
const Context *context,
struct iovec_wrapper *iovw,
- int input_fd) {
+ int input_fd,
+ int mntns_fd) {
_cleanup_(json_variant_unrefp) JsonVariant *json_metadata = NULL;
_cleanup_close_ int coredump_fd = -1, coredump_node_fd = -1;
@@ -938,15 +917,6 @@ static int submit_coredump(
/* Vacuum again, but exclude the coredump we just created */
(void) coredump_vacuum(coredump_node_fd >= 0 ? coredump_node_fd : coredump_fd, arg_keep_free, arg_max_use);
- /* Now, let's drop privileges to become the user who owns the segfaulted process
- * and allocate the coredump memory under the user's uid. This also ensures that
- * the credentials journald will see are the ones of the coredumping user, thus
- * making sure the user gets access to the core dump. Let's also get rid of all
- * capabilities, if we run as root, we won't need them anymore. */
- r = change_uid_gid(context);
- if (r < 0)
- return log_error_errno(r, "Failed to drop privileges: %m");
-
/* Try to get a stack trace if we can */
if (coredump_size > arg_process_size_max)
log_debug("Not generating stack trace: core size %"PRIu64" is greater "
@@ -956,12 +926,23 @@ static int submit_coredump(
bool skip = startswith(context->meta[META_COMM], "systemd-coredum"); /* COMM is 16 bytes usually */
(void) parse_elf_object(coredump_fd,
+ mntns_fd,
+ context->uid,
+ context->gid,
context->meta[META_EXE],
/* fork_disable_dump= */ skip, /* avoid loops */
&stacktrace,
&json_metadata);
}
+ /* Now, let's drop privileges to become the user who owns the segfaulted process. This also ensures
+ * that the credentials journald will see are the ones of the coredumping user, thus making sure
+ * the user gets access to the core dump. Let's also get rid of all capabilities, if we run as root,
+ * we won't need them anymore. */
+ r = drop_privileges(context->uid, context->gid, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to drop privileges: %m");
+
log:
core_message = strjoina("Process ", context->meta[META_ARGV_PID],
" (", context->meta[META_COMM], ") of user ",
@@ -1094,6 +1075,15 @@ static int save_context(Context *context, const struct iovec_wrapper *iovw) {
if (r < 0)
return log_error_errno(r, "Failed to parse PID \"%s\": %m", context->meta[META_ARGV_PID]);
+ r = parse_uid(context->meta[META_ARGV_UID], &context->uid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse UID \"%s\": %m", context->meta[META_ARGV_UID]);
+
+ r = parse_gid(context->meta[META_ARGV_GID], &context->gid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse GID \"%s\": %m", context->meta[META_ARGV_GID]);
+
+
unit = context->meta[META_UNIT];
context->is_pid1 = streq(context->meta[META_ARGV_PID], "1") || streq_ptr(unit, SPECIAL_INIT_SCOPE);
context->is_journald = streq_ptr(unit, SPECIAL_JOURNALD_SERVICE);
@@ -1102,11 +1092,11 @@ static int save_context(Context *context, const struct iovec_wrapper *iovw) {
}
static int process_socket(int fd) {
- _cleanup_close_ int input_fd = -1;
+ _cleanup_close_ int input_fd = -EBADF, mntns_fd = -EBADF;
Context context = {};
struct iovec_wrapper iovw = {};
struct iovec iovec;
- int r;
+ int iterations = 0, r;
assert(fd >= 0);
@@ -1146,23 +1136,39 @@ static int process_socket(int fd) {
goto finish;
}
- /* The final zero-length datagram carries the file descriptor and tells us
+ /* The final zero-length datagram carries the file descriptors and tells us
* that we're done. */
if (n == 0) {
struct cmsghdr *found;
free(iovec.iov_base);
- found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int)));
- if (!found) {
- cmsg_close_all(&mh);
- r = log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
- "Coredump file descriptor missing.");
- goto finish;
+ found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int) * 2));
+ if (found) {
+ int fds[2] = { -EBADF, -EBADF };
+
+ memcpy(fds, CMSG_DATA(found), sizeof(int) * 2);
+
+ assert(mntns_fd < 0);
+
+ /* Maybe we already got coredump FD in previous iteration? */
+ safe_close(input_fd);
+
+ input_fd = fds[0];
+ mntns_fd = fds[1];
+
+ /* We have all FDs we need let's take a shortcut here. */
+ break;
+ } else {
+ found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int)));
+ if (found)
+ input_fd = *CMSG_DATA(found);
}
- assert(input_fd < 0);
- input_fd = *(int*) CMSG_DATA(found);
+ /* This is the first message that carries file descriptors, maybe there will be one more that actually contains array of descriptors. */
+ if (iterations++ == 0)
+ continue;
+
break;
} else
cmsg_close_all(&mh);
@@ -1177,7 +1183,11 @@ static int process_socket(int fd) {
}
/* Make sure we got all data we really need */
- assert(input_fd >= 0);
+ if (input_fd < 0) {
+ r = log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
+ "Coredump file descriptor missing.");
+ goto finish;
+ }
r = save_context(&context, &iovw);
if (r < 0)
@@ -1192,15 +1202,15 @@ static int process_socket(int fd) {
goto finish;
}
- r = submit_coredump(&context, &iovw, input_fd);
+ r = submit_coredump(&context, &iovw, input_fd, mntns_fd);
finish:
iovw_free_contents(&iovw, true);
return r;
}
-static int send_iovec(const struct iovec_wrapper *iovw, int input_fd) {
- _cleanup_close_ int fd = -1;
+static int send_iovec(const struct iovec_wrapper *iovw, int input_fd, int mntns_fd) {
+ _cleanup_close_ int fd = -EBADF;
int r;
assert(iovw);
@@ -1256,6 +1266,12 @@ static int send_iovec(const struct iovec_wrapper *iovw, int input_fd) {
if (r < 0)
return log_error_errno(r, "Failed to send coredump fd: %m");
+ if (mntns_fd >= 0) {
+ r = send_many_fds(fd, (int[]) { input_fd, mntns_fd }, 2, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to send coredump fds: %m");
+ }
+
return 0;
}
@@ -1428,7 +1444,7 @@ static int gather_pid_metadata(struct iovec_wrapper *iovw, Context *context) {
static int process_kernel(int argc, char* argv[]) {
Context context = {};
struct iovec_wrapper *iovw;
- int r;
+ int r, mntns_fd = -EBADF;
/* When we're invoked by the kernel, stdout/stderr are closed which is dangerous because the fds
* could get reallocated. To avoid hard to debug issues, let's instead bind stdout/stderr to
@@ -1462,6 +1478,25 @@ static int process_kernel(int argc, char* argv[]) {
log_open();
}
+ r = in_same_namespace(getpid_cached(), context.pid, NAMESPACE_PID);
+ if (r < 0)
+ log_debug_errno(r, "Failed to check pidns of crashing process, ignoring: %m");
+
+ if (r == 0 && getenv_bool("SYSTEMD_COREDUMP_ALLOW_NAMESPACE_CHANGE") > 0) {
+ r = namespace_open(context.pid, NULL, &mntns_fd, NULL, NULL, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to open mntns of crashing process: %m");
+ } else {
+ /* Crashing process is not running in the container or changing namespace is disabled, but we
+ still need to send mount namespace fd along side coredump fd so let's just open our own
+ mount namespace. Entering it will be NOP but that is OK. */
+ r = namespace_open(getpid_cached(), NULL, &mntns_fd, NULL, NULL, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to open our mntns: %m");
+ }
+
+ assert(mntns_fd >= 0 && fd_is_ns(mntns_fd, CLONE_NEWNS) > 0);
+
/* If this is PID 1 disable coredump collection, we'll unlikely be able to process
* it later on.
*
@@ -1474,9 +1509,9 @@ static int process_kernel(int argc, char* argv[]) {
}
if (context.is_journald || context.is_pid1)
- r = submit_coredump(&context, iovw, STDIN_FILENO);
+ r = submit_coredump(&context, iovw, STDIN_FILENO, mntns_fd);
else
- r = send_iovec(iovw, STDIN_FILENO);
+ r = send_iovec(iovw, STDIN_FILENO, mntns_fd);
finish:
iovw = iovw_free_free(iovw);
diff --git a/src/shared/elf-util.c b/src/shared/elf-util.c
index bde5013b92..6a2969732d 100644
--- a/src/shared/elf-util.c
+++ b/src/shared/elf-util.c
@@ -12,6 +12,7 @@
#include <unistd.h>
#include "alloc-util.h"
+#include "capability-util.h"
#include "dlfcn-util.h"
#include "elf-util.h"
#include "errno-util.h"
@@ -21,9 +22,12 @@
#include "hexdecoct.h"
#include "io-util.h"
#include "macro.h"
+#include "namespace-util.h"
#include "process-util.h"
#include "rlimit-util.h"
#include "string-util.h"
+#include "uid-alloc-range.h"
+#include "user-util.h"
#include "util.h"
#define FRAMES_MAX 64
@@ -752,11 +756,27 @@ static int parse_elf(int fd, const char *executable, char **ret, JsonVariant **r
return 0;
}
-int parse_elf_object(int fd, const char *executable, bool fork_disable_dump, char **ret, JsonVariant **ret_package_metadata) {
+static int core_change_uid_gid(uid_t uid, gid_t gid) {
+ uid_t u = uid;
+ gid_t g = gid;
+ int r;
+
+ if (uid_is_system(u)) {
+ const char *user = "systemd-coredump";
+
+ r = get_user_creds(&user, &u, &g, NULL, NULL, 0);
+ if (r < 0)
+ log_warning_errno(r, "Cannot resolve %s user, ignoring: %m", user);
+ }
+
+ return drop_privileges(u, g, 0);
+}
+
+int parse_elf_object(int fd, int mntns_fd, uid_t uid, gid_t gid, const char *executable, bool fork_disable_dump, char **ret, JsonVariant **ret_package_metadata) {
_cleanup_close_pair_ int error_pipe[2] = { -1, -1 }, return_pipe[2] = { -1, -1 }, json_pipe[2] = { -1, -1 };
_cleanup_(json_variant_unrefp) JsonVariant *package_metadata = NULL;
_cleanup_free_ char *buf = NULL;
- int r;
+ int flags, r;
assert(fd >= 0);
@@ -784,6 +804,10 @@ int parse_elf_object(int fd, const char *executable, bool fork_disable_dump, cha
return r;
}
+ flags = FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE|FORK_NEW_USERNS|FORK_WAIT|FORK_REOPEN_LOG;
+ if (mntns_fd >= 0)
+ flags &= ~(FORK_CLOSE_ALL_FDS|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE|FORK_NEW_USERNS);
+
/* Parsing possibly malformed data is crash-happy, so fork. In case we crash,
* the core file will not be lost, and the messages will still be attached to
* the journal. Reading the elf object might be slow, but it still has an upper
@@ -793,7 +817,7 @@ int parse_elf_object(int fd, const char *executable, bool fork_disable_dump, cha
r = safe_fork_full("(sd-parse-elf)",
(int[]){ fd, error_pipe[1], return_pipe[1], json_pipe[1] },
4,
- FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE|FORK_NEW_USERNS|FORK_WAIT|FORK_REOPEN_LOG,
+ flags,
NULL);
if (r < 0) {
if (r == -EPROTO) { /* We should have the errno from the child, but don't clobber original error */
@@ -811,6 +835,22 @@ int parse_elf_object(int fd, const char *executable, bool fork_disable_dump, cha
return r;
}
if (r == 0) {
+ if (mntns_fd >= 0) {
+ r = namespace_enter(/* pidns_fd = */ -EBADF,
+ mntns_fd,
+ /* netns_fd = */ -EBADF,
+ /* userns_fd = */ -EBADF,
+ /* root_fd = */ -EBADF);
+ if (r < 0)
+ log_notice_errno(r, "Failed to enter mount namespace of crashing process, ignoring: %m");
+ }
+
+ if (uid != UID_NOBODY && gid != GID_NOBODY) {
+ r = core_change_uid_gid(uid, gid);
+ if (r < 0)
+ log_notice_errno(r, "Failed to drop privileges, ignoring: %m");
+ }
+
/* We want to avoid loops, given this can be called from systemd-coredump */
if (fork_disable_dump) {
r = RET_NERRNO(prctl(PR_SET_DUMPABLE, 0));
diff --git a/src/shared/elf-util.h b/src/shared/elf-util.h
index b28e64cea6..350464941f 100644
--- a/src/shared/elf-util.h
+++ b/src/shared/elf-util.h
@@ -10,9 +10,9 @@ int dlopen_elf(void);
/* Parse an ELF object in a forked process, so that errors while iterating over
* untrusted and potentially malicious data do not propagate to the main caller's process.
* If fork_disable_dump, the child process will not dump core if it crashes. */
-int parse_elf_object(int fd, const char *executable, bool fork_disable_dump, char **ret, JsonVariant **ret_package_metadata);
+int parse_elf_object(int fd, int mntns_fd, uid_t uid, gid_t gid, const char *executable, bool fork_disable_dump, char **ret, JsonVariant **ret_package_metadata);
#else
-static inline int parse_elf_object(int fd, const char *executable, bool fork_disable_dump, char **ret, JsonVariant **ret_package_metadata) {
+static inline int parse_elf_object(int fd, int mntns_fd, uid_t uid, gid_t gid, const char *executable, bool fork_disable_dump, char **ret, JsonVariant **ret_package_metadata) {
return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "elfutils disabled, parsing ELF objects not supported");
}
#endif