From 1556c13f89f5db22911a4e771af9253a9b79e02c Mon Sep 17 00:00:00 2001 From: Sohan Kunkerkar Date: Thu, 28 Aug 2025 08:53:20 -0400 Subject: [PATCH 1/2] src/libcrun: limit tmpfs memory usage for masked paths Replace "size=0k" (unlimited growth) with explicit block and inode limits for tmpfs mounts used in masked directory paths. This prevents excessive kernel memory consumption under high container density. Signed-off-by: Sohan Kunkerkar --- src/libcrun/linux.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index 75120cea37..36ed40bb5b 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -1114,7 +1114,7 @@ do_masked_or_readonly_path (libcrun_container_t *container, const char *rel_path return crun_make_error (err, errno, "cannot stat `%s`", rel_path); if ((mode & S_IFMT) == S_IFDIR) - ret = do_mount (container, "tmpfs", pathfd, rel_path, "tmpfs", MS_RDONLY, "size=0k", LABEL_MOUNT, err); + ret = do_mount (container, "tmpfs", pathfd, rel_path, "tmpfs", MS_RDONLY, "nr_blocks=1,nr_inodes=1", LABEL_MOUNT, err); else ret = do_mount (container, "/dev/null", pathfd, rel_path, NULL, MS_BIND | MS_RDONLY, NULL, LABEL_MOUNT, err); if (UNLIKELY (ret < 0)) From 4004e5bed9ff52029a829131fbc16f9a877154b9 Mon Sep 17 00:00:00 2001 From: Sohan Kunkerkar Date: Tue, 26 Aug 2025 23:22:56 -0400 Subject: [PATCH 2/2] linux: optimize masked paths with shared empty directory Optimize masked path handling by bind-mounting a shared empty directory (via cached /proc/self/fd) instead of creating per-path tmpfs mounts. This reduces kernel memory and mount syscall overhead under high container density. Signed-off-by: Sohan Kunkerkar --- src/libcrun/linux.c | 110 ++++++++++++++++++++++++++++++++++++++++++- src/libcrun/status.c | 2 +- src/libcrun/status.h | 1 + 3 files changed, 111 insertions(+), 2 deletions(-) diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index 36ed40bb5b..552715f729 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -21,6 +21,7 @@ #include #include "linux.h" #include "utils.h" +#include "status.h" #include #include #include @@ -148,6 +149,12 @@ struct private_data_s /* Used to save stdin, stdout, stderr during checkpointing to descriptors.json * and needed during restore. */ char *external_descriptors; + + /* Cached shared empty directory for masked paths optimization */ + int maskdir_fd; + char *maskdir_proc_path; + bool maskdir_bind_failed; + bool maskdir_warned; }; struct linux_namespace_s @@ -164,6 +171,8 @@ cleanup_private_data (void *private_data) if (p->rootfsfd >= 0) TEMP_FAILURE_RETRY (close (p->rootfsfd)); + if (p->maskdir_fd >= 0) + TEMP_FAILURE_RETRY (close (p->maskdir_fd)); if (p->mount_fds) cleanup_close_mapp (&(p->mount_fds)); if (p->dev_fds) @@ -173,6 +182,7 @@ cleanup_private_data (void *private_data) free (p->host_notify_socket_path); free (p->container_notify_socket_path); free (p->external_descriptors); + free (p->maskdir_proc_path); free (p); } @@ -185,6 +195,7 @@ get_private_data (struct libcrun_container_s *container) container->private_data = p; p->rootfsfd = -1; p->notify_socket_tree_fd = -1; + p->maskdir_fd = -1; container->cleanup_private_data = cleanup_private_data; } return container->private_data; @@ -1058,6 +1069,103 @@ has_mount_for (libcrun_container_t *container, const char *destination) return false; } +static void +warn_tmpfs_fallback_once (struct private_data_s *private_data, const char *reason) +{ + if (! private_data->maskdir_warned) + { + libcrun_warning ("Falling back to tmpfs for masked dirs (reason: %s)", reason); + private_data->maskdir_warned = true; + } +} + +/* Get or create the cached shared empty directory for masked paths optimization. + * Creates directory and FD once per container, caches /proc/self/fd path for fast mounting. + */ +static int +get_shared_empty_dir_cached (libcrun_container_t *container, char **proc_fd_path, libcrun_error_t *err) +{ + struct private_data_s *private_data = get_private_data (container); + cleanup_close int fd = -1; + cleanup_free char *run_dir = NULL; + cleanup_free char *empty_dir_path = NULL; + int ret; + + /* Fast path: return cached proc fd path if already set up */ + if (private_data->maskdir_proc_path != NULL) + { + *proc_fd_path = private_data->maskdir_proc_path; + return 0; + } + + /* Slow path: create directory and cache everything once */ + ret = get_run_directory (&run_dir, container->context->state_root, err); + if (UNLIKELY (ret < 0)) + return ret; + + ret = append_paths (&empty_dir_path, err, run_dir, ".empty-directory", NULL); + if (UNLIKELY (ret < 0)) + return ret; + + /* Ensure the empty directory exists (once per container) */ + ret = crun_ensure_directory (empty_dir_path, 0555, false, err); + if (UNLIKELY (ret < 0)) + return ret; + + /* Open directory and cache FD (once per container) */ + fd = open (empty_dir_path, O_DIRECTORY | O_RDONLY | O_CLOEXEC); + if (fd < 0) + return crun_make_error (err, errno, "open directory `%s`", empty_dir_path); + + /* Cache the /proc/self/fd path for fast mounting */ + ret = xasprintf (&private_data->maskdir_proc_path, "/proc/self/fd/%d", fd); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "xasprintf failed"); + + private_data->maskdir_fd = fd; + fd = -1; /* Don't auto-close */ + + *proc_fd_path = private_data->maskdir_proc_path; + return 0; +} + +static int +mount_masked_dir (libcrun_container_t *container, int pathfd, const char *rel_path, libcrun_error_t *err) +{ + struct private_data_s *private_data = get_private_data (container); + char *proc_fd_path = NULL; + libcrun_error_t tmp_err = NULL; + int ret; + + if (private_data->maskdir_bind_failed) + goto fallback_to_tmpfs; + + /* Get cached /proc/self/fd path (fast after first call) */ + ret = get_shared_empty_dir_cached (container, &proc_fd_path, &tmp_err); + if (ret < 0) + { + private_data->maskdir_bind_failed = true; + warn_tmpfs_fallback_once (private_data, tmp_err->msg); + crun_error_release (&tmp_err); + goto fallback_to_tmpfs; + } + + ret = do_mount (container, proc_fd_path, pathfd, rel_path, NULL, MS_BIND | MS_RDONLY, NULL, LABEL_MOUNT, &tmp_err); + if (LIKELY (ret >= 0)) + return ret; + + /* Bind mount failed - mark as failed and fall back for all future mounts */ + private_data->maskdir_bind_failed = true; + libcrun_warning ("bind mount failed for %s to %s: %s, falling back to tmpfs", + proc_fd_path, rel_path, tmp_err->msg); + warn_tmpfs_fallback_once (private_data, tmp_err->msg); + crun_error_release (&tmp_err); + +fallback_to_tmpfs: + libcrun_debug ("using tmpfs fallback for %s", rel_path); + return ret = do_mount (container, "tmpfs", pathfd, rel_path, "tmpfs", MS_RDONLY, "nr_blocks=1,nr_inodes=1", LABEL_MOUNT, err); +} + static int do_masked_or_readonly_path (libcrun_container_t *container, const char *rel_path, bool readonly, bool keep_flags, libcrun_error_t *err) @@ -1114,7 +1222,7 @@ do_masked_or_readonly_path (libcrun_container_t *container, const char *rel_path return crun_make_error (err, errno, "cannot stat `%s`", rel_path); if ((mode & S_IFMT) == S_IFDIR) - ret = do_mount (container, "tmpfs", pathfd, rel_path, "tmpfs", MS_RDONLY, "nr_blocks=1,nr_inodes=1", LABEL_MOUNT, err); + ret = mount_masked_dir (container, pathfd, rel_path, err); else ret = do_mount (container, "/dev/null", pathfd, rel_path, NULL, MS_BIND | MS_RDONLY, NULL, LABEL_MOUNT, err); if (UNLIKELY (ret < 0)) diff --git a/src/libcrun/status.c b/src/libcrun/status.c index 714a31adc7..c786ef6ea9 100644 --- a/src/libcrun/status.c +++ b/src/libcrun/status.c @@ -55,7 +55,7 @@ validate_id (const char *id, libcrun_error_t *err) return 0; } -static int +int get_run_directory (char **out, const char *state_root, libcrun_error_t *err) { int ret; diff --git a/src/libcrun/status.h b/src/libcrun/status.h index cd6c0ced16..72a94348a5 100644 --- a/src/libcrun/status.h +++ b/src/libcrun/status.h @@ -65,6 +65,7 @@ int libcrun_status_create_exec_fifo (const char *state_root, const char *id, lib int libcrun_status_write_exec_fifo (const char *state_root, const char *id, libcrun_error_t *err); int libcrun_status_has_read_exec_fifo (const char *state_root, const char *id, libcrun_error_t *err); int libcrun_check_pid_valid (libcrun_container_status_t *status, libcrun_error_t *err); +int get_run_directory (char **out, const char *state_root, libcrun_error_t *err); static inline void libcrun_free_container_listp (void *p)