Compare commits
No commits in common. "c8-stream-1.0" and "c9-beta" have entirely different histories.
c8-stream-
...
c9-beta
2
.gitignore
vendored
2
.gitignore
vendored
@ -1 +1 @@
|
|||||||
SOURCES/runc-2abd837.tar.gz
|
SOURCES/v1.2.4.tar.gz
|
||||||
|
@ -1 +1 @@
|
|||||||
cf7119a838db2963e7af6ecdba90a2cc95ec0d56 SOURCES/runc-2abd837.tar.gz
|
14c6119053012f4072aed8dbb9fd47b57c47279b SOURCES/v1.2.4.tar.gz
|
||||||
|
103
SOURCES/0001-Bump-runtime-spec-to-latest-git-HEAD.patch
Normal file
103
SOURCES/0001-Bump-runtime-spec-to-latest-git-HEAD.patch
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
From c6dad73d617864f3a281ac1fdaacd5ed971fa317 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Kir Kolyshkin <kolyshkin@gmail.com>
|
||||||
|
Date: Thu, 27 Jun 2024 09:00:51 -0700
|
||||||
|
Subject: [PATCH 1/2] Bump runtime-spec to latest git HEAD
|
||||||
|
|
||||||
|
This is to include
|
||||||
|
- https://github.com/opencontainers/runtime-spec/pull/1261
|
||||||
|
- https://github.com/opencontainers/runtime-spec/pull/1253
|
||||||
|
|
||||||
|
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
|
||||||
|
(cherry picked from commit 2cac22b1e29e6be4c004f35ce582aa2b7e1c2fda)
|
||||||
|
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
|
||||||
|
---
|
||||||
|
go.mod | 2 +-
|
||||||
|
go.sum | 4 ++--
|
||||||
|
.../opencontainers/runtime-spec/specs-go/config.go | 8 ++++++++
|
||||||
|
.../opencontainers/runtime-spec/specs-go/version.go | 2 +-
|
||||||
|
vendor/modules.txt | 2 +-
|
||||||
|
5 files changed, 13 insertions(+), 5 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/go.mod b/go.mod
|
||||||
|
index 348bc9c6..db2d7ef1 100644
|
||||||
|
--- a/go.mod
|
||||||
|
+++ b/go.mod
|
||||||
|
@@ -19,7 +19,7 @@ require (
|
||||||
|
github.com/moby/sys/user v0.3.0
|
||||||
|
github.com/moby/sys/userns v0.1.0
|
||||||
|
github.com/mrunalp/fileutils v0.5.1
|
||||||
|
- github.com/opencontainers/runtime-spec v1.2.0
|
||||||
|
+ github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95
|
||||||
|
github.com/opencontainers/selinux v1.11.0
|
||||||
|
github.com/seccomp/libseccomp-golang v0.10.0
|
||||||
|
github.com/sirupsen/logrus v1.9.3
|
||||||
|
diff --git a/go.sum b/go.sum
|
||||||
|
index 225d5860..4c863cc9 100644
|
||||||
|
--- a/go.sum
|
||||||
|
+++ b/go.sum
|
||||||
|
@@ -46,8 +46,8 @@ github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g
|
||||||
|
github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28=
|
||||||
|
github.com/mrunalp/fileutils v0.5.1 h1:F+S7ZlNKnrwHfSwdlgNSkKo67ReVf8o9fel6C3dkm/Q=
|
||||||
|
github.com/mrunalp/fileutils v0.5.1/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ=
|
||||||
|
-github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk=
|
||||||
|
-github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
|
||||||
|
+github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95 h1:Ghl8Z3l+yPQUDSxAp7Kg7fJLRNNXjOsR6ooDcca7PjU=
|
||||||
|
+github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
|
||||||
|
github.com/opencontainers/selinux v1.11.0 h1:+5Zbo97w3Lbmb3PeqQtpmTkMwsW5nRI3YaLpt7tQ7oU=
|
||||||
|
github.com/opencontainers/selinux v1.11.0/go.mod h1:E5dMC3VPuVvVHDYmi78qvhJp8+M586T4DlDRYpFkyec=
|
||||||
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||||
|
diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go
|
||||||
|
index d1236ba7..671f0d01 100644
|
||||||
|
--- a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go
|
||||||
|
+++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go
|
||||||
|
@@ -94,6 +94,8 @@ type Process struct {
|
||||||
|
SelinuxLabel string `json:"selinuxLabel,omitempty" platform:"linux"`
|
||||||
|
// IOPriority contains the I/O priority settings for the cgroup.
|
||||||
|
IOPriority *LinuxIOPriority `json:"ioPriority,omitempty" platform:"linux"`
|
||||||
|
+ // ExecCPUAffinity specifies CPU affinity for exec processes.
|
||||||
|
+ ExecCPUAffinity *CPUAffinity `json:"execCPUAffinity,omitempty" platform:"linux"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// LinuxCapabilities specifies the list of allowed capabilities that are kept for a process.
|
||||||
|
@@ -127,6 +129,12 @@ const (
|
||||||
|
IOPRIO_CLASS_IDLE IOPriorityClass = "IOPRIO_CLASS_IDLE"
|
||||||
|
)
|
||||||
|
|
||||||
|
+// CPUAffinity specifies process' CPU affinity.
|
||||||
|
+type CPUAffinity struct {
|
||||||
|
+ Initial string `json:"initial,omitempty"`
|
||||||
|
+ Final string `json:"final,omitempty"`
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
// Box specifies dimensions of a rectangle. Used for specifying the size of a console.
|
||||||
|
type Box struct {
|
||||||
|
// Height is the vertical dimension of a box.
|
||||||
|
diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go
|
||||||
|
index 503971e0..f6c15f6c 100644
|
||||||
|
--- a/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go
|
||||||
|
+++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go
|
||||||
|
@@ -11,7 +11,7 @@ const (
|
||||||
|
VersionPatch = 0
|
||||||
|
|
||||||
|
// VersionDev indicates development branch. Releases will be empty string.
|
||||||
|
- VersionDev = ""
|
||||||
|
+ VersionDev = "+dev"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Version is the specification version that the package types support.
|
||||||
|
diff --git a/vendor/modules.txt b/vendor/modules.txt
|
||||||
|
index 3b245e0d..df520923 100644
|
||||||
|
--- a/vendor/modules.txt
|
||||||
|
+++ b/vendor/modules.txt
|
||||||
|
@@ -46,7 +46,7 @@ github.com/moby/sys/userns
|
||||||
|
# github.com/mrunalp/fileutils v0.5.1
|
||||||
|
## explicit; go 1.13
|
||||||
|
github.com/mrunalp/fileutils
|
||||||
|
-# github.com/opencontainers/runtime-spec v1.2.0
|
||||||
|
+# github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95
|
||||||
|
## explicit
|
||||||
|
github.com/opencontainers/runtime-spec/specs-go
|
||||||
|
github.com/opencontainers/runtime-spec/specs-go/features
|
||||||
|
--
|
||||||
|
2.47.1
|
||||||
|
|
@ -1,62 +0,0 @@
|
|||||||
From dfb3496c174377b860b62872ce6af951364cc3ac Mon Sep 17 00:00:00 2001
|
|
||||||
From: Lokesh Mandvekar <lsm5@fedoraproject.org>
|
|
||||||
Date: Tue, 12 Dec 2017 13:22:42 +0530
|
|
||||||
Subject: [PATCH] Revert "Apply cgroups earlier"
|
|
||||||
|
|
||||||
This reverts commit 7062c7556b71188abc18d7516441ff4b03fbc1fc.
|
|
||||||
---
|
|
||||||
libcontainer/process_linux.go | 31 ++++++++++++++-----------------
|
|
||||||
1 file changed, 14 insertions(+), 17 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
|
|
||||||
index 149b1126..b8a395af 100644
|
|
||||||
--- a/libcontainer/process_linux.go
|
|
||||||
+++ b/libcontainer/process_linux.go
|
|
||||||
@@ -272,6 +272,20 @@ func (p *initProcess) start() error {
|
|
||||||
p.process.ops = nil
|
|
||||||
return newSystemErrorWithCause(err, "starting init process command")
|
|
||||||
}
|
|
||||||
+ if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
|
|
||||||
+ return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
|
|
||||||
+ }
|
|
||||||
+ if err := p.execSetns(); err != nil {
|
|
||||||
+ return newSystemErrorWithCause(err, "running exec setns process for init")
|
|
||||||
+ }
|
|
||||||
+ // Save the standard descriptor names before the container process
|
|
||||||
+ // can potentially move them (e.g., via dup2()). If we don't do this now,
|
|
||||||
+ // we won't know at checkpoint time which file descriptor to look up.
|
|
||||||
+ fds, err := getPipeFds(p.pid())
|
|
||||||
+ if err != nil {
|
|
||||||
+ return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
|
|
||||||
+ }
|
|
||||||
+ p.setExternalDescriptors(fds)
|
|
||||||
// Do this before syncing with child so that no children can escape the
|
|
||||||
// cgroup. We don't need to worry about not doing this and not being root
|
|
||||||
// because we'd be using the rootless cgroup manager in that case.
|
|
||||||
@@ -292,23 +306,6 @@ func (p *initProcess) start() error {
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
-
|
|
||||||
- if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
|
|
||||||
- return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
|
|
||||||
- }
|
|
||||||
-
|
|
||||||
- if err := p.execSetns(); err != nil {
|
|
||||||
- return newSystemErrorWithCause(err, "running exec setns process for init")
|
|
||||||
- }
|
|
||||||
-
|
|
||||||
- // Save the standard descriptor names before the container process
|
|
||||||
- // can potentially move them (e.g., via dup2()). If we don't do this now,
|
|
||||||
- // we won't know at checkpoint time which file descriptor to look up.
|
|
||||||
- fds, err := getPipeFds(p.pid())
|
|
||||||
- if err != nil {
|
|
||||||
- return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
|
|
||||||
- }
|
|
||||||
- p.setExternalDescriptors(fds)
|
|
||||||
if err := p.createNetworkInterfaces(); err != nil {
|
|
||||||
return newSystemErrorWithCause(err, "creating network interfaces")
|
|
||||||
}
|
|
||||||
--
|
|
||||||
2.14.3
|
|
||||||
|
|
@ -1,290 +0,0 @@
|
|||||||
From bf6405284aa3870a39b402309003633a1c230ed9 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Aleksa Sarai <asarai@suse.de>
|
|
||||||
Date: Wed, 9 Jan 2019 13:40:01 +1100
|
|
||||||
Subject: [PATCH 1/1] nsenter: clone /proc/self/exe to avoid exposing host
|
|
||||||
binary to container
|
|
||||||
|
|
||||||
There are quite a few circumstances where /proc/self/exe pointing to a
|
|
||||||
pretty important container binary is a _bad_ thing, so to avoid this we
|
|
||||||
have to make a copy (preferably doing self-clean-up and not being
|
|
||||||
writeable).
|
|
||||||
|
|
||||||
As a hotfix we require memfd_create(2), but we can always extend this to
|
|
||||||
use a scratch MNT_DETACH overlayfs or tmpfs. The main downside to this
|
|
||||||
approach is no page-cache sharing for the runc binary (which overlayfs
|
|
||||||
would give us) but this is far less complicated.
|
|
||||||
|
|
||||||
This is only done during nsenter so that it happens transparently to the
|
|
||||||
Go code, and any libcontainer users benefit from it. This also makes
|
|
||||||
ExtraFiles and --preserve-fds handling trivial (because we don't need to
|
|
||||||
worry about it).
|
|
||||||
|
|
||||||
Fixes: CVE-2019-5736
|
|
||||||
Co-developed-by: Christian Brauner <christian.brauner@ubuntu.com>
|
|
||||||
Signed-off-by: Aleksa Sarai <asarai@suse.de>
|
|
||||||
Signed-off-by: Mrunal Patel <mrunalp@gmail.com>
|
|
||||||
---
|
|
||||||
libcontainer/nsenter/cloned_binary.c | 221 +++++++++++++++++++++++++++
|
|
||||||
libcontainer/nsenter/nsexec.c | 11 ++
|
|
||||||
2 files changed, 232 insertions(+)
|
|
||||||
create mode 100644 libcontainer/nsenter/cloned_binary.c
|
|
||||||
|
|
||||||
diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c
|
|
||||||
new file mode 100644
|
|
||||||
index 00000000..d9f6093a
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/libcontainer/nsenter/cloned_binary.c
|
|
||||||
@@ -0,0 +1,221 @@
|
|
||||||
+#define _GNU_SOURCE
|
|
||||||
+#include <unistd.h>
|
|
||||||
+#include <stdio.h>
|
|
||||||
+#include <stdlib.h>
|
|
||||||
+#include <stdbool.h>
|
|
||||||
+#include <string.h>
|
|
||||||
+#include <limits.h>
|
|
||||||
+#include <fcntl.h>
|
|
||||||
+#include <errno.h>
|
|
||||||
+
|
|
||||||
+#include <sys/types.h>
|
|
||||||
+#include <sys/stat.h>
|
|
||||||
+#include <sys/vfs.h>
|
|
||||||
+#include <sys/mman.h>
|
|
||||||
+#include <sys/sendfile.h>
|
|
||||||
+#include <sys/syscall.h>
|
|
||||||
+
|
|
||||||
+#include <linux/magic.h>
|
|
||||||
+#include <linux/memfd.h>
|
|
||||||
+
|
|
||||||
+/* Use our own wrapper for memfd_create. */
|
|
||||||
+#if !defined(SYS_memfd_create) && defined(__NR_memfd_create)
|
|
||||||
+# define SYS_memfd_create __NR_memfd_create
|
|
||||||
+#endif
|
|
||||||
+#ifndef SYS_memfd_create
|
|
||||||
+# error "memfd_create(2) syscall not supported by this glibc version"
|
|
||||||
+#endif
|
|
||||||
+int memfd_create(const char *name, unsigned int flags)
|
|
||||||
+{
|
|
||||||
+ return syscall(SYS_memfd_create, name, flags);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+/* This comes directly from <linux/fcntl.h>. */
|
|
||||||
+#ifndef F_LINUX_SPECIFIC_BASE
|
|
||||||
+# define F_LINUX_SPECIFIC_BASE 1024
|
|
||||||
+#endif
|
|
||||||
+#ifndef F_ADD_SEALS
|
|
||||||
+# define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
|
|
||||||
+# define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
|
|
||||||
+#endif
|
|
||||||
+#ifndef F_SEAL_SEAL
|
|
||||||
+# define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
|
|
||||||
+# define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
|
|
||||||
+# define F_SEAL_GROW 0x0004 /* prevent file from growing */
|
|
||||||
+# define F_SEAL_WRITE 0x0008 /* prevent writes */
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+#define OUR_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
|
|
||||||
+#define OUR_MEMFD_SEALS \
|
|
||||||
+ (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
|
|
||||||
+
|
|
||||||
+static void *must_realloc(void *ptr, size_t size)
|
|
||||||
+{
|
|
||||||
+ void *old = ptr;
|
|
||||||
+ do {
|
|
||||||
+ ptr = realloc(old, size);
|
|
||||||
+ } while(!ptr);
|
|
||||||
+ return ptr;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+/*
|
|
||||||
+ * Verify whether we are currently in a self-cloned program (namely, is
|
|
||||||
+ * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather
|
|
||||||
+ * for shmem files), and we want to be sure it's actually sealed.
|
|
||||||
+ */
|
|
||||||
+static int is_self_cloned(void)
|
|
||||||
+{
|
|
||||||
+ int fd, seals;
|
|
||||||
+
|
|
||||||
+ fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC);
|
|
||||||
+ if (fd < 0)
|
|
||||||
+ return -ENOTRECOVERABLE;
|
|
||||||
+
|
|
||||||
+ seals = fcntl(fd, F_GET_SEALS);
|
|
||||||
+ close(fd);
|
|
||||||
+ return seals == OUR_MEMFD_SEALS;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+/*
|
|
||||||
+ * Basic wrapper around mmap(2) that gives you the file length so you can
|
|
||||||
+ * safely treat it as an ordinary buffer. Only gives you read access.
|
|
||||||
+ */
|
|
||||||
+static char *read_file(char *path, size_t *length)
|
|
||||||
+{
|
|
||||||
+ int fd;
|
|
||||||
+ char buf[4096], *copy = NULL;
|
|
||||||
+
|
|
||||||
+ if (!length)
|
|
||||||
+ return NULL;
|
|
||||||
+
|
|
||||||
+ fd = open(path, O_RDONLY | O_CLOEXEC);
|
|
||||||
+ if (fd < 0)
|
|
||||||
+ return NULL;
|
|
||||||
+
|
|
||||||
+ *length = 0;
|
|
||||||
+ for (;;) {
|
|
||||||
+ int n;
|
|
||||||
+
|
|
||||||
+ n = read(fd, buf, sizeof(buf));
|
|
||||||
+ if (n < 0)
|
|
||||||
+ goto error;
|
|
||||||
+ if (!n)
|
|
||||||
+ break;
|
|
||||||
+
|
|
||||||
+ copy = must_realloc(copy, (*length + n) * sizeof(*copy));
|
|
||||||
+ memcpy(copy + *length, buf, n);
|
|
||||||
+ *length += n;
|
|
||||||
+ }
|
|
||||||
+ close(fd);
|
|
||||||
+ return copy;
|
|
||||||
+
|
|
||||||
+error:
|
|
||||||
+ close(fd);
|
|
||||||
+ free(copy);
|
|
||||||
+ return NULL;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+/*
|
|
||||||
+ * A poor-man's version of "xargs -0". Basically parses a given block of
|
|
||||||
+ * NUL-delimited data, within the given length and adds a pointer to each entry
|
|
||||||
+ * to the array of pointers.
|
|
||||||
+ */
|
|
||||||
+static int parse_xargs(char *data, int data_length, char ***output)
|
|
||||||
+{
|
|
||||||
+ int num = 0;
|
|
||||||
+ char *cur = data;
|
|
||||||
+
|
|
||||||
+ if (!data || *output != NULL)
|
|
||||||
+ return -1;
|
|
||||||
+
|
|
||||||
+ while (cur < data + data_length) {
|
|
||||||
+ num++;
|
|
||||||
+ *output = must_realloc(*output, (num + 1) * sizeof(**output));
|
|
||||||
+ (*output)[num - 1] = cur;
|
|
||||||
+ cur += strlen(cur) + 1;
|
|
||||||
+ }
|
|
||||||
+ (*output)[num] = NULL;
|
|
||||||
+ return num;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+/*
|
|
||||||
+ * "Parse" out argv and envp from /proc/self/cmdline and /proc/self/environ.
|
|
||||||
+ * This is necessary because we are running in a context where we don't have a
|
|
||||||
+ * main() that we can just get the arguments from.
|
|
||||||
+ */
|
|
||||||
+static int fetchve(char ***argv, char ***envp)
|
|
||||||
+{
|
|
||||||
+ char *cmdline = NULL, *environ = NULL;
|
|
||||||
+ size_t cmdline_size, environ_size;
|
|
||||||
+
|
|
||||||
+ cmdline = read_file("/proc/self/cmdline", &cmdline_size);
|
|
||||||
+ if (!cmdline)
|
|
||||||
+ goto error;
|
|
||||||
+ environ = read_file("/proc/self/environ", &environ_size);
|
|
||||||
+ if (!environ)
|
|
||||||
+ goto error;
|
|
||||||
+
|
|
||||||
+ if (parse_xargs(cmdline, cmdline_size, argv) <= 0)
|
|
||||||
+ goto error;
|
|
||||||
+ if (parse_xargs(environ, environ_size, envp) <= 0)
|
|
||||||
+ goto error;
|
|
||||||
+
|
|
||||||
+ return 0;
|
|
||||||
+
|
|
||||||
+error:
|
|
||||||
+ free(environ);
|
|
||||||
+ free(cmdline);
|
|
||||||
+ return -EINVAL;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+#define SENDFILE_MAX 0x7FFFF000 /* sendfile(2) is limited to 2GB. */
|
|
||||||
+static int clone_binary(void)
|
|
||||||
+{
|
|
||||||
+ int binfd, memfd, err;
|
|
||||||
+ ssize_t sent = 0;
|
|
||||||
+
|
|
||||||
+ memfd = memfd_create(OUR_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
|
|
||||||
+ if (memfd < 0)
|
|
||||||
+ return -ENOTRECOVERABLE;
|
|
||||||
+
|
|
||||||
+ binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
|
|
||||||
+ if (binfd < 0)
|
|
||||||
+ goto error;
|
|
||||||
+
|
|
||||||
+ sent = sendfile(memfd, binfd, NULL, SENDFILE_MAX);
|
|
||||||
+ close(binfd);
|
|
||||||
+ if (sent < 0)
|
|
||||||
+ goto error;
|
|
||||||
+
|
|
||||||
+ err = fcntl(memfd, F_ADD_SEALS, OUR_MEMFD_SEALS);
|
|
||||||
+ if (err < 0)
|
|
||||||
+ goto error;
|
|
||||||
+
|
|
||||||
+ return memfd;
|
|
||||||
+
|
|
||||||
+error:
|
|
||||||
+ close(memfd);
|
|
||||||
+ return -EIO;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+int ensure_cloned_binary(void)
|
|
||||||
+{
|
|
||||||
+ int execfd;
|
|
||||||
+ char **argv = NULL, **envp = NULL;
|
|
||||||
+
|
|
||||||
+ /* Check that we're not self-cloned, and if we are then bail. */
|
|
||||||
+ int cloned = is_self_cloned();
|
|
||||||
+ if (cloned > 0 || cloned == -ENOTRECOVERABLE)
|
|
||||||
+ return cloned;
|
|
||||||
+
|
|
||||||
+ if (fetchve(&argv, &envp) < 0)
|
|
||||||
+ return -EINVAL;
|
|
||||||
+
|
|
||||||
+ execfd = clone_binary();
|
|
||||||
+ if (execfd < 0)
|
|
||||||
+ return -EIO;
|
|
||||||
+
|
|
||||||
+ fexecve(execfd, argv, envp);
|
|
||||||
+ return -ENOEXEC;
|
|
||||||
+}
|
|
||||||
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
|
|
||||||
index cb224314..784fd9b0 100644
|
|
||||||
--- a/libcontainer/nsenter/nsexec.c
|
|
||||||
+++ b/libcontainer/nsenter/nsexec.c
|
|
||||||
@@ -528,6 +528,9 @@ void join_namespaces(char *nslist)
|
|
||||||
free(namespaces);
|
|
||||||
}
|
|
||||||
|
|
||||||
+/* Defined in cloned_binary.c. */
|
|
||||||
+int ensure_cloned_binary(void);
|
|
||||||
+
|
|
||||||
void nsexec(void)
|
|
||||||
{
|
|
||||||
int pipenum;
|
|
||||||
@@ -543,6 +546,14 @@ void nsexec(void)
|
|
||||||
if (pipenum == -1)
|
|
||||||
return;
|
|
||||||
|
|
||||||
+ /*
|
|
||||||
+ * We need to re-exec if we are not in a cloned binary. This is necessary
|
|
||||||
+ * to ensure that containers won't be able to access the host binary
|
|
||||||
+ * through /proc/self/exe. See CVE-2019-5736.
|
|
||||||
+ */
|
|
||||||
+ if (ensure_cloned_binary() < 0)
|
|
||||||
+ bail("could not ensure we are a cloned binary");
|
|
||||||
+
|
|
||||||
/* Parse all of the netlink configuration. */
|
|
||||||
nl_parse(pipenum, &config);
|
|
||||||
|
|
||||||
--
|
|
||||||
2.20.1
|
|
||||||
|
|
521
SOURCES/0002-runc-exec-implement-CPU-affinity.patch
Normal file
521
SOURCES/0002-runc-exec-implement-CPU-affinity.patch
Normal file
@ -0,0 +1,521 @@
|
|||||||
|
From 73786942b7176eae1e676cf2f78af548f090e418 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Kir Kolyshkin <kolyshkin@gmail.com>
|
||||||
|
Date: Mon, 21 Oct 2024 15:50:38 -0700
|
||||||
|
Subject: [PATCH 2/2] runc exec: implement CPU affinity
|
||||||
|
|
||||||
|
As per
|
||||||
|
- https://github.com/opencontainers/runtime-spec/pull/1253
|
||||||
|
- https://github.com/opencontainers/runtime-spec/pull/1261
|
||||||
|
|
||||||
|
CPU affinity can be set in two ways:
|
||||||
|
1. When creating/starting a container, in config.json's
|
||||||
|
Process.ExecCPUAffinity, which is when applied to all execs.
|
||||||
|
2. When running an exec, in process.json's CPUAffinity, which
|
||||||
|
applied to a given exec and overrides the value from (1).
|
||||||
|
|
||||||
|
Add some basic tests.
|
||||||
|
|
||||||
|
Note that older kernels (RHEL8, Ubuntu 20.04) change CPU affinity of a
|
||||||
|
process to that of a container's cgroup, as soon as it is moved to that
|
||||||
|
cgroup, while newer kernels (Ubuntu 24.04, Fedora 41) don't do that.
|
||||||
|
|
||||||
|
Because of the above,
|
||||||
|
- it's impossible to really test initial CPU affinity without adding
|
||||||
|
debug logging to libcontainer/nsenter;
|
||||||
|
- for older kernels, there can be a brief moment when exec's affinity
|
||||||
|
is different than either initial or final affinity being set;
|
||||||
|
- exec's final CPU affinity, if not specified, can be different
|
||||||
|
depending on the kernel, therefore we don't test it.
|
||||||
|
|
||||||
|
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
|
||||||
|
(cherry picked from commit 57237b31de367a722c5d49088912d57c28c6fb46)
|
||||||
|
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
|
||||||
|
---
|
||||||
|
libcontainer/configs/config.go | 72 ++++++++++++++++++++
|
||||||
|
libcontainer/container_linux.go | 4 ++
|
||||||
|
libcontainer/init_linux.go | 3 +-
|
||||||
|
libcontainer/nsenter/log.c | 9 ++-
|
||||||
|
libcontainer/nsenter/log.h | 3 +
|
||||||
|
libcontainer/nsenter/nsexec.c | 29 ++++++++
|
||||||
|
libcontainer/process.go | 2 +
|
||||||
|
libcontainer/process_linux.go | 49 +++++++++++++-
|
||||||
|
libcontainer/specconv/spec_linux.go | 5 ++
|
||||||
|
tests/integration/cpu_affinity.bats | 101 ++++++++++++++++++++++++++++
|
||||||
|
utils_linux.go | 6 ++
|
||||||
|
11 files changed, 277 insertions(+), 6 deletions(-)
|
||||||
|
create mode 100644 tests/integration/cpu_affinity.bats
|
||||||
|
|
||||||
|
diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go
|
||||||
|
index 22fe0f9b..daffd130 100644
|
||||||
|
--- a/libcontainer/configs/config.go
|
||||||
|
+++ b/libcontainer/configs/config.go
|
||||||
|
@@ -3,8 +3,11 @@ package configs
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"encoding/json"
|
||||||
|
+ "errors"
|
||||||
|
"fmt"
|
||||||
|
"os/exec"
|
||||||
|
+ "strconv"
|
||||||
|
+ "strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/sirupsen/logrus"
|
||||||
|
@@ -225,6 +228,9 @@ type Config struct {
|
||||||
|
|
||||||
|
// IOPriority is the container's I/O priority.
|
||||||
|
IOPriority *IOPriority `json:"io_priority,omitempty"`
|
||||||
|
+
|
||||||
|
+ // ExecCPUAffinity is CPU affinity for a non-init process to be run in the container.
|
||||||
|
+ ExecCPUAffinity *CPUAffinity `json:"exec_cpu_affinity,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scheduler is based on the Linux sched_setattr(2) syscall.
|
||||||
|
@@ -294,6 +300,72 @@ var IOPrioClassMapping = map[specs.IOPriorityClass]int{
|
||||||
|
|
||||||
|
type IOPriority = specs.LinuxIOPriority
|
||||||
|
|
||||||
|
+type CPUAffinity struct {
|
||||||
|
+ Initial, Final *unix.CPUSet
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+func toCPUSet(str string) (*unix.CPUSet, error) {
|
||||||
|
+ if str == "" {
|
||||||
|
+ return nil, nil
|
||||||
|
+ }
|
||||||
|
+ s := new(unix.CPUSet)
|
||||||
|
+ for _, r := range strings.Split(str, ",") {
|
||||||
|
+ // Allow extra spaces around.
|
||||||
|
+ r = strings.TrimSpace(r)
|
||||||
|
+ // Allow empty elements (extra commas).
|
||||||
|
+ if r == "" {
|
||||||
|
+ continue
|
||||||
|
+ }
|
||||||
|
+ if r0, r1, found := strings.Cut(r, "-"); found {
|
||||||
|
+ start, err := strconv.ParseUint(r0, 10, 32)
|
||||||
|
+ if err != nil {
|
||||||
|
+ return nil, err
|
||||||
|
+ }
|
||||||
|
+ end, err := strconv.ParseUint(r1, 10, 32)
|
||||||
|
+ if err != nil {
|
||||||
|
+ return nil, err
|
||||||
|
+ }
|
||||||
|
+ if start > end {
|
||||||
|
+ return nil, errors.New("invalid range: " + r)
|
||||||
|
+ }
|
||||||
|
+ for i := int(start); i <= int(end); i++ {
|
||||||
|
+ s.Set(i)
|
||||||
|
+ }
|
||||||
|
+ } else {
|
||||||
|
+ val, err := strconv.ParseUint(r, 10, 32)
|
||||||
|
+ if err != nil {
|
||||||
|
+ return nil, err
|
||||||
|
+ }
|
||||||
|
+ s.Set(int(val))
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ return s, nil
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+// ConvertCPUAffinity converts [specs.CPUAffinity] to [CPUAffinity].
|
||||||
|
+func ConvertCPUAffinity(sa *specs.CPUAffinity) (*CPUAffinity, error) {
|
||||||
|
+ if sa == nil {
|
||||||
|
+ return nil, nil
|
||||||
|
+ }
|
||||||
|
+ initial, err := toCPUSet(sa.Initial)
|
||||||
|
+ if err != nil {
|
||||||
|
+ return nil, fmt.Errorf("bad CPUAffinity.Initial: %w", err)
|
||||||
|
+ }
|
||||||
|
+ final, err := toCPUSet(sa.Final)
|
||||||
|
+ if err != nil {
|
||||||
|
+ return nil, fmt.Errorf("bad CPUAffinity.Final: %w", err)
|
||||||
|
+ }
|
||||||
|
+ if initial == nil && final == nil {
|
||||||
|
+ return nil, nil
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ return &CPUAffinity{
|
||||||
|
+ Initial: initial,
|
||||||
|
+ Final: final,
|
||||||
|
+ }, nil
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
type (
|
||||||
|
HookName string
|
||||||
|
HookList []Hook
|
||||||
|
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
|
||||||
|
index c0211617..1fc590a5 100644
|
||||||
|
--- a/libcontainer/container_linux.go
|
||||||
|
+++ b/libcontainer/container_linux.go
|
||||||
|
@@ -692,6 +692,7 @@ func (c *Container) newInitConfig(process *Process) *initConfig {
|
||||||
|
AppArmorProfile: c.config.AppArmorProfile,
|
||||||
|
ProcessLabel: c.config.ProcessLabel,
|
||||||
|
Rlimits: c.config.Rlimits,
|
||||||
|
+ CPUAffinity: c.config.ExecCPUAffinity,
|
||||||
|
CreateConsole: process.ConsoleSocket != nil,
|
||||||
|
ConsoleWidth: process.ConsoleWidth,
|
||||||
|
ConsoleHeight: process.ConsoleHeight,
|
||||||
|
@@ -708,6 +709,9 @@ func (c *Container) newInitConfig(process *Process) *initConfig {
|
||||||
|
if len(process.Rlimits) > 0 {
|
||||||
|
cfg.Rlimits = process.Rlimits
|
||||||
|
}
|
||||||
|
+ if process.CPUAffinity != nil {
|
||||||
|
+ cfg.CPUAffinity = process.CPUAffinity
|
||||||
|
+ }
|
||||||
|
if cgroups.IsCgroup2UnifiedMode() {
|
||||||
|
cfg.Cgroup2Path = c.cgroupManager.Path("")
|
||||||
|
}
|
||||||
|
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
|
||||||
|
index 1eb0279d..eddbfba6 100644
|
||||||
|
--- a/libcontainer/init_linux.go
|
||||||
|
+++ b/libcontainer/init_linux.go
|
||||||
|
@@ -72,6 +72,7 @@ type initConfig struct {
|
||||||
|
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
|
||||||
|
SpecState *specs.State `json:"spec_state,omitempty"`
|
||||||
|
Cgroup2Path string `json:"cgroup2_path,omitempty"`
|
||||||
|
+ CPUAffinity *configs.CPUAffinity `json:"cpu_affinity,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// Init is part of "runc init" implementation.
|
||||||
|
@@ -151,7 +152,7 @@ func startInitialization() (retErr error) {
|
||||||
|
|
||||||
|
logrus.SetOutput(logPipe)
|
||||||
|
logrus.SetFormatter(new(logrus.JSONFormatter))
|
||||||
|
- logrus.Debug("child process in init()")
|
||||||
|
+ logrus.Debugf("child process in init()")
|
||||||
|
|
||||||
|
// Only init processes have FIFOFD.
|
||||||
|
var fifoFile *os.File
|
||||||
|
diff --git a/libcontainer/nsenter/log.c b/libcontainer/nsenter/log.c
|
||||||
|
index 086b5398..72774cb0 100644
|
||||||
|
--- a/libcontainer/nsenter/log.c
|
||||||
|
+++ b/libcontainer/nsenter/log.c
|
||||||
|
@@ -31,6 +31,11 @@ void setup_logpipe(void)
|
||||||
|
loglevel = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
+bool log_enabled_for(int level)
|
||||||
|
+{
|
||||||
|
+ return (logfd >= 0 && level <= loglevel);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
/* Defined in nsexec.c */
|
||||||
|
extern int current_stage;
|
||||||
|
|
||||||
|
@@ -40,8 +45,8 @@ void write_log(int level, const char *format, ...)
|
||||||
|
va_list args;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
- if (logfd < 0 || level > loglevel)
|
||||||
|
- goto out;
|
||||||
|
+ if (!log_enabled_for(level))
|
||||||
|
+ return;
|
||||||
|
|
||||||
|
va_start(args, format);
|
||||||
|
ret = vasprintf(&message, format, args);
|
||||||
|
diff --git a/libcontainer/nsenter/log.h b/libcontainer/nsenter/log.h
|
||||||
|
index 1fe95a11..3e18de68 100644
|
||||||
|
--- a/libcontainer/nsenter/log.h
|
||||||
|
+++ b/libcontainer/nsenter/log.h
|
||||||
|
@@ -1,6 +1,7 @@
|
||||||
|
#ifndef NSENTER_LOG_H
|
||||||
|
#define NSENTER_LOG_H
|
||||||
|
|
||||||
|
+#include <stdbool.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
@@ -20,6 +21,8 @@
|
||||||
|
*/
|
||||||
|
void setup_logpipe(void);
|
||||||
|
|
||||||
|
+bool log_enabled_for(int level);
|
||||||
|
+
|
||||||
|
void write_log(int level, const char *format, ...) __attribute__((format(printf, 2, 3)));
|
||||||
|
|
||||||
|
extern int logfd;
|
||||||
|
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
|
||||||
|
index 565b2ca2..aa4976d6 100644
|
||||||
|
--- a/libcontainer/nsenter/nsexec.c
|
||||||
|
+++ b/libcontainer/nsenter/nsexec.c
|
||||||
|
@@ -558,6 +558,25 @@ static void update_timens_offsets(pid_t pid, char *map, size_t map_len)
|
||||||
|
bail("failed to update /proc/%d/timens_offsets", pid);
|
||||||
|
}
|
||||||
|
|
||||||
|
+void print_cpu_affinity()
|
||||||
|
+{
|
||||||
|
+ cpu_set_t cpus = { };
|
||||||
|
+ size_t i, mask = 0;
|
||||||
|
+
|
||||||
|
+ if (sched_getaffinity(0, sizeof(cpus), &cpus) < 0) {
|
||||||
|
+ write_log(WARNING, "sched_getaffinity: %m");
|
||||||
|
+ return;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ /* Do not print the complete mask, we only need a few first CPUs. */
|
||||||
|
+ for (i = 0; i < sizeof(mask) * 8; i++) {
|
||||||
|
+ if (CPU_ISSET(i, &cpus))
|
||||||
|
+ mask |= 1 << i;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ write_log(DEBUG, "affinity: 0x%zx", mask);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
void nsexec(void)
|
||||||
|
{
|
||||||
|
int pipenum;
|
||||||
|
@@ -584,6 +603,16 @@ void nsexec(void)
|
||||||
|
|
||||||
|
write_log(DEBUG, "=> nsexec container setup");
|
||||||
|
|
||||||
|
+ /* This is for ../../tests/integration/cpu_affinity.bats test only.
|
||||||
|
+ *
|
||||||
|
+ * Printing this from Go code might be too late as some kernels
|
||||||
|
+ * change the process' CPU affinity to that of container's cpuset
|
||||||
|
+ * as soon as the process is moved into container's cgroup.
|
||||||
|
+ */
|
||||||
|
+ if (log_enabled_for(DEBUG)) {
|
||||||
|
+ print_cpu_affinity();
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
/* Parse all of the netlink configuration. */
|
||||||
|
nl_parse(pipenum, &config);
|
||||||
|
|
||||||
|
diff --git a/libcontainer/process.go b/libcontainer/process.go
|
||||||
|
index 114b3f2b..5339583f 100644
|
||||||
|
--- a/libcontainer/process.go
|
||||||
|
+++ b/libcontainer/process.go
|
||||||
|
@@ -102,6 +102,8 @@ type Process struct {
|
||||||
|
Scheduler *configs.Scheduler
|
||||||
|
|
||||||
|
IOPriority *configs.IOPriority
|
||||||
|
+
|
||||||
|
+ CPUAffinity *configs.CPUAffinity
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait waits for the process to exit.
|
||||||
|
diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
|
||||||
|
index fcbb54a3..477c8a77 100644
|
||||||
|
--- a/libcontainer/process_linux.go
|
||||||
|
+++ b/libcontainer/process_linux.go
|
||||||
|
@@ -122,6 +122,46 @@ func (p *setnsProcess) signal(sig os.Signal) error {
|
||||||
|
return unix.Kill(p.pid(), s)
|
||||||
|
}
|
||||||
|
|
||||||
|
+// Starts setns process with specified initial CPU affinity.
|
||||||
|
+func (p *setnsProcess) startWithCPUAffinity() error {
|
||||||
|
+ aff := p.config.CPUAffinity
|
||||||
|
+ if aff == nil || aff.Initial == nil {
|
||||||
|
+ return p.cmd.Start()
|
||||||
|
+ }
|
||||||
|
+ errCh := make(chan error)
|
||||||
|
+ defer close(errCh)
|
||||||
|
+
|
||||||
|
+ // Use a goroutine to dedicate an OS thread.
|
||||||
|
+ go func() {
|
||||||
|
+ runtime.LockOSThread()
|
||||||
|
+ // Command inherits the CPU affinity.
|
||||||
|
+ if err := unix.SchedSetaffinity(unix.Gettid(), aff.Initial); err != nil {
|
||||||
|
+ runtime.UnlockOSThread()
|
||||||
|
+ errCh <- fmt.Errorf("error setting initial CPU affinity: %w", err)
|
||||||
|
+ return
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ errCh <- p.cmd.Start()
|
||||||
|
+ // Deliberately omit runtime.UnlockOSThread here.
|
||||||
|
+ // https://pkg.go.dev/runtime#LockOSThread says:
|
||||||
|
+ // "If the calling goroutine exits without unlocking the
|
||||||
|
+ // thread, the thread will be terminated".
|
||||||
|
+ }()
|
||||||
|
+
|
||||||
|
+ return <-errCh
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+func (p *setnsProcess) setFinalCPUAffinity() error {
|
||||||
|
+ aff := p.config.CPUAffinity
|
||||||
|
+ if aff == nil || aff.Final == nil {
|
||||||
|
+ return nil
|
||||||
|
+ }
|
||||||
|
+ if err := unix.SchedSetaffinity(p.pid(), aff.Final); err != nil {
|
||||||
|
+ return fmt.Errorf("error setting final CPU affinity: %w", err)
|
||||||
|
+ }
|
||||||
|
+ return nil
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
func (p *setnsProcess) start() (retErr error) {
|
||||||
|
defer p.comm.closeParent()
|
||||||
|
|
||||||
|
@@ -133,8 +173,8 @@ func (p *setnsProcess) start() (retErr error) {
|
||||||
|
|
||||||
|
// get the "before" value of oom kill count
|
||||||
|
oom, _ := p.manager.OOMKillCount()
|
||||||
|
- err := p.cmd.Start()
|
||||||
|
- // close the child-side of the pipes (controlled by child)
|
||||||
|
+ err := p.startWithCPUAffinity()
|
||||||
|
+ // Close the child-side of the pipes (controlled by child).
|
||||||
|
p.comm.closeChild()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error starting setns process: %w", err)
|
||||||
|
@@ -184,6 +224,10 @@ func (p *setnsProcess) start() (retErr error) {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
+ // Set final CPU affinity right after the process is moved into container's cgroup.
|
||||||
|
+ if err := p.setFinalCPUAffinity(); err != nil {
|
||||||
|
+ return err
|
||||||
|
+ }
|
||||||
|
if p.intelRdtPath != "" {
|
||||||
|
// if Intel RDT "resource control" filesystem path exists
|
||||||
|
_, err := os.Stat(p.intelRdtPath)
|
||||||
|
@@ -193,7 +237,6 @@ func (p *setnsProcess) start() (retErr error) {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
-
|
||||||
|
if err := utils.WriteJSON(p.comm.initSockParent, p.config); err != nil {
|
||||||
|
return fmt.Errorf("error writing config to pipe: %w", err)
|
||||||
|
}
|
||||||
|
diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go
|
||||||
|
index 95ada499..2d0db342 100644
|
||||||
|
--- a/libcontainer/specconv/spec_linux.go
|
||||||
|
+++ b/libcontainer/specconv/spec_linux.go
|
||||||
|
@@ -556,6 +556,11 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
|
||||||
|
ioPriority := *spec.Process.IOPriority
|
||||||
|
config.IOPriority = &ioPriority
|
||||||
|
}
|
||||||
|
+ config.ExecCPUAffinity, err = configs.ConvertCPUAffinity(spec.Process.ExecCPUAffinity)
|
||||||
|
+ if err != nil {
|
||||||
|
+ return nil, err
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
}
|
||||||
|
createHooks(spec, config)
|
||||||
|
config.Version = specs.Version
|
||||||
|
diff --git a/tests/integration/cpu_affinity.bats b/tests/integration/cpu_affinity.bats
|
||||||
|
new file mode 100644
|
||||||
|
index 00000000..f6adfa2a
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/tests/integration/cpu_affinity.bats
|
||||||
|
@@ -0,0 +1,101 @@
|
||||||
|
+#!/usr/bin/env bats
|
||||||
|
+# Exec CPU affinity tests. For more details, see:
|
||||||
|
+# - https://github.com/opencontainers/runtime-spec/pull/1253
|
||||||
|
+
|
||||||
|
+load helpers
|
||||||
|
+
|
||||||
|
+function setup() {
|
||||||
|
+ requires smp cgroups_cpuset
|
||||||
|
+ setup_busybox
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+function teardown() {
|
||||||
|
+ teardown_bundle
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+function first_cpu() {
|
||||||
|
+ sed 's/[-,].*//g' </sys/devices/system/cpu/online
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+# Convert list of cpus ("0,1" or "0-1") to mask as printed by nsexec.
|
||||||
|
+# NOTE the range conversion is not proper, merely sufficient for tests here.
|
||||||
|
+function cpus_to_mask() {
|
||||||
|
+ local cpus=$* mask=0
|
||||||
|
+
|
||||||
|
+ cpus=${cpus//,/-} # 1. "," --> "-".
|
||||||
|
+ cpus=${cpus//-/ } # 2. "-" --> " ".
|
||||||
|
+
|
||||||
|
+ for c in $cpus; do
|
||||||
|
+ mask=$((mask | 1 << c))
|
||||||
|
+ done
|
||||||
|
+
|
||||||
|
+ printf "0x%x" $mask
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+@test "runc exec [CPU affinity, only initial set from process.json]" {
|
||||||
|
+ first="$(first_cpu)"
|
||||||
|
+ second=$((first + 1)) # Hacky; might not work in all environments.
|
||||||
|
+
|
||||||
|
+ runc run -d --console-socket "$CONSOLE_SOCKET" ct1
|
||||||
|
+ [ "$status" -eq 0 ]
|
||||||
|
+
|
||||||
|
+ for cpus in "$second" "$first-$second" "$first,$second" "$first"; do
|
||||||
|
+ proc='
|
||||||
|
+{
|
||||||
|
+ "terminal": false,
|
||||||
|
+ "execCPUAffinity": {
|
||||||
|
+ "initial": "'$cpus'"
|
||||||
|
+ },
|
||||||
|
+ "args": [ "/bin/true" ],
|
||||||
|
+ "cwd": "/"
|
||||||
|
+}'
|
||||||
|
+ mask=$(cpus_to_mask "$cpus")
|
||||||
|
+ echo "CPUS: $cpus, mask: $mask"
|
||||||
|
+ runc --debug exec --process <(echo "$proc") ct1
|
||||||
|
+ [[ "$output" == *"nsexec"*": affinity: $mask"* ]]
|
||||||
|
+ done
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+@test "runc exec [CPU affinity, initial and final set from process.json]" {
|
||||||
|
+ first="$(first_cpu)"
|
||||||
|
+ second=$((first + 1)) # Hacky; might not work in all environments.
|
||||||
|
+
|
||||||
|
+ runc run -d --console-socket "$CONSOLE_SOCKET" ct1
|
||||||
|
+ [ "$status" -eq 0 ]
|
||||||
|
+
|
||||||
|
+ for cpus in "$second" "$first-$second" "$first,$second" "$first"; do
|
||||||
|
+ proc='
|
||||||
|
+{
|
||||||
|
+ "terminal": false,
|
||||||
|
+ "execCPUAffinity": {
|
||||||
|
+ "initial": "'$cpus'",
|
||||||
|
+ "final": "'$cpus'"
|
||||||
|
+ },
|
||||||
|
+ "args": [ "/bin/grep", "-F", "Cpus_allowed_list:", "/proc/self/status" ],
|
||||||
|
+ "cwd": "/"
|
||||||
|
+}'
|
||||||
|
+ mask=$(cpus_to_mask "$cpus")
|
||||||
|
+ exp=${cpus//,/-} # "," --> "-".
|
||||||
|
+ echo "CPUS: $cpus, mask: $mask, final: $exp"
|
||||||
|
+ runc --debug exec --process <(echo "$proc") ct1
|
||||||
|
+ [[ "$output" == *"nsexec"*": affinity: $mask"* ]]
|
||||||
|
+ [[ "$output" == *"Cpus_allowed_list: $exp"* ]] # Mind the literal tab.
|
||||||
|
+ done
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+@test "runc exec [CPU affinity, initial and final set from config.json]" {
|
||||||
|
+ initial="$(first_cpu)"
|
||||||
|
+ final=$((initial + 1)) # Hacky; might not work in all environments.
|
||||||
|
+
|
||||||
|
+ update_config " .process.execCPUAffinity.initial = \"$initial\"
|
||||||
|
+ | .process.execCPUAffinity.final = \"$final\""
|
||||||
|
+
|
||||||
|
+ runc run -d --console-socket "$CONSOLE_SOCKET" ct1
|
||||||
|
+ [ "$status" -eq 0 ]
|
||||||
|
+
|
||||||
|
+ runc --debug exec ct1 grep "Cpus_allowed_list:" /proc/self/status
|
||||||
|
+ [ "$status" -eq 0 ]
|
||||||
|
+ mask=$(cpus_to_mask "$initial")
|
||||||
|
+ [[ "$output" == *"nsexec"*": affinity: $mask"* ]]
|
||||||
|
+ [[ "$output" == *"Cpus_allowed_list: $final"* ]] # Mind the literal tab.
|
||||||
|
+}
|
||||||
|
diff --git a/utils_linux.go b/utils_linux.go
|
||||||
|
index feb6ef80..013dbcf4 100644
|
||||||
|
--- a/utils_linux.go
|
||||||
|
+++ b/utils_linux.go
|
||||||
|
@@ -90,6 +90,12 @@ func newProcess(p specs.Process) (*libcontainer.Process, error) {
|
||||||
|
}
|
||||||
|
lp.Rlimits = append(lp.Rlimits, rl)
|
||||||
|
}
|
||||||
|
+ aff, err := configs.ConvertCPUAffinity(p.ExecCPUAffinity)
|
||||||
|
+ if err != nil {
|
||||||
|
+ return nil, err
|
||||||
|
+ }
|
||||||
|
+ lp.CPUAffinity = aff
|
||||||
|
+
|
||||||
|
return lp, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
--
|
||||||
|
2.47.1
|
||||||
|
|
@ -1,200 +0,0 @@
|
|||||||
From ecf53c23545092019602578583031c28fde4d2a1 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Giuseppe Scrivano <gscrivan@redhat.com>
|
|
||||||
Date: Fri, 25 May 2018 18:04:06 +0200
|
|
||||||
Subject: [PATCH] sd-notify: do not hang when NOTIFY_SOCKET is used with create
|
|
||||||
|
|
||||||
if NOTIFY_SOCKET is used, do not block the main runc process waiting
|
|
||||||
for events on the notify socket. Change the logic to create a new
|
|
||||||
process that monitors exclusively the notify socket until an event is
|
|
||||||
received.
|
|
||||||
|
|
||||||
Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
|
|
||||||
---
|
|
||||||
init.go | 12 +++++++
|
|
||||||
notify_socket.go | 101 ++++++++++++++++++++++++++++++++++++++++++++++---------
|
|
||||||
signals.go | 5 +--
|
|
||||||
3 files changed, 99 insertions(+), 19 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/init.go b/init.go
|
|
||||||
index c8f453192..6a3d9e91c 100644
|
|
||||||
--- a/init.go
|
|
||||||
+++ b/init.go
|
|
||||||
@@ -20,6 +20,18 @@ var initCommand = cli.Command{
|
|
||||||
Name: "init",
|
|
||||||
Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`,
|
|
||||||
Action: func(context *cli.Context) error {
|
|
||||||
+ // If NOTIFY_SOCKET is used create a new process that stays around
|
|
||||||
+ // so to not block "runc start". It will automatically exits when the
|
|
||||||
+ // container notifies that it is ready, or when the container is deleted
|
|
||||||
+ if os.Getenv("_NOTIFY_SOCKET_FD") != "" {
|
|
||||||
+ fd := os.Getenv("_NOTIFY_SOCKET_FD")
|
|
||||||
+ pid := os.Getenv("_NOTIFY_SOCKET_PID")
|
|
||||||
+ hostNotifySocket := os.Getenv("_NOTIFY_SOCKET_HOST")
|
|
||||||
+ notifySocketPath := os.Getenv("_NOTIFY_SOCKET_PATH")
|
|
||||||
+ notifySocketInit(fd, pid, hostNotifySocket, notifySocketPath)
|
|
||||||
+ os.Exit(0)
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
factory, _ := libcontainer.New("")
|
|
||||||
if err := factory.StartInitialization(); err != nil {
|
|
||||||
// as the error is sent back to the parent there is no need to log
|
|
||||||
diff --git a/notify_socket.go b/notify_socket.go
|
|
||||||
index cd6c0a989..e04e9d660 100644
|
|
||||||
--- a/notify_socket.go
|
|
||||||
+++ b/notify_socket.go
|
|
||||||
@@ -6,10 +6,13 @@ import (
|
|
||||||
"bytes"
|
|
||||||
"fmt"
|
|
||||||
"net"
|
|
||||||
+ "os"
|
|
||||||
+ "os/exec"
|
|
||||||
"path/filepath"
|
|
||||||
+ "strconv"
|
|
||||||
+ "time"
|
|
||||||
|
|
||||||
"github.com/opencontainers/runtime-spec/specs-go"
|
|
||||||
-
|
|
||||||
"github.com/sirupsen/logrus"
|
|
||||||
"github.com/urfave/cli"
|
|
||||||
)
|
|
||||||
@@ -64,24 +67,94 @@ func (s *notifySocket) setupSocket() error {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
+func (notifySocket *notifySocket) notifyNewPid(pid int) {
|
|
||||||
+ notifySocketHostAddr := net.UnixAddr{Name: notifySocket.host, Net: "unixgram"}
|
|
||||||
+ client, err := net.DialUnix("unixgram", nil, ¬ifySocketHostAddr)
|
|
||||||
+ if err != nil {
|
|
||||||
+ return
|
|
||||||
+ }
|
|
||||||
+ newPid := fmt.Sprintf("MAINPID=%d\n", pid)
|
|
||||||
+ client.Write([]byte(newPid))
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
// pid1 must be set only with -d, as it is used to set the new process as the main process
|
|
||||||
// for the service in systemd
|
|
||||||
func (notifySocket *notifySocket) run(pid1 int) {
|
|
||||||
- buf := make([]byte, 512)
|
|
||||||
- notifySocketHostAddr := net.UnixAddr{Name: notifySocket.host, Net: "unixgram"}
|
|
||||||
- client, err := net.DialUnix("unixgram", nil, ¬ifySocketHostAddr)
|
|
||||||
+ file, err := notifySocket.socket.File()
|
|
||||||
if err != nil {
|
|
||||||
logrus.Error(err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
- for {
|
|
||||||
- r, err := notifySocket.socket.Read(buf)
|
|
||||||
- if err != nil {
|
|
||||||
- break
|
|
||||||
+ defer file.Close()
|
|
||||||
+ defer notifySocket.socket.Close()
|
|
||||||
+
|
|
||||||
+ cmd := exec.Command("/proc/self/exe", "init")
|
|
||||||
+ cmd.ExtraFiles = []*os.File{file}
|
|
||||||
+ cmd.Env = append(cmd.Env, "_NOTIFY_SOCKET_FD=3",
|
|
||||||
+ fmt.Sprintf("_NOTIFY_SOCKET_PID=%d", pid1),
|
|
||||||
+ fmt.Sprintf("_NOTIFY_SOCKET_HOST=%s", notifySocket.host),
|
|
||||||
+ fmt.Sprintf("_NOTIFY_SOCKET_PATH=%s", notifySocket.socketPath))
|
|
||||||
+
|
|
||||||
+ if err := cmd.Start(); err != nil {
|
|
||||||
+ logrus.Fatal(err)
|
|
||||||
+ }
|
|
||||||
+ notifySocket.notifyNewPid(cmd.Process.Pid)
|
|
||||||
+ cmd.Process.Release()
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+func notifySocketInit(envFd string, envPid string, notifySocketHost string, notifySocketPath string) {
|
|
||||||
+ intFd, err := strconv.Atoi(envFd)
|
|
||||||
+ if err != nil {
|
|
||||||
+ return
|
|
||||||
+ }
|
|
||||||
+ pid1, err := strconv.Atoi(envPid)
|
|
||||||
+ if err != nil {
|
|
||||||
+ return
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ file := os.NewFile(uintptr(intFd), "unixgram")
|
|
||||||
+ defer file.Close()
|
|
||||||
+
|
|
||||||
+ fileChan := make(chan []byte)
|
|
||||||
+ exitChan := make(chan bool)
|
|
||||||
+
|
|
||||||
+ go func() {
|
|
||||||
+ for {
|
|
||||||
+ buf := make([]byte, 512)
|
|
||||||
+ r, err := file.Read(buf)
|
|
||||||
+ if err != nil {
|
|
||||||
+ return
|
|
||||||
+ }
|
|
||||||
+ fileChan <- buf[0:r]
|
|
||||||
}
|
|
||||||
- var out bytes.Buffer
|
|
||||||
- for _, line := range bytes.Split(buf[0:r], []byte{'\n'}) {
|
|
||||||
- if bytes.HasPrefix(line, []byte("READY=")) {
|
|
||||||
+ }()
|
|
||||||
+ go func() {
|
|
||||||
+ for {
|
|
||||||
+ if _, err := os.Stat(notifySocketPath); os.IsNotExist(err) {
|
|
||||||
+ exitChan <- true
|
|
||||||
+ return
|
|
||||||
+ }
|
|
||||||
+ time.Sleep(time.Second)
|
|
||||||
+ }
|
|
||||||
+ }()
|
|
||||||
+
|
|
||||||
+ notifySocketHostAddr := net.UnixAddr{Name: notifySocketHost, Net: "unixgram"}
|
|
||||||
+ client, err := net.DialUnix("unixgram", nil, ¬ifySocketHostAddr)
|
|
||||||
+ if err != nil {
|
|
||||||
+ return
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ for {
|
|
||||||
+ select {
|
|
||||||
+ case <-exitChan:
|
|
||||||
+ return
|
|
||||||
+ case b := <-fileChan:
|
|
||||||
+ for _, line := range bytes.Split(b, []byte{'\n'}) {
|
|
||||||
+ if !bytes.HasPrefix(line, []byte("READY=")) {
|
|
||||||
+ continue
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ var out bytes.Buffer
|
|
||||||
_, err = out.Write(line)
|
|
||||||
if err != nil {
|
|
||||||
return
|
|
||||||
@@ -98,10 +171,8 @@ func (notifySocket *notifySocket) run(pid1 int) {
|
|
||||||
}
|
|
||||||
|
|
||||||
// now we can inform systemd to use pid1 as the pid to monitor
|
|
||||||
- if pid1 > 0 {
|
|
||||||
- newPid := fmt.Sprintf("MAINPID=%d\n", pid1)
|
|
||||||
- client.Write([]byte(newPid))
|
|
||||||
- }
|
|
||||||
+ newPid := fmt.Sprintf("MAINPID=%d\n", pid1)
|
|
||||||
+ client.Write([]byte(newPid))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
diff --git a/signals.go b/signals.go
|
|
||||||
index 1811de837..d0988cb39 100644
|
|
||||||
--- a/signals.go
|
|
||||||
+++ b/signals.go
|
|
||||||
@@ -70,7 +70,7 @@ func (h *signalHandler) forward(process *libcontainer.Process, tty *tty, detach
|
|
||||||
h.notifySocket.run(pid1)
|
|
||||||
return 0, nil
|
|
||||||
} else {
|
|
||||||
- go h.notifySocket.run(0)
|
|
||||||
+ h.notifySocket.run(os.Getpid())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@@ -98,9 +98,6 @@ func (h *signalHandler) forward(process *libcontainer.Process, tty *tty, detach
|
|
||||||
// status because we must ensure that any of the go specific process
|
|
||||||
// fun such as flushing pipes are complete before we return.
|
|
||||||
process.Wait()
|
|
||||||
- if h.notifySocket != nil {
|
|
||||||
- h.notifySocket.Close()
|
|
||||||
- }
|
|
||||||
return e.status, nil
|
|
||||||
}
|
|
||||||
}
|
|
@ -1 +0,0 @@
|
|||||||
fs.may_detach_mounts=1
|
|
@ -1,61 +0,0 @@
|
|||||||
diff --git a/list.go b/list.go
|
|
||||||
index 0313d8c..328798b 100644
|
|
||||||
--- a/list.go
|
|
||||||
+++ b/list.go
|
|
||||||
@@ -50,7 +50,7 @@ var listCommand = cli.Command{
|
|
||||||
ArgsUsage: `
|
|
||||||
|
|
||||||
Where the given root is specified via the global option "--root"
|
|
||||||
-(default: "/run/runc").
|
|
||||||
+(default: "/run/runc-ctrs").
|
|
||||||
|
|
||||||
EXAMPLE 1:
|
|
||||||
To list containers created via the default "--root":
|
|
||||||
diff --git a/main.go b/main.go
|
|
||||||
index 278399a..0f49fce 100644
|
|
||||||
--- a/main.go
|
|
||||||
+++ b/main.go
|
|
||||||
@@ -62,7 +62,7 @@ func main() {
|
|
||||||
v = append(v, fmt.Sprintf("spec: %s", specs.Version))
|
|
||||||
app.Version = strings.Join(v, "\n")
|
|
||||||
|
|
||||||
- root := "/run/runc"
|
|
||||||
+ root := "/run/runc-ctrs"
|
|
||||||
rootless, err := isRootless(nil)
|
|
||||||
if err != nil {
|
|
||||||
fatal(err)
|
|
||||||
@@ -70,7 +70,7 @@ func main() {
|
|
||||||
if rootless {
|
|
||||||
runtimeDir := os.Getenv("XDG_RUNTIME_DIR")
|
|
||||||
if runtimeDir != "" {
|
|
||||||
- root = runtimeDir + "/runc"
|
|
||||||
+ root = runtimeDir + "/runc-ctrs"
|
|
||||||
// According to the XDG specification, we need to set anything in
|
|
||||||
// XDG_RUNTIME_DIR to have a sticky bit if we don't want it to get
|
|
||||||
// auto-pruned.
|
|
||||||
diff --git a/man/runc-list.8.md b/man/runc-list.8.md
|
|
||||||
index f737424..107220e 100644
|
|
||||||
--- a/man/runc-list.8.md
|
|
||||||
+++ b/man/runc-list.8.md
|
|
||||||
@@ -6,7 +6,7 @@
|
|
||||||
|
|
||||||
# EXAMPLE
|
|
||||||
Where the given root is specified via the global option "--root"
|
|
||||||
-(default: "/run/runc").
|
|
||||||
+(default: "/run/runc-ctrs").
|
|
||||||
|
|
||||||
To list containers created via the default "--root":
|
|
||||||
# runc list
|
|
||||||
diff --git a/man/runc.8.md b/man/runc.8.md
|
|
||||||
index 6d0ddff..337bc73 100644
|
|
||||||
--- a/man/runc.8.md
|
|
||||||
+++ b/man/runc.8.md
|
|
||||||
@@ -51,7 +51,7 @@ value for "bundle" is the current directory.
|
|
||||||
--debug enable debug output for logging
|
|
||||||
--log value set the log file path where internal debug information is written (default: "/dev/null")
|
|
||||||
--log-format value set the format used by logs ('text' (default), or 'json') (default: "text")
|
|
||||||
- --root value root directory for storage of container state (this should be located in tmpfs) (default: "/run/runc" or $XDG_RUNTIME_DIR/runc for rootless containers)
|
|
||||||
+ --root value root directory for storage of container state (this should be located in tmpfs) (default: "/run/runc-ctrs" or $XDG_RUNTIME_DIR/runc-ctrs for rootless containers)
|
|
||||||
--criu value path to the criu binary used for checkpoint and restore (default: "criu")
|
|
||||||
--systemd-cgroup enable systemd cgroup support, expects cgroupsPath to be of form "slice:prefix:name" for e.g. "system.slice:runc:434234"
|
|
||||||
--rootless value enable rootless mode ('true', 'false', or 'auto') (default: "auto")
|
|
@ -1,72 +0,0 @@
|
|||||||
From 28a697cce3e4f905dca700eda81d681a30eef9cd Mon Sep 17 00:00:00 2001
|
|
||||||
From: Giuseppe Scrivano <gscrivan@redhat.com>
|
|
||||||
Date: Fri, 11 Jan 2019 21:53:45 +0100
|
|
||||||
Subject: [PATCH] rootfs: umount all procfs and sysfs with --no-pivot
|
|
||||||
|
|
||||||
When creating a new user namespace, the kernel doesn't allow to mount
|
|
||||||
a new procfs or sysfs file system if there is not already one instance
|
|
||||||
fully visible in the current mount namespace.
|
|
||||||
|
|
||||||
When using --no-pivot we were effectively inhibiting this protection
|
|
||||||
from the kernel, as /proc and /sys from the host are still present in
|
|
||||||
the container mount namespace.
|
|
||||||
|
|
||||||
A container without full access to /proc could then create a new user
|
|
||||||
namespace, and from there able to mount a fully visible /proc, bypassing
|
|
||||||
the limitations in the container.
|
|
||||||
|
|
||||||
A simple reproducer for this issue is:
|
|
||||||
|
|
||||||
unshare -mrfp sh -c "mount -t proc none /proc && echo c > /proc/sysrq-trigger"
|
|
||||||
|
|
||||||
Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
|
|
||||||
---
|
|
||||||
libcontainer/rootfs_linux.go | 35 +++++++++++++++++++++++++++++++++++
|
|
||||||
1 file changed, 35 insertions(+)
|
|
||||||
|
|
||||||
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
|
|
||||||
index e7c2f8ada..6bd6da74a 100644
|
|
||||||
--- a/libcontainer/rootfs_linux.go
|
|
||||||
+++ b/libcontainer/rootfs_linux.go
|
|
||||||
@@ -748,6 +748,41 @@ func pivotRoot(rootfs string) error {
|
|
||||||
}
|
|
||||||
|
|
||||||
func msMoveRoot(rootfs string) error {
|
|
||||||
+ mountinfos, err := mount.GetMounts()
|
|
||||||
+ if err != nil {
|
|
||||||
+ return err
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ absRootfs, err := filepath.Abs(rootfs)
|
|
||||||
+ if err != nil {
|
|
||||||
+ return err
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ for _, info := range mountinfos {
|
|
||||||
+ p, err := filepath.Abs(info.Mountpoint)
|
|
||||||
+ if err != nil {
|
|
||||||
+ return err
|
|
||||||
+ }
|
|
||||||
+ // Umount every syfs and proc file systems, except those under the container rootfs
|
|
||||||
+ if (info.Fstype != "proc" && info.Fstype != "sysfs") || filepath.HasPrefix(p, absRootfs) {
|
|
||||||
+ continue
|
|
||||||
+ }
|
|
||||||
+ // Be sure umount events are not propagated to the host.
|
|
||||||
+ if err := unix.Mount("", p, "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
|
|
||||||
+ return err
|
|
||||||
+ }
|
|
||||||
+ if err := unix.Unmount(p, unix.MNT_DETACH); err != nil {
|
|
||||||
+ if err != unix.EINVAL && err != unix.EPERM {
|
|
||||||
+ return err
|
|
||||||
+ } else {
|
|
||||||
+ // If we have not privileges for umounting (e.g. rootless), then
|
|
||||||
+ // cover the path.
|
|
||||||
+ if err := unix.Mount("tmpfs", p, "tmpfs", 0, ""); err != nil {
|
|
||||||
+ return err
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
if err := unix.Mount(rootfs, "/", "", unix.MS_MOVE, ""); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
1389
SPECS/runc.spec
1389
SPECS/runc.spec
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user