Compare commits

...

No commits in common. "c8-stream-1.0" and "c9" have entirely different histories.

12 changed files with 15443 additions and 839 deletions

2
.gitignore vendored
View File

@ -1 +1 @@
SOURCES/runc-2abd837.tar.gz
SOURCES/v1.3.0.tar.gz

View File

@ -1 +1 @@
cf7119a838db2963e7af6ecdba90a2cc95ec0d56 SOURCES/runc-2abd837.tar.gz
0ea2488912e9ae562782f5980971f7fb0d73df38 SOURCES/v1.3.0.tar.gz

View File

@ -0,0 +1,416 @@
From 2df42d4db6bc57ee914fa9cc4455ad3b8daff1d9 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Sat, 1 Nov 2025 17:21:36 +1100
Subject: [PATCH 1/2] [1.3] openat2: improve resilience on busy systems
Previously, we would see a ~3% failure rate when starting containers
with mounts that contain ".." (which can trigger -EAGAIN). To counteract
this, filepath-securejoin v0.5.1 includes a bump of the internal retry
limit from 32 to 128, which lowers the failure rate to 0.12%.
However, there is still a risk of spurious failure on regular systems.
In order to try to provide more resilience (while avoiding DoS attacks),
this patch also includes an additional retry loop that terminates based
on a deadline rather than retry count. The deadline is 2ms, as my
testing found that ~800us for a single pathrs operation was the longest
latency due to -EAGAIN retries, and that was an outlier compared to the
more common ~400us latencies -- so 2ms should be more than enough for
any real system.
The failure rates above were based on more 50k runs of runc with an
attack script (from libpathrs) running a rename attack on all cores of a
16-core system, which is arguably a worst-case but heavily utilised
servers could likely approach similar results.
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
go.mod | 2 +-
go.sum | 4 +-
internal/pathrs/mkdirall_pathrslite.go | 4 +-
internal/pathrs/procfs_pathrslite.go | 22 ++++---
internal/pathrs/retry.go | 66 +++++++++++++++++++
internal/pathrs/root_pathrslite.go | 7 +-
.../cyphar/filepath-securejoin/CHANGELOG.md | 34 +++++++++-
.../cyphar/filepath-securejoin/VERSION | 2 +-
.../internal/{errors.go => errors_linux.go} | 15 ++++-
.../pathrs-lite/internal/fd/openat2_linux.go | 12 ++--
vendor/modules.txt | 2 +-
11 files changed, 144 insertions(+), 26 deletions(-)
create mode 100644 internal/pathrs/retry.go
rename vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/{errors.go => errors_linux.go} (70%)
diff --git a/go.mod b/go.mod
index f2deafc3..a551a4ec 100644
--- a/go.mod
+++ b/go.mod
@@ -6,7 +6,7 @@ require (
github.com/checkpoint-restore/go-criu/v6 v6.3.0
github.com/containerd/console v1.0.5
github.com/coreos/go-systemd/v22 v22.5.0
- github.com/cyphar/filepath-securejoin v0.5.0
+ github.com/cyphar/filepath-securejoin v0.5.1
github.com/docker/go-units v0.5.0
github.com/godbus/dbus/v5 v5.1.0
github.com/moby/sys/capability v0.4.0
diff --git a/go.sum b/go.sum
index ba395bf0..fb357b43 100644
--- a/go.sum
+++ b/go.sum
@@ -10,8 +10,8 @@ github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSV
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/cpuguy83/go-md2man/v2 v2.0.5 h1:ZtcqGrnekaHpVLArFSe4HK5DoKx1T0rq2DwVB0alcyc=
github.com/cpuguy83/go-md2man/v2 v2.0.5/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
-github.com/cyphar/filepath-securejoin v0.5.0 h1:hIAhkRBMQ8nIeuVwcAoymp7MY4oherZdAxD+m0u9zaw=
-github.com/cyphar/filepath-securejoin v0.5.0/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI=
+github.com/cyphar/filepath-securejoin v0.5.1 h1:eYgfMq5yryL4fbWfkLpFFy2ukSELzaJOTaUTuh+oF48=
+github.com/cyphar/filepath-securejoin v0.5.1/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
diff --git a/internal/pathrs/mkdirall_pathrslite.go b/internal/pathrs/mkdirall_pathrslite.go
index fb4f7842..a9a0157c 100644
--- a/internal/pathrs/mkdirall_pathrslite.go
+++ b/internal/pathrs/mkdirall_pathrslite.go
@@ -83,7 +83,9 @@ func MkdirAllInRootOpen(root, unsafePath string, mode os.FileMode) (*os.File, er
}
defer rootDir.Close()
- return pathrs.MkdirAllHandle(rootDir, unsafePath, mode)
+ return retryEAGAIN(func() (*os.File, error) {
+ return pathrs.MkdirAllHandle(rootDir, unsafePath, mode)
+ })
}
// MkdirAllInRoot is a wrapper around MkdirAllInRootOpen which closes the
diff --git a/internal/pathrs/procfs_pathrslite.go b/internal/pathrs/procfs_pathrslite.go
index a02b0d39..37450a0e 100644
--- a/internal/pathrs/procfs_pathrslite.go
+++ b/internal/pathrs/procfs_pathrslite.go
@@ -27,13 +27,15 @@ import (
)
func procOpenReopen(openFn func(subpath string) (*os.File, error), subpath string, flags int) (*os.File, error) {
- handle, err := openFn(subpath)
+ handle, err := retryEAGAIN(func() (*os.File, error) {
+ return openFn(subpath)
+ })
if err != nil {
return nil, err
}
defer handle.Close()
- f, err := pathrs.Reopen(handle, flags)
+ f, err := Reopen(handle, flags)
if err != nil {
return nil, fmt.Errorf("reopen %s: %w", handle.Name(), err)
}
@@ -44,7 +46,7 @@ func procOpenReopen(openFn func(subpath string) (*os.File, error), subpath strin
// [pathrs.Reopen], to let you one-shot open a procfs file with the given
// flags.
func ProcSelfOpen(subpath string, flags int) (*os.File, error) {
- proc, err := procfs.OpenProcRoot()
+ proc, err := retryEAGAIN(procfs.OpenProcRoot)
if err != nil {
return nil, err
}
@@ -55,7 +57,7 @@ func ProcSelfOpen(subpath string, flags int) (*os.File, error) {
// ProcPidOpen is a wrapper around [procfs.Handle.OpenPid] and [pathrs.Reopen],
// to let you one-shot open a procfs file with the given flags.
func ProcPidOpen(pid int, subpath string, flags int) (*os.File, error) {
- proc, err := procfs.OpenProcRoot()
+ proc, err := retryEAGAIN(procfs.OpenProcRoot)
if err != nil {
return nil, err
}
@@ -70,13 +72,15 @@ func ProcPidOpen(pid int, subpath string, flags int) (*os.File, error) {
// flags. The returned [procfs.ProcThreadSelfCloser] needs the same handling as
// when using pathrs-lite.
func ProcThreadSelfOpen(subpath string, flags int) (_ *os.File, _ procfs.ProcThreadSelfCloser, Err error) {
- proc, err := procfs.OpenProcRoot()
+ proc, err := retryEAGAIN(procfs.OpenProcRoot)
if err != nil {
return nil, nil, err
}
defer proc.Close()
- handle, closer, err := proc.OpenThreadSelf(subpath)
+ handle, closer, err := retryEAGAIN2(func() (*os.File, procfs.ProcThreadSelfCloser, error) {
+ return proc.OpenThreadSelf(subpath)
+ })
if err != nil {
return nil, nil, err
}
@@ -89,7 +93,7 @@ func ProcThreadSelfOpen(subpath string, flags int) (_ *os.File, _ procfs.ProcThr
}
defer handle.Close()
- f, err := pathrs.Reopen(handle, flags)
+ f, err := Reopen(handle, flags)
if err != nil {
return nil, nil, fmt.Errorf("reopen %s: %w", handle.Name(), err)
}
@@ -98,5 +102,7 @@ func ProcThreadSelfOpen(subpath string, flags int) (_ *os.File, _ procfs.ProcThr
// Reopen is a wrapper around pathrs.Reopen.
func Reopen(file *os.File, flags int) (*os.File, error) {
- return pathrs.Reopen(file, flags)
+ return retryEAGAIN(func() (*os.File, error) {
+ return pathrs.Reopen(file, flags)
+ })
}
diff --git a/internal/pathrs/retry.go b/internal/pathrs/retry.go
new file mode 100644
index 00000000..a51d335c
--- /dev/null
+++ b/internal/pathrs/retry.go
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+/*
+ * Copyright (C) 2024-2025 Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2024-2025 SUSE LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package pathrs
+
+import (
+ "errors"
+ "fmt"
+ "time"
+
+ "golang.org/x/sys/unix"
+)
+
+// Based on >50k tests running "runc run" on a 16-core system with very heavy
+// rename(2) load, the single longest latency caused by -EAGAIN retries was
+// ~800us (with the vast majority being closer to 400us). So, a 2ms limit
+// should give more than enough headroom for any real system in practice.
+const retryDeadline = 2 * time.Millisecond
+
+// retryEAGAIN is a top-level retry loop for pathrs to try to returning
+// spurious errors in most normal user cases when using openat2 (libpathrs
+// itself does up to 128 retries already, but this method takes a
+// wallclock-deadline approach to simply retry until a timer elapses).
+func retryEAGAIN[T any](fn func() (T, error)) (T, error) {
+ deadline := time.After(retryDeadline)
+ for {
+ v, err := fn()
+ if !errors.Is(err, unix.EAGAIN) {
+ return v, err
+ }
+ select {
+ case <-deadline:
+ return *new(T), fmt.Errorf("%v retry deadline exceeded: %w", retryDeadline, err)
+ default:
+ // retry
+ }
+ }
+}
+
+// retryEAGAIN2 is like retryEAGAIN except it returns two values.
+func retryEAGAIN2[T1, T2 any](fn func() (T1, T2, error)) (T1, T2, error) {
+ type ret struct {
+ v1 T1
+ v2 T2
+ }
+ v, err := retryEAGAIN(func() (ret, error) {
+ v1, v2, err := fn()
+ return ret{v1: v1, v2: v2}, err
+ })
+ return v.v1, v.v2, err
+}
diff --git a/internal/pathrs/root_pathrslite.go b/internal/pathrs/root_pathrslite.go
index 0ef81fae..899af270 100644
--- a/internal/pathrs/root_pathrslite.go
+++ b/internal/pathrs/root_pathrslite.go
@@ -31,12 +31,15 @@ import (
// is effectively shorthand for [securejoin.OpenInRoot] followed by
// [securejoin.Reopen].
func OpenInRoot(root, subpath string, flags int) (*os.File, error) {
- handle, err := pathrs.OpenInRoot(root, subpath)
+ handle, err := retryEAGAIN(func() (*os.File, error) {
+ return pathrs.OpenInRoot(root, subpath)
+ })
if err != nil {
return nil, err
}
defer handle.Close()
- return pathrs.Reopen(handle, flags)
+
+ return Reopen(handle, flags)
}
// CreateInRoot creates a new file inside a root (as well as any missing parent
diff --git a/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md b/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md
index 6862467c..3faee0bc 100644
--- a/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md
+++ b/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md
@@ -4,7 +4,36 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/)
and this project adheres to [Semantic Versioning](http://semver.org/).
-## [Unreleased] ##
+## [Unreleased 0.5.z] ##
+
+## [0.5.1] - 2025-10-31 ##
+
+> Spooky scary skeletons send shivers down your spine!
+
+### Changed ###
+- `openat2` can return `-EAGAIN` if it detects a possible attack in certain
+ scenarios (namely if there was a rename or mount while walking a path with a
+ `..` component). While this is necessary to avoid a denial-of-service in the
+ kernel, it does require retry loops in userspace.
+
+ In previous versions, `pathrs-lite` would retry `openat2` 32 times before
+ returning an error, but we've received user reports that this limit can be
+ hit on systems with very heavy load. In some synthetic benchmarks (testing
+ the worst-case of an attacker doing renames in a tight loop on every core of
+ a 16-core machine) we managed to get a ~3% failure rate in runc. We have
+ improved this situation in two ways:
+
+ * We have now increased this limit to 128, which should be good enough for
+ most use-cases without becoming a denial-of-service vector (the number of
+ syscalls called by the `O_PATH` resolver in a typical case is within the
+ same ballpark). The same benchmarks show a failure rate of ~0.12% which
+ (while not zero) is probably sufficient for most users.
+
+ * In addition, we now return a `unix.EAGAIN` error that is bubbled up and can
+ be detected by callers. This means that callers with stricter requirements
+ to avoid spurious errors can choose to do their own infinite `EAGAIN` retry
+ loop (though we would strongly recommend users use time-based deadlines in
+ such retry loops to avoid potentially unbounded denials-of-service).
## [0.5.0] - 2025-09-26 ##
@@ -354,7 +383,8 @@ This is our first release of `github.com/cyphar/filepath-securejoin`,
containing a full implementation with a coverage of 93.5% (the only missing
cases are the error cases, which are hard to mocktest at the moment).
-[Unreleased]: https://github.com/cyphar/filepath-securejoin/compare/v0.5.0...HEAD
+[Unreleased 0.5.z]: https://github.com/cyphar/filepath-securejoin/compare/v0.5.1...release-0.5
+[0.5.1]: https://github.com/cyphar/filepath-securejoin/compare/v0.5.0...v0.5.1
[0.5.0]: https://github.com/cyphar/filepath-securejoin/compare/v0.4.1...v0.5.0
[0.4.1]: https://github.com/cyphar/filepath-securejoin/compare/v0.4.0...v0.4.1
[0.4.0]: https://github.com/cyphar/filepath-securejoin/compare/v0.3.6...v0.4.0
diff --git a/vendor/github.com/cyphar/filepath-securejoin/VERSION b/vendor/github.com/cyphar/filepath-securejoin/VERSION
index 8f0916f7..4b9fcbec 100644
--- a/vendor/github.com/cyphar/filepath-securejoin/VERSION
+++ b/vendor/github.com/cyphar/filepath-securejoin/VERSION
@@ -1 +1 @@
-0.5.0
+0.5.1
diff --git a/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors.go b/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors_linux.go
similarity index 70%
rename from vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors.go
rename to vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors_linux.go
index c26e440e..d0b200f4 100644
--- a/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors.go
+++ b/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors_linux.go
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: MPL-2.0
+//go:build linux
+
// Copyright (C) 2024-2025 Aleksa Sarai <cyphar@cyphar.com>
// Copyright (C) 2024-2025 SUSE LLC
//
@@ -12,15 +14,24 @@ package internal
import (
"errors"
+
+ "golang.org/x/sys/unix"
)
+type xdevErrorish struct {
+ description string
+}
+
+func (err xdevErrorish) Error() string { return err.description }
+func (err xdevErrorish) Is(target error) bool { return target == unix.EXDEV }
+
var (
// ErrPossibleAttack indicates that some attack was detected.
- ErrPossibleAttack = errors.New("possible attack detected")
+ ErrPossibleAttack error = xdevErrorish{"possible attack detected"}
// ErrPossibleBreakout indicates that during an operation we ended up in a
// state that could be a breakout but we detected it.
- ErrPossibleBreakout = errors.New("possible breakout detected")
+ ErrPossibleBreakout error = xdevErrorish{"possible breakout detected"}
// ErrInvalidDirectory indicates an unlinked directory.
ErrInvalidDirectory = errors.New("wandered into deleted directory")
diff --git a/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/fd/openat2_linux.go b/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/fd/openat2_linux.go
index 23053083..3e937fe3 100644
--- a/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/fd/openat2_linux.go
+++ b/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/fd/openat2_linux.go
@@ -17,8 +17,6 @@ import (
"runtime"
"golang.org/x/sys/unix"
-
- "github.com/cyphar/filepath-securejoin/pathrs-lite/internal"
)
func scopedLookupShouldRetry(how *unix.OpenHow, err error) bool {
@@ -34,7 +32,10 @@ func scopedLookupShouldRetry(how *unix.OpenHow, err error) bool {
(errors.Is(err, unix.EAGAIN) || errors.Is(err, unix.EXDEV))
}
-const scopedLookupMaxRetries = 32
+// This is a fairly arbitrary limit we have just to avoid an attacker being
+// able to make us spin in an infinite retry loop -- callers can choose to
+// retry on EAGAIN if they prefer.
+const scopedLookupMaxRetries = 128
// Openat2 is an [Fd]-based wrapper around unix.Openat2, but with some retry
// logic in case of EAGAIN errors.
@@ -43,10 +44,10 @@ func Openat2(dir Fd, path string, how *unix.OpenHow) (*os.File, error) {
// Make sure we always set O_CLOEXEC.
how.Flags |= unix.O_CLOEXEC
var tries int
- for tries < scopedLookupMaxRetries {
+ for {
fd, err := unix.Openat2(dirFd, path, how)
if err != nil {
- if scopedLookupShouldRetry(how, err) {
+ if scopedLookupShouldRetry(how, err) && tries < scopedLookupMaxRetries {
// We retry a couple of times to avoid the spurious errors, and
// if we are being attacked then returning -EAGAIN is the best
// we can do.
@@ -58,5 +59,4 @@ func Openat2(dir Fd, path string, how *unix.OpenHow) (*os.File, error) {
runtime.KeepAlive(dir)
return os.NewFile(uintptr(fd), fullPath), nil
}
- return nil, &os.PathError{Op: "openat2", Path: fullPath, Err: internal.ErrPossibleAttack}
}
diff --git a/vendor/modules.txt b/vendor/modules.txt
index f22001c8..18276b61 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -27,7 +27,7 @@ github.com/coreos/go-systemd/v22/dbus
# github.com/cpuguy83/go-md2man/v2 v2.0.5
## explicit; go 1.11
github.com/cpuguy83/go-md2man/v2/md2man
-# github.com/cyphar/filepath-securejoin v0.5.0
+# github.com/cyphar/filepath-securejoin v0.5.1
## explicit; go 1.18
github.com/cyphar/filepath-securejoin
github.com/cyphar/filepath-securejoin/internal/consts
--
2.51.1

File diff suppressed because it is too large Load Diff

View File

@ -1,62 +0,0 @@
From dfb3496c174377b860b62872ce6af951364cc3ac Mon Sep 17 00:00:00 2001
From: Lokesh Mandvekar <lsm5@fedoraproject.org>
Date: Tue, 12 Dec 2017 13:22:42 +0530
Subject: [PATCH] Revert "Apply cgroups earlier"
This reverts commit 7062c7556b71188abc18d7516441ff4b03fbc1fc.
---
libcontainer/process_linux.go | 31 ++++++++++++++-----------------
1 file changed, 14 insertions(+), 17 deletions(-)
diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
index 149b1126..b8a395af 100644
--- a/libcontainer/process_linux.go
+++ b/libcontainer/process_linux.go
@@ -272,6 +272,20 @@ func (p *initProcess) start() error {
p.process.ops = nil
return newSystemErrorWithCause(err, "starting init process command")
}
+ if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
+ return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
+ }
+ if err := p.execSetns(); err != nil {
+ return newSystemErrorWithCause(err, "running exec setns process for init")
+ }
+ // Save the standard descriptor names before the container process
+ // can potentially move them (e.g., via dup2()). If we don't do this now,
+ // we won't know at checkpoint time which file descriptor to look up.
+ fds, err := getPipeFds(p.pid())
+ if err != nil {
+ return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
+ }
+ p.setExternalDescriptors(fds)
// Do this before syncing with child so that no children can escape the
// cgroup. We don't need to worry about not doing this and not being root
// because we'd be using the rootless cgroup manager in that case.
@@ -292,23 +306,6 @@ func (p *initProcess) start() error {
}
}
}()
-
- if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
- return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
- }
-
- if err := p.execSetns(); err != nil {
- return newSystemErrorWithCause(err, "running exec setns process for init")
- }
-
- // Save the standard descriptor names before the container process
- // can potentially move them (e.g., via dup2()). If we don't do this now,
- // we won't know at checkpoint time which file descriptor to look up.
- fds, err := getPipeFds(p.pid())
- if err != nil {
- return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
- }
- p.setExternalDescriptors(fds)
if err := p.createNetworkInterfaces(); err != nil {
return newSystemErrorWithCause(err, "creating network interfaces")
}
--
2.14.3

View File

@ -1,290 +0,0 @@
From bf6405284aa3870a39b402309003633a1c230ed9 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <asarai@suse.de>
Date: Wed, 9 Jan 2019 13:40:01 +1100
Subject: [PATCH 1/1] nsenter: clone /proc/self/exe to avoid exposing host
binary to container
There are quite a few circumstances where /proc/self/exe pointing to a
pretty important container binary is a _bad_ thing, so to avoid this we
have to make a copy (preferably doing self-clean-up and not being
writeable).
As a hotfix we require memfd_create(2), but we can always extend this to
use a scratch MNT_DETACH overlayfs or tmpfs. The main downside to this
approach is no page-cache sharing for the runc binary (which overlayfs
would give us) but this is far less complicated.
This is only done during nsenter so that it happens transparently to the
Go code, and any libcontainer users benefit from it. This also makes
ExtraFiles and --preserve-fds handling trivial (because we don't need to
worry about it).
Fixes: CVE-2019-5736
Co-developed-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Aleksa Sarai <asarai@suse.de>
Signed-off-by: Mrunal Patel <mrunalp@gmail.com>
---
libcontainer/nsenter/cloned_binary.c | 221 +++++++++++++++++++++++++++
libcontainer/nsenter/nsexec.c | 11 ++
2 files changed, 232 insertions(+)
create mode 100644 libcontainer/nsenter/cloned_binary.c
diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c
new file mode 100644
index 00000000..d9f6093a
--- /dev/null
+++ b/libcontainer/nsenter/cloned_binary.c
@@ -0,0 +1,221 @@
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/vfs.h>
+#include <sys/mman.h>
+#include <sys/sendfile.h>
+#include <sys/syscall.h>
+
+#include <linux/magic.h>
+#include <linux/memfd.h>
+
+/* Use our own wrapper for memfd_create. */
+#if !defined(SYS_memfd_create) && defined(__NR_memfd_create)
+# define SYS_memfd_create __NR_memfd_create
+#endif
+#ifndef SYS_memfd_create
+# error "memfd_create(2) syscall not supported by this glibc version"
+#endif
+int memfd_create(const char *name, unsigned int flags)
+{
+ return syscall(SYS_memfd_create, name, flags);
+}
+
+/* This comes directly from <linux/fcntl.h>. */
+#ifndef F_LINUX_SPECIFIC_BASE
+# define F_LINUX_SPECIFIC_BASE 1024
+#endif
+#ifndef F_ADD_SEALS
+# define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
+# define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
+#endif
+#ifndef F_SEAL_SEAL
+# define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
+# define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
+# define F_SEAL_GROW 0x0004 /* prevent file from growing */
+# define F_SEAL_WRITE 0x0008 /* prevent writes */
+#endif
+
+
+#define OUR_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
+#define OUR_MEMFD_SEALS \
+ (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
+
+static void *must_realloc(void *ptr, size_t size)
+{
+ void *old = ptr;
+ do {
+ ptr = realloc(old, size);
+ } while(!ptr);
+ return ptr;
+}
+
+/*
+ * Verify whether we are currently in a self-cloned program (namely, is
+ * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather
+ * for shmem files), and we want to be sure it's actually sealed.
+ */
+static int is_self_cloned(void)
+{
+ int fd, seals;
+
+ fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC);
+ if (fd < 0)
+ return -ENOTRECOVERABLE;
+
+ seals = fcntl(fd, F_GET_SEALS);
+ close(fd);
+ return seals == OUR_MEMFD_SEALS;
+}
+
+/*
+ * Basic wrapper around mmap(2) that gives you the file length so you can
+ * safely treat it as an ordinary buffer. Only gives you read access.
+ */
+static char *read_file(char *path, size_t *length)
+{
+ int fd;
+ char buf[4096], *copy = NULL;
+
+ if (!length)
+ return NULL;
+
+ fd = open(path, O_RDONLY | O_CLOEXEC);
+ if (fd < 0)
+ return NULL;
+
+ *length = 0;
+ for (;;) {
+ int n;
+
+ n = read(fd, buf, sizeof(buf));
+ if (n < 0)
+ goto error;
+ if (!n)
+ break;
+
+ copy = must_realloc(copy, (*length + n) * sizeof(*copy));
+ memcpy(copy + *length, buf, n);
+ *length += n;
+ }
+ close(fd);
+ return copy;
+
+error:
+ close(fd);
+ free(copy);
+ return NULL;
+}
+
+/*
+ * A poor-man's version of "xargs -0". Basically parses a given block of
+ * NUL-delimited data, within the given length and adds a pointer to each entry
+ * to the array of pointers.
+ */
+static int parse_xargs(char *data, int data_length, char ***output)
+{
+ int num = 0;
+ char *cur = data;
+
+ if (!data || *output != NULL)
+ return -1;
+
+ while (cur < data + data_length) {
+ num++;
+ *output = must_realloc(*output, (num + 1) * sizeof(**output));
+ (*output)[num - 1] = cur;
+ cur += strlen(cur) + 1;
+ }
+ (*output)[num] = NULL;
+ return num;
+}
+
+/*
+ * "Parse" out argv and envp from /proc/self/cmdline and /proc/self/environ.
+ * This is necessary because we are running in a context where we don't have a
+ * main() that we can just get the arguments from.
+ */
+static int fetchve(char ***argv, char ***envp)
+{
+ char *cmdline = NULL, *environ = NULL;
+ size_t cmdline_size, environ_size;
+
+ cmdline = read_file("/proc/self/cmdline", &cmdline_size);
+ if (!cmdline)
+ goto error;
+ environ = read_file("/proc/self/environ", &environ_size);
+ if (!environ)
+ goto error;
+
+ if (parse_xargs(cmdline, cmdline_size, argv) <= 0)
+ goto error;
+ if (parse_xargs(environ, environ_size, envp) <= 0)
+ goto error;
+
+ return 0;
+
+error:
+ free(environ);
+ free(cmdline);
+ return -EINVAL;
+}
+
+#define SENDFILE_MAX 0x7FFFF000 /* sendfile(2) is limited to 2GB. */
+static int clone_binary(void)
+{
+ int binfd, memfd, err;
+ ssize_t sent = 0;
+
+ memfd = memfd_create(OUR_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
+ if (memfd < 0)
+ return -ENOTRECOVERABLE;
+
+ binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
+ if (binfd < 0)
+ goto error;
+
+ sent = sendfile(memfd, binfd, NULL, SENDFILE_MAX);
+ close(binfd);
+ if (sent < 0)
+ goto error;
+
+ err = fcntl(memfd, F_ADD_SEALS, OUR_MEMFD_SEALS);
+ if (err < 0)
+ goto error;
+
+ return memfd;
+
+error:
+ close(memfd);
+ return -EIO;
+}
+
+int ensure_cloned_binary(void)
+{
+ int execfd;
+ char **argv = NULL, **envp = NULL;
+
+ /* Check that we're not self-cloned, and if we are then bail. */
+ int cloned = is_self_cloned();
+ if (cloned > 0 || cloned == -ENOTRECOVERABLE)
+ return cloned;
+
+ if (fetchve(&argv, &envp) < 0)
+ return -EINVAL;
+
+ execfd = clone_binary();
+ if (execfd < 0)
+ return -EIO;
+
+ fexecve(execfd, argv, envp);
+ return -ENOEXEC;
+}
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index cb224314..784fd9b0 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -528,6 +528,9 @@ void join_namespaces(char *nslist)
free(namespaces);
}
+/* Defined in cloned_binary.c. */
+int ensure_cloned_binary(void);
+
void nsexec(void)
{
int pipenum;
@@ -543,6 +546,14 @@ void nsexec(void)
if (pipenum == -1)
return;
+ /*
+ * We need to re-exec if we are not in a cloned binary. This is necessary
+ * to ensure that containers won't be able to access the host binary
+ * through /proc/self/exe. See CVE-2019-5736.
+ */
+ if (ensure_cloned_binary() < 0)
+ bail("could not ensure we are a cloned binary");
+
/* Parse all of the netlink configuration. */
nl_parse(pipenum, &config);
--
2.20.1

View File

@ -0,0 +1,49 @@
From 2a9b44aabfa52bb071ff2e3564427da0bb82312e Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 5 Nov 2025 02:04:02 +1100
Subject: [PATCH 2/2] [1.3] rootfs: re-allow dangling symlinks in mount targets
It seems there are a fair few images where dangling symlinks are used as
path components for mount targets, which pathrs-lite does not support
(and it would be difficult to fully support this in a race-free way).
This was actually meant to be blocked by commit 63c2908164f3 ("rootfs:
try to scope MkdirAll to stay inside the rootfs"), followed by commit
dd827f7b715a ("utils: switch to securejoin.MkdirAllHandle"). However, we
still used SecureJoin to construct mountpoint targets, which means that
dangling symlinks were "resolved" before reaching pathrs-lite.
This patch basically re-adds this hack in order to reduce the breakages
we've seen so far.
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
libcontainer/rootfs_linux.go | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
index d85e7321..2fda3c9d 100644
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -519,6 +519,17 @@ func (m *mountEntry) createOpenMountpoint(rootfs string) (Err error) {
dstIsFile = !fi.IsDir()
}
+ // In previous runc versions, we would tolerate nonsense paths with
+ // dangling symlinks as path components. pathrs-lite does not support
+ // this, so instead we have to emulate this behaviour by doing
+ // SecureJoin *purely to get a semi-reasonable path to use* and then we
+ // use pathrs-lite to operate on the path safely.
+ newUnsafePath, err := securejoin.SecureJoin(rootfs, unsafePath)
+ if err != nil {
+ return err
+ }
+ unsafePath = utils.StripRoot(rootfs, newUnsafePath)
+
if dstIsFile {
dstFile, err = pathrs.CreateInRoot(rootfs, unsafePath, unix.O_CREAT|unix.O_EXCL|unix.O_NOFOLLOW, 0o644)
} else {
--
2.51.1

View File

@ -1,200 +0,0 @@
From ecf53c23545092019602578583031c28fde4d2a1 Mon Sep 17 00:00:00 2001
From: Giuseppe Scrivano <gscrivan@redhat.com>
Date: Fri, 25 May 2018 18:04:06 +0200
Subject: [PATCH] sd-notify: do not hang when NOTIFY_SOCKET is used with create
if NOTIFY_SOCKET is used, do not block the main runc process waiting
for events on the notify socket. Change the logic to create a new
process that monitors exclusively the notify socket until an event is
received.
Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---
init.go | 12 +++++++
notify_socket.go | 101 ++++++++++++++++++++++++++++++++++++++++++++++---------
signals.go | 5 +--
3 files changed, 99 insertions(+), 19 deletions(-)
diff --git a/init.go b/init.go
index c8f453192..6a3d9e91c 100644
--- a/init.go
+++ b/init.go
@@ -20,6 +20,18 @@ var initCommand = cli.Command{
Name: "init",
Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`,
Action: func(context *cli.Context) error {
+ // If NOTIFY_SOCKET is used create a new process that stays around
+ // so to not block "runc start". It will automatically exits when the
+ // container notifies that it is ready, or when the container is deleted
+ if os.Getenv("_NOTIFY_SOCKET_FD") != "" {
+ fd := os.Getenv("_NOTIFY_SOCKET_FD")
+ pid := os.Getenv("_NOTIFY_SOCKET_PID")
+ hostNotifySocket := os.Getenv("_NOTIFY_SOCKET_HOST")
+ notifySocketPath := os.Getenv("_NOTIFY_SOCKET_PATH")
+ notifySocketInit(fd, pid, hostNotifySocket, notifySocketPath)
+ os.Exit(0)
+ }
+
factory, _ := libcontainer.New("")
if err := factory.StartInitialization(); err != nil {
// as the error is sent back to the parent there is no need to log
diff --git a/notify_socket.go b/notify_socket.go
index cd6c0a989..e04e9d660 100644
--- a/notify_socket.go
+++ b/notify_socket.go
@@ -6,10 +6,13 @@ import (
"bytes"
"fmt"
"net"
+ "os"
+ "os/exec"
"path/filepath"
+ "strconv"
+ "time"
"github.com/opencontainers/runtime-spec/specs-go"
-
"github.com/sirupsen/logrus"
"github.com/urfave/cli"
)
@@ -64,24 +67,94 @@ func (s *notifySocket) setupSocket() error {
return nil
}
+func (notifySocket *notifySocket) notifyNewPid(pid int) {
+ notifySocketHostAddr := net.UnixAddr{Name: notifySocket.host, Net: "unixgram"}
+ client, err := net.DialUnix("unixgram", nil, &notifySocketHostAddr)
+ if err != nil {
+ return
+ }
+ newPid := fmt.Sprintf("MAINPID=%d\n", pid)
+ client.Write([]byte(newPid))
+}
+
// pid1 must be set only with -d, as it is used to set the new process as the main process
// for the service in systemd
func (notifySocket *notifySocket) run(pid1 int) {
- buf := make([]byte, 512)
- notifySocketHostAddr := net.UnixAddr{Name: notifySocket.host, Net: "unixgram"}
- client, err := net.DialUnix("unixgram", nil, &notifySocketHostAddr)
+ file, err := notifySocket.socket.File()
if err != nil {
logrus.Error(err)
return
}
- for {
- r, err := notifySocket.socket.Read(buf)
- if err != nil {
- break
+ defer file.Close()
+ defer notifySocket.socket.Close()
+
+ cmd := exec.Command("/proc/self/exe", "init")
+ cmd.ExtraFiles = []*os.File{file}
+ cmd.Env = append(cmd.Env, "_NOTIFY_SOCKET_FD=3",
+ fmt.Sprintf("_NOTIFY_SOCKET_PID=%d", pid1),
+ fmt.Sprintf("_NOTIFY_SOCKET_HOST=%s", notifySocket.host),
+ fmt.Sprintf("_NOTIFY_SOCKET_PATH=%s", notifySocket.socketPath))
+
+ if err := cmd.Start(); err != nil {
+ logrus.Fatal(err)
+ }
+ notifySocket.notifyNewPid(cmd.Process.Pid)
+ cmd.Process.Release()
+}
+
+func notifySocketInit(envFd string, envPid string, notifySocketHost string, notifySocketPath string) {
+ intFd, err := strconv.Atoi(envFd)
+ if err != nil {
+ return
+ }
+ pid1, err := strconv.Atoi(envPid)
+ if err != nil {
+ return
+ }
+
+ file := os.NewFile(uintptr(intFd), "unixgram")
+ defer file.Close()
+
+ fileChan := make(chan []byte)
+ exitChan := make(chan bool)
+
+ go func() {
+ for {
+ buf := make([]byte, 512)
+ r, err := file.Read(buf)
+ if err != nil {
+ return
+ }
+ fileChan <- buf[0:r]
}
- var out bytes.Buffer
- for _, line := range bytes.Split(buf[0:r], []byte{'\n'}) {
- if bytes.HasPrefix(line, []byte("READY=")) {
+ }()
+ go func() {
+ for {
+ if _, err := os.Stat(notifySocketPath); os.IsNotExist(err) {
+ exitChan <- true
+ return
+ }
+ time.Sleep(time.Second)
+ }
+ }()
+
+ notifySocketHostAddr := net.UnixAddr{Name: notifySocketHost, Net: "unixgram"}
+ client, err := net.DialUnix("unixgram", nil, &notifySocketHostAddr)
+ if err != nil {
+ return
+ }
+
+ for {
+ select {
+ case <-exitChan:
+ return
+ case b := <-fileChan:
+ for _, line := range bytes.Split(b, []byte{'\n'}) {
+ if !bytes.HasPrefix(line, []byte("READY=")) {
+ continue
+ }
+
+ var out bytes.Buffer
_, err = out.Write(line)
if err != nil {
return
@@ -98,10 +171,8 @@ func (notifySocket *notifySocket) run(pid1 int) {
}
// now we can inform systemd to use pid1 as the pid to monitor
- if pid1 > 0 {
- newPid := fmt.Sprintf("MAINPID=%d\n", pid1)
- client.Write([]byte(newPid))
- }
+ newPid := fmt.Sprintf("MAINPID=%d\n", pid1)
+ client.Write([]byte(newPid))
return
}
}
diff --git a/signals.go b/signals.go
index 1811de837..d0988cb39 100644
--- a/signals.go
+++ b/signals.go
@@ -70,7 +70,7 @@ func (h *signalHandler) forward(process *libcontainer.Process, tty *tty, detach
h.notifySocket.run(pid1)
return 0, nil
} else {
- go h.notifySocket.run(0)
+ h.notifySocket.run(os.Getpid())
}
}
@@ -98,9 +98,6 @@ func (h *signalHandler) forward(process *libcontainer.Process, tty *tty, detach
// status because we must ensure that any of the go specific process
// fun such as flushing pipes are complete before we return.
process.Wait()
- if h.notifySocket != nil {
- h.notifySocket.Close()
- }
return e.status, nil
}
}

View File

@ -1 +0,0 @@
fs.may_detach_mounts=1

View File

@ -1,61 +0,0 @@
diff --git a/list.go b/list.go
index 0313d8c..328798b 100644
--- a/list.go
+++ b/list.go
@@ -50,7 +50,7 @@ var listCommand = cli.Command{
ArgsUsage: `
Where the given root is specified via the global option "--root"
-(default: "/run/runc").
+(default: "/run/runc-ctrs").
EXAMPLE 1:
To list containers created via the default "--root":
diff --git a/main.go b/main.go
index 278399a..0f49fce 100644
--- a/main.go
+++ b/main.go
@@ -62,7 +62,7 @@ func main() {
v = append(v, fmt.Sprintf("spec: %s", specs.Version))
app.Version = strings.Join(v, "\n")
- root := "/run/runc"
+ root := "/run/runc-ctrs"
rootless, err := isRootless(nil)
if err != nil {
fatal(err)
@@ -70,7 +70,7 @@ func main() {
if rootless {
runtimeDir := os.Getenv("XDG_RUNTIME_DIR")
if runtimeDir != "" {
- root = runtimeDir + "/runc"
+ root = runtimeDir + "/runc-ctrs"
// According to the XDG specification, we need to set anything in
// XDG_RUNTIME_DIR to have a sticky bit if we don't want it to get
// auto-pruned.
diff --git a/man/runc-list.8.md b/man/runc-list.8.md
index f737424..107220e 100644
--- a/man/runc-list.8.md
+++ b/man/runc-list.8.md
@@ -6,7 +6,7 @@
# EXAMPLE
Where the given root is specified via the global option "--root"
-(default: "/run/runc").
+(default: "/run/runc-ctrs").
To list containers created via the default "--root":
# runc list
diff --git a/man/runc.8.md b/man/runc.8.md
index 6d0ddff..337bc73 100644
--- a/man/runc.8.md
+++ b/man/runc.8.md
@@ -51,7 +51,7 @@ value for "bundle" is the current directory.
--debug enable debug output for logging
--log value set the log file path where internal debug information is written (default: "/dev/null")
--log-format value set the format used by logs ('text' (default), or 'json') (default: "text")
- --root value root directory for storage of container state (this should be located in tmpfs) (default: "/run/runc" or $XDG_RUNTIME_DIR/runc for rootless containers)
+ --root value root directory for storage of container state (this should be located in tmpfs) (default: "/run/runc-ctrs" or $XDG_RUNTIME_DIR/runc-ctrs for rootless containers)
--criu value path to the criu binary used for checkpoint and restore (default: "criu")
--systemd-cgroup enable systemd cgroup support, expects cgroupsPath to be of form "slice:prefix:name" for e.g. "system.slice:runc:434234"
--rootless value enable rootless mode ('true', 'false', or 'auto') (default: "auto")

View File

@ -1,72 +0,0 @@
From 28a697cce3e4f905dca700eda81d681a30eef9cd Mon Sep 17 00:00:00 2001
From: Giuseppe Scrivano <gscrivan@redhat.com>
Date: Fri, 11 Jan 2019 21:53:45 +0100
Subject: [PATCH] rootfs: umount all procfs and sysfs with --no-pivot
When creating a new user namespace, the kernel doesn't allow to mount
a new procfs or sysfs file system if there is not already one instance
fully visible in the current mount namespace.
When using --no-pivot we were effectively inhibiting this protection
from the kernel, as /proc and /sys from the host are still present in
the container mount namespace.
A container without full access to /proc could then create a new user
namespace, and from there able to mount a fully visible /proc, bypassing
the limitations in the container.
A simple reproducer for this issue is:
unshare -mrfp sh -c "mount -t proc none /proc && echo c > /proc/sysrq-trigger"
Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---
libcontainer/rootfs_linux.go | 35 +++++++++++++++++++++++++++++++++++
1 file changed, 35 insertions(+)
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
index e7c2f8ada..6bd6da74a 100644
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -748,6 +748,41 @@ func pivotRoot(rootfs string) error {
}
func msMoveRoot(rootfs string) error {
+ mountinfos, err := mount.GetMounts()
+ if err != nil {
+ return err
+ }
+
+ absRootfs, err := filepath.Abs(rootfs)
+ if err != nil {
+ return err
+ }
+
+ for _, info := range mountinfos {
+ p, err := filepath.Abs(info.Mountpoint)
+ if err != nil {
+ return err
+ }
+ // Umount every syfs and proc file systems, except those under the container rootfs
+ if (info.Fstype != "proc" && info.Fstype != "sysfs") || filepath.HasPrefix(p, absRootfs) {
+ continue
+ }
+ // Be sure umount events are not propagated to the host.
+ if err := unix.Mount("", p, "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
+ return err
+ }
+ if err := unix.Unmount(p, unix.MNT_DETACH); err != nil {
+ if err != unix.EINVAL && err != unix.EPERM {
+ return err
+ } else {
+ // If we have not privileges for umounting (e.g. rootless), then
+ // cover the path.
+ if err := unix.Mount("tmpfs", p, "tmpfs", 0, ""); err != nil {
+ return err
+ }
+ }
+ }
+ }
if err := unix.Mount(rootfs, "/", "", unix.MS_MOVE, ""); err != nil {
return err
}

File diff suppressed because it is too large Load Diff