Compare commits
No commits in common. "c8-stream-1.0" and "c9" have entirely different histories.
c8-stream-
...
c9
2
.gitignore
vendored
2
.gitignore
vendored
@ -1 +1 @@
|
||||
SOURCES/runc-2abd837.tar.gz
|
||||
SOURCES/v1.3.0.tar.gz
|
||||
|
||||
@ -1 +1 @@
|
||||
cf7119a838db2963e7af6ecdba90a2cc95ec0d56 SOURCES/runc-2abd837.tar.gz
|
||||
0ea2488912e9ae562782f5980971f7fb0d73df38 SOURCES/v1.3.0.tar.gz
|
||||
|
||||
@ -0,0 +1,416 @@
|
||||
From 2df42d4db6bc57ee914fa9cc4455ad3b8daff1d9 Mon Sep 17 00:00:00 2001
|
||||
From: Aleksa Sarai <cyphar@cyphar.com>
|
||||
Date: Sat, 1 Nov 2025 17:21:36 +1100
|
||||
Subject: [PATCH 1/2] [1.3] openat2: improve resilience on busy systems
|
||||
|
||||
Previously, we would see a ~3% failure rate when starting containers
|
||||
with mounts that contain ".." (which can trigger -EAGAIN). To counteract
|
||||
this, filepath-securejoin v0.5.1 includes a bump of the internal retry
|
||||
limit from 32 to 128, which lowers the failure rate to 0.12%.
|
||||
|
||||
However, there is still a risk of spurious failure on regular systems.
|
||||
In order to try to provide more resilience (while avoiding DoS attacks),
|
||||
this patch also includes an additional retry loop that terminates based
|
||||
on a deadline rather than retry count. The deadline is 2ms, as my
|
||||
testing found that ~800us for a single pathrs operation was the longest
|
||||
latency due to -EAGAIN retries, and that was an outlier compared to the
|
||||
more common ~400us latencies -- so 2ms should be more than enough for
|
||||
any real system.
|
||||
|
||||
The failure rates above were based on more 50k runs of runc with an
|
||||
attack script (from libpathrs) running a rename attack on all cores of a
|
||||
16-core system, which is arguably a worst-case but heavily utilised
|
||||
servers could likely approach similar results.
|
||||
|
||||
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
|
||||
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
|
||||
---
|
||||
go.mod | 2 +-
|
||||
go.sum | 4 +-
|
||||
internal/pathrs/mkdirall_pathrslite.go | 4 +-
|
||||
internal/pathrs/procfs_pathrslite.go | 22 ++++---
|
||||
internal/pathrs/retry.go | 66 +++++++++++++++++++
|
||||
internal/pathrs/root_pathrslite.go | 7 +-
|
||||
.../cyphar/filepath-securejoin/CHANGELOG.md | 34 +++++++++-
|
||||
.../cyphar/filepath-securejoin/VERSION | 2 +-
|
||||
.../internal/{errors.go => errors_linux.go} | 15 ++++-
|
||||
.../pathrs-lite/internal/fd/openat2_linux.go | 12 ++--
|
||||
vendor/modules.txt | 2 +-
|
||||
11 files changed, 144 insertions(+), 26 deletions(-)
|
||||
create mode 100644 internal/pathrs/retry.go
|
||||
rename vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/{errors.go => errors_linux.go} (70%)
|
||||
|
||||
diff --git a/go.mod b/go.mod
|
||||
index f2deafc3..a551a4ec 100644
|
||||
--- a/go.mod
|
||||
+++ b/go.mod
|
||||
@@ -6,7 +6,7 @@ require (
|
||||
github.com/checkpoint-restore/go-criu/v6 v6.3.0
|
||||
github.com/containerd/console v1.0.5
|
||||
github.com/coreos/go-systemd/v22 v22.5.0
|
||||
- github.com/cyphar/filepath-securejoin v0.5.0
|
||||
+ github.com/cyphar/filepath-securejoin v0.5.1
|
||||
github.com/docker/go-units v0.5.0
|
||||
github.com/godbus/dbus/v5 v5.1.0
|
||||
github.com/moby/sys/capability v0.4.0
|
||||
diff --git a/go.sum b/go.sum
|
||||
index ba395bf0..fb357b43 100644
|
||||
--- a/go.sum
|
||||
+++ b/go.sum
|
||||
@@ -10,8 +10,8 @@ github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSV
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.5 h1:ZtcqGrnekaHpVLArFSe4HK5DoKx1T0rq2DwVB0alcyc=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.5/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
|
||||
-github.com/cyphar/filepath-securejoin v0.5.0 h1:hIAhkRBMQ8nIeuVwcAoymp7MY4oherZdAxD+m0u9zaw=
|
||||
-github.com/cyphar/filepath-securejoin v0.5.0/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI=
|
||||
+github.com/cyphar/filepath-securejoin v0.5.1 h1:eYgfMq5yryL4fbWfkLpFFy2ukSELzaJOTaUTuh+oF48=
|
||||
+github.com/cyphar/filepath-securejoin v0.5.1/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
diff --git a/internal/pathrs/mkdirall_pathrslite.go b/internal/pathrs/mkdirall_pathrslite.go
|
||||
index fb4f7842..a9a0157c 100644
|
||||
--- a/internal/pathrs/mkdirall_pathrslite.go
|
||||
+++ b/internal/pathrs/mkdirall_pathrslite.go
|
||||
@@ -83,7 +83,9 @@ func MkdirAllInRootOpen(root, unsafePath string, mode os.FileMode) (*os.File, er
|
||||
}
|
||||
defer rootDir.Close()
|
||||
|
||||
- return pathrs.MkdirAllHandle(rootDir, unsafePath, mode)
|
||||
+ return retryEAGAIN(func() (*os.File, error) {
|
||||
+ return pathrs.MkdirAllHandle(rootDir, unsafePath, mode)
|
||||
+ })
|
||||
}
|
||||
|
||||
// MkdirAllInRoot is a wrapper around MkdirAllInRootOpen which closes the
|
||||
diff --git a/internal/pathrs/procfs_pathrslite.go b/internal/pathrs/procfs_pathrslite.go
|
||||
index a02b0d39..37450a0e 100644
|
||||
--- a/internal/pathrs/procfs_pathrslite.go
|
||||
+++ b/internal/pathrs/procfs_pathrslite.go
|
||||
@@ -27,13 +27,15 @@ import (
|
||||
)
|
||||
|
||||
func procOpenReopen(openFn func(subpath string) (*os.File, error), subpath string, flags int) (*os.File, error) {
|
||||
- handle, err := openFn(subpath)
|
||||
+ handle, err := retryEAGAIN(func() (*os.File, error) {
|
||||
+ return openFn(subpath)
|
||||
+ })
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer handle.Close()
|
||||
|
||||
- f, err := pathrs.Reopen(handle, flags)
|
||||
+ f, err := Reopen(handle, flags)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reopen %s: %w", handle.Name(), err)
|
||||
}
|
||||
@@ -44,7 +46,7 @@ func procOpenReopen(openFn func(subpath string) (*os.File, error), subpath strin
|
||||
// [pathrs.Reopen], to let you one-shot open a procfs file with the given
|
||||
// flags.
|
||||
func ProcSelfOpen(subpath string, flags int) (*os.File, error) {
|
||||
- proc, err := procfs.OpenProcRoot()
|
||||
+ proc, err := retryEAGAIN(procfs.OpenProcRoot)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -55,7 +57,7 @@ func ProcSelfOpen(subpath string, flags int) (*os.File, error) {
|
||||
// ProcPidOpen is a wrapper around [procfs.Handle.OpenPid] and [pathrs.Reopen],
|
||||
// to let you one-shot open a procfs file with the given flags.
|
||||
func ProcPidOpen(pid int, subpath string, flags int) (*os.File, error) {
|
||||
- proc, err := procfs.OpenProcRoot()
|
||||
+ proc, err := retryEAGAIN(procfs.OpenProcRoot)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -70,13 +72,15 @@ func ProcPidOpen(pid int, subpath string, flags int) (*os.File, error) {
|
||||
// flags. The returned [procfs.ProcThreadSelfCloser] needs the same handling as
|
||||
// when using pathrs-lite.
|
||||
func ProcThreadSelfOpen(subpath string, flags int) (_ *os.File, _ procfs.ProcThreadSelfCloser, Err error) {
|
||||
- proc, err := procfs.OpenProcRoot()
|
||||
+ proc, err := retryEAGAIN(procfs.OpenProcRoot)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
defer proc.Close()
|
||||
|
||||
- handle, closer, err := proc.OpenThreadSelf(subpath)
|
||||
+ handle, closer, err := retryEAGAIN2(func() (*os.File, procfs.ProcThreadSelfCloser, error) {
|
||||
+ return proc.OpenThreadSelf(subpath)
|
||||
+ })
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
@@ -89,7 +93,7 @@ func ProcThreadSelfOpen(subpath string, flags int) (_ *os.File, _ procfs.ProcThr
|
||||
}
|
||||
defer handle.Close()
|
||||
|
||||
- f, err := pathrs.Reopen(handle, flags)
|
||||
+ f, err := Reopen(handle, flags)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("reopen %s: %w", handle.Name(), err)
|
||||
}
|
||||
@@ -98,5 +102,7 @@ func ProcThreadSelfOpen(subpath string, flags int) (_ *os.File, _ procfs.ProcThr
|
||||
|
||||
// Reopen is a wrapper around pathrs.Reopen.
|
||||
func Reopen(file *os.File, flags int) (*os.File, error) {
|
||||
- return pathrs.Reopen(file, flags)
|
||||
+ return retryEAGAIN(func() (*os.File, error) {
|
||||
+ return pathrs.Reopen(file, flags)
|
||||
+ })
|
||||
}
|
||||
diff --git a/internal/pathrs/retry.go b/internal/pathrs/retry.go
|
||||
new file mode 100644
|
||||
index 00000000..a51d335c
|
||||
--- /dev/null
|
||||
+++ b/internal/pathrs/retry.go
|
||||
@@ -0,0 +1,66 @@
|
||||
+// SPDX-License-Identifier: Apache-2.0
|
||||
+/*
|
||||
+ * Copyright (C) 2024-2025 Aleksa Sarai <cyphar@cyphar.com>
|
||||
+ * Copyright (C) 2024-2025 SUSE LLC
|
||||
+ *
|
||||
+ * Licensed under the Apache License, Version 2.0 (the "License");
|
||||
+ * you may not use this file except in compliance with the License.
|
||||
+ * You may obtain a copy of the License at
|
||||
+ *
|
||||
+ * http://www.apache.org/licenses/LICENSE-2.0
|
||||
+ *
|
||||
+ * Unless required by applicable law or agreed to in writing, software
|
||||
+ * distributed under the License is distributed on an "AS IS" BASIS,
|
||||
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
+ * See the License for the specific language governing permissions and
|
||||
+ * limitations under the License.
|
||||
+ */
|
||||
+
|
||||
+package pathrs
|
||||
+
|
||||
+import (
|
||||
+ "errors"
|
||||
+ "fmt"
|
||||
+ "time"
|
||||
+
|
||||
+ "golang.org/x/sys/unix"
|
||||
+)
|
||||
+
|
||||
+// Based on >50k tests running "runc run" on a 16-core system with very heavy
|
||||
+// rename(2) load, the single longest latency caused by -EAGAIN retries was
|
||||
+// ~800us (with the vast majority being closer to 400us). So, a 2ms limit
|
||||
+// should give more than enough headroom for any real system in practice.
|
||||
+const retryDeadline = 2 * time.Millisecond
|
||||
+
|
||||
+// retryEAGAIN is a top-level retry loop for pathrs to try to returning
|
||||
+// spurious errors in most normal user cases when using openat2 (libpathrs
|
||||
+// itself does up to 128 retries already, but this method takes a
|
||||
+// wallclock-deadline approach to simply retry until a timer elapses).
|
||||
+func retryEAGAIN[T any](fn func() (T, error)) (T, error) {
|
||||
+ deadline := time.After(retryDeadline)
|
||||
+ for {
|
||||
+ v, err := fn()
|
||||
+ if !errors.Is(err, unix.EAGAIN) {
|
||||
+ return v, err
|
||||
+ }
|
||||
+ select {
|
||||
+ case <-deadline:
|
||||
+ return *new(T), fmt.Errorf("%v retry deadline exceeded: %w", retryDeadline, err)
|
||||
+ default:
|
||||
+ // retry
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+// retryEAGAIN2 is like retryEAGAIN except it returns two values.
|
||||
+func retryEAGAIN2[T1, T2 any](fn func() (T1, T2, error)) (T1, T2, error) {
|
||||
+ type ret struct {
|
||||
+ v1 T1
|
||||
+ v2 T2
|
||||
+ }
|
||||
+ v, err := retryEAGAIN(func() (ret, error) {
|
||||
+ v1, v2, err := fn()
|
||||
+ return ret{v1: v1, v2: v2}, err
|
||||
+ })
|
||||
+ return v.v1, v.v2, err
|
||||
+}
|
||||
diff --git a/internal/pathrs/root_pathrslite.go b/internal/pathrs/root_pathrslite.go
|
||||
index 0ef81fae..899af270 100644
|
||||
--- a/internal/pathrs/root_pathrslite.go
|
||||
+++ b/internal/pathrs/root_pathrslite.go
|
||||
@@ -31,12 +31,15 @@ import (
|
||||
// is effectively shorthand for [securejoin.OpenInRoot] followed by
|
||||
// [securejoin.Reopen].
|
||||
func OpenInRoot(root, subpath string, flags int) (*os.File, error) {
|
||||
- handle, err := pathrs.OpenInRoot(root, subpath)
|
||||
+ handle, err := retryEAGAIN(func() (*os.File, error) {
|
||||
+ return pathrs.OpenInRoot(root, subpath)
|
||||
+ })
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer handle.Close()
|
||||
- return pathrs.Reopen(handle, flags)
|
||||
+
|
||||
+ return Reopen(handle, flags)
|
||||
}
|
||||
|
||||
// CreateInRoot creates a new file inside a root (as well as any missing parent
|
||||
diff --git a/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md b/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md
|
||||
index 6862467c..3faee0bc 100644
|
||||
--- a/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md
|
||||
+++ b/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md
|
||||
@@ -4,7 +4,36 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](http://keepachangelog.com/)
|
||||
and this project adheres to [Semantic Versioning](http://semver.org/).
|
||||
|
||||
-## [Unreleased] ##
|
||||
+## [Unreleased 0.5.z] ##
|
||||
+
|
||||
+## [0.5.1] - 2025-10-31 ##
|
||||
+
|
||||
+> Spooky scary skeletons send shivers down your spine!
|
||||
+
|
||||
+### Changed ###
|
||||
+- `openat2` can return `-EAGAIN` if it detects a possible attack in certain
|
||||
+ scenarios (namely if there was a rename or mount while walking a path with a
|
||||
+ `..` component). While this is necessary to avoid a denial-of-service in the
|
||||
+ kernel, it does require retry loops in userspace.
|
||||
+
|
||||
+ In previous versions, `pathrs-lite` would retry `openat2` 32 times before
|
||||
+ returning an error, but we've received user reports that this limit can be
|
||||
+ hit on systems with very heavy load. In some synthetic benchmarks (testing
|
||||
+ the worst-case of an attacker doing renames in a tight loop on every core of
|
||||
+ a 16-core machine) we managed to get a ~3% failure rate in runc. We have
|
||||
+ improved this situation in two ways:
|
||||
+
|
||||
+ * We have now increased this limit to 128, which should be good enough for
|
||||
+ most use-cases without becoming a denial-of-service vector (the number of
|
||||
+ syscalls called by the `O_PATH` resolver in a typical case is within the
|
||||
+ same ballpark). The same benchmarks show a failure rate of ~0.12% which
|
||||
+ (while not zero) is probably sufficient for most users.
|
||||
+
|
||||
+ * In addition, we now return a `unix.EAGAIN` error that is bubbled up and can
|
||||
+ be detected by callers. This means that callers with stricter requirements
|
||||
+ to avoid spurious errors can choose to do their own infinite `EAGAIN` retry
|
||||
+ loop (though we would strongly recommend users use time-based deadlines in
|
||||
+ such retry loops to avoid potentially unbounded denials-of-service).
|
||||
|
||||
## [0.5.0] - 2025-09-26 ##
|
||||
|
||||
@@ -354,7 +383,8 @@ This is our first release of `github.com/cyphar/filepath-securejoin`,
|
||||
containing a full implementation with a coverage of 93.5% (the only missing
|
||||
cases are the error cases, which are hard to mocktest at the moment).
|
||||
|
||||
-[Unreleased]: https://github.com/cyphar/filepath-securejoin/compare/v0.5.0...HEAD
|
||||
+[Unreleased 0.5.z]: https://github.com/cyphar/filepath-securejoin/compare/v0.5.1...release-0.5
|
||||
+[0.5.1]: https://github.com/cyphar/filepath-securejoin/compare/v0.5.0...v0.5.1
|
||||
[0.5.0]: https://github.com/cyphar/filepath-securejoin/compare/v0.4.1...v0.5.0
|
||||
[0.4.1]: https://github.com/cyphar/filepath-securejoin/compare/v0.4.0...v0.4.1
|
||||
[0.4.0]: https://github.com/cyphar/filepath-securejoin/compare/v0.3.6...v0.4.0
|
||||
diff --git a/vendor/github.com/cyphar/filepath-securejoin/VERSION b/vendor/github.com/cyphar/filepath-securejoin/VERSION
|
||||
index 8f0916f7..4b9fcbec 100644
|
||||
--- a/vendor/github.com/cyphar/filepath-securejoin/VERSION
|
||||
+++ b/vendor/github.com/cyphar/filepath-securejoin/VERSION
|
||||
@@ -1 +1 @@
|
||||
-0.5.0
|
||||
+0.5.1
|
||||
diff --git a/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors.go b/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors_linux.go
|
||||
similarity index 70%
|
||||
rename from vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors.go
|
||||
rename to vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors_linux.go
|
||||
index c26e440e..d0b200f4 100644
|
||||
--- a/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors.go
|
||||
+++ b/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors_linux.go
|
||||
@@ -1,5 +1,7 @@
|
||||
// SPDX-License-Identifier: MPL-2.0
|
||||
|
||||
+//go:build linux
|
||||
+
|
||||
// Copyright (C) 2024-2025 Aleksa Sarai <cyphar@cyphar.com>
|
||||
// Copyright (C) 2024-2025 SUSE LLC
|
||||
//
|
||||
@@ -12,15 +14,24 @@ package internal
|
||||
|
||||
import (
|
||||
"errors"
|
||||
+
|
||||
+ "golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
+type xdevErrorish struct {
|
||||
+ description string
|
||||
+}
|
||||
+
|
||||
+func (err xdevErrorish) Error() string { return err.description }
|
||||
+func (err xdevErrorish) Is(target error) bool { return target == unix.EXDEV }
|
||||
+
|
||||
var (
|
||||
// ErrPossibleAttack indicates that some attack was detected.
|
||||
- ErrPossibleAttack = errors.New("possible attack detected")
|
||||
+ ErrPossibleAttack error = xdevErrorish{"possible attack detected"}
|
||||
|
||||
// ErrPossibleBreakout indicates that during an operation we ended up in a
|
||||
// state that could be a breakout but we detected it.
|
||||
- ErrPossibleBreakout = errors.New("possible breakout detected")
|
||||
+ ErrPossibleBreakout error = xdevErrorish{"possible breakout detected"}
|
||||
|
||||
// ErrInvalidDirectory indicates an unlinked directory.
|
||||
ErrInvalidDirectory = errors.New("wandered into deleted directory")
|
||||
diff --git a/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/fd/openat2_linux.go b/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/fd/openat2_linux.go
|
||||
index 23053083..3e937fe3 100644
|
||||
--- a/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/fd/openat2_linux.go
|
||||
+++ b/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/fd/openat2_linux.go
|
||||
@@ -17,8 +17,6 @@ import (
|
||||
"runtime"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
-
|
||||
- "github.com/cyphar/filepath-securejoin/pathrs-lite/internal"
|
||||
)
|
||||
|
||||
func scopedLookupShouldRetry(how *unix.OpenHow, err error) bool {
|
||||
@@ -34,7 +32,10 @@ func scopedLookupShouldRetry(how *unix.OpenHow, err error) bool {
|
||||
(errors.Is(err, unix.EAGAIN) || errors.Is(err, unix.EXDEV))
|
||||
}
|
||||
|
||||
-const scopedLookupMaxRetries = 32
|
||||
+// This is a fairly arbitrary limit we have just to avoid an attacker being
|
||||
+// able to make us spin in an infinite retry loop -- callers can choose to
|
||||
+// retry on EAGAIN if they prefer.
|
||||
+const scopedLookupMaxRetries = 128
|
||||
|
||||
// Openat2 is an [Fd]-based wrapper around unix.Openat2, but with some retry
|
||||
// logic in case of EAGAIN errors.
|
||||
@@ -43,10 +44,10 @@ func Openat2(dir Fd, path string, how *unix.OpenHow) (*os.File, error) {
|
||||
// Make sure we always set O_CLOEXEC.
|
||||
how.Flags |= unix.O_CLOEXEC
|
||||
var tries int
|
||||
- for tries < scopedLookupMaxRetries {
|
||||
+ for {
|
||||
fd, err := unix.Openat2(dirFd, path, how)
|
||||
if err != nil {
|
||||
- if scopedLookupShouldRetry(how, err) {
|
||||
+ if scopedLookupShouldRetry(how, err) && tries < scopedLookupMaxRetries {
|
||||
// We retry a couple of times to avoid the spurious errors, and
|
||||
// if we are being attacked then returning -EAGAIN is the best
|
||||
// we can do.
|
||||
@@ -58,5 +59,4 @@ func Openat2(dir Fd, path string, how *unix.OpenHow) (*os.File, error) {
|
||||
runtime.KeepAlive(dir)
|
||||
return os.NewFile(uintptr(fd), fullPath), nil
|
||||
}
|
||||
- return nil, &os.PathError{Op: "openat2", Path: fullPath, Err: internal.ErrPossibleAttack}
|
||||
}
|
||||
diff --git a/vendor/modules.txt b/vendor/modules.txt
|
||||
index f22001c8..18276b61 100644
|
||||
--- a/vendor/modules.txt
|
||||
+++ b/vendor/modules.txt
|
||||
@@ -27,7 +27,7 @@ github.com/coreos/go-systemd/v22/dbus
|
||||
# github.com/cpuguy83/go-md2man/v2 v2.0.5
|
||||
## explicit; go 1.11
|
||||
github.com/cpuguy83/go-md2man/v2/md2man
|
||||
-# github.com/cyphar/filepath-securejoin v0.5.0
|
||||
+# github.com/cyphar/filepath-securejoin v0.5.1
|
||||
## explicit; go 1.18
|
||||
github.com/cyphar/filepath-securejoin
|
||||
github.com/cyphar/filepath-securejoin/internal/consts
|
||||
--
|
||||
2.51.1
|
||||
|
||||
13709
SOURCES/0001-1.3.0-CVEs-mega-patch.patch
Normal file
13709
SOURCES/0001-1.3.0-CVEs-mega-patch.patch
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,62 +0,0 @@
|
||||
From dfb3496c174377b860b62872ce6af951364cc3ac Mon Sep 17 00:00:00 2001
|
||||
From: Lokesh Mandvekar <lsm5@fedoraproject.org>
|
||||
Date: Tue, 12 Dec 2017 13:22:42 +0530
|
||||
Subject: [PATCH] Revert "Apply cgroups earlier"
|
||||
|
||||
This reverts commit 7062c7556b71188abc18d7516441ff4b03fbc1fc.
|
||||
---
|
||||
libcontainer/process_linux.go | 31 ++++++++++++++-----------------
|
||||
1 file changed, 14 insertions(+), 17 deletions(-)
|
||||
|
||||
diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
|
||||
index 149b1126..b8a395af 100644
|
||||
--- a/libcontainer/process_linux.go
|
||||
+++ b/libcontainer/process_linux.go
|
||||
@@ -272,6 +272,20 @@ func (p *initProcess) start() error {
|
||||
p.process.ops = nil
|
||||
return newSystemErrorWithCause(err, "starting init process command")
|
||||
}
|
||||
+ if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
|
||||
+ return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
|
||||
+ }
|
||||
+ if err := p.execSetns(); err != nil {
|
||||
+ return newSystemErrorWithCause(err, "running exec setns process for init")
|
||||
+ }
|
||||
+ // Save the standard descriptor names before the container process
|
||||
+ // can potentially move them (e.g., via dup2()). If we don't do this now,
|
||||
+ // we won't know at checkpoint time which file descriptor to look up.
|
||||
+ fds, err := getPipeFds(p.pid())
|
||||
+ if err != nil {
|
||||
+ return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
|
||||
+ }
|
||||
+ p.setExternalDescriptors(fds)
|
||||
// Do this before syncing with child so that no children can escape the
|
||||
// cgroup. We don't need to worry about not doing this and not being root
|
||||
// because we'd be using the rootless cgroup manager in that case.
|
||||
@@ -292,23 +306,6 @@ func (p *initProcess) start() error {
|
||||
}
|
||||
}
|
||||
}()
|
||||
-
|
||||
- if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
|
||||
- return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
|
||||
- }
|
||||
-
|
||||
- if err := p.execSetns(); err != nil {
|
||||
- return newSystemErrorWithCause(err, "running exec setns process for init")
|
||||
- }
|
||||
-
|
||||
- // Save the standard descriptor names before the container process
|
||||
- // can potentially move them (e.g., via dup2()). If we don't do this now,
|
||||
- // we won't know at checkpoint time which file descriptor to look up.
|
||||
- fds, err := getPipeFds(p.pid())
|
||||
- if err != nil {
|
||||
- return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
|
||||
- }
|
||||
- p.setExternalDescriptors(fds)
|
||||
if err := p.createNetworkInterfaces(); err != nil {
|
||||
return newSystemErrorWithCause(err, "creating network interfaces")
|
||||
}
|
||||
--
|
||||
2.14.3
|
||||
|
||||
@ -1,290 +0,0 @@
|
||||
From bf6405284aa3870a39b402309003633a1c230ed9 Mon Sep 17 00:00:00 2001
|
||||
From: Aleksa Sarai <asarai@suse.de>
|
||||
Date: Wed, 9 Jan 2019 13:40:01 +1100
|
||||
Subject: [PATCH 1/1] nsenter: clone /proc/self/exe to avoid exposing host
|
||||
binary to container
|
||||
|
||||
There are quite a few circumstances where /proc/self/exe pointing to a
|
||||
pretty important container binary is a _bad_ thing, so to avoid this we
|
||||
have to make a copy (preferably doing self-clean-up and not being
|
||||
writeable).
|
||||
|
||||
As a hotfix we require memfd_create(2), but we can always extend this to
|
||||
use a scratch MNT_DETACH overlayfs or tmpfs. The main downside to this
|
||||
approach is no page-cache sharing for the runc binary (which overlayfs
|
||||
would give us) but this is far less complicated.
|
||||
|
||||
This is only done during nsenter so that it happens transparently to the
|
||||
Go code, and any libcontainer users benefit from it. This also makes
|
||||
ExtraFiles and --preserve-fds handling trivial (because we don't need to
|
||||
worry about it).
|
||||
|
||||
Fixes: CVE-2019-5736
|
||||
Co-developed-by: Christian Brauner <christian.brauner@ubuntu.com>
|
||||
Signed-off-by: Aleksa Sarai <asarai@suse.de>
|
||||
Signed-off-by: Mrunal Patel <mrunalp@gmail.com>
|
||||
---
|
||||
libcontainer/nsenter/cloned_binary.c | 221 +++++++++++++++++++++++++++
|
||||
libcontainer/nsenter/nsexec.c | 11 ++
|
||||
2 files changed, 232 insertions(+)
|
||||
create mode 100644 libcontainer/nsenter/cloned_binary.c
|
||||
|
||||
diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c
|
||||
new file mode 100644
|
||||
index 00000000..d9f6093a
|
||||
--- /dev/null
|
||||
+++ b/libcontainer/nsenter/cloned_binary.c
|
||||
@@ -0,0 +1,221 @@
|
||||
+#define _GNU_SOURCE
|
||||
+#include <unistd.h>
|
||||
+#include <stdio.h>
|
||||
+#include <stdlib.h>
|
||||
+#include <stdbool.h>
|
||||
+#include <string.h>
|
||||
+#include <limits.h>
|
||||
+#include <fcntl.h>
|
||||
+#include <errno.h>
|
||||
+
|
||||
+#include <sys/types.h>
|
||||
+#include <sys/stat.h>
|
||||
+#include <sys/vfs.h>
|
||||
+#include <sys/mman.h>
|
||||
+#include <sys/sendfile.h>
|
||||
+#include <sys/syscall.h>
|
||||
+
|
||||
+#include <linux/magic.h>
|
||||
+#include <linux/memfd.h>
|
||||
+
|
||||
+/* Use our own wrapper for memfd_create. */
|
||||
+#if !defined(SYS_memfd_create) && defined(__NR_memfd_create)
|
||||
+# define SYS_memfd_create __NR_memfd_create
|
||||
+#endif
|
||||
+#ifndef SYS_memfd_create
|
||||
+# error "memfd_create(2) syscall not supported by this glibc version"
|
||||
+#endif
|
||||
+int memfd_create(const char *name, unsigned int flags)
|
||||
+{
|
||||
+ return syscall(SYS_memfd_create, name, flags);
|
||||
+}
|
||||
+
|
||||
+/* This comes directly from <linux/fcntl.h>. */
|
||||
+#ifndef F_LINUX_SPECIFIC_BASE
|
||||
+# define F_LINUX_SPECIFIC_BASE 1024
|
||||
+#endif
|
||||
+#ifndef F_ADD_SEALS
|
||||
+# define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
|
||||
+# define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
|
||||
+#endif
|
||||
+#ifndef F_SEAL_SEAL
|
||||
+# define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
|
||||
+# define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
|
||||
+# define F_SEAL_GROW 0x0004 /* prevent file from growing */
|
||||
+# define F_SEAL_WRITE 0x0008 /* prevent writes */
|
||||
+#endif
|
||||
+
|
||||
+
|
||||
+#define OUR_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
|
||||
+#define OUR_MEMFD_SEALS \
|
||||
+ (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
|
||||
+
|
||||
+static void *must_realloc(void *ptr, size_t size)
|
||||
+{
|
||||
+ void *old = ptr;
|
||||
+ do {
|
||||
+ ptr = realloc(old, size);
|
||||
+ } while(!ptr);
|
||||
+ return ptr;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Verify whether we are currently in a self-cloned program (namely, is
|
||||
+ * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather
|
||||
+ * for shmem files), and we want to be sure it's actually sealed.
|
||||
+ */
|
||||
+static int is_self_cloned(void)
|
||||
+{
|
||||
+ int fd, seals;
|
||||
+
|
||||
+ fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC);
|
||||
+ if (fd < 0)
|
||||
+ return -ENOTRECOVERABLE;
|
||||
+
|
||||
+ seals = fcntl(fd, F_GET_SEALS);
|
||||
+ close(fd);
|
||||
+ return seals == OUR_MEMFD_SEALS;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Basic wrapper around mmap(2) that gives you the file length so you can
|
||||
+ * safely treat it as an ordinary buffer. Only gives you read access.
|
||||
+ */
|
||||
+static char *read_file(char *path, size_t *length)
|
||||
+{
|
||||
+ int fd;
|
||||
+ char buf[4096], *copy = NULL;
|
||||
+
|
||||
+ if (!length)
|
||||
+ return NULL;
|
||||
+
|
||||
+ fd = open(path, O_RDONLY | O_CLOEXEC);
|
||||
+ if (fd < 0)
|
||||
+ return NULL;
|
||||
+
|
||||
+ *length = 0;
|
||||
+ for (;;) {
|
||||
+ int n;
|
||||
+
|
||||
+ n = read(fd, buf, sizeof(buf));
|
||||
+ if (n < 0)
|
||||
+ goto error;
|
||||
+ if (!n)
|
||||
+ break;
|
||||
+
|
||||
+ copy = must_realloc(copy, (*length + n) * sizeof(*copy));
|
||||
+ memcpy(copy + *length, buf, n);
|
||||
+ *length += n;
|
||||
+ }
|
||||
+ close(fd);
|
||||
+ return copy;
|
||||
+
|
||||
+error:
|
||||
+ close(fd);
|
||||
+ free(copy);
|
||||
+ return NULL;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * A poor-man's version of "xargs -0". Basically parses a given block of
|
||||
+ * NUL-delimited data, within the given length and adds a pointer to each entry
|
||||
+ * to the array of pointers.
|
||||
+ */
|
||||
+static int parse_xargs(char *data, int data_length, char ***output)
|
||||
+{
|
||||
+ int num = 0;
|
||||
+ char *cur = data;
|
||||
+
|
||||
+ if (!data || *output != NULL)
|
||||
+ return -1;
|
||||
+
|
||||
+ while (cur < data + data_length) {
|
||||
+ num++;
|
||||
+ *output = must_realloc(*output, (num + 1) * sizeof(**output));
|
||||
+ (*output)[num - 1] = cur;
|
||||
+ cur += strlen(cur) + 1;
|
||||
+ }
|
||||
+ (*output)[num] = NULL;
|
||||
+ return num;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * "Parse" out argv and envp from /proc/self/cmdline and /proc/self/environ.
|
||||
+ * This is necessary because we are running in a context where we don't have a
|
||||
+ * main() that we can just get the arguments from.
|
||||
+ */
|
||||
+static int fetchve(char ***argv, char ***envp)
|
||||
+{
|
||||
+ char *cmdline = NULL, *environ = NULL;
|
||||
+ size_t cmdline_size, environ_size;
|
||||
+
|
||||
+ cmdline = read_file("/proc/self/cmdline", &cmdline_size);
|
||||
+ if (!cmdline)
|
||||
+ goto error;
|
||||
+ environ = read_file("/proc/self/environ", &environ_size);
|
||||
+ if (!environ)
|
||||
+ goto error;
|
||||
+
|
||||
+ if (parse_xargs(cmdline, cmdline_size, argv) <= 0)
|
||||
+ goto error;
|
||||
+ if (parse_xargs(environ, environ_size, envp) <= 0)
|
||||
+ goto error;
|
||||
+
|
||||
+ return 0;
|
||||
+
|
||||
+error:
|
||||
+ free(environ);
|
||||
+ free(cmdline);
|
||||
+ return -EINVAL;
|
||||
+}
|
||||
+
|
||||
+#define SENDFILE_MAX 0x7FFFF000 /* sendfile(2) is limited to 2GB. */
|
||||
+static int clone_binary(void)
|
||||
+{
|
||||
+ int binfd, memfd, err;
|
||||
+ ssize_t sent = 0;
|
||||
+
|
||||
+ memfd = memfd_create(OUR_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
|
||||
+ if (memfd < 0)
|
||||
+ return -ENOTRECOVERABLE;
|
||||
+
|
||||
+ binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
|
||||
+ if (binfd < 0)
|
||||
+ goto error;
|
||||
+
|
||||
+ sent = sendfile(memfd, binfd, NULL, SENDFILE_MAX);
|
||||
+ close(binfd);
|
||||
+ if (sent < 0)
|
||||
+ goto error;
|
||||
+
|
||||
+ err = fcntl(memfd, F_ADD_SEALS, OUR_MEMFD_SEALS);
|
||||
+ if (err < 0)
|
||||
+ goto error;
|
||||
+
|
||||
+ return memfd;
|
||||
+
|
||||
+error:
|
||||
+ close(memfd);
|
||||
+ return -EIO;
|
||||
+}
|
||||
+
|
||||
+int ensure_cloned_binary(void)
|
||||
+{
|
||||
+ int execfd;
|
||||
+ char **argv = NULL, **envp = NULL;
|
||||
+
|
||||
+ /* Check that we're not self-cloned, and if we are then bail. */
|
||||
+ int cloned = is_self_cloned();
|
||||
+ if (cloned > 0 || cloned == -ENOTRECOVERABLE)
|
||||
+ return cloned;
|
||||
+
|
||||
+ if (fetchve(&argv, &envp) < 0)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ execfd = clone_binary();
|
||||
+ if (execfd < 0)
|
||||
+ return -EIO;
|
||||
+
|
||||
+ fexecve(execfd, argv, envp);
|
||||
+ return -ENOEXEC;
|
||||
+}
|
||||
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
|
||||
index cb224314..784fd9b0 100644
|
||||
--- a/libcontainer/nsenter/nsexec.c
|
||||
+++ b/libcontainer/nsenter/nsexec.c
|
||||
@@ -528,6 +528,9 @@ void join_namespaces(char *nslist)
|
||||
free(namespaces);
|
||||
}
|
||||
|
||||
+/* Defined in cloned_binary.c. */
|
||||
+int ensure_cloned_binary(void);
|
||||
+
|
||||
void nsexec(void)
|
||||
{
|
||||
int pipenum;
|
||||
@@ -543,6 +546,14 @@ void nsexec(void)
|
||||
if (pipenum == -1)
|
||||
return;
|
||||
|
||||
+ /*
|
||||
+ * We need to re-exec if we are not in a cloned binary. This is necessary
|
||||
+ * to ensure that containers won't be able to access the host binary
|
||||
+ * through /proc/self/exe. See CVE-2019-5736.
|
||||
+ */
|
||||
+ if (ensure_cloned_binary() < 0)
|
||||
+ bail("could not ensure we are a cloned binary");
|
||||
+
|
||||
/* Parse all of the netlink configuration. */
|
||||
nl_parse(pipenum, &config);
|
||||
|
||||
--
|
||||
2.20.1
|
||||
|
||||
@ -0,0 +1,49 @@
|
||||
From 2a9b44aabfa52bb071ff2e3564427da0bb82312e Mon Sep 17 00:00:00 2001
|
||||
From: Aleksa Sarai <cyphar@cyphar.com>
|
||||
Date: Wed, 5 Nov 2025 02:04:02 +1100
|
||||
Subject: [PATCH 2/2] [1.3] rootfs: re-allow dangling symlinks in mount targets
|
||||
|
||||
It seems there are a fair few images where dangling symlinks are used as
|
||||
path components for mount targets, which pathrs-lite does not support
|
||||
(and it would be difficult to fully support this in a race-free way).
|
||||
|
||||
This was actually meant to be blocked by commit 63c2908164f3 ("rootfs:
|
||||
try to scope MkdirAll to stay inside the rootfs"), followed by commit
|
||||
dd827f7b715a ("utils: switch to securejoin.MkdirAllHandle"). However, we
|
||||
still used SecureJoin to construct mountpoint targets, which means that
|
||||
dangling symlinks were "resolved" before reaching pathrs-lite.
|
||||
|
||||
This patch basically re-adds this hack in order to reduce the breakages
|
||||
we've seen so far.
|
||||
|
||||
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
|
||||
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
|
||||
---
|
||||
libcontainer/rootfs_linux.go | 11 +++++++++++
|
||||
1 file changed, 11 insertions(+)
|
||||
|
||||
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
|
||||
index d85e7321..2fda3c9d 100644
|
||||
--- a/libcontainer/rootfs_linux.go
|
||||
+++ b/libcontainer/rootfs_linux.go
|
||||
@@ -519,6 +519,17 @@ func (m *mountEntry) createOpenMountpoint(rootfs string) (Err error) {
|
||||
dstIsFile = !fi.IsDir()
|
||||
}
|
||||
|
||||
+ // In previous runc versions, we would tolerate nonsense paths with
|
||||
+ // dangling symlinks as path components. pathrs-lite does not support
|
||||
+ // this, so instead we have to emulate this behaviour by doing
|
||||
+ // SecureJoin *purely to get a semi-reasonable path to use* and then we
|
||||
+ // use pathrs-lite to operate on the path safely.
|
||||
+ newUnsafePath, err := securejoin.SecureJoin(rootfs, unsafePath)
|
||||
+ if err != nil {
|
||||
+ return err
|
||||
+ }
|
||||
+ unsafePath = utils.StripRoot(rootfs, newUnsafePath)
|
||||
+
|
||||
if dstIsFile {
|
||||
dstFile, err = pathrs.CreateInRoot(rootfs, unsafePath, unix.O_CREAT|unix.O_EXCL|unix.O_NOFOLLOW, 0o644)
|
||||
} else {
|
||||
--
|
||||
2.51.1
|
||||
|
||||
@ -1,200 +0,0 @@
|
||||
From ecf53c23545092019602578583031c28fde4d2a1 Mon Sep 17 00:00:00 2001
|
||||
From: Giuseppe Scrivano <gscrivan@redhat.com>
|
||||
Date: Fri, 25 May 2018 18:04:06 +0200
|
||||
Subject: [PATCH] sd-notify: do not hang when NOTIFY_SOCKET is used with create
|
||||
|
||||
if NOTIFY_SOCKET is used, do not block the main runc process waiting
|
||||
for events on the notify socket. Change the logic to create a new
|
||||
process that monitors exclusively the notify socket until an event is
|
||||
received.
|
||||
|
||||
Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
|
||||
---
|
||||
init.go | 12 +++++++
|
||||
notify_socket.go | 101 ++++++++++++++++++++++++++++++++++++++++++++++---------
|
||||
signals.go | 5 +--
|
||||
3 files changed, 99 insertions(+), 19 deletions(-)
|
||||
|
||||
diff --git a/init.go b/init.go
|
||||
index c8f453192..6a3d9e91c 100644
|
||||
--- a/init.go
|
||||
+++ b/init.go
|
||||
@@ -20,6 +20,18 @@ var initCommand = cli.Command{
|
||||
Name: "init",
|
||||
Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`,
|
||||
Action: func(context *cli.Context) error {
|
||||
+ // If NOTIFY_SOCKET is used create a new process that stays around
|
||||
+ // so to not block "runc start". It will automatically exits when the
|
||||
+ // container notifies that it is ready, or when the container is deleted
|
||||
+ if os.Getenv("_NOTIFY_SOCKET_FD") != "" {
|
||||
+ fd := os.Getenv("_NOTIFY_SOCKET_FD")
|
||||
+ pid := os.Getenv("_NOTIFY_SOCKET_PID")
|
||||
+ hostNotifySocket := os.Getenv("_NOTIFY_SOCKET_HOST")
|
||||
+ notifySocketPath := os.Getenv("_NOTIFY_SOCKET_PATH")
|
||||
+ notifySocketInit(fd, pid, hostNotifySocket, notifySocketPath)
|
||||
+ os.Exit(0)
|
||||
+ }
|
||||
+
|
||||
factory, _ := libcontainer.New("")
|
||||
if err := factory.StartInitialization(); err != nil {
|
||||
// as the error is sent back to the parent there is no need to log
|
||||
diff --git a/notify_socket.go b/notify_socket.go
|
||||
index cd6c0a989..e04e9d660 100644
|
||||
--- a/notify_socket.go
|
||||
+++ b/notify_socket.go
|
||||
@@ -6,10 +6,13 @@ import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"net"
|
||||
+ "os"
|
||||
+ "os/exec"
|
||||
"path/filepath"
|
||||
+ "strconv"
|
||||
+ "time"
|
||||
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
-
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/urfave/cli"
|
||||
)
|
||||
@@ -64,24 +67,94 @@ func (s *notifySocket) setupSocket() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
+func (notifySocket *notifySocket) notifyNewPid(pid int) {
|
||||
+ notifySocketHostAddr := net.UnixAddr{Name: notifySocket.host, Net: "unixgram"}
|
||||
+ client, err := net.DialUnix("unixgram", nil, ¬ifySocketHostAddr)
|
||||
+ if err != nil {
|
||||
+ return
|
||||
+ }
|
||||
+ newPid := fmt.Sprintf("MAINPID=%d\n", pid)
|
||||
+ client.Write([]byte(newPid))
|
||||
+}
|
||||
+
|
||||
// pid1 must be set only with -d, as it is used to set the new process as the main process
|
||||
// for the service in systemd
|
||||
func (notifySocket *notifySocket) run(pid1 int) {
|
||||
- buf := make([]byte, 512)
|
||||
- notifySocketHostAddr := net.UnixAddr{Name: notifySocket.host, Net: "unixgram"}
|
||||
- client, err := net.DialUnix("unixgram", nil, ¬ifySocketHostAddr)
|
||||
+ file, err := notifySocket.socket.File()
|
||||
if err != nil {
|
||||
logrus.Error(err)
|
||||
return
|
||||
}
|
||||
- for {
|
||||
- r, err := notifySocket.socket.Read(buf)
|
||||
- if err != nil {
|
||||
- break
|
||||
+ defer file.Close()
|
||||
+ defer notifySocket.socket.Close()
|
||||
+
|
||||
+ cmd := exec.Command("/proc/self/exe", "init")
|
||||
+ cmd.ExtraFiles = []*os.File{file}
|
||||
+ cmd.Env = append(cmd.Env, "_NOTIFY_SOCKET_FD=3",
|
||||
+ fmt.Sprintf("_NOTIFY_SOCKET_PID=%d", pid1),
|
||||
+ fmt.Sprintf("_NOTIFY_SOCKET_HOST=%s", notifySocket.host),
|
||||
+ fmt.Sprintf("_NOTIFY_SOCKET_PATH=%s", notifySocket.socketPath))
|
||||
+
|
||||
+ if err := cmd.Start(); err != nil {
|
||||
+ logrus.Fatal(err)
|
||||
+ }
|
||||
+ notifySocket.notifyNewPid(cmd.Process.Pid)
|
||||
+ cmd.Process.Release()
|
||||
+}
|
||||
+
|
||||
+func notifySocketInit(envFd string, envPid string, notifySocketHost string, notifySocketPath string) {
|
||||
+ intFd, err := strconv.Atoi(envFd)
|
||||
+ if err != nil {
|
||||
+ return
|
||||
+ }
|
||||
+ pid1, err := strconv.Atoi(envPid)
|
||||
+ if err != nil {
|
||||
+ return
|
||||
+ }
|
||||
+
|
||||
+ file := os.NewFile(uintptr(intFd), "unixgram")
|
||||
+ defer file.Close()
|
||||
+
|
||||
+ fileChan := make(chan []byte)
|
||||
+ exitChan := make(chan bool)
|
||||
+
|
||||
+ go func() {
|
||||
+ for {
|
||||
+ buf := make([]byte, 512)
|
||||
+ r, err := file.Read(buf)
|
||||
+ if err != nil {
|
||||
+ return
|
||||
+ }
|
||||
+ fileChan <- buf[0:r]
|
||||
}
|
||||
- var out bytes.Buffer
|
||||
- for _, line := range bytes.Split(buf[0:r], []byte{'\n'}) {
|
||||
- if bytes.HasPrefix(line, []byte("READY=")) {
|
||||
+ }()
|
||||
+ go func() {
|
||||
+ for {
|
||||
+ if _, err := os.Stat(notifySocketPath); os.IsNotExist(err) {
|
||||
+ exitChan <- true
|
||||
+ return
|
||||
+ }
|
||||
+ time.Sleep(time.Second)
|
||||
+ }
|
||||
+ }()
|
||||
+
|
||||
+ notifySocketHostAddr := net.UnixAddr{Name: notifySocketHost, Net: "unixgram"}
|
||||
+ client, err := net.DialUnix("unixgram", nil, ¬ifySocketHostAddr)
|
||||
+ if err != nil {
|
||||
+ return
|
||||
+ }
|
||||
+
|
||||
+ for {
|
||||
+ select {
|
||||
+ case <-exitChan:
|
||||
+ return
|
||||
+ case b := <-fileChan:
|
||||
+ for _, line := range bytes.Split(b, []byte{'\n'}) {
|
||||
+ if !bytes.HasPrefix(line, []byte("READY=")) {
|
||||
+ continue
|
||||
+ }
|
||||
+
|
||||
+ var out bytes.Buffer
|
||||
_, err = out.Write(line)
|
||||
if err != nil {
|
||||
return
|
||||
@@ -98,10 +171,8 @@ func (notifySocket *notifySocket) run(pid1 int) {
|
||||
}
|
||||
|
||||
// now we can inform systemd to use pid1 as the pid to monitor
|
||||
- if pid1 > 0 {
|
||||
- newPid := fmt.Sprintf("MAINPID=%d\n", pid1)
|
||||
- client.Write([]byte(newPid))
|
||||
- }
|
||||
+ newPid := fmt.Sprintf("MAINPID=%d\n", pid1)
|
||||
+ client.Write([]byte(newPid))
|
||||
return
|
||||
}
|
||||
}
|
||||
diff --git a/signals.go b/signals.go
|
||||
index 1811de837..d0988cb39 100644
|
||||
--- a/signals.go
|
||||
+++ b/signals.go
|
||||
@@ -70,7 +70,7 @@ func (h *signalHandler) forward(process *libcontainer.Process, tty *tty, detach
|
||||
h.notifySocket.run(pid1)
|
||||
return 0, nil
|
||||
} else {
|
||||
- go h.notifySocket.run(0)
|
||||
+ h.notifySocket.run(os.Getpid())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,9 +98,6 @@ func (h *signalHandler) forward(process *libcontainer.Process, tty *tty, detach
|
||||
// status because we must ensure that any of the go specific process
|
||||
// fun such as flushing pipes are complete before we return.
|
||||
process.Wait()
|
||||
- if h.notifySocket != nil {
|
||||
- h.notifySocket.Close()
|
||||
- }
|
||||
return e.status, nil
|
||||
}
|
||||
}
|
||||
@ -1 +0,0 @@
|
||||
fs.may_detach_mounts=1
|
||||
@ -1,61 +0,0 @@
|
||||
diff --git a/list.go b/list.go
|
||||
index 0313d8c..328798b 100644
|
||||
--- a/list.go
|
||||
+++ b/list.go
|
||||
@@ -50,7 +50,7 @@ var listCommand = cli.Command{
|
||||
ArgsUsage: `
|
||||
|
||||
Where the given root is specified via the global option "--root"
|
||||
-(default: "/run/runc").
|
||||
+(default: "/run/runc-ctrs").
|
||||
|
||||
EXAMPLE 1:
|
||||
To list containers created via the default "--root":
|
||||
diff --git a/main.go b/main.go
|
||||
index 278399a..0f49fce 100644
|
||||
--- a/main.go
|
||||
+++ b/main.go
|
||||
@@ -62,7 +62,7 @@ func main() {
|
||||
v = append(v, fmt.Sprintf("spec: %s", specs.Version))
|
||||
app.Version = strings.Join(v, "\n")
|
||||
|
||||
- root := "/run/runc"
|
||||
+ root := "/run/runc-ctrs"
|
||||
rootless, err := isRootless(nil)
|
||||
if err != nil {
|
||||
fatal(err)
|
||||
@@ -70,7 +70,7 @@ func main() {
|
||||
if rootless {
|
||||
runtimeDir := os.Getenv("XDG_RUNTIME_DIR")
|
||||
if runtimeDir != "" {
|
||||
- root = runtimeDir + "/runc"
|
||||
+ root = runtimeDir + "/runc-ctrs"
|
||||
// According to the XDG specification, we need to set anything in
|
||||
// XDG_RUNTIME_DIR to have a sticky bit if we don't want it to get
|
||||
// auto-pruned.
|
||||
diff --git a/man/runc-list.8.md b/man/runc-list.8.md
|
||||
index f737424..107220e 100644
|
||||
--- a/man/runc-list.8.md
|
||||
+++ b/man/runc-list.8.md
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
# EXAMPLE
|
||||
Where the given root is specified via the global option "--root"
|
||||
-(default: "/run/runc").
|
||||
+(default: "/run/runc-ctrs").
|
||||
|
||||
To list containers created via the default "--root":
|
||||
# runc list
|
||||
diff --git a/man/runc.8.md b/man/runc.8.md
|
||||
index 6d0ddff..337bc73 100644
|
||||
--- a/man/runc.8.md
|
||||
+++ b/man/runc.8.md
|
||||
@@ -51,7 +51,7 @@ value for "bundle" is the current directory.
|
||||
--debug enable debug output for logging
|
||||
--log value set the log file path where internal debug information is written (default: "/dev/null")
|
||||
--log-format value set the format used by logs ('text' (default), or 'json') (default: "text")
|
||||
- --root value root directory for storage of container state (this should be located in tmpfs) (default: "/run/runc" or $XDG_RUNTIME_DIR/runc for rootless containers)
|
||||
+ --root value root directory for storage of container state (this should be located in tmpfs) (default: "/run/runc-ctrs" or $XDG_RUNTIME_DIR/runc-ctrs for rootless containers)
|
||||
--criu value path to the criu binary used for checkpoint and restore (default: "criu")
|
||||
--systemd-cgroup enable systemd cgroup support, expects cgroupsPath to be of form "slice:prefix:name" for e.g. "system.slice:runc:434234"
|
||||
--rootless value enable rootless mode ('true', 'false', or 'auto') (default: "auto")
|
||||
@ -1,72 +0,0 @@
|
||||
From 28a697cce3e4f905dca700eda81d681a30eef9cd Mon Sep 17 00:00:00 2001
|
||||
From: Giuseppe Scrivano <gscrivan@redhat.com>
|
||||
Date: Fri, 11 Jan 2019 21:53:45 +0100
|
||||
Subject: [PATCH] rootfs: umount all procfs and sysfs with --no-pivot
|
||||
|
||||
When creating a new user namespace, the kernel doesn't allow to mount
|
||||
a new procfs or sysfs file system if there is not already one instance
|
||||
fully visible in the current mount namespace.
|
||||
|
||||
When using --no-pivot we were effectively inhibiting this protection
|
||||
from the kernel, as /proc and /sys from the host are still present in
|
||||
the container mount namespace.
|
||||
|
||||
A container without full access to /proc could then create a new user
|
||||
namespace, and from there able to mount a fully visible /proc, bypassing
|
||||
the limitations in the container.
|
||||
|
||||
A simple reproducer for this issue is:
|
||||
|
||||
unshare -mrfp sh -c "mount -t proc none /proc && echo c > /proc/sysrq-trigger"
|
||||
|
||||
Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
|
||||
---
|
||||
libcontainer/rootfs_linux.go | 35 +++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 35 insertions(+)
|
||||
|
||||
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
|
||||
index e7c2f8ada..6bd6da74a 100644
|
||||
--- a/libcontainer/rootfs_linux.go
|
||||
+++ b/libcontainer/rootfs_linux.go
|
||||
@@ -748,6 +748,41 @@ func pivotRoot(rootfs string) error {
|
||||
}
|
||||
|
||||
func msMoveRoot(rootfs string) error {
|
||||
+ mountinfos, err := mount.GetMounts()
|
||||
+ if err != nil {
|
||||
+ return err
|
||||
+ }
|
||||
+
|
||||
+ absRootfs, err := filepath.Abs(rootfs)
|
||||
+ if err != nil {
|
||||
+ return err
|
||||
+ }
|
||||
+
|
||||
+ for _, info := range mountinfos {
|
||||
+ p, err := filepath.Abs(info.Mountpoint)
|
||||
+ if err != nil {
|
||||
+ return err
|
||||
+ }
|
||||
+ // Umount every syfs and proc file systems, except those under the container rootfs
|
||||
+ if (info.Fstype != "proc" && info.Fstype != "sysfs") || filepath.HasPrefix(p, absRootfs) {
|
||||
+ continue
|
||||
+ }
|
||||
+ // Be sure umount events are not propagated to the host.
|
||||
+ if err := unix.Mount("", p, "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
|
||||
+ return err
|
||||
+ }
|
||||
+ if err := unix.Unmount(p, unix.MNT_DETACH); err != nil {
|
||||
+ if err != unix.EINVAL && err != unix.EPERM {
|
||||
+ return err
|
||||
+ } else {
|
||||
+ // If we have not privileges for umounting (e.g. rootless), then
|
||||
+ // cover the path.
|
||||
+ if err := unix.Mount("tmpfs", p, "tmpfs", 0, ""); err != nil {
|
||||
+ return err
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
if err := unix.Mount(rootfs, "/", "", unix.MS_MOVE, ""); err != nil {
|
||||
return err
|
||||
}
|
||||
1418
SPECS/runc.spec
1418
SPECS/runc.spec
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user