import UBI runc-1.2.9-2.module+el8.10.0+23820+ae6deecc
This commit is contained in:
parent
576dd3cde2
commit
894fc8b95b
2
.gitignore
vendored
2
.gitignore
vendored
@ -1 +1 @@
|
||||
SOURCES/v1.2.5.tar.gz
|
||||
SOURCES/v1.2.9.tar.gz
|
||||
|
||||
@ -1 +1 @@
|
||||
35e5289a5b1ac1a12a35c3475b7d0bee2232ef39 SOURCES/v1.2.5.tar.gz
|
||||
537b121ab5e611e865bae05e9a85568a9a2ac85b SOURCES/v1.2.9.tar.gz
|
||||
|
||||
@ -1,416 +0,0 @@
|
||||
From 4ad5d01eeda006ba9ae067cbf999a77fe096fe00 Mon Sep 17 00:00:00 2001
|
||||
From: Aleksa Sarai <cyphar@cyphar.com>
|
||||
Date: Sat, 1 Nov 2025 17:21:36 +1100
|
||||
Subject: [PATCH 1/2] [1.2] openat2: improve resilience on busy systems
|
||||
|
||||
Previously, we would see a ~3% failure rate when starting containers
|
||||
with mounts that contain ".." (which can trigger -EAGAIN). To counteract
|
||||
this, filepath-securejoin v0.5.1 includes a bump of the internal retry
|
||||
limit from 32 to 128, which lowers the failure rate to 0.12%.
|
||||
|
||||
However, there is still a risk of spurious failure on regular systems.
|
||||
In order to try to provide more resilience (while avoiding DoS attacks),
|
||||
this patch also includes an additional retry loop that terminates based
|
||||
on a deadline rather than retry count. The deadline is 2ms, as my
|
||||
testing found that ~800us for a single pathrs operation was the longest
|
||||
latency due to -EAGAIN retries, and that was an outlier compared to the
|
||||
more common ~400us latencies -- so 2ms should be more than enough for
|
||||
any real system.
|
||||
|
||||
The failure rates above were based on more 50k runs of runc with an
|
||||
attack script (from libpathrs) running a rename attack on all cores of a
|
||||
16-core system, which is arguably a worst-case but heavily utilised
|
||||
servers could likely approach similar results.
|
||||
|
||||
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
|
||||
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
|
||||
---
|
||||
go.mod | 2 +-
|
||||
go.sum | 4 +-
|
||||
internal/pathrs/mkdirall_pathrslite.go | 4 +-
|
||||
internal/pathrs/procfs_pathrslite.go | 22 ++++---
|
||||
internal/pathrs/retry.go | 66 +++++++++++++++++++
|
||||
internal/pathrs/root_pathrslite.go | 7 +-
|
||||
.../cyphar/filepath-securejoin/CHANGELOG.md | 34 +++++++++-
|
||||
.../cyphar/filepath-securejoin/VERSION | 2 +-
|
||||
.../internal/{errors.go => errors_linux.go} | 15 ++++-
|
||||
.../pathrs-lite/internal/fd/openat2_linux.go | 12 ++--
|
||||
vendor/modules.txt | 2 +-
|
||||
11 files changed, 144 insertions(+), 26 deletions(-)
|
||||
create mode 100644 internal/pathrs/retry.go
|
||||
rename vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/{errors.go => errors_linux.go} (70%)
|
||||
|
||||
diff --git a/go.mod b/go.mod
|
||||
index 5f00a576..90fa2e5b 100644
|
||||
--- a/go.mod
|
||||
+++ b/go.mod
|
||||
@@ -12,7 +12,7 @@ require (
|
||||
github.com/cilium/ebpf v0.16.0
|
||||
github.com/containerd/console v1.0.5
|
||||
github.com/coreos/go-systemd/v22 v22.5.0
|
||||
- github.com/cyphar/filepath-securejoin v0.5.0
|
||||
+ github.com/cyphar/filepath-securejoin v0.5.1
|
||||
github.com/docker/go-units v0.5.0
|
||||
github.com/godbus/dbus/v5 v5.1.0
|
||||
github.com/moby/sys/mountinfo v0.7.1
|
||||
diff --git a/go.sum b/go.sum
|
||||
index 1f930ce4..049597b6 100644
|
||||
--- a/go.sum
|
||||
+++ b/go.sum
|
||||
@@ -9,8 +9,8 @@ github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8
|
||||
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
|
||||
-github.com/cyphar/filepath-securejoin v0.5.0 h1:hIAhkRBMQ8nIeuVwcAoymp7MY4oherZdAxD+m0u9zaw=
|
||||
-github.com/cyphar/filepath-securejoin v0.5.0/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI=
|
||||
+github.com/cyphar/filepath-securejoin v0.5.1 h1:eYgfMq5yryL4fbWfkLpFFy2ukSELzaJOTaUTuh+oF48=
|
||||
+github.com/cyphar/filepath-securejoin v0.5.1/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
diff --git a/internal/pathrs/mkdirall_pathrslite.go b/internal/pathrs/mkdirall_pathrslite.go
|
||||
index fb4f7842..a9a0157c 100644
|
||||
--- a/internal/pathrs/mkdirall_pathrslite.go
|
||||
+++ b/internal/pathrs/mkdirall_pathrslite.go
|
||||
@@ -83,7 +83,9 @@ func MkdirAllInRootOpen(root, unsafePath string, mode os.FileMode) (*os.File, er
|
||||
}
|
||||
defer rootDir.Close()
|
||||
|
||||
- return pathrs.MkdirAllHandle(rootDir, unsafePath, mode)
|
||||
+ return retryEAGAIN(func() (*os.File, error) {
|
||||
+ return pathrs.MkdirAllHandle(rootDir, unsafePath, mode)
|
||||
+ })
|
||||
}
|
||||
|
||||
// MkdirAllInRoot is a wrapper around MkdirAllInRootOpen which closes the
|
||||
diff --git a/internal/pathrs/procfs_pathrslite.go b/internal/pathrs/procfs_pathrslite.go
|
||||
index a02b0d39..37450a0e 100644
|
||||
--- a/internal/pathrs/procfs_pathrslite.go
|
||||
+++ b/internal/pathrs/procfs_pathrslite.go
|
||||
@@ -27,13 +27,15 @@ import (
|
||||
)
|
||||
|
||||
func procOpenReopen(openFn func(subpath string) (*os.File, error), subpath string, flags int) (*os.File, error) {
|
||||
- handle, err := openFn(subpath)
|
||||
+ handle, err := retryEAGAIN(func() (*os.File, error) {
|
||||
+ return openFn(subpath)
|
||||
+ })
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer handle.Close()
|
||||
|
||||
- f, err := pathrs.Reopen(handle, flags)
|
||||
+ f, err := Reopen(handle, flags)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reopen %s: %w", handle.Name(), err)
|
||||
}
|
||||
@@ -44,7 +46,7 @@ func procOpenReopen(openFn func(subpath string) (*os.File, error), subpath strin
|
||||
// [pathrs.Reopen], to let you one-shot open a procfs file with the given
|
||||
// flags.
|
||||
func ProcSelfOpen(subpath string, flags int) (*os.File, error) {
|
||||
- proc, err := procfs.OpenProcRoot()
|
||||
+ proc, err := retryEAGAIN(procfs.OpenProcRoot)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -55,7 +57,7 @@ func ProcSelfOpen(subpath string, flags int) (*os.File, error) {
|
||||
// ProcPidOpen is a wrapper around [procfs.Handle.OpenPid] and [pathrs.Reopen],
|
||||
// to let you one-shot open a procfs file with the given flags.
|
||||
func ProcPidOpen(pid int, subpath string, flags int) (*os.File, error) {
|
||||
- proc, err := procfs.OpenProcRoot()
|
||||
+ proc, err := retryEAGAIN(procfs.OpenProcRoot)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -70,13 +72,15 @@ func ProcPidOpen(pid int, subpath string, flags int) (*os.File, error) {
|
||||
// flags. The returned [procfs.ProcThreadSelfCloser] needs the same handling as
|
||||
// when using pathrs-lite.
|
||||
func ProcThreadSelfOpen(subpath string, flags int) (_ *os.File, _ procfs.ProcThreadSelfCloser, Err error) {
|
||||
- proc, err := procfs.OpenProcRoot()
|
||||
+ proc, err := retryEAGAIN(procfs.OpenProcRoot)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
defer proc.Close()
|
||||
|
||||
- handle, closer, err := proc.OpenThreadSelf(subpath)
|
||||
+ handle, closer, err := retryEAGAIN2(func() (*os.File, procfs.ProcThreadSelfCloser, error) {
|
||||
+ return proc.OpenThreadSelf(subpath)
|
||||
+ })
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
@@ -89,7 +93,7 @@ func ProcThreadSelfOpen(subpath string, flags int) (_ *os.File, _ procfs.ProcThr
|
||||
}
|
||||
defer handle.Close()
|
||||
|
||||
- f, err := pathrs.Reopen(handle, flags)
|
||||
+ f, err := Reopen(handle, flags)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("reopen %s: %w", handle.Name(), err)
|
||||
}
|
||||
@@ -98,5 +102,7 @@ func ProcThreadSelfOpen(subpath string, flags int) (_ *os.File, _ procfs.ProcThr
|
||||
|
||||
// Reopen is a wrapper around pathrs.Reopen.
|
||||
func Reopen(file *os.File, flags int) (*os.File, error) {
|
||||
- return pathrs.Reopen(file, flags)
|
||||
+ return retryEAGAIN(func() (*os.File, error) {
|
||||
+ return pathrs.Reopen(file, flags)
|
||||
+ })
|
||||
}
|
||||
diff --git a/internal/pathrs/retry.go b/internal/pathrs/retry.go
|
||||
new file mode 100644
|
||||
index 00000000..a51d335c
|
||||
--- /dev/null
|
||||
+++ b/internal/pathrs/retry.go
|
||||
@@ -0,0 +1,66 @@
|
||||
+// SPDX-License-Identifier: Apache-2.0
|
||||
+/*
|
||||
+ * Copyright (C) 2024-2025 Aleksa Sarai <cyphar@cyphar.com>
|
||||
+ * Copyright (C) 2024-2025 SUSE LLC
|
||||
+ *
|
||||
+ * Licensed under the Apache License, Version 2.0 (the "License");
|
||||
+ * you may not use this file except in compliance with the License.
|
||||
+ * You may obtain a copy of the License at
|
||||
+ *
|
||||
+ * http://www.apache.org/licenses/LICENSE-2.0
|
||||
+ *
|
||||
+ * Unless required by applicable law or agreed to in writing, software
|
||||
+ * distributed under the License is distributed on an "AS IS" BASIS,
|
||||
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
+ * See the License for the specific language governing permissions and
|
||||
+ * limitations under the License.
|
||||
+ */
|
||||
+
|
||||
+package pathrs
|
||||
+
|
||||
+import (
|
||||
+ "errors"
|
||||
+ "fmt"
|
||||
+ "time"
|
||||
+
|
||||
+ "golang.org/x/sys/unix"
|
||||
+)
|
||||
+
|
||||
+// Based on >50k tests running "runc run" on a 16-core system with very heavy
|
||||
+// rename(2) load, the single longest latency caused by -EAGAIN retries was
|
||||
+// ~800us (with the vast majority being closer to 400us). So, a 2ms limit
|
||||
+// should give more than enough headroom for any real system in practice.
|
||||
+const retryDeadline = 2 * time.Millisecond
|
||||
+
|
||||
+// retryEAGAIN is a top-level retry loop for pathrs to try to returning
|
||||
+// spurious errors in most normal user cases when using openat2 (libpathrs
|
||||
+// itself does up to 128 retries already, but this method takes a
|
||||
+// wallclock-deadline approach to simply retry until a timer elapses).
|
||||
+func retryEAGAIN[T any](fn func() (T, error)) (T, error) {
|
||||
+ deadline := time.After(retryDeadline)
|
||||
+ for {
|
||||
+ v, err := fn()
|
||||
+ if !errors.Is(err, unix.EAGAIN) {
|
||||
+ return v, err
|
||||
+ }
|
||||
+ select {
|
||||
+ case <-deadline:
|
||||
+ return *new(T), fmt.Errorf("%v retry deadline exceeded: %w", retryDeadline, err)
|
||||
+ default:
|
||||
+ // retry
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+// retryEAGAIN2 is like retryEAGAIN except it returns two values.
|
||||
+func retryEAGAIN2[T1, T2 any](fn func() (T1, T2, error)) (T1, T2, error) {
|
||||
+ type ret struct {
|
||||
+ v1 T1
|
||||
+ v2 T2
|
||||
+ }
|
||||
+ v, err := retryEAGAIN(func() (ret, error) {
|
||||
+ v1, v2, err := fn()
|
||||
+ return ret{v1: v1, v2: v2}, err
|
||||
+ })
|
||||
+ return v.v1, v.v2, err
|
||||
+}
|
||||
diff --git a/internal/pathrs/root_pathrslite.go b/internal/pathrs/root_pathrslite.go
|
||||
index 0ef81fae..899af270 100644
|
||||
--- a/internal/pathrs/root_pathrslite.go
|
||||
+++ b/internal/pathrs/root_pathrslite.go
|
||||
@@ -31,12 +31,15 @@ import (
|
||||
// is effectively shorthand for [securejoin.OpenInRoot] followed by
|
||||
// [securejoin.Reopen].
|
||||
func OpenInRoot(root, subpath string, flags int) (*os.File, error) {
|
||||
- handle, err := pathrs.OpenInRoot(root, subpath)
|
||||
+ handle, err := retryEAGAIN(func() (*os.File, error) {
|
||||
+ return pathrs.OpenInRoot(root, subpath)
|
||||
+ })
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer handle.Close()
|
||||
- return pathrs.Reopen(handle, flags)
|
||||
+
|
||||
+ return Reopen(handle, flags)
|
||||
}
|
||||
|
||||
// CreateInRoot creates a new file inside a root (as well as any missing parent
|
||||
diff --git a/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md b/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md
|
||||
index 6862467c..3faee0bc 100644
|
||||
--- a/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md
|
||||
+++ b/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md
|
||||
@@ -4,7 +4,36 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](http://keepachangelog.com/)
|
||||
and this project adheres to [Semantic Versioning](http://semver.org/).
|
||||
|
||||
-## [Unreleased] ##
|
||||
+## [Unreleased 0.5.z] ##
|
||||
+
|
||||
+## [0.5.1] - 2025-10-31 ##
|
||||
+
|
||||
+> Spooky scary skeletons send shivers down your spine!
|
||||
+
|
||||
+### Changed ###
|
||||
+- `openat2` can return `-EAGAIN` if it detects a possible attack in certain
|
||||
+ scenarios (namely if there was a rename or mount while walking a path with a
|
||||
+ `..` component). While this is necessary to avoid a denial-of-service in the
|
||||
+ kernel, it does require retry loops in userspace.
|
||||
+
|
||||
+ In previous versions, `pathrs-lite` would retry `openat2` 32 times before
|
||||
+ returning an error, but we've received user reports that this limit can be
|
||||
+ hit on systems with very heavy load. In some synthetic benchmarks (testing
|
||||
+ the worst-case of an attacker doing renames in a tight loop on every core of
|
||||
+ a 16-core machine) we managed to get a ~3% failure rate in runc. We have
|
||||
+ improved this situation in two ways:
|
||||
+
|
||||
+ * We have now increased this limit to 128, which should be good enough for
|
||||
+ most use-cases without becoming a denial-of-service vector (the number of
|
||||
+ syscalls called by the `O_PATH` resolver in a typical case is within the
|
||||
+ same ballpark). The same benchmarks show a failure rate of ~0.12% which
|
||||
+ (while not zero) is probably sufficient for most users.
|
||||
+
|
||||
+ * In addition, we now return a `unix.EAGAIN` error that is bubbled up and can
|
||||
+ be detected by callers. This means that callers with stricter requirements
|
||||
+ to avoid spurious errors can choose to do their own infinite `EAGAIN` retry
|
||||
+ loop (though we would strongly recommend users use time-based deadlines in
|
||||
+ such retry loops to avoid potentially unbounded denials-of-service).
|
||||
|
||||
## [0.5.0] - 2025-09-26 ##
|
||||
|
||||
@@ -354,7 +383,8 @@ This is our first release of `github.com/cyphar/filepath-securejoin`,
|
||||
containing a full implementation with a coverage of 93.5% (the only missing
|
||||
cases are the error cases, which are hard to mocktest at the moment).
|
||||
|
||||
-[Unreleased]: https://github.com/cyphar/filepath-securejoin/compare/v0.5.0...HEAD
|
||||
+[Unreleased 0.5.z]: https://github.com/cyphar/filepath-securejoin/compare/v0.5.1...release-0.5
|
||||
+[0.5.1]: https://github.com/cyphar/filepath-securejoin/compare/v0.5.0...v0.5.1
|
||||
[0.5.0]: https://github.com/cyphar/filepath-securejoin/compare/v0.4.1...v0.5.0
|
||||
[0.4.1]: https://github.com/cyphar/filepath-securejoin/compare/v0.4.0...v0.4.1
|
||||
[0.4.0]: https://github.com/cyphar/filepath-securejoin/compare/v0.3.6...v0.4.0
|
||||
diff --git a/vendor/github.com/cyphar/filepath-securejoin/VERSION b/vendor/github.com/cyphar/filepath-securejoin/VERSION
|
||||
index 8f0916f7..4b9fcbec 100644
|
||||
--- a/vendor/github.com/cyphar/filepath-securejoin/VERSION
|
||||
+++ b/vendor/github.com/cyphar/filepath-securejoin/VERSION
|
||||
@@ -1 +1 @@
|
||||
-0.5.0
|
||||
+0.5.1
|
||||
diff --git a/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors.go b/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors_linux.go
|
||||
similarity index 70%
|
||||
rename from vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors.go
|
||||
rename to vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors_linux.go
|
||||
index c26e440e..d0b200f4 100644
|
||||
--- a/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors.go
|
||||
+++ b/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors_linux.go
|
||||
@@ -1,5 +1,7 @@
|
||||
// SPDX-License-Identifier: MPL-2.0
|
||||
|
||||
+//go:build linux
|
||||
+
|
||||
// Copyright (C) 2024-2025 Aleksa Sarai <cyphar@cyphar.com>
|
||||
// Copyright (C) 2024-2025 SUSE LLC
|
||||
//
|
||||
@@ -12,15 +14,24 @@ package internal
|
||||
|
||||
import (
|
||||
"errors"
|
||||
+
|
||||
+ "golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
+type xdevErrorish struct {
|
||||
+ description string
|
||||
+}
|
||||
+
|
||||
+func (err xdevErrorish) Error() string { return err.description }
|
||||
+func (err xdevErrorish) Is(target error) bool { return target == unix.EXDEV }
|
||||
+
|
||||
var (
|
||||
// ErrPossibleAttack indicates that some attack was detected.
|
||||
- ErrPossibleAttack = errors.New("possible attack detected")
|
||||
+ ErrPossibleAttack error = xdevErrorish{"possible attack detected"}
|
||||
|
||||
// ErrPossibleBreakout indicates that during an operation we ended up in a
|
||||
// state that could be a breakout but we detected it.
|
||||
- ErrPossibleBreakout = errors.New("possible breakout detected")
|
||||
+ ErrPossibleBreakout error = xdevErrorish{"possible breakout detected"}
|
||||
|
||||
// ErrInvalidDirectory indicates an unlinked directory.
|
||||
ErrInvalidDirectory = errors.New("wandered into deleted directory")
|
||||
diff --git a/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/fd/openat2_linux.go b/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/fd/openat2_linux.go
|
||||
index 23053083..3e937fe3 100644
|
||||
--- a/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/fd/openat2_linux.go
|
||||
+++ b/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/fd/openat2_linux.go
|
||||
@@ -17,8 +17,6 @@ import (
|
||||
"runtime"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
-
|
||||
- "github.com/cyphar/filepath-securejoin/pathrs-lite/internal"
|
||||
)
|
||||
|
||||
func scopedLookupShouldRetry(how *unix.OpenHow, err error) bool {
|
||||
@@ -34,7 +32,10 @@ func scopedLookupShouldRetry(how *unix.OpenHow, err error) bool {
|
||||
(errors.Is(err, unix.EAGAIN) || errors.Is(err, unix.EXDEV))
|
||||
}
|
||||
|
||||
-const scopedLookupMaxRetries = 32
|
||||
+// This is a fairly arbitrary limit we have just to avoid an attacker being
|
||||
+// able to make us spin in an infinite retry loop -- callers can choose to
|
||||
+// retry on EAGAIN if they prefer.
|
||||
+const scopedLookupMaxRetries = 128
|
||||
|
||||
// Openat2 is an [Fd]-based wrapper around unix.Openat2, but with some retry
|
||||
// logic in case of EAGAIN errors.
|
||||
@@ -43,10 +44,10 @@ func Openat2(dir Fd, path string, how *unix.OpenHow) (*os.File, error) {
|
||||
// Make sure we always set O_CLOEXEC.
|
||||
how.Flags |= unix.O_CLOEXEC
|
||||
var tries int
|
||||
- for tries < scopedLookupMaxRetries {
|
||||
+ for {
|
||||
fd, err := unix.Openat2(dirFd, path, how)
|
||||
if err != nil {
|
||||
- if scopedLookupShouldRetry(how, err) {
|
||||
+ if scopedLookupShouldRetry(how, err) && tries < scopedLookupMaxRetries {
|
||||
// We retry a couple of times to avoid the spurious errors, and
|
||||
// if we are being attacked then returning -EAGAIN is the best
|
||||
// we can do.
|
||||
@@ -58,5 +59,4 @@ func Openat2(dir Fd, path string, how *unix.OpenHow) (*os.File, error) {
|
||||
runtime.KeepAlive(dir)
|
||||
return os.NewFile(uintptr(fd), fullPath), nil
|
||||
}
|
||||
- return nil, &os.PathError{Op: "openat2", Path: fullPath, Err: internal.ErrPossibleAttack}
|
||||
}
|
||||
diff --git a/vendor/modules.txt b/vendor/modules.txt
|
||||
index 4e7e0ef8..64524598 100644
|
||||
--- a/vendor/modules.txt
|
||||
+++ b/vendor/modules.txt
|
||||
@@ -25,7 +25,7 @@ github.com/coreos/go-systemd/v22/dbus
|
||||
# github.com/cpuguy83/go-md2man/v2 v2.0.2
|
||||
## explicit; go 1.11
|
||||
github.com/cpuguy83/go-md2man/v2/md2man
|
||||
-# github.com/cyphar/filepath-securejoin v0.5.0
|
||||
+# github.com/cyphar/filepath-securejoin v0.5.1
|
||||
## explicit; go 1.18
|
||||
github.com/cyphar/filepath-securejoin
|
||||
github.com/cyphar/filepath-securejoin/internal/consts
|
||||
--
|
||||
2.51.1
|
||||
|
||||
@ -1,161 +0,0 @@
|
||||
From c8588560cdebd80e9d1823a4a8e39172ee4650bb Mon Sep 17 00:00:00 2001
|
||||
From: Aleksa Sarai <cyphar@cyphar.com>
|
||||
Date: Fri, 7 Nov 2025 14:52:09 +1100
|
||||
Subject: [PATCH] rootfs: only set mode= for tmpfs mount if target already
|
||||
existed
|
||||
|
||||
This was always the intended behaviour but commit 72fbb34f5006 ("rootfs:
|
||||
switch to fd-based handling of mountpoint targets") regressed it when
|
||||
adding a mechanism to create a file handle to the target if it didn't
|
||||
already exist (causing the later stat to always succeed).
|
||||
|
||||
A lot of people depend on this functionality, so add some tests to make
|
||||
sure we don't break it in the future.
|
||||
|
||||
Fixes: 72fbb34f5006 ("rootfs: switch to fd-based handling of mountpoint targets")
|
||||
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
|
||||
(cherry picked from commit 9a9719eeb4978e73c64740b3fc796c1b12987b05)
|
||||
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
|
||||
---
|
||||
libcontainer/rootfs_linux.go | 25 ++++++-----
|
||||
tests/integration/mounts.bats | 81 +++++++++++++++++++++++++++++++++++
|
||||
2 files changed, 93 insertions(+), 13 deletions(-)
|
||||
|
||||
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
|
||||
index 204e6a80..ab5a260d 100644
|
||||
--- a/libcontainer/rootfs_linux.go
|
||||
+++ b/libcontainer/rootfs_linux.go
|
||||
@@ -511,6 +511,18 @@ func (m *mountEntry) createOpenMountpoint(rootfs string) (Err error) {
|
||||
_ = dstFile.Close()
|
||||
}
|
||||
}()
|
||||
+ if err == nil && m.Device == "tmpfs" {
|
||||
+ // If the original target exists, copy the mode for the tmpfs mount.
|
||||
+ stat, err := dstFile.Stat()
|
||||
+ if err != nil {
|
||||
+ return fmt.Errorf("check tmpfs source mode: %w", err)
|
||||
+ }
|
||||
+ dt := fmt.Sprintf("mode=%04o", syscallMode(stat.Mode()))
|
||||
+ if m.Data != "" {
|
||||
+ dt = dt + "," + m.Data
|
||||
+ }
|
||||
+ m.Data = dt
|
||||
+ }
|
||||
if err != nil {
|
||||
if !errors.Is(err, unix.ENOENT) {
|
||||
return fmt.Errorf("lookup mountpoint target: %w", err)
|
||||
@@ -551,19 +563,6 @@ func (m *mountEntry) createOpenMountpoint(rootfs string) (Err error) {
|
||||
}
|
||||
}
|
||||
|
||||
- if m.Device == "tmpfs" {
|
||||
- // If the original target exists, copy the mode for the tmpfs mount.
|
||||
- stat, err := dstFile.Stat()
|
||||
- if err != nil {
|
||||
- return fmt.Errorf("check tmpfs source mode: %w", err)
|
||||
- }
|
||||
- dt := fmt.Sprintf("mode=%04o", syscallMode(stat.Mode()))
|
||||
- if m.Data != "" {
|
||||
- dt = dt + "," + m.Data
|
||||
- }
|
||||
- m.Data = dt
|
||||
- }
|
||||
-
|
||||
dstFullPath, err := procfs.ProcSelfFdReadlink(dstFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("get mount destination real path: %w", err)
|
||||
diff --git a/tests/integration/mounts.bats b/tests/integration/mounts.bats
|
||||
index 11fb2cfc..b60c88ae 100644
|
||||
--- a/tests/integration/mounts.bats
|
||||
+++ b/tests/integration/mounts.bats
|
||||
@@ -234,6 +234,87 @@ function test_mount_order() {
|
||||
[[ "$(stat -c %a rootfs/setgid/a/b/c)" == 2755 ]]
|
||||
}
|
||||
|
||||
+# https://github.com/opencontainers/runc/issues/4971
|
||||
+@test "runc run [tmpfs mount mode= inherit]" {
|
||||
+ mkdir rootfs/tmpfs
|
||||
+ chmod "=0710" rootfs/tmpfs
|
||||
+
|
||||
+ update_config '.mounts += [{
|
||||
+ type: "tmpfs",
|
||||
+ source: "tmpfs",
|
||||
+ destination: "/tmpfs",
|
||||
+ options: ["rw", "nodev", "nosuid"]
|
||||
+ }]'
|
||||
+ update_config '.process.args = ["stat", "-c", "%a", "/tmpfs"]'
|
||||
+
|
||||
+ runc run test_busybox
|
||||
+ [ "$status" -eq 0 ]
|
||||
+ [[ "$output" == "710" ]]
|
||||
+
|
||||
+ update_config '.process.args = ["cat", "/proc/self/mounts"]'
|
||||
+ runc run test_busybox
|
||||
+ [ "$status" -eq 0 ]
|
||||
+ grep -Ex "tmpfs /tmpfs tmpfs [^ ]*\bmode=710\b[^ ]* .*" <<<"$output"
|
||||
+}
|
||||
+
|
||||
+# https://github.com/opencontainers/runc/issues/4971
|
||||
+@test "runc run [tmpfs mount explicit mode=]" {
|
||||
+ mkdir rootfs/tmpfs
|
||||
+ chmod "=0710" rootfs/tmpfs
|
||||
+
|
||||
+ update_config '.mounts += [{
|
||||
+ type: "tmpfs",
|
||||
+ source: "tmpfs",
|
||||
+ destination: "/tmpfs",
|
||||
+ options: ["rw", "nodev", "nosuid", "mode=1500"]
|
||||
+ }]'
|
||||
+ update_config '.process.args = ["stat", "-c", "%a", "/tmpfs"]'
|
||||
+
|
||||
+ # Explicitly setting mode= overrides whatever mode we would've inherited.
|
||||
+ runc run test_busybox
|
||||
+ [ "$status" -eq 0 ]
|
||||
+ [[ "$output" == "1500" ]]
|
||||
+
|
||||
+ update_config '.process.args = ["cat", "/proc/self/mounts"]'
|
||||
+ runc run test_busybox
|
||||
+ [ "$status" -eq 0 ]
|
||||
+ grep -Ex "tmpfs /tmpfs tmpfs [^ ]*\bmode=1500\b[^ ]* .*" <<<"$output"
|
||||
+
|
||||
+ # Verify that the actual directory was not chmod-ed.
|
||||
+ [[ "$(stat -c %a rootfs/tmpfs)" == 710 ]]
|
||||
+}
|
||||
+
|
||||
+# https://github.com/opencontainers/runc/issues/4971
|
||||
+@test "runc run [tmpfs mount mode=1777 default]" {
|
||||
+ update_config '.mounts += [{
|
||||
+ type: "tmpfs",
|
||||
+ source: "tmpfs",
|
||||
+ destination: "/non-existent/foo/bar/baz",
|
||||
+ options: ["rw", "nodev", "nosuid"]
|
||||
+ }]'
|
||||
+ update_config '.process.args = ["stat", "-c", "%a", "/non-existent/foo/bar/baz"]'
|
||||
+
|
||||
+ rm -rf rootfs/non-existent
|
||||
+ runc run test_busybox
|
||||
+ [ "$status" -eq 0 ]
|
||||
+ [[ "$output" == "1777" ]]
|
||||
+
|
||||
+ update_config '.process.args = ["cat", "/proc/self/mounts"]'
|
||||
+
|
||||
+ rm -rf rootfs/non-existent
|
||||
+ runc run test_busybox
|
||||
+ [ "$status" -eq 0 ]
|
||||
+ # We don't explicitly set a mode= in this case, it is just the tmpfs default.
|
||||
+ grep -Ex "tmpfs /non-existent/foo/bar/baz tmpfs .*" <<<"$output"
|
||||
+ run ! grep -Ex "tmpfs /non-existent/foo/bar/baz tmpfs [^ ]*\bmode=[0-7]+\b[^ ]* .*" <<<"$output"
|
||||
+
|
||||
+ # Verify that the actual modes are *not* 1777.
|
||||
+ [[ "$(stat -c %a rootfs/non-existent)" == 755 ]]
|
||||
+ [[ "$(stat -c %a rootfs/non-existent/foo)" == 755 ]]
|
||||
+ [[ "$(stat -c %a rootfs/non-existent/foo/bar)" == 755 ]]
|
||||
+ [[ "$(stat -c %a rootfs/non-existent/foo/bar/baz)" == 755 ]]
|
||||
+}
|
||||
+
|
||||
@test "runc run [ro /sys/fs/cgroup mounts]" {
|
||||
# Without cgroup namespace.
|
||||
update_config '.linux.namespaces -= [{"type": "cgroup"}]'
|
||||
--
|
||||
2.51.1
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,103 +0,0 @@
|
||||
From c6dad73d617864f3a281ac1fdaacd5ed971fa317 Mon Sep 17 00:00:00 2001
|
||||
From: Kir Kolyshkin <kolyshkin@gmail.com>
|
||||
Date: Thu, 27 Jun 2024 09:00:51 -0700
|
||||
Subject: [PATCH 1/2] Bump runtime-spec to latest git HEAD
|
||||
|
||||
This is to include
|
||||
- https://github.com/opencontainers/runtime-spec/pull/1261
|
||||
- https://github.com/opencontainers/runtime-spec/pull/1253
|
||||
|
||||
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
|
||||
(cherry picked from commit 2cac22b1e29e6be4c004f35ce582aa2b7e1c2fda)
|
||||
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
|
||||
---
|
||||
go.mod | 2 +-
|
||||
go.sum | 4 ++--
|
||||
.../opencontainers/runtime-spec/specs-go/config.go | 8 ++++++++
|
||||
.../opencontainers/runtime-spec/specs-go/version.go | 2 +-
|
||||
vendor/modules.txt | 2 +-
|
||||
5 files changed, 13 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/go.mod b/go.mod
|
||||
index 348bc9c6..db2d7ef1 100644
|
||||
--- a/go.mod
|
||||
+++ b/go.mod
|
||||
@@ -19,7 +19,7 @@ require (
|
||||
github.com/moby/sys/user v0.3.0
|
||||
github.com/moby/sys/userns v0.1.0
|
||||
github.com/mrunalp/fileutils v0.5.1
|
||||
- github.com/opencontainers/runtime-spec v1.2.0
|
||||
+ github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95
|
||||
github.com/opencontainers/selinux v1.11.0
|
||||
github.com/seccomp/libseccomp-golang v0.10.0
|
||||
github.com/sirupsen/logrus v1.9.3
|
||||
diff --git a/go.sum b/go.sum
|
||||
index 225d5860..4c863cc9 100644
|
||||
--- a/go.sum
|
||||
+++ b/go.sum
|
||||
@@ -46,8 +46,8 @@ github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g
|
||||
github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28=
|
||||
github.com/mrunalp/fileutils v0.5.1 h1:F+S7ZlNKnrwHfSwdlgNSkKo67ReVf8o9fel6C3dkm/Q=
|
||||
github.com/mrunalp/fileutils v0.5.1/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ=
|
||||
-github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk=
|
||||
-github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
|
||||
+github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95 h1:Ghl8Z3l+yPQUDSxAp7Kg7fJLRNNXjOsR6ooDcca7PjU=
|
||||
+github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
|
||||
github.com/opencontainers/selinux v1.11.0 h1:+5Zbo97w3Lbmb3PeqQtpmTkMwsW5nRI3YaLpt7tQ7oU=
|
||||
github.com/opencontainers/selinux v1.11.0/go.mod h1:E5dMC3VPuVvVHDYmi78qvhJp8+M586T4DlDRYpFkyec=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go
|
||||
index d1236ba7..671f0d01 100644
|
||||
--- a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go
|
||||
+++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go
|
||||
@@ -94,6 +94,8 @@ type Process struct {
|
||||
SelinuxLabel string `json:"selinuxLabel,omitempty" platform:"linux"`
|
||||
// IOPriority contains the I/O priority settings for the cgroup.
|
||||
IOPriority *LinuxIOPriority `json:"ioPriority,omitempty" platform:"linux"`
|
||||
+ // ExecCPUAffinity specifies CPU affinity for exec processes.
|
||||
+ ExecCPUAffinity *CPUAffinity `json:"execCPUAffinity,omitempty" platform:"linux"`
|
||||
}
|
||||
|
||||
// LinuxCapabilities specifies the list of allowed capabilities that are kept for a process.
|
||||
@@ -127,6 +129,12 @@ const (
|
||||
IOPRIO_CLASS_IDLE IOPriorityClass = "IOPRIO_CLASS_IDLE"
|
||||
)
|
||||
|
||||
+// CPUAffinity specifies process' CPU affinity.
|
||||
+type CPUAffinity struct {
|
||||
+ Initial string `json:"initial,omitempty"`
|
||||
+ Final string `json:"final,omitempty"`
|
||||
+}
|
||||
+
|
||||
// Box specifies dimensions of a rectangle. Used for specifying the size of a console.
|
||||
type Box struct {
|
||||
// Height is the vertical dimension of a box.
|
||||
diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go
|
||||
index 503971e0..f6c15f6c 100644
|
||||
--- a/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go
|
||||
+++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go
|
||||
@@ -11,7 +11,7 @@ const (
|
||||
VersionPatch = 0
|
||||
|
||||
// VersionDev indicates development branch. Releases will be empty string.
|
||||
- VersionDev = ""
|
||||
+ VersionDev = "+dev"
|
||||
)
|
||||
|
||||
// Version is the specification version that the package types support.
|
||||
diff --git a/vendor/modules.txt b/vendor/modules.txt
|
||||
index 3b245e0d..df520923 100644
|
||||
--- a/vendor/modules.txt
|
||||
+++ b/vendor/modules.txt
|
||||
@@ -46,7 +46,7 @@ github.com/moby/sys/userns
|
||||
# github.com/mrunalp/fileutils v0.5.1
|
||||
## explicit; go 1.13
|
||||
github.com/mrunalp/fileutils
|
||||
-# github.com/opencontainers/runtime-spec v1.2.0
|
||||
+# github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95
|
||||
## explicit
|
||||
github.com/opencontainers/runtime-spec/specs-go
|
||||
github.com/opencontainers/runtime-spec/specs-go/features
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -1,49 +0,0 @@
|
||||
From e949092d469c3ee3ea9bf1002649b6a692895da9 Mon Sep 17 00:00:00 2001
|
||||
From: Aleksa Sarai <cyphar@cyphar.com>
|
||||
Date: Wed, 5 Nov 2025 02:04:02 +1100
|
||||
Subject: [PATCH 2/2] [1.2] rootfs: re-allow dangling symlinks in mount targets
|
||||
|
||||
It seems there are a fair few images where dangling symlinks are used as
|
||||
path components for mount targets, which pathrs-lite does not support
|
||||
(and it would be difficult to fully support this in a race-free way).
|
||||
|
||||
This was actually meant to be blocked by commit 63c2908164f3 ("rootfs:
|
||||
try to scope MkdirAll to stay inside the rootfs"), followed by commit
|
||||
dd827f7b715a ("utils: switch to securejoin.MkdirAllHandle"). However, we
|
||||
still used SecureJoin to construct mountpoint targets, which means that
|
||||
dangling symlinks were "resolved" before reaching pathrs-lite.
|
||||
|
||||
This patch basically re-adds this hack in order to reduce the breakages
|
||||
we've seen so far.
|
||||
|
||||
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
|
||||
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
|
||||
---
|
||||
libcontainer/rootfs_linux.go | 11 +++++++++++
|
||||
1 file changed, 11 insertions(+)
|
||||
|
||||
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
|
||||
index 377642c9..6ea7cd47 100644
|
||||
--- a/libcontainer/rootfs_linux.go
|
||||
+++ b/libcontainer/rootfs_linux.go
|
||||
@@ -518,6 +518,17 @@ func (m *mountEntry) createOpenMountpoint(rootfs string) (Err error) {
|
||||
dstIsFile = !fi.IsDir()
|
||||
}
|
||||
|
||||
+ // In previous runc versions, we would tolerate nonsense paths with
|
||||
+ // dangling symlinks as path components. pathrs-lite does not support
|
||||
+ // this, so instead we have to emulate this behaviour by doing
|
||||
+ // SecureJoin *purely to get a semi-reasonable path to use* and then we
|
||||
+ // use pathrs-lite to operate on the path safely.
|
||||
+ newUnsafePath, err := securejoin.SecureJoin(rootfs, unsafePath)
|
||||
+ if err != nil {
|
||||
+ return err
|
||||
+ }
|
||||
+ unsafePath = utils.StripRoot(rootfs, newUnsafePath)
|
||||
+
|
||||
if dstIsFile {
|
||||
dstFile, err = pathrs.CreateInRoot(rootfs, unsafePath, unix.O_CREAT|unix.O_EXCL|unix.O_NOFOLLOW, 0o644)
|
||||
} else {
|
||||
--
|
||||
2.51.1
|
||||
|
||||
@ -1,521 +0,0 @@
|
||||
From 73786942b7176eae1e676cf2f78af548f090e418 Mon Sep 17 00:00:00 2001
|
||||
From: Kir Kolyshkin <kolyshkin@gmail.com>
|
||||
Date: Mon, 21 Oct 2024 15:50:38 -0700
|
||||
Subject: [PATCH 2/2] runc exec: implement CPU affinity
|
||||
|
||||
As per
|
||||
- https://github.com/opencontainers/runtime-spec/pull/1253
|
||||
- https://github.com/opencontainers/runtime-spec/pull/1261
|
||||
|
||||
CPU affinity can be set in two ways:
|
||||
1. When creating/starting a container, in config.json's
|
||||
Process.ExecCPUAffinity, which is when applied to all execs.
|
||||
2. When running an exec, in process.json's CPUAffinity, which
|
||||
applied to a given exec and overrides the value from (1).
|
||||
|
||||
Add some basic tests.
|
||||
|
||||
Note that older kernels (RHEL8, Ubuntu 20.04) change CPU affinity of a
|
||||
process to that of a container's cgroup, as soon as it is moved to that
|
||||
cgroup, while newer kernels (Ubuntu 24.04, Fedora 41) don't do that.
|
||||
|
||||
Because of the above,
|
||||
- it's impossible to really test initial CPU affinity without adding
|
||||
debug logging to libcontainer/nsenter;
|
||||
- for older kernels, there can be a brief moment when exec's affinity
|
||||
is different than either initial or final affinity being set;
|
||||
- exec's final CPU affinity, if not specified, can be different
|
||||
depending on the kernel, therefore we don't test it.
|
||||
|
||||
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
|
||||
(cherry picked from commit 57237b31de367a722c5d49088912d57c28c6fb46)
|
||||
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
|
||||
---
|
||||
libcontainer/configs/config.go | 72 ++++++++++++++++++++
|
||||
libcontainer/container_linux.go | 4 ++
|
||||
libcontainer/init_linux.go | 3 +-
|
||||
libcontainer/nsenter/log.c | 9 ++-
|
||||
libcontainer/nsenter/log.h | 3 +
|
||||
libcontainer/nsenter/nsexec.c | 29 ++++++++
|
||||
libcontainer/process.go | 2 +
|
||||
libcontainer/process_linux.go | 49 +++++++++++++-
|
||||
libcontainer/specconv/spec_linux.go | 5 ++
|
||||
tests/integration/cpu_affinity.bats | 101 ++++++++++++++++++++++++++++
|
||||
utils_linux.go | 6 ++
|
||||
11 files changed, 277 insertions(+), 6 deletions(-)
|
||||
create mode 100644 tests/integration/cpu_affinity.bats
|
||||
|
||||
diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go
|
||||
index 22fe0f9b..daffd130 100644
|
||||
--- a/libcontainer/configs/config.go
|
||||
+++ b/libcontainer/configs/config.go
|
||||
@@ -3,8 +3,11 @@ package configs
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
+ "errors"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
+ "strconv"
|
||||
+ "strings"
|
||||
"time"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
@@ -225,6 +228,9 @@ type Config struct {
|
||||
|
||||
// IOPriority is the container's I/O priority.
|
||||
IOPriority *IOPriority `json:"io_priority,omitempty"`
|
||||
+
|
||||
+ // ExecCPUAffinity is CPU affinity for a non-init process to be run in the container.
|
||||
+ ExecCPUAffinity *CPUAffinity `json:"exec_cpu_affinity,omitempty"`
|
||||
}
|
||||
|
||||
// Scheduler is based on the Linux sched_setattr(2) syscall.
|
||||
@@ -294,6 +300,72 @@ var IOPrioClassMapping = map[specs.IOPriorityClass]int{
|
||||
|
||||
type IOPriority = specs.LinuxIOPriority
|
||||
|
||||
+type CPUAffinity struct {
|
||||
+ Initial, Final *unix.CPUSet
|
||||
+}
|
||||
+
|
||||
+func toCPUSet(str string) (*unix.CPUSet, error) {
|
||||
+ if str == "" {
|
||||
+ return nil, nil
|
||||
+ }
|
||||
+ s := new(unix.CPUSet)
|
||||
+ for _, r := range strings.Split(str, ",") {
|
||||
+ // Allow extra spaces around.
|
||||
+ r = strings.TrimSpace(r)
|
||||
+ // Allow empty elements (extra commas).
|
||||
+ if r == "" {
|
||||
+ continue
|
||||
+ }
|
||||
+ if r0, r1, found := strings.Cut(r, "-"); found {
|
||||
+ start, err := strconv.ParseUint(r0, 10, 32)
|
||||
+ if err != nil {
|
||||
+ return nil, err
|
||||
+ }
|
||||
+ end, err := strconv.ParseUint(r1, 10, 32)
|
||||
+ if err != nil {
|
||||
+ return nil, err
|
||||
+ }
|
||||
+ if start > end {
|
||||
+ return nil, errors.New("invalid range: " + r)
|
||||
+ }
|
||||
+ for i := int(start); i <= int(end); i++ {
|
||||
+ s.Set(i)
|
||||
+ }
|
||||
+ } else {
|
||||
+ val, err := strconv.ParseUint(r, 10, 32)
|
||||
+ if err != nil {
|
||||
+ return nil, err
|
||||
+ }
|
||||
+ s.Set(int(val))
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return s, nil
|
||||
+}
|
||||
+
|
||||
+// ConvertCPUAffinity converts [specs.CPUAffinity] to [CPUAffinity].
|
||||
+func ConvertCPUAffinity(sa *specs.CPUAffinity) (*CPUAffinity, error) {
|
||||
+ if sa == nil {
|
||||
+ return nil, nil
|
||||
+ }
|
||||
+ initial, err := toCPUSet(sa.Initial)
|
||||
+ if err != nil {
|
||||
+ return nil, fmt.Errorf("bad CPUAffinity.Initial: %w", err)
|
||||
+ }
|
||||
+ final, err := toCPUSet(sa.Final)
|
||||
+ if err != nil {
|
||||
+ return nil, fmt.Errorf("bad CPUAffinity.Final: %w", err)
|
||||
+ }
|
||||
+ if initial == nil && final == nil {
|
||||
+ return nil, nil
|
||||
+ }
|
||||
+
|
||||
+ return &CPUAffinity{
|
||||
+ Initial: initial,
|
||||
+ Final: final,
|
||||
+ }, nil
|
||||
+}
|
||||
+
|
||||
type (
|
||||
HookName string
|
||||
HookList []Hook
|
||||
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
|
||||
index c0211617..1fc590a5 100644
|
||||
--- a/libcontainer/container_linux.go
|
||||
+++ b/libcontainer/container_linux.go
|
||||
@@ -692,6 +692,7 @@ func (c *Container) newInitConfig(process *Process) *initConfig {
|
||||
AppArmorProfile: c.config.AppArmorProfile,
|
||||
ProcessLabel: c.config.ProcessLabel,
|
||||
Rlimits: c.config.Rlimits,
|
||||
+ CPUAffinity: c.config.ExecCPUAffinity,
|
||||
CreateConsole: process.ConsoleSocket != nil,
|
||||
ConsoleWidth: process.ConsoleWidth,
|
||||
ConsoleHeight: process.ConsoleHeight,
|
||||
@@ -708,6 +709,9 @@ func (c *Container) newInitConfig(process *Process) *initConfig {
|
||||
if len(process.Rlimits) > 0 {
|
||||
cfg.Rlimits = process.Rlimits
|
||||
}
|
||||
+ if process.CPUAffinity != nil {
|
||||
+ cfg.CPUAffinity = process.CPUAffinity
|
||||
+ }
|
||||
if cgroups.IsCgroup2UnifiedMode() {
|
||||
cfg.Cgroup2Path = c.cgroupManager.Path("")
|
||||
}
|
||||
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
|
||||
index 1eb0279d..eddbfba6 100644
|
||||
--- a/libcontainer/init_linux.go
|
||||
+++ b/libcontainer/init_linux.go
|
||||
@@ -72,6 +72,7 @@ type initConfig struct {
|
||||
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
|
||||
SpecState *specs.State `json:"spec_state,omitempty"`
|
||||
Cgroup2Path string `json:"cgroup2_path,omitempty"`
|
||||
+ CPUAffinity *configs.CPUAffinity `json:"cpu_affinity,omitempty"`
|
||||
}
|
||||
|
||||
// Init is part of "runc init" implementation.
|
||||
@@ -151,7 +152,7 @@ func startInitialization() (retErr error) {
|
||||
|
||||
logrus.SetOutput(logPipe)
|
||||
logrus.SetFormatter(new(logrus.JSONFormatter))
|
||||
- logrus.Debug("child process in init()")
|
||||
+ logrus.Debugf("child process in init()")
|
||||
|
||||
// Only init processes have FIFOFD.
|
||||
var fifoFile *os.File
|
||||
diff --git a/libcontainer/nsenter/log.c b/libcontainer/nsenter/log.c
|
||||
index 086b5398..72774cb0 100644
|
||||
--- a/libcontainer/nsenter/log.c
|
||||
+++ b/libcontainer/nsenter/log.c
|
||||
@@ -31,6 +31,11 @@ void setup_logpipe(void)
|
||||
loglevel = i;
|
||||
}
|
||||
|
||||
+bool log_enabled_for(int level)
|
||||
+{
|
||||
+ return (logfd >= 0 && level <= loglevel);
|
||||
+}
|
||||
+
|
||||
/* Defined in nsexec.c */
|
||||
extern int current_stage;
|
||||
|
||||
@@ -40,8 +45,8 @@ void write_log(int level, const char *format, ...)
|
||||
va_list args;
|
||||
int ret;
|
||||
|
||||
- if (logfd < 0 || level > loglevel)
|
||||
- goto out;
|
||||
+ if (!log_enabled_for(level))
|
||||
+ return;
|
||||
|
||||
va_start(args, format);
|
||||
ret = vasprintf(&message, format, args);
|
||||
diff --git a/libcontainer/nsenter/log.h b/libcontainer/nsenter/log.h
|
||||
index 1fe95a11..3e18de68 100644
|
||||
--- a/libcontainer/nsenter/log.h
|
||||
+++ b/libcontainer/nsenter/log.h
|
||||
@@ -1,6 +1,7 @@
|
||||
#ifndef NSENTER_LOG_H
|
||||
#define NSENTER_LOG_H
|
||||
|
||||
+#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
|
||||
/*
|
||||
@@ -20,6 +21,8 @@
|
||||
*/
|
||||
void setup_logpipe(void);
|
||||
|
||||
+bool log_enabled_for(int level);
|
||||
+
|
||||
void write_log(int level, const char *format, ...) __attribute__((format(printf, 2, 3)));
|
||||
|
||||
extern int logfd;
|
||||
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
|
||||
index 565b2ca2..aa4976d6 100644
|
||||
--- a/libcontainer/nsenter/nsexec.c
|
||||
+++ b/libcontainer/nsenter/nsexec.c
|
||||
@@ -558,6 +558,25 @@ static void update_timens_offsets(pid_t pid, char *map, size_t map_len)
|
||||
bail("failed to update /proc/%d/timens_offsets", pid);
|
||||
}
|
||||
|
||||
+void print_cpu_affinity()
|
||||
+{
|
||||
+ cpu_set_t cpus = { };
|
||||
+ size_t i, mask = 0;
|
||||
+
|
||||
+ if (sched_getaffinity(0, sizeof(cpus), &cpus) < 0) {
|
||||
+ write_log(WARNING, "sched_getaffinity: %m");
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ /* Do not print the complete mask, we only need a few first CPUs. */
|
||||
+ for (i = 0; i < sizeof(mask) * 8; i++) {
|
||||
+ if (CPU_ISSET(i, &cpus))
|
||||
+ mask |= 1 << i;
|
||||
+ }
|
||||
+
|
||||
+ write_log(DEBUG, "affinity: 0x%zx", mask);
|
||||
+}
|
||||
+
|
||||
void nsexec(void)
|
||||
{
|
||||
int pipenum;
|
||||
@@ -584,6 +603,16 @@ void nsexec(void)
|
||||
|
||||
write_log(DEBUG, "=> nsexec container setup");
|
||||
|
||||
+ /* This is for ../../tests/integration/cpu_affinity.bats test only.
|
||||
+ *
|
||||
+ * Printing this from Go code might be too late as some kernels
|
||||
+ * change the process' CPU affinity to that of container's cpuset
|
||||
+ * as soon as the process is moved into container's cgroup.
|
||||
+ */
|
||||
+ if (log_enabled_for(DEBUG)) {
|
||||
+ print_cpu_affinity();
|
||||
+ }
|
||||
+
|
||||
/* Parse all of the netlink configuration. */
|
||||
nl_parse(pipenum, &config);
|
||||
|
||||
diff --git a/libcontainer/process.go b/libcontainer/process.go
|
||||
index 114b3f2b..5339583f 100644
|
||||
--- a/libcontainer/process.go
|
||||
+++ b/libcontainer/process.go
|
||||
@@ -102,6 +102,8 @@ type Process struct {
|
||||
Scheduler *configs.Scheduler
|
||||
|
||||
IOPriority *configs.IOPriority
|
||||
+
|
||||
+ CPUAffinity *configs.CPUAffinity
|
||||
}
|
||||
|
||||
// Wait waits for the process to exit.
|
||||
diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
|
||||
index fcbb54a3..477c8a77 100644
|
||||
--- a/libcontainer/process_linux.go
|
||||
+++ b/libcontainer/process_linux.go
|
||||
@@ -122,6 +122,46 @@ func (p *setnsProcess) signal(sig os.Signal) error {
|
||||
return unix.Kill(p.pid(), s)
|
||||
}
|
||||
|
||||
+// Starts setns process with specified initial CPU affinity.
|
||||
+func (p *setnsProcess) startWithCPUAffinity() error {
|
||||
+ aff := p.config.CPUAffinity
|
||||
+ if aff == nil || aff.Initial == nil {
|
||||
+ return p.cmd.Start()
|
||||
+ }
|
||||
+ errCh := make(chan error)
|
||||
+ defer close(errCh)
|
||||
+
|
||||
+ // Use a goroutine to dedicate an OS thread.
|
||||
+ go func() {
|
||||
+ runtime.LockOSThread()
|
||||
+ // Command inherits the CPU affinity.
|
||||
+ if err := unix.SchedSetaffinity(unix.Gettid(), aff.Initial); err != nil {
|
||||
+ runtime.UnlockOSThread()
|
||||
+ errCh <- fmt.Errorf("error setting initial CPU affinity: %w", err)
|
||||
+ return
|
||||
+ }
|
||||
+
|
||||
+ errCh <- p.cmd.Start()
|
||||
+ // Deliberately omit runtime.UnlockOSThread here.
|
||||
+ // https://pkg.go.dev/runtime#LockOSThread says:
|
||||
+ // "If the calling goroutine exits without unlocking the
|
||||
+ // thread, the thread will be terminated".
|
||||
+ }()
|
||||
+
|
||||
+ return <-errCh
|
||||
+}
|
||||
+
|
||||
+func (p *setnsProcess) setFinalCPUAffinity() error {
|
||||
+ aff := p.config.CPUAffinity
|
||||
+ if aff == nil || aff.Final == nil {
|
||||
+ return nil
|
||||
+ }
|
||||
+ if err := unix.SchedSetaffinity(p.pid(), aff.Final); err != nil {
|
||||
+ return fmt.Errorf("error setting final CPU affinity: %w", err)
|
||||
+ }
|
||||
+ return nil
|
||||
+}
|
||||
+
|
||||
func (p *setnsProcess) start() (retErr error) {
|
||||
defer p.comm.closeParent()
|
||||
|
||||
@@ -133,8 +173,8 @@ func (p *setnsProcess) start() (retErr error) {
|
||||
|
||||
// get the "before" value of oom kill count
|
||||
oom, _ := p.manager.OOMKillCount()
|
||||
- err := p.cmd.Start()
|
||||
- // close the child-side of the pipes (controlled by child)
|
||||
+ err := p.startWithCPUAffinity()
|
||||
+ // Close the child-side of the pipes (controlled by child).
|
||||
p.comm.closeChild()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error starting setns process: %w", err)
|
||||
@@ -184,6 +224,10 @@ func (p *setnsProcess) start() (retErr error) {
|
||||
}
|
||||
}
|
||||
}
|
||||
+ // Set final CPU affinity right after the process is moved into container's cgroup.
|
||||
+ if err := p.setFinalCPUAffinity(); err != nil {
|
||||
+ return err
|
||||
+ }
|
||||
if p.intelRdtPath != "" {
|
||||
// if Intel RDT "resource control" filesystem path exists
|
||||
_, err := os.Stat(p.intelRdtPath)
|
||||
@@ -193,7 +237,6 @@ func (p *setnsProcess) start() (retErr error) {
|
||||
}
|
||||
}
|
||||
}
|
||||
-
|
||||
if err := utils.WriteJSON(p.comm.initSockParent, p.config); err != nil {
|
||||
return fmt.Errorf("error writing config to pipe: %w", err)
|
||||
}
|
||||
diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go
|
||||
index 95ada499..2d0db342 100644
|
||||
--- a/libcontainer/specconv/spec_linux.go
|
||||
+++ b/libcontainer/specconv/spec_linux.go
|
||||
@@ -556,6 +556,11 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
|
||||
ioPriority := *spec.Process.IOPriority
|
||||
config.IOPriority = &ioPriority
|
||||
}
|
||||
+ config.ExecCPUAffinity, err = configs.ConvertCPUAffinity(spec.Process.ExecCPUAffinity)
|
||||
+ if err != nil {
|
||||
+ return nil, err
|
||||
+ }
|
||||
+
|
||||
}
|
||||
createHooks(spec, config)
|
||||
config.Version = specs.Version
|
||||
diff --git a/tests/integration/cpu_affinity.bats b/tests/integration/cpu_affinity.bats
|
||||
new file mode 100644
|
||||
index 00000000..f6adfa2a
|
||||
--- /dev/null
|
||||
+++ b/tests/integration/cpu_affinity.bats
|
||||
@@ -0,0 +1,101 @@
|
||||
+#!/usr/bin/env bats
|
||||
+# Exec CPU affinity tests. For more details, see:
|
||||
+# - https://github.com/opencontainers/runtime-spec/pull/1253
|
||||
+
|
||||
+load helpers
|
||||
+
|
||||
+function setup() {
|
||||
+ requires smp cgroups_cpuset
|
||||
+ setup_busybox
|
||||
+}
|
||||
+
|
||||
+function teardown() {
|
||||
+ teardown_bundle
|
||||
+}
|
||||
+
|
||||
+function first_cpu() {
|
||||
+ sed 's/[-,].*//g' </sys/devices/system/cpu/online
|
||||
+}
|
||||
+
|
||||
+# Convert list of cpus ("0,1" or "0-1") to mask as printed by nsexec.
|
||||
+# NOTE the range conversion is not proper, merely sufficient for tests here.
|
||||
+function cpus_to_mask() {
|
||||
+ local cpus=$* mask=0
|
||||
+
|
||||
+ cpus=${cpus//,/-} # 1. "," --> "-".
|
||||
+ cpus=${cpus//-/ } # 2. "-" --> " ".
|
||||
+
|
||||
+ for c in $cpus; do
|
||||
+ mask=$((mask | 1 << c))
|
||||
+ done
|
||||
+
|
||||
+ printf "0x%x" $mask
|
||||
+}
|
||||
+
|
||||
+@test "runc exec [CPU affinity, only initial set from process.json]" {
|
||||
+ first="$(first_cpu)"
|
||||
+ second=$((first + 1)) # Hacky; might not work in all environments.
|
||||
+
|
||||
+ runc run -d --console-socket "$CONSOLE_SOCKET" ct1
|
||||
+ [ "$status" -eq 0 ]
|
||||
+
|
||||
+ for cpus in "$second" "$first-$second" "$first,$second" "$first"; do
|
||||
+ proc='
|
||||
+{
|
||||
+ "terminal": false,
|
||||
+ "execCPUAffinity": {
|
||||
+ "initial": "'$cpus'"
|
||||
+ },
|
||||
+ "args": [ "/bin/true" ],
|
||||
+ "cwd": "/"
|
||||
+}'
|
||||
+ mask=$(cpus_to_mask "$cpus")
|
||||
+ echo "CPUS: $cpus, mask: $mask"
|
||||
+ runc --debug exec --process <(echo "$proc") ct1
|
||||
+ [[ "$output" == *"nsexec"*": affinity: $mask"* ]]
|
||||
+ done
|
||||
+}
|
||||
+
|
||||
+@test "runc exec [CPU affinity, initial and final set from process.json]" {
|
||||
+ first="$(first_cpu)"
|
||||
+ second=$((first + 1)) # Hacky; might not work in all environments.
|
||||
+
|
||||
+ runc run -d --console-socket "$CONSOLE_SOCKET" ct1
|
||||
+ [ "$status" -eq 0 ]
|
||||
+
|
||||
+ for cpus in "$second" "$first-$second" "$first,$second" "$first"; do
|
||||
+ proc='
|
||||
+{
|
||||
+ "terminal": false,
|
||||
+ "execCPUAffinity": {
|
||||
+ "initial": "'$cpus'",
|
||||
+ "final": "'$cpus'"
|
||||
+ },
|
||||
+ "args": [ "/bin/grep", "-F", "Cpus_allowed_list:", "/proc/self/status" ],
|
||||
+ "cwd": "/"
|
||||
+}'
|
||||
+ mask=$(cpus_to_mask "$cpus")
|
||||
+ exp=${cpus//,/-} # "," --> "-".
|
||||
+ echo "CPUS: $cpus, mask: $mask, final: $exp"
|
||||
+ runc --debug exec --process <(echo "$proc") ct1
|
||||
+ [[ "$output" == *"nsexec"*": affinity: $mask"* ]]
|
||||
+ [[ "$output" == *"Cpus_allowed_list: $exp"* ]] # Mind the literal tab.
|
||||
+ done
|
||||
+}
|
||||
+
|
||||
+@test "runc exec [CPU affinity, initial and final set from config.json]" {
|
||||
+ initial="$(first_cpu)"
|
||||
+ final=$((initial + 1)) # Hacky; might not work in all environments.
|
||||
+
|
||||
+ update_config " .process.execCPUAffinity.initial = \"$initial\"
|
||||
+ | .process.execCPUAffinity.final = \"$final\""
|
||||
+
|
||||
+ runc run -d --console-socket "$CONSOLE_SOCKET" ct1
|
||||
+ [ "$status" -eq 0 ]
|
||||
+
|
||||
+ runc --debug exec ct1 grep "Cpus_allowed_list:" /proc/self/status
|
||||
+ [ "$status" -eq 0 ]
|
||||
+ mask=$(cpus_to_mask "$initial")
|
||||
+ [[ "$output" == *"nsexec"*": affinity: $mask"* ]]
|
||||
+ [[ "$output" == *"Cpus_allowed_list: $final"* ]] # Mind the literal tab.
|
||||
+}
|
||||
diff --git a/utils_linux.go b/utils_linux.go
|
||||
index feb6ef80..013dbcf4 100644
|
||||
--- a/utils_linux.go
|
||||
+++ b/utils_linux.go
|
||||
@@ -90,6 +90,12 @@ func newProcess(p specs.Process) (*libcontainer.Process, error) {
|
||||
}
|
||||
lp.Rlimits = append(lp.Rlimits, rl)
|
||||
}
|
||||
+ aff, err := configs.ConvertCPUAffinity(p.ExecCPUAffinity)
|
||||
+ if err != nil {
|
||||
+ return nil, err
|
||||
+ }
|
||||
+ lp.CPUAffinity = aff
|
||||
+
|
||||
return lp, nil
|
||||
}
|
||||
|
||||
--
|
||||
2.47.1
|
||||
|
||||
@ -19,7 +19,7 @@ go build -buildmode pie -compiler gc -tags="rpm_crashtraceback libtrust_openssl
|
||||
|
||||
Epoch: 4
|
||||
Name: %{repo}
|
||||
Version: 1.2.5
|
||||
Version: 1.2.9
|
||||
Release: 2%{?dist}
|
||||
Summary: CLI for running Open Containers
|
||||
# https://fedoraproject.org/wiki/PackagingDrafts/Go#Go_Language_Architectures
|
||||
@ -30,12 +30,6 @@ ExcludeArch: %{ix86}
|
||||
License: ASL 2.0
|
||||
URL: %{git0}
|
||||
Source0: %{git0}/archive/v%{version}.tar.gz
|
||||
Patch0: 0001-Bump-runtime-spec-to-latest-git-HEAD.patch
|
||||
Patch1: 0002-runc-exec-implement-CPU-affinity.patch
|
||||
Patch2: 0001-1.2.5-1.el9-CVEs-mega-patch.patch
|
||||
Patch3: 0001-1.2-openat2-improve-resilience-on-busy-systems.patch
|
||||
Patch4: 0002-1.2-rootfs-re-allow-dangling-symlinks-in-mount-targe.patch
|
||||
Patch5: 0001-1.2-rootfs-only-set-mode-for-tmpfs-mount-if-target-alrea.patch
|
||||
Provides: oci-runtime
|
||||
BuildRequires: golang >= 1.22.4
|
||||
BuildRequires: git
|
||||
@ -90,6 +84,10 @@ make install install-man install-bash DESTDIR=$RPM_BUILD_ROOT PREFIX=%{_prefix}
|
||||
%{_datadir}/bash-completion/completions/%{name}
|
||||
|
||||
%changelog
|
||||
* Thu Dec 04 2025 Jindrich Novy <jnovy@redhat.com> - 4:1.2.9-2
|
||||
- update to https://github.com/opencontainers/runc/releases/tag/v1.2.9
|
||||
- Resolves: RHEL-132818
|
||||
|
||||
* Wed Nov 12 2025 Jindrich Novy <jnovy@redhat.com> - 4:1.2.5-2
|
||||
- fix permission regression
|
||||
- Related: RHEL-122384
|
||||
|
||||
Loading…
Reference in New Issue
Block a user