import UBI runc-1.2.9-2.module+el8.10.0+23820+ae6deecc

This commit is contained in:
eabdullin 2025-12-18 07:11:20 +00:00
parent 576dd3cde2
commit 894fc8b95b
9 changed files with 7 additions and 15184 deletions

2
.gitignore vendored
View File

@ -1 +1 @@
SOURCES/v1.2.5.tar.gz
SOURCES/v1.2.9.tar.gz

View File

@ -1 +1 @@
35e5289a5b1ac1a12a35c3475b7d0bee2232ef39 SOURCES/v1.2.5.tar.gz
537b121ab5e611e865bae05e9a85568a9a2ac85b SOURCES/v1.2.9.tar.gz

View File

@ -1,416 +0,0 @@
From 4ad5d01eeda006ba9ae067cbf999a77fe096fe00 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Sat, 1 Nov 2025 17:21:36 +1100
Subject: [PATCH 1/2] [1.2] openat2: improve resilience on busy systems
Previously, we would see a ~3% failure rate when starting containers
with mounts that contain ".." (which can trigger -EAGAIN). To counteract
this, filepath-securejoin v0.5.1 includes a bump of the internal retry
limit from 32 to 128, which lowers the failure rate to 0.12%.
However, there is still a risk of spurious failure on regular systems.
In order to try to provide more resilience (while avoiding DoS attacks),
this patch also includes an additional retry loop that terminates based
on a deadline rather than retry count. The deadline is 2ms, as my
testing found that ~800us for a single pathrs operation was the longest
latency due to -EAGAIN retries, and that was an outlier compared to the
more common ~400us latencies -- so 2ms should be more than enough for
any real system.
The failure rates above were based on more 50k runs of runc with an
attack script (from libpathrs) running a rename attack on all cores of a
16-core system, which is arguably a worst-case but heavily utilised
servers could likely approach similar results.
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
go.mod | 2 +-
go.sum | 4 +-
internal/pathrs/mkdirall_pathrslite.go | 4 +-
internal/pathrs/procfs_pathrslite.go | 22 ++++---
internal/pathrs/retry.go | 66 +++++++++++++++++++
internal/pathrs/root_pathrslite.go | 7 +-
.../cyphar/filepath-securejoin/CHANGELOG.md | 34 +++++++++-
.../cyphar/filepath-securejoin/VERSION | 2 +-
.../internal/{errors.go => errors_linux.go} | 15 ++++-
.../pathrs-lite/internal/fd/openat2_linux.go | 12 ++--
vendor/modules.txt | 2 +-
11 files changed, 144 insertions(+), 26 deletions(-)
create mode 100644 internal/pathrs/retry.go
rename vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/{errors.go => errors_linux.go} (70%)
diff --git a/go.mod b/go.mod
index 5f00a576..90fa2e5b 100644
--- a/go.mod
+++ b/go.mod
@@ -12,7 +12,7 @@ require (
github.com/cilium/ebpf v0.16.0
github.com/containerd/console v1.0.5
github.com/coreos/go-systemd/v22 v22.5.0
- github.com/cyphar/filepath-securejoin v0.5.0
+ github.com/cyphar/filepath-securejoin v0.5.1
github.com/docker/go-units v0.5.0
github.com/godbus/dbus/v5 v5.1.0
github.com/moby/sys/mountinfo v0.7.1
diff --git a/go.sum b/go.sum
index 1f930ce4..049597b6 100644
--- a/go.sum
+++ b/go.sum
@@ -9,8 +9,8 @@ github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
-github.com/cyphar/filepath-securejoin v0.5.0 h1:hIAhkRBMQ8nIeuVwcAoymp7MY4oherZdAxD+m0u9zaw=
-github.com/cyphar/filepath-securejoin v0.5.0/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI=
+github.com/cyphar/filepath-securejoin v0.5.1 h1:eYgfMq5yryL4fbWfkLpFFy2ukSELzaJOTaUTuh+oF48=
+github.com/cyphar/filepath-securejoin v0.5.1/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
diff --git a/internal/pathrs/mkdirall_pathrslite.go b/internal/pathrs/mkdirall_pathrslite.go
index fb4f7842..a9a0157c 100644
--- a/internal/pathrs/mkdirall_pathrslite.go
+++ b/internal/pathrs/mkdirall_pathrslite.go
@@ -83,7 +83,9 @@ func MkdirAllInRootOpen(root, unsafePath string, mode os.FileMode) (*os.File, er
}
defer rootDir.Close()
- return pathrs.MkdirAllHandle(rootDir, unsafePath, mode)
+ return retryEAGAIN(func() (*os.File, error) {
+ return pathrs.MkdirAllHandle(rootDir, unsafePath, mode)
+ })
}
// MkdirAllInRoot is a wrapper around MkdirAllInRootOpen which closes the
diff --git a/internal/pathrs/procfs_pathrslite.go b/internal/pathrs/procfs_pathrslite.go
index a02b0d39..37450a0e 100644
--- a/internal/pathrs/procfs_pathrslite.go
+++ b/internal/pathrs/procfs_pathrslite.go
@@ -27,13 +27,15 @@ import (
)
func procOpenReopen(openFn func(subpath string) (*os.File, error), subpath string, flags int) (*os.File, error) {
- handle, err := openFn(subpath)
+ handle, err := retryEAGAIN(func() (*os.File, error) {
+ return openFn(subpath)
+ })
if err != nil {
return nil, err
}
defer handle.Close()
- f, err := pathrs.Reopen(handle, flags)
+ f, err := Reopen(handle, flags)
if err != nil {
return nil, fmt.Errorf("reopen %s: %w", handle.Name(), err)
}
@@ -44,7 +46,7 @@ func procOpenReopen(openFn func(subpath string) (*os.File, error), subpath strin
// [pathrs.Reopen], to let you one-shot open a procfs file with the given
// flags.
func ProcSelfOpen(subpath string, flags int) (*os.File, error) {
- proc, err := procfs.OpenProcRoot()
+ proc, err := retryEAGAIN(procfs.OpenProcRoot)
if err != nil {
return nil, err
}
@@ -55,7 +57,7 @@ func ProcSelfOpen(subpath string, flags int) (*os.File, error) {
// ProcPidOpen is a wrapper around [procfs.Handle.OpenPid] and [pathrs.Reopen],
// to let you one-shot open a procfs file with the given flags.
func ProcPidOpen(pid int, subpath string, flags int) (*os.File, error) {
- proc, err := procfs.OpenProcRoot()
+ proc, err := retryEAGAIN(procfs.OpenProcRoot)
if err != nil {
return nil, err
}
@@ -70,13 +72,15 @@ func ProcPidOpen(pid int, subpath string, flags int) (*os.File, error) {
// flags. The returned [procfs.ProcThreadSelfCloser] needs the same handling as
// when using pathrs-lite.
func ProcThreadSelfOpen(subpath string, flags int) (_ *os.File, _ procfs.ProcThreadSelfCloser, Err error) {
- proc, err := procfs.OpenProcRoot()
+ proc, err := retryEAGAIN(procfs.OpenProcRoot)
if err != nil {
return nil, nil, err
}
defer proc.Close()
- handle, closer, err := proc.OpenThreadSelf(subpath)
+ handle, closer, err := retryEAGAIN2(func() (*os.File, procfs.ProcThreadSelfCloser, error) {
+ return proc.OpenThreadSelf(subpath)
+ })
if err != nil {
return nil, nil, err
}
@@ -89,7 +93,7 @@ func ProcThreadSelfOpen(subpath string, flags int) (_ *os.File, _ procfs.ProcThr
}
defer handle.Close()
- f, err := pathrs.Reopen(handle, flags)
+ f, err := Reopen(handle, flags)
if err != nil {
return nil, nil, fmt.Errorf("reopen %s: %w", handle.Name(), err)
}
@@ -98,5 +102,7 @@ func ProcThreadSelfOpen(subpath string, flags int) (_ *os.File, _ procfs.ProcThr
// Reopen is a wrapper around pathrs.Reopen.
func Reopen(file *os.File, flags int) (*os.File, error) {
- return pathrs.Reopen(file, flags)
+ return retryEAGAIN(func() (*os.File, error) {
+ return pathrs.Reopen(file, flags)
+ })
}
diff --git a/internal/pathrs/retry.go b/internal/pathrs/retry.go
new file mode 100644
index 00000000..a51d335c
--- /dev/null
+++ b/internal/pathrs/retry.go
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+/*
+ * Copyright (C) 2024-2025 Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2024-2025 SUSE LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package pathrs
+
+import (
+ "errors"
+ "fmt"
+ "time"
+
+ "golang.org/x/sys/unix"
+)
+
+// Based on >50k tests running "runc run" on a 16-core system with very heavy
+// rename(2) load, the single longest latency caused by -EAGAIN retries was
+// ~800us (with the vast majority being closer to 400us). So, a 2ms limit
+// should give more than enough headroom for any real system in practice.
+const retryDeadline = 2 * time.Millisecond
+
+// retryEAGAIN is a top-level retry loop for pathrs to try to returning
+// spurious errors in most normal user cases when using openat2 (libpathrs
+// itself does up to 128 retries already, but this method takes a
+// wallclock-deadline approach to simply retry until a timer elapses).
+func retryEAGAIN[T any](fn func() (T, error)) (T, error) {
+ deadline := time.After(retryDeadline)
+ for {
+ v, err := fn()
+ if !errors.Is(err, unix.EAGAIN) {
+ return v, err
+ }
+ select {
+ case <-deadline:
+ return *new(T), fmt.Errorf("%v retry deadline exceeded: %w", retryDeadline, err)
+ default:
+ // retry
+ }
+ }
+}
+
+// retryEAGAIN2 is like retryEAGAIN except it returns two values.
+func retryEAGAIN2[T1, T2 any](fn func() (T1, T2, error)) (T1, T2, error) {
+ type ret struct {
+ v1 T1
+ v2 T2
+ }
+ v, err := retryEAGAIN(func() (ret, error) {
+ v1, v2, err := fn()
+ return ret{v1: v1, v2: v2}, err
+ })
+ return v.v1, v.v2, err
+}
diff --git a/internal/pathrs/root_pathrslite.go b/internal/pathrs/root_pathrslite.go
index 0ef81fae..899af270 100644
--- a/internal/pathrs/root_pathrslite.go
+++ b/internal/pathrs/root_pathrslite.go
@@ -31,12 +31,15 @@ import (
// is effectively shorthand for [securejoin.OpenInRoot] followed by
// [securejoin.Reopen].
func OpenInRoot(root, subpath string, flags int) (*os.File, error) {
- handle, err := pathrs.OpenInRoot(root, subpath)
+ handle, err := retryEAGAIN(func() (*os.File, error) {
+ return pathrs.OpenInRoot(root, subpath)
+ })
if err != nil {
return nil, err
}
defer handle.Close()
- return pathrs.Reopen(handle, flags)
+
+ return Reopen(handle, flags)
}
// CreateInRoot creates a new file inside a root (as well as any missing parent
diff --git a/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md b/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md
index 6862467c..3faee0bc 100644
--- a/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md
+++ b/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md
@@ -4,7 +4,36 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/)
and this project adheres to [Semantic Versioning](http://semver.org/).
-## [Unreleased] ##
+## [Unreleased 0.5.z] ##
+
+## [0.5.1] - 2025-10-31 ##
+
+> Spooky scary skeletons send shivers down your spine!
+
+### Changed ###
+- `openat2` can return `-EAGAIN` if it detects a possible attack in certain
+ scenarios (namely if there was a rename or mount while walking a path with a
+ `..` component). While this is necessary to avoid a denial-of-service in the
+ kernel, it does require retry loops in userspace.
+
+ In previous versions, `pathrs-lite` would retry `openat2` 32 times before
+ returning an error, but we've received user reports that this limit can be
+ hit on systems with very heavy load. In some synthetic benchmarks (testing
+ the worst-case of an attacker doing renames in a tight loop on every core of
+ a 16-core machine) we managed to get a ~3% failure rate in runc. We have
+ improved this situation in two ways:
+
+ * We have now increased this limit to 128, which should be good enough for
+ most use-cases without becoming a denial-of-service vector (the number of
+ syscalls called by the `O_PATH` resolver in a typical case is within the
+ same ballpark). The same benchmarks show a failure rate of ~0.12% which
+ (while not zero) is probably sufficient for most users.
+
+ * In addition, we now return a `unix.EAGAIN` error that is bubbled up and can
+ be detected by callers. This means that callers with stricter requirements
+ to avoid spurious errors can choose to do their own infinite `EAGAIN` retry
+ loop (though we would strongly recommend users use time-based deadlines in
+ such retry loops to avoid potentially unbounded denials-of-service).
## [0.5.0] - 2025-09-26 ##
@@ -354,7 +383,8 @@ This is our first release of `github.com/cyphar/filepath-securejoin`,
containing a full implementation with a coverage of 93.5% (the only missing
cases are the error cases, which are hard to mocktest at the moment).
-[Unreleased]: https://github.com/cyphar/filepath-securejoin/compare/v0.5.0...HEAD
+[Unreleased 0.5.z]: https://github.com/cyphar/filepath-securejoin/compare/v0.5.1...release-0.5
+[0.5.1]: https://github.com/cyphar/filepath-securejoin/compare/v0.5.0...v0.5.1
[0.5.0]: https://github.com/cyphar/filepath-securejoin/compare/v0.4.1...v0.5.0
[0.4.1]: https://github.com/cyphar/filepath-securejoin/compare/v0.4.0...v0.4.1
[0.4.0]: https://github.com/cyphar/filepath-securejoin/compare/v0.3.6...v0.4.0
diff --git a/vendor/github.com/cyphar/filepath-securejoin/VERSION b/vendor/github.com/cyphar/filepath-securejoin/VERSION
index 8f0916f7..4b9fcbec 100644
--- a/vendor/github.com/cyphar/filepath-securejoin/VERSION
+++ b/vendor/github.com/cyphar/filepath-securejoin/VERSION
@@ -1 +1 @@
-0.5.0
+0.5.1
diff --git a/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors.go b/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors_linux.go
similarity index 70%
rename from vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors.go
rename to vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors_linux.go
index c26e440e..d0b200f4 100644
--- a/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors.go
+++ b/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors_linux.go
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: MPL-2.0
+//go:build linux
+
// Copyright (C) 2024-2025 Aleksa Sarai <cyphar@cyphar.com>
// Copyright (C) 2024-2025 SUSE LLC
//
@@ -12,15 +14,24 @@ package internal
import (
"errors"
+
+ "golang.org/x/sys/unix"
)
+type xdevErrorish struct {
+ description string
+}
+
+func (err xdevErrorish) Error() string { return err.description }
+func (err xdevErrorish) Is(target error) bool { return target == unix.EXDEV }
+
var (
// ErrPossibleAttack indicates that some attack was detected.
- ErrPossibleAttack = errors.New("possible attack detected")
+ ErrPossibleAttack error = xdevErrorish{"possible attack detected"}
// ErrPossibleBreakout indicates that during an operation we ended up in a
// state that could be a breakout but we detected it.
- ErrPossibleBreakout = errors.New("possible breakout detected")
+ ErrPossibleBreakout error = xdevErrorish{"possible breakout detected"}
// ErrInvalidDirectory indicates an unlinked directory.
ErrInvalidDirectory = errors.New("wandered into deleted directory")
diff --git a/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/fd/openat2_linux.go b/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/fd/openat2_linux.go
index 23053083..3e937fe3 100644
--- a/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/fd/openat2_linux.go
+++ b/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/fd/openat2_linux.go
@@ -17,8 +17,6 @@ import (
"runtime"
"golang.org/x/sys/unix"
-
- "github.com/cyphar/filepath-securejoin/pathrs-lite/internal"
)
func scopedLookupShouldRetry(how *unix.OpenHow, err error) bool {
@@ -34,7 +32,10 @@ func scopedLookupShouldRetry(how *unix.OpenHow, err error) bool {
(errors.Is(err, unix.EAGAIN) || errors.Is(err, unix.EXDEV))
}
-const scopedLookupMaxRetries = 32
+// This is a fairly arbitrary limit we have just to avoid an attacker being
+// able to make us spin in an infinite retry loop -- callers can choose to
+// retry on EAGAIN if they prefer.
+const scopedLookupMaxRetries = 128
// Openat2 is an [Fd]-based wrapper around unix.Openat2, but with some retry
// logic in case of EAGAIN errors.
@@ -43,10 +44,10 @@ func Openat2(dir Fd, path string, how *unix.OpenHow) (*os.File, error) {
// Make sure we always set O_CLOEXEC.
how.Flags |= unix.O_CLOEXEC
var tries int
- for tries < scopedLookupMaxRetries {
+ for {
fd, err := unix.Openat2(dirFd, path, how)
if err != nil {
- if scopedLookupShouldRetry(how, err) {
+ if scopedLookupShouldRetry(how, err) && tries < scopedLookupMaxRetries {
// We retry a couple of times to avoid the spurious errors, and
// if we are being attacked then returning -EAGAIN is the best
// we can do.
@@ -58,5 +59,4 @@ func Openat2(dir Fd, path string, how *unix.OpenHow) (*os.File, error) {
runtime.KeepAlive(dir)
return os.NewFile(uintptr(fd), fullPath), nil
}
- return nil, &os.PathError{Op: "openat2", Path: fullPath, Err: internal.ErrPossibleAttack}
}
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 4e7e0ef8..64524598 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -25,7 +25,7 @@ github.com/coreos/go-systemd/v22/dbus
# github.com/cpuguy83/go-md2man/v2 v2.0.2
## explicit; go 1.11
github.com/cpuguy83/go-md2man/v2/md2man
-# github.com/cyphar/filepath-securejoin v0.5.0
+# github.com/cyphar/filepath-securejoin v0.5.1
## explicit; go 1.18
github.com/cyphar/filepath-securejoin
github.com/cyphar/filepath-securejoin/internal/consts
--
2.51.1

View File

@ -1,161 +0,0 @@
From c8588560cdebd80e9d1823a4a8e39172ee4650bb Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Fri, 7 Nov 2025 14:52:09 +1100
Subject: [PATCH] rootfs: only set mode= for tmpfs mount if target already
existed
This was always the intended behaviour but commit 72fbb34f5006 ("rootfs:
switch to fd-based handling of mountpoint targets") regressed it when
adding a mechanism to create a file handle to the target if it didn't
already exist (causing the later stat to always succeed).
A lot of people depend on this functionality, so add some tests to make
sure we don't break it in the future.
Fixes: 72fbb34f5006 ("rootfs: switch to fd-based handling of mountpoint targets")
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
(cherry picked from commit 9a9719eeb4978e73c64740b3fc796c1b12987b05)
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
libcontainer/rootfs_linux.go | 25 ++++++-----
tests/integration/mounts.bats | 81 +++++++++++++++++++++++++++++++++++
2 files changed, 93 insertions(+), 13 deletions(-)
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
index 204e6a80..ab5a260d 100644
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -511,6 +511,18 @@ func (m *mountEntry) createOpenMountpoint(rootfs string) (Err error) {
_ = dstFile.Close()
}
}()
+ if err == nil && m.Device == "tmpfs" {
+ // If the original target exists, copy the mode for the tmpfs mount.
+ stat, err := dstFile.Stat()
+ if err != nil {
+ return fmt.Errorf("check tmpfs source mode: %w", err)
+ }
+ dt := fmt.Sprintf("mode=%04o", syscallMode(stat.Mode()))
+ if m.Data != "" {
+ dt = dt + "," + m.Data
+ }
+ m.Data = dt
+ }
if err != nil {
if !errors.Is(err, unix.ENOENT) {
return fmt.Errorf("lookup mountpoint target: %w", err)
@@ -551,19 +563,6 @@ func (m *mountEntry) createOpenMountpoint(rootfs string) (Err error) {
}
}
- if m.Device == "tmpfs" {
- // If the original target exists, copy the mode for the tmpfs mount.
- stat, err := dstFile.Stat()
- if err != nil {
- return fmt.Errorf("check tmpfs source mode: %w", err)
- }
- dt := fmt.Sprintf("mode=%04o", syscallMode(stat.Mode()))
- if m.Data != "" {
- dt = dt + "," + m.Data
- }
- m.Data = dt
- }
-
dstFullPath, err := procfs.ProcSelfFdReadlink(dstFile)
if err != nil {
return fmt.Errorf("get mount destination real path: %w", err)
diff --git a/tests/integration/mounts.bats b/tests/integration/mounts.bats
index 11fb2cfc..b60c88ae 100644
--- a/tests/integration/mounts.bats
+++ b/tests/integration/mounts.bats
@@ -234,6 +234,87 @@ function test_mount_order() {
[[ "$(stat -c %a rootfs/setgid/a/b/c)" == 2755 ]]
}
+# https://github.com/opencontainers/runc/issues/4971
+@test "runc run [tmpfs mount mode= inherit]" {
+ mkdir rootfs/tmpfs
+ chmod "=0710" rootfs/tmpfs
+
+ update_config '.mounts += [{
+ type: "tmpfs",
+ source: "tmpfs",
+ destination: "/tmpfs",
+ options: ["rw", "nodev", "nosuid"]
+ }]'
+ update_config '.process.args = ["stat", "-c", "%a", "/tmpfs"]'
+
+ runc run test_busybox
+ [ "$status" -eq 0 ]
+ [[ "$output" == "710" ]]
+
+ update_config '.process.args = ["cat", "/proc/self/mounts"]'
+ runc run test_busybox
+ [ "$status" -eq 0 ]
+ grep -Ex "tmpfs /tmpfs tmpfs [^ ]*\bmode=710\b[^ ]* .*" <<<"$output"
+}
+
+# https://github.com/opencontainers/runc/issues/4971
+@test "runc run [tmpfs mount explicit mode=]" {
+ mkdir rootfs/tmpfs
+ chmod "=0710" rootfs/tmpfs
+
+ update_config '.mounts += [{
+ type: "tmpfs",
+ source: "tmpfs",
+ destination: "/tmpfs",
+ options: ["rw", "nodev", "nosuid", "mode=1500"]
+ }]'
+ update_config '.process.args = ["stat", "-c", "%a", "/tmpfs"]'
+
+ # Explicitly setting mode= overrides whatever mode we would've inherited.
+ runc run test_busybox
+ [ "$status" -eq 0 ]
+ [[ "$output" == "1500" ]]
+
+ update_config '.process.args = ["cat", "/proc/self/mounts"]'
+ runc run test_busybox
+ [ "$status" -eq 0 ]
+ grep -Ex "tmpfs /tmpfs tmpfs [^ ]*\bmode=1500\b[^ ]* .*" <<<"$output"
+
+ # Verify that the actual directory was not chmod-ed.
+ [[ "$(stat -c %a rootfs/tmpfs)" == 710 ]]
+}
+
+# https://github.com/opencontainers/runc/issues/4971
+@test "runc run [tmpfs mount mode=1777 default]" {
+ update_config '.mounts += [{
+ type: "tmpfs",
+ source: "tmpfs",
+ destination: "/non-existent/foo/bar/baz",
+ options: ["rw", "nodev", "nosuid"]
+ }]'
+ update_config '.process.args = ["stat", "-c", "%a", "/non-existent/foo/bar/baz"]'
+
+ rm -rf rootfs/non-existent
+ runc run test_busybox
+ [ "$status" -eq 0 ]
+ [[ "$output" == "1777" ]]
+
+ update_config '.process.args = ["cat", "/proc/self/mounts"]'
+
+ rm -rf rootfs/non-existent
+ runc run test_busybox
+ [ "$status" -eq 0 ]
+ # We don't explicitly set a mode= in this case, it is just the tmpfs default.
+ grep -Ex "tmpfs /non-existent/foo/bar/baz tmpfs .*" <<<"$output"
+ run ! grep -Ex "tmpfs /non-existent/foo/bar/baz tmpfs [^ ]*\bmode=[0-7]+\b[^ ]* .*" <<<"$output"
+
+ # Verify that the actual modes are *not* 1777.
+ [[ "$(stat -c %a rootfs/non-existent)" == 755 ]]
+ [[ "$(stat -c %a rootfs/non-existent/foo)" == 755 ]]
+ [[ "$(stat -c %a rootfs/non-existent/foo/bar)" == 755 ]]
+ [[ "$(stat -c %a rootfs/non-existent/foo/bar/baz)" == 755 ]]
+}
+
@test "runc run [ro /sys/fs/cgroup mounts]" {
# Without cgroup namespace.
update_config '.linux.namespaces -= [{"type": "cgroup"}]'
--
2.51.1

File diff suppressed because it is too large Load Diff

View File

@ -1,103 +0,0 @@
From c6dad73d617864f3a281ac1fdaacd5ed971fa317 Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Thu, 27 Jun 2024 09:00:51 -0700
Subject: [PATCH 1/2] Bump runtime-spec to latest git HEAD
This is to include
- https://github.com/opencontainers/runtime-spec/pull/1261
- https://github.com/opencontainers/runtime-spec/pull/1253
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
(cherry picked from commit 2cac22b1e29e6be4c004f35ce582aa2b7e1c2fda)
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
go.mod | 2 +-
go.sum | 4 ++--
.../opencontainers/runtime-spec/specs-go/config.go | 8 ++++++++
.../opencontainers/runtime-spec/specs-go/version.go | 2 +-
vendor/modules.txt | 2 +-
5 files changed, 13 insertions(+), 5 deletions(-)
diff --git a/go.mod b/go.mod
index 348bc9c6..db2d7ef1 100644
--- a/go.mod
+++ b/go.mod
@@ -19,7 +19,7 @@ require (
github.com/moby/sys/user v0.3.0
github.com/moby/sys/userns v0.1.0
github.com/mrunalp/fileutils v0.5.1
- github.com/opencontainers/runtime-spec v1.2.0
+ github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95
github.com/opencontainers/selinux v1.11.0
github.com/seccomp/libseccomp-golang v0.10.0
github.com/sirupsen/logrus v1.9.3
diff --git a/go.sum b/go.sum
index 225d5860..4c863cc9 100644
--- a/go.sum
+++ b/go.sum
@@ -46,8 +46,8 @@ github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g
github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28=
github.com/mrunalp/fileutils v0.5.1 h1:F+S7ZlNKnrwHfSwdlgNSkKo67ReVf8o9fel6C3dkm/Q=
github.com/mrunalp/fileutils v0.5.1/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ=
-github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk=
-github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
+github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95 h1:Ghl8Z3l+yPQUDSxAp7Kg7fJLRNNXjOsR6ooDcca7PjU=
+github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
github.com/opencontainers/selinux v1.11.0 h1:+5Zbo97w3Lbmb3PeqQtpmTkMwsW5nRI3YaLpt7tQ7oU=
github.com/opencontainers/selinux v1.11.0/go.mod h1:E5dMC3VPuVvVHDYmi78qvhJp8+M586T4DlDRYpFkyec=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go
index d1236ba7..671f0d01 100644
--- a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go
+++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go
@@ -94,6 +94,8 @@ type Process struct {
SelinuxLabel string `json:"selinuxLabel,omitempty" platform:"linux"`
// IOPriority contains the I/O priority settings for the cgroup.
IOPriority *LinuxIOPriority `json:"ioPriority,omitempty" platform:"linux"`
+ // ExecCPUAffinity specifies CPU affinity for exec processes.
+ ExecCPUAffinity *CPUAffinity `json:"execCPUAffinity,omitempty" platform:"linux"`
}
// LinuxCapabilities specifies the list of allowed capabilities that are kept for a process.
@@ -127,6 +129,12 @@ const (
IOPRIO_CLASS_IDLE IOPriorityClass = "IOPRIO_CLASS_IDLE"
)
+// CPUAffinity specifies process' CPU affinity.
+type CPUAffinity struct {
+ Initial string `json:"initial,omitempty"`
+ Final string `json:"final,omitempty"`
+}
+
// Box specifies dimensions of a rectangle. Used for specifying the size of a console.
type Box struct {
// Height is the vertical dimension of a box.
diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go
index 503971e0..f6c15f6c 100644
--- a/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go
+++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go
@@ -11,7 +11,7 @@ const (
VersionPatch = 0
// VersionDev indicates development branch. Releases will be empty string.
- VersionDev = ""
+ VersionDev = "+dev"
)
// Version is the specification version that the package types support.
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 3b245e0d..df520923 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -46,7 +46,7 @@ github.com/moby/sys/userns
# github.com/mrunalp/fileutils v0.5.1
## explicit; go 1.13
github.com/mrunalp/fileutils
-# github.com/opencontainers/runtime-spec v1.2.0
+# github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95
## explicit
github.com/opencontainers/runtime-spec/specs-go
github.com/opencontainers/runtime-spec/specs-go/features
--
2.47.1

View File

@ -1,49 +0,0 @@
From e949092d469c3ee3ea9bf1002649b6a692895da9 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 5 Nov 2025 02:04:02 +1100
Subject: [PATCH 2/2] [1.2] rootfs: re-allow dangling symlinks in mount targets
It seems there are a fair few images where dangling symlinks are used as
path components for mount targets, which pathrs-lite does not support
(and it would be difficult to fully support this in a race-free way).
This was actually meant to be blocked by commit 63c2908164f3 ("rootfs:
try to scope MkdirAll to stay inside the rootfs"), followed by commit
dd827f7b715a ("utils: switch to securejoin.MkdirAllHandle"). However, we
still used SecureJoin to construct mountpoint targets, which means that
dangling symlinks were "resolved" before reaching pathrs-lite.
This patch basically re-adds this hack in order to reduce the breakages
we've seen so far.
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
libcontainer/rootfs_linux.go | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
index 377642c9..6ea7cd47 100644
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -518,6 +518,17 @@ func (m *mountEntry) createOpenMountpoint(rootfs string) (Err error) {
dstIsFile = !fi.IsDir()
}
+ // In previous runc versions, we would tolerate nonsense paths with
+ // dangling symlinks as path components. pathrs-lite does not support
+ // this, so instead we have to emulate this behaviour by doing
+ // SecureJoin *purely to get a semi-reasonable path to use* and then we
+ // use pathrs-lite to operate on the path safely.
+ newUnsafePath, err := securejoin.SecureJoin(rootfs, unsafePath)
+ if err != nil {
+ return err
+ }
+ unsafePath = utils.StripRoot(rootfs, newUnsafePath)
+
if dstIsFile {
dstFile, err = pathrs.CreateInRoot(rootfs, unsafePath, unix.O_CREAT|unix.O_EXCL|unix.O_NOFOLLOW, 0o644)
} else {
--
2.51.1

View File

@ -1,521 +0,0 @@
From 73786942b7176eae1e676cf2f78af548f090e418 Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Mon, 21 Oct 2024 15:50:38 -0700
Subject: [PATCH 2/2] runc exec: implement CPU affinity
As per
- https://github.com/opencontainers/runtime-spec/pull/1253
- https://github.com/opencontainers/runtime-spec/pull/1261
CPU affinity can be set in two ways:
1. When creating/starting a container, in config.json's
Process.ExecCPUAffinity, which is when applied to all execs.
2. When running an exec, in process.json's CPUAffinity, which
applied to a given exec and overrides the value from (1).
Add some basic tests.
Note that older kernels (RHEL8, Ubuntu 20.04) change CPU affinity of a
process to that of a container's cgroup, as soon as it is moved to that
cgroup, while newer kernels (Ubuntu 24.04, Fedora 41) don't do that.
Because of the above,
- it's impossible to really test initial CPU affinity without adding
debug logging to libcontainer/nsenter;
- for older kernels, there can be a brief moment when exec's affinity
is different than either initial or final affinity being set;
- exec's final CPU affinity, if not specified, can be different
depending on the kernel, therefore we don't test it.
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
(cherry picked from commit 57237b31de367a722c5d49088912d57c28c6fb46)
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
libcontainer/configs/config.go | 72 ++++++++++++++++++++
libcontainer/container_linux.go | 4 ++
libcontainer/init_linux.go | 3 +-
libcontainer/nsenter/log.c | 9 ++-
libcontainer/nsenter/log.h | 3 +
libcontainer/nsenter/nsexec.c | 29 ++++++++
libcontainer/process.go | 2 +
libcontainer/process_linux.go | 49 +++++++++++++-
libcontainer/specconv/spec_linux.go | 5 ++
tests/integration/cpu_affinity.bats | 101 ++++++++++++++++++++++++++++
utils_linux.go | 6 ++
11 files changed, 277 insertions(+), 6 deletions(-)
create mode 100644 tests/integration/cpu_affinity.bats
diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go
index 22fe0f9b..daffd130 100644
--- a/libcontainer/configs/config.go
+++ b/libcontainer/configs/config.go
@@ -3,8 +3,11 @@ package configs
import (
"bytes"
"encoding/json"
+ "errors"
"fmt"
"os/exec"
+ "strconv"
+ "strings"
"time"
"github.com/sirupsen/logrus"
@@ -225,6 +228,9 @@ type Config struct {
// IOPriority is the container's I/O priority.
IOPriority *IOPriority `json:"io_priority,omitempty"`
+
+ // ExecCPUAffinity is CPU affinity for a non-init process to be run in the container.
+ ExecCPUAffinity *CPUAffinity `json:"exec_cpu_affinity,omitempty"`
}
// Scheduler is based on the Linux sched_setattr(2) syscall.
@@ -294,6 +300,72 @@ var IOPrioClassMapping = map[specs.IOPriorityClass]int{
type IOPriority = specs.LinuxIOPriority
+type CPUAffinity struct {
+ Initial, Final *unix.CPUSet
+}
+
+func toCPUSet(str string) (*unix.CPUSet, error) {
+ if str == "" {
+ return nil, nil
+ }
+ s := new(unix.CPUSet)
+ for _, r := range strings.Split(str, ",") {
+ // Allow extra spaces around.
+ r = strings.TrimSpace(r)
+ // Allow empty elements (extra commas).
+ if r == "" {
+ continue
+ }
+ if r0, r1, found := strings.Cut(r, "-"); found {
+ start, err := strconv.ParseUint(r0, 10, 32)
+ if err != nil {
+ return nil, err
+ }
+ end, err := strconv.ParseUint(r1, 10, 32)
+ if err != nil {
+ return nil, err
+ }
+ if start > end {
+ return nil, errors.New("invalid range: " + r)
+ }
+ for i := int(start); i <= int(end); i++ {
+ s.Set(i)
+ }
+ } else {
+ val, err := strconv.ParseUint(r, 10, 32)
+ if err != nil {
+ return nil, err
+ }
+ s.Set(int(val))
+ }
+ }
+
+ return s, nil
+}
+
+// ConvertCPUAffinity converts [specs.CPUAffinity] to [CPUAffinity].
+func ConvertCPUAffinity(sa *specs.CPUAffinity) (*CPUAffinity, error) {
+ if sa == nil {
+ return nil, nil
+ }
+ initial, err := toCPUSet(sa.Initial)
+ if err != nil {
+ return nil, fmt.Errorf("bad CPUAffinity.Initial: %w", err)
+ }
+ final, err := toCPUSet(sa.Final)
+ if err != nil {
+ return nil, fmt.Errorf("bad CPUAffinity.Final: %w", err)
+ }
+ if initial == nil && final == nil {
+ return nil, nil
+ }
+
+ return &CPUAffinity{
+ Initial: initial,
+ Final: final,
+ }, nil
+}
+
type (
HookName string
HookList []Hook
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index c0211617..1fc590a5 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -692,6 +692,7 @@ func (c *Container) newInitConfig(process *Process) *initConfig {
AppArmorProfile: c.config.AppArmorProfile,
ProcessLabel: c.config.ProcessLabel,
Rlimits: c.config.Rlimits,
+ CPUAffinity: c.config.ExecCPUAffinity,
CreateConsole: process.ConsoleSocket != nil,
ConsoleWidth: process.ConsoleWidth,
ConsoleHeight: process.ConsoleHeight,
@@ -708,6 +709,9 @@ func (c *Container) newInitConfig(process *Process) *initConfig {
if len(process.Rlimits) > 0 {
cfg.Rlimits = process.Rlimits
}
+ if process.CPUAffinity != nil {
+ cfg.CPUAffinity = process.CPUAffinity
+ }
if cgroups.IsCgroup2UnifiedMode() {
cfg.Cgroup2Path = c.cgroupManager.Path("")
}
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
index 1eb0279d..eddbfba6 100644
--- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go
@@ -72,6 +72,7 @@ type initConfig struct {
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
SpecState *specs.State `json:"spec_state,omitempty"`
Cgroup2Path string `json:"cgroup2_path,omitempty"`
+ CPUAffinity *configs.CPUAffinity `json:"cpu_affinity,omitempty"`
}
// Init is part of "runc init" implementation.
@@ -151,7 +152,7 @@ func startInitialization() (retErr error) {
logrus.SetOutput(logPipe)
logrus.SetFormatter(new(logrus.JSONFormatter))
- logrus.Debug("child process in init()")
+ logrus.Debugf("child process in init()")
// Only init processes have FIFOFD.
var fifoFile *os.File
diff --git a/libcontainer/nsenter/log.c b/libcontainer/nsenter/log.c
index 086b5398..72774cb0 100644
--- a/libcontainer/nsenter/log.c
+++ b/libcontainer/nsenter/log.c
@@ -31,6 +31,11 @@ void setup_logpipe(void)
loglevel = i;
}
+bool log_enabled_for(int level)
+{
+ return (logfd >= 0 && level <= loglevel);
+}
+
/* Defined in nsexec.c */
extern int current_stage;
@@ -40,8 +45,8 @@ void write_log(int level, const char *format, ...)
va_list args;
int ret;
- if (logfd < 0 || level > loglevel)
- goto out;
+ if (!log_enabled_for(level))
+ return;
va_start(args, format);
ret = vasprintf(&message, format, args);
diff --git a/libcontainer/nsenter/log.h b/libcontainer/nsenter/log.h
index 1fe95a11..3e18de68 100644
--- a/libcontainer/nsenter/log.h
+++ b/libcontainer/nsenter/log.h
@@ -1,6 +1,7 @@
#ifndef NSENTER_LOG_H
#define NSENTER_LOG_H
+#include <stdbool.h>
#include <stdio.h>
/*
@@ -20,6 +21,8 @@
*/
void setup_logpipe(void);
+bool log_enabled_for(int level);
+
void write_log(int level, const char *format, ...) __attribute__((format(printf, 2, 3)));
extern int logfd;
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index 565b2ca2..aa4976d6 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -558,6 +558,25 @@ static void update_timens_offsets(pid_t pid, char *map, size_t map_len)
bail("failed to update /proc/%d/timens_offsets", pid);
}
+void print_cpu_affinity()
+{
+ cpu_set_t cpus = { };
+ size_t i, mask = 0;
+
+ if (sched_getaffinity(0, sizeof(cpus), &cpus) < 0) {
+ write_log(WARNING, "sched_getaffinity: %m");
+ return;
+ }
+
+ /* Do not print the complete mask, we only need a few first CPUs. */
+ for (i = 0; i < sizeof(mask) * 8; i++) {
+ if (CPU_ISSET(i, &cpus))
+ mask |= 1 << i;
+ }
+
+ write_log(DEBUG, "affinity: 0x%zx", mask);
+}
+
void nsexec(void)
{
int pipenum;
@@ -584,6 +603,16 @@ void nsexec(void)
write_log(DEBUG, "=> nsexec container setup");
+ /* This is for ../../tests/integration/cpu_affinity.bats test only.
+ *
+ * Printing this from Go code might be too late as some kernels
+ * change the process' CPU affinity to that of container's cpuset
+ * as soon as the process is moved into container's cgroup.
+ */
+ if (log_enabled_for(DEBUG)) {
+ print_cpu_affinity();
+ }
+
/* Parse all of the netlink configuration. */
nl_parse(pipenum, &config);
diff --git a/libcontainer/process.go b/libcontainer/process.go
index 114b3f2b..5339583f 100644
--- a/libcontainer/process.go
+++ b/libcontainer/process.go
@@ -102,6 +102,8 @@ type Process struct {
Scheduler *configs.Scheduler
IOPriority *configs.IOPriority
+
+ CPUAffinity *configs.CPUAffinity
}
// Wait waits for the process to exit.
diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
index fcbb54a3..477c8a77 100644
--- a/libcontainer/process_linux.go
+++ b/libcontainer/process_linux.go
@@ -122,6 +122,46 @@ func (p *setnsProcess) signal(sig os.Signal) error {
return unix.Kill(p.pid(), s)
}
+// Starts setns process with specified initial CPU affinity.
+func (p *setnsProcess) startWithCPUAffinity() error {
+ aff := p.config.CPUAffinity
+ if aff == nil || aff.Initial == nil {
+ return p.cmd.Start()
+ }
+ errCh := make(chan error)
+ defer close(errCh)
+
+ // Use a goroutine to dedicate an OS thread.
+ go func() {
+ runtime.LockOSThread()
+ // Command inherits the CPU affinity.
+ if err := unix.SchedSetaffinity(unix.Gettid(), aff.Initial); err != nil {
+ runtime.UnlockOSThread()
+ errCh <- fmt.Errorf("error setting initial CPU affinity: %w", err)
+ return
+ }
+
+ errCh <- p.cmd.Start()
+ // Deliberately omit runtime.UnlockOSThread here.
+ // https://pkg.go.dev/runtime#LockOSThread says:
+ // "If the calling goroutine exits without unlocking the
+ // thread, the thread will be terminated".
+ }()
+
+ return <-errCh
+}
+
+func (p *setnsProcess) setFinalCPUAffinity() error {
+ aff := p.config.CPUAffinity
+ if aff == nil || aff.Final == nil {
+ return nil
+ }
+ if err := unix.SchedSetaffinity(p.pid(), aff.Final); err != nil {
+ return fmt.Errorf("error setting final CPU affinity: %w", err)
+ }
+ return nil
+}
+
func (p *setnsProcess) start() (retErr error) {
defer p.comm.closeParent()
@@ -133,8 +173,8 @@ func (p *setnsProcess) start() (retErr error) {
// get the "before" value of oom kill count
oom, _ := p.manager.OOMKillCount()
- err := p.cmd.Start()
- // close the child-side of the pipes (controlled by child)
+ err := p.startWithCPUAffinity()
+ // Close the child-side of the pipes (controlled by child).
p.comm.closeChild()
if err != nil {
return fmt.Errorf("error starting setns process: %w", err)
@@ -184,6 +224,10 @@ func (p *setnsProcess) start() (retErr error) {
}
}
}
+ // Set final CPU affinity right after the process is moved into container's cgroup.
+ if err := p.setFinalCPUAffinity(); err != nil {
+ return err
+ }
if p.intelRdtPath != "" {
// if Intel RDT "resource control" filesystem path exists
_, err := os.Stat(p.intelRdtPath)
@@ -193,7 +237,6 @@ func (p *setnsProcess) start() (retErr error) {
}
}
}
-
if err := utils.WriteJSON(p.comm.initSockParent, p.config); err != nil {
return fmt.Errorf("error writing config to pipe: %w", err)
}
diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go
index 95ada499..2d0db342 100644
--- a/libcontainer/specconv/spec_linux.go
+++ b/libcontainer/specconv/spec_linux.go
@@ -556,6 +556,11 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
ioPriority := *spec.Process.IOPriority
config.IOPriority = &ioPriority
}
+ config.ExecCPUAffinity, err = configs.ConvertCPUAffinity(spec.Process.ExecCPUAffinity)
+ if err != nil {
+ return nil, err
+ }
+
}
createHooks(spec, config)
config.Version = specs.Version
diff --git a/tests/integration/cpu_affinity.bats b/tests/integration/cpu_affinity.bats
new file mode 100644
index 00000000..f6adfa2a
--- /dev/null
+++ b/tests/integration/cpu_affinity.bats
@@ -0,0 +1,101 @@
+#!/usr/bin/env bats
+# Exec CPU affinity tests. For more details, see:
+# - https://github.com/opencontainers/runtime-spec/pull/1253
+
+load helpers
+
+function setup() {
+ requires smp cgroups_cpuset
+ setup_busybox
+}
+
+function teardown() {
+ teardown_bundle
+}
+
+function first_cpu() {
+ sed 's/[-,].*//g' </sys/devices/system/cpu/online
+}
+
+# Convert list of cpus ("0,1" or "0-1") to mask as printed by nsexec.
+# NOTE the range conversion is not proper, merely sufficient for tests here.
+function cpus_to_mask() {
+ local cpus=$* mask=0
+
+ cpus=${cpus//,/-} # 1. "," --> "-".
+ cpus=${cpus//-/ } # 2. "-" --> " ".
+
+ for c in $cpus; do
+ mask=$((mask | 1 << c))
+ done
+
+ printf "0x%x" $mask
+}
+
+@test "runc exec [CPU affinity, only initial set from process.json]" {
+ first="$(first_cpu)"
+ second=$((first + 1)) # Hacky; might not work in all environments.
+
+ runc run -d --console-socket "$CONSOLE_SOCKET" ct1
+ [ "$status" -eq 0 ]
+
+ for cpus in "$second" "$first-$second" "$first,$second" "$first"; do
+ proc='
+{
+ "terminal": false,
+ "execCPUAffinity": {
+ "initial": "'$cpus'"
+ },
+ "args": [ "/bin/true" ],
+ "cwd": "/"
+}'
+ mask=$(cpus_to_mask "$cpus")
+ echo "CPUS: $cpus, mask: $mask"
+ runc --debug exec --process <(echo "$proc") ct1
+ [[ "$output" == *"nsexec"*": affinity: $mask"* ]]
+ done
+}
+
+@test "runc exec [CPU affinity, initial and final set from process.json]" {
+ first="$(first_cpu)"
+ second=$((first + 1)) # Hacky; might not work in all environments.
+
+ runc run -d --console-socket "$CONSOLE_SOCKET" ct1
+ [ "$status" -eq 0 ]
+
+ for cpus in "$second" "$first-$second" "$first,$second" "$first"; do
+ proc='
+{
+ "terminal": false,
+ "execCPUAffinity": {
+ "initial": "'$cpus'",
+ "final": "'$cpus'"
+ },
+ "args": [ "/bin/grep", "-F", "Cpus_allowed_list:", "/proc/self/status" ],
+ "cwd": "/"
+}'
+ mask=$(cpus_to_mask "$cpus")
+ exp=${cpus//,/-} # "," --> "-".
+ echo "CPUS: $cpus, mask: $mask, final: $exp"
+ runc --debug exec --process <(echo "$proc") ct1
+ [[ "$output" == *"nsexec"*": affinity: $mask"* ]]
+ [[ "$output" == *"Cpus_allowed_list: $exp"* ]] # Mind the literal tab.
+ done
+}
+
+@test "runc exec [CPU affinity, initial and final set from config.json]" {
+ initial="$(first_cpu)"
+ final=$((initial + 1)) # Hacky; might not work in all environments.
+
+ update_config " .process.execCPUAffinity.initial = \"$initial\"
+ | .process.execCPUAffinity.final = \"$final\""
+
+ runc run -d --console-socket "$CONSOLE_SOCKET" ct1
+ [ "$status" -eq 0 ]
+
+ runc --debug exec ct1 grep "Cpus_allowed_list:" /proc/self/status
+ [ "$status" -eq 0 ]
+ mask=$(cpus_to_mask "$initial")
+ [[ "$output" == *"nsexec"*": affinity: $mask"* ]]
+ [[ "$output" == *"Cpus_allowed_list: $final"* ]] # Mind the literal tab.
+}
diff --git a/utils_linux.go b/utils_linux.go
index feb6ef80..013dbcf4 100644
--- a/utils_linux.go
+++ b/utils_linux.go
@@ -90,6 +90,12 @@ func newProcess(p specs.Process) (*libcontainer.Process, error) {
}
lp.Rlimits = append(lp.Rlimits, rl)
}
+ aff, err := configs.ConvertCPUAffinity(p.ExecCPUAffinity)
+ if err != nil {
+ return nil, err
+ }
+ lp.CPUAffinity = aff
+
return lp, nil
}
--
2.47.1

View File

@ -19,7 +19,7 @@ go build -buildmode pie -compiler gc -tags="rpm_crashtraceback libtrust_openssl
Epoch: 4
Name: %{repo}
Version: 1.2.5
Version: 1.2.9
Release: 2%{?dist}
Summary: CLI for running Open Containers
# https://fedoraproject.org/wiki/PackagingDrafts/Go#Go_Language_Architectures
@ -30,12 +30,6 @@ ExcludeArch: %{ix86}
License: ASL 2.0
URL: %{git0}
Source0: %{git0}/archive/v%{version}.tar.gz
Patch0: 0001-Bump-runtime-spec-to-latest-git-HEAD.patch
Patch1: 0002-runc-exec-implement-CPU-affinity.patch
Patch2: 0001-1.2.5-1.el9-CVEs-mega-patch.patch
Patch3: 0001-1.2-openat2-improve-resilience-on-busy-systems.patch
Patch4: 0002-1.2-rootfs-re-allow-dangling-symlinks-in-mount-targe.patch
Patch5: 0001-1.2-rootfs-only-set-mode-for-tmpfs-mount-if-target-alrea.patch
Provides: oci-runtime
BuildRequires: golang >= 1.22.4
BuildRequires: git
@ -90,6 +84,10 @@ make install install-man install-bash DESTDIR=$RPM_BUILD_ROOT PREFIX=%{_prefix}
%{_datadir}/bash-completion/completions/%{name}
%changelog
* Thu Dec 04 2025 Jindrich Novy <jnovy@redhat.com> - 4:1.2.9-2
- update to https://github.com/opencontainers/runc/releases/tag/v1.2.9
- Resolves: RHEL-132818
* Wed Nov 12 2025 Jindrich Novy <jnovy@redhat.com> - 4:1.2.5-2
- fix permission regression
- Related: RHEL-122384