import UBI runc-1.2.5-2.module+el8.10.0+23647+cfd78660

This commit is contained in:
eabdullin 2025-11-17 21:36:27 +00:00
parent 984afe0af6
commit 576dd3cde2
10 changed files with 14791 additions and 588 deletions

2
.gitignore vendored
View File

@ -1 +1 @@
SOURCES/v1.1.12.tar.gz SOURCES/v1.2.5.tar.gz

View File

@ -1 +1 @@
3fac650358578b8694012a44b1d5b156523c3402 SOURCES/v1.1.12.tar.gz 35e5289a5b1ac1a12a35c3475b7d0bee2232ef39 SOURCES/v1.2.5.tar.gz

View File

@ -1,508 +0,0 @@
From 50f50245235097b0c87b31e97b86fd11685232a3 Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Thu, 16 Jan 2025 15:40:28 -0800
Subject: [PATCH 1/2] [1.1] Bump runtime-spec to latest git HEAD
This is to include
- https://github.com/opencontainers/runtime-spec/pull/1261
- https://github.com/opencontainers/runtime-spec/pull/1253
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
go.mod | 2 +-
go.sum | 4 +-
.../runtime-spec/specs-go/config.go | 239 ++++++++++++++++--
.../runtime-spec/specs-go/version.go | 6 +-
vendor/modules.txt | 2 +-
5 files changed, 225 insertions(+), 28 deletions(-)
diff --git a/go.mod b/go.mod
index f51b6432..87c8d4b4 100644
--- a/go.mod
+++ b/go.mod
@@ -12,7 +12,7 @@ require (
github.com/godbus/dbus/v5 v5.0.6
github.com/moby/sys/mountinfo v0.5.0
github.com/mrunalp/fileutils v0.5.1
- github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417
+ github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95
github.com/opencontainers/selinux v1.10.0
github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646
github.com/sirupsen/logrus v1.8.1
diff --git a/go.sum b/go.sum
index ecabd398..9d3bedc0 100644
--- a/go.sum
+++ b/go.sum
@@ -33,8 +33,8 @@ github.com/moby/sys/mountinfo v0.5.0 h1:2Ks8/r6lopsxWi9m58nlwjaeSzUX9iiL1vj5qB/9
github.com/moby/sys/mountinfo v0.5.0/go.mod h1:3bMD3Rg+zkqx8MRYPi7Pyb0Ie97QEBmdxbhnCLlSvSU=
github.com/mrunalp/fileutils v0.5.1 h1:F+S7ZlNKnrwHfSwdlgNSkKo67ReVf8o9fel6C3dkm/Q=
github.com/mrunalp/fileutils v0.5.1/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ=
-github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 h1:3snG66yBm59tKhhSPQrQ/0bCrv1LQbKt40LnUPiUxdc=
-github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
+github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95 h1:Ghl8Z3l+yPQUDSxAp7Kg7fJLRNNXjOsR6ooDcca7PjU=
+github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
github.com/opencontainers/selinux v1.10.0 h1:rAiKF8hTcgLI3w0DHm6i0ylVVcOrlgR1kK99DRLDhyU=
github.com/opencontainers/selinux v1.10.0/go.mod h1:2i0OySw99QjzBBQByd1Gr9gSjvuho1lHsJxIJ3gGbJI=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go
index 6a7a91e5..671f0d01 100644
--- a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go
+++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go
@@ -12,10 +12,12 @@ type Spec struct {
Root *Root `json:"root,omitempty"`
// Hostname configures the container's hostname.
Hostname string `json:"hostname,omitempty"`
+ // Domainname configures the container's domainname.
+ Domainname string `json:"domainname,omitempty"`
// Mounts configures additional mounts (on top of Root).
Mounts []Mount `json:"mounts,omitempty"`
// Hooks configures callbacks for container lifecycle events.
- Hooks *Hooks `json:"hooks,omitempty" platform:"linux,solaris"`
+ Hooks *Hooks `json:"hooks,omitempty" platform:"linux,solaris,zos"`
// Annotations contains arbitrary metadata for the container.
Annotations map[string]string `json:"annotations,omitempty"`
@@ -27,6 +29,36 @@ type Spec struct {
Windows *Windows `json:"windows,omitempty" platform:"windows"`
// VM specifies configuration for virtual-machine-based containers.
VM *VM `json:"vm,omitempty" platform:"vm"`
+ // ZOS is platform-specific configuration for z/OS based containers.
+ ZOS *ZOS `json:"zos,omitempty" platform:"zos"`
+}
+
+// Scheduler represents the scheduling attributes for a process. It is based on
+// the Linux sched_setattr(2) syscall.
+type Scheduler struct {
+ // Policy represents the scheduling policy (e.g., SCHED_FIFO, SCHED_RR, SCHED_OTHER).
+ Policy LinuxSchedulerPolicy `json:"policy"`
+
+ // Nice is the nice value for the process, which affects its priority.
+ Nice int32 `json:"nice,omitempty"`
+
+ // Priority represents the static priority of the process.
+ Priority int32 `json:"priority,omitempty"`
+
+ // Flags is an array of scheduling flags.
+ Flags []LinuxSchedulerFlag `json:"flags,omitempty"`
+
+ // The following ones are used by the DEADLINE scheduler.
+
+ // Runtime is the amount of time in nanoseconds during which the process
+ // is allowed to run in a given period.
+ Runtime uint64 `json:"runtime,omitempty"`
+
+ // Deadline is the absolute deadline for the process to complete its execution.
+ Deadline uint64 `json:"deadline,omitempty"`
+
+ // Period is the length of the period in nanoseconds used for determining the process runtime.
+ Period uint64 `json:"period,omitempty"`
}
// Process contains information to start a specific application inside the container.
@@ -49,15 +81,21 @@ type Process struct {
// Capabilities are Linux capabilities that are kept for the process.
Capabilities *LinuxCapabilities `json:"capabilities,omitempty" platform:"linux"`
// Rlimits specifies rlimit options to apply to the process.
- Rlimits []POSIXRlimit `json:"rlimits,omitempty" platform:"linux,solaris"`
+ Rlimits []POSIXRlimit `json:"rlimits,omitempty" platform:"linux,solaris,zos"`
// NoNewPrivileges controls whether additional privileges could be gained by processes in the container.
NoNewPrivileges bool `json:"noNewPrivileges,omitempty" platform:"linux"`
// ApparmorProfile specifies the apparmor profile for the container.
ApparmorProfile string `json:"apparmorProfile,omitempty" platform:"linux"`
// Specify an oom_score_adj for the container.
OOMScoreAdj *int `json:"oomScoreAdj,omitempty" platform:"linux"`
+ // Scheduler specifies the scheduling attributes for a process
+ Scheduler *Scheduler `json:"scheduler,omitempty" platform:"linux"`
// SelinuxLabel specifies the selinux context that the container process is run as.
SelinuxLabel string `json:"selinuxLabel,omitempty" platform:"linux"`
+ // IOPriority contains the I/O priority settings for the cgroup.
+ IOPriority *LinuxIOPriority `json:"ioPriority,omitempty" platform:"linux"`
+ // ExecCPUAffinity specifies CPU affinity for exec processes.
+ ExecCPUAffinity *CPUAffinity `json:"execCPUAffinity,omitempty" platform:"linux"`
}
// LinuxCapabilities specifies the list of allowed capabilities that are kept for a process.
@@ -75,6 +113,28 @@ type LinuxCapabilities struct {
Ambient []string `json:"ambient,omitempty" platform:"linux"`
}
+// IOPriority represents I/O priority settings for the container's processes within the process group.
+type LinuxIOPriority struct {
+ Class IOPriorityClass `json:"class"`
+ Priority int `json:"priority"`
+}
+
+// IOPriorityClass represents an I/O scheduling class.
+type IOPriorityClass string
+
+// Possible values for IOPriorityClass.
+const (
+ IOPRIO_CLASS_RT IOPriorityClass = "IOPRIO_CLASS_RT"
+ IOPRIO_CLASS_BE IOPriorityClass = "IOPRIO_CLASS_BE"
+ IOPRIO_CLASS_IDLE IOPriorityClass = "IOPRIO_CLASS_IDLE"
+)
+
+// CPUAffinity specifies process' CPU affinity.
+type CPUAffinity struct {
+ Initial string `json:"initial,omitempty"`
+ Final string `json:"final,omitempty"`
+}
+
// Box specifies dimensions of a rectangle. Used for specifying the size of a console.
type Box struct {
// Height is the vertical dimension of a box.
@@ -86,11 +146,11 @@ type Box struct {
// User specifies specific user (and group) information for the container process.
type User struct {
// UID is the user id.
- UID uint32 `json:"uid" platform:"linux,solaris"`
+ UID uint32 `json:"uid" platform:"linux,solaris,zos"`
// GID is the group id.
- GID uint32 `json:"gid" platform:"linux,solaris"`
+ GID uint32 `json:"gid" platform:"linux,solaris,zos"`
// Umask is the umask for the init process.
- Umask *uint32 `json:"umask,omitempty" platform:"linux,solaris"`
+ Umask *uint32 `json:"umask,omitempty" platform:"linux,solaris,zos"`
// AdditionalGids are additional group ids set for the container's process.
AdditionalGids []uint32 `json:"additionalGids,omitempty" platform:"linux,solaris"`
// Username is the user name.
@@ -110,11 +170,16 @@ type Mount struct {
// Destination is the absolute path where the mount will be placed in the container.
Destination string `json:"destination"`
// Type specifies the mount kind.
- Type string `json:"type,omitempty" platform:"linux,solaris"`
+ Type string `json:"type,omitempty" platform:"linux,solaris,zos"`
// Source specifies the source path of the mount.
Source string `json:"source,omitempty"`
// Options are fstab style mount options.
Options []string `json:"options,omitempty"`
+
+ // UID/GID mappings used for changing file owners w/o calling chown, fs should support it.
+ // Every mount point could have its own mapping.
+ UIDMappings []LinuxIDMapping `json:"uidMappings,omitempty" platform:"linux"`
+ GIDMappings []LinuxIDMapping `json:"gidMappings,omitempty" platform:"linux"`
}
// Hook specifies a command that is run at a particular event in the lifecycle of a container
@@ -130,6 +195,10 @@ type Hook struct {
type Hooks struct {
// Prestart is Deprecated. Prestart is a list of hooks to be run before the container process is executed.
// It is called in the Runtime Namespace
+ //
+ // Deprecated: use [Hooks.CreateRuntime], [Hooks.CreateContainer], and
+ // [Hooks.StartContainer] instead, which allow more granular hook control
+ // during the create and start phase.
Prestart []Hook `json:"prestart,omitempty"`
// CreateRuntime is a list of hooks to be run after the container has been created but before pivot_root or any equivalent operation has been called
// It is called in the Runtime Namespace
@@ -178,10 +247,12 @@ type Linux struct {
// MountLabel specifies the selinux context for the mounts in the container.
MountLabel string `json:"mountLabel,omitempty"`
// IntelRdt contains Intel Resource Director Technology (RDT) information for
- // handling resource constraints (e.g., L3 cache, memory bandwidth) for the container
+ // handling resource constraints and monitoring metrics (e.g., L3 cache, memory bandwidth) for the container
IntelRdt *LinuxIntelRdt `json:"intelRdt,omitempty"`
// Personality contains configuration for the Linux personality syscall
Personality *LinuxPersonality `json:"personality,omitempty"`
+ // TimeOffsets specifies the offset for supporting time namespaces.
+ TimeOffsets map[string]LinuxTimeOffset `json:"timeOffsets,omitempty"`
}
// LinuxNamespace is the configuration for a Linux namespace
@@ -211,6 +282,8 @@ const (
UserNamespace LinuxNamespaceType = "user"
// CgroupNamespace for isolating cgroup hierarchies
CgroupNamespace LinuxNamespaceType = "cgroup"
+ // TimeNamespace for isolating the clocks
+ TimeNamespace LinuxNamespaceType = "time"
)
// LinuxIDMapping specifies UID/GID mappings
@@ -223,6 +296,14 @@ type LinuxIDMapping struct {
Size uint32 `json:"size"`
}
+// LinuxTimeOffset specifies the offset for Time Namespace
+type LinuxTimeOffset struct {
+ // Secs is the offset of clock (in secs) in the container
+ Secs int64 `json:"secs,omitempty"`
+ // Nanosecs is the additional offset for Secs (in nanosecs)
+ Nanosecs uint32 `json:"nanosecs,omitempty"`
+}
+
// POSIXRlimit type and restrictions
type POSIXRlimit struct {
// Type of the rlimit to set
@@ -233,12 +314,13 @@ type POSIXRlimit struct {
Soft uint64 `json:"soft"`
}
-// LinuxHugepageLimit structure corresponds to limiting kernel hugepages
+// LinuxHugepageLimit structure corresponds to limiting kernel hugepages.
+// Default to reservation limits if supported. Otherwise fallback to page fault limits.
type LinuxHugepageLimit struct {
- // Pagesize is the hugepage size
- // Format: "<size><unit-prefix>B' (e.g. 64KB, 2MB, 1GB, etc.)
+ // Pagesize is the hugepage size.
+ // Format: "<size><unit-prefix>B' (e.g. 64KB, 2MB, 1GB, etc.).
Pagesize string `json:"pageSize"`
- // Limit is the limit of "hugepagesize" hugetlb usage
+ // Limit is the limit of "hugepagesize" hugetlb reservations (if supported) or usage.
Limit uint64 `json:"limit"`
}
@@ -250,8 +332,8 @@ type LinuxInterfacePriority struct {
Priority uint32 `json:"priority"`
}
-// linuxBlockIODevice holds major:minor format supported in blkio cgroup
-type linuxBlockIODevice struct {
+// LinuxBlockIODevice holds major:minor format supported in blkio cgroup
+type LinuxBlockIODevice struct {
// Major is the device's major number.
Major int64 `json:"major"`
// Minor is the device's minor number.
@@ -260,7 +342,7 @@ type linuxBlockIODevice struct {
// LinuxWeightDevice struct holds a `major:minor weight` pair for weightDevice
type LinuxWeightDevice struct {
- linuxBlockIODevice
+ LinuxBlockIODevice
// Weight is the bandwidth rate for the device.
Weight *uint16 `json:"weight,omitempty"`
// LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, CFQ scheduler only
@@ -269,7 +351,7 @@ type LinuxWeightDevice struct {
// LinuxThrottleDevice struct holds a `major:minor rate_per_second` pair
type LinuxThrottleDevice struct {
- linuxBlockIODevice
+ LinuxBlockIODevice
// Rate is the IO rate limit per cgroup per device
Rate uint64 `json:"rate"`
}
@@ -301,6 +383,12 @@ type LinuxMemory struct {
// Total memory limit (memory + swap).
Swap *int64 `json:"swap,omitempty"`
// Kernel memory limit (in bytes).
+ //
+ // Deprecated: kernel-memory limits are not supported in cgroups v2, and
+ // were obsoleted in [kernel v5.4]. This field should no longer be used,
+ // as it may be ignored by runtimes.
+ //
+ // [kernel v5.4]: https://github.com/torvalds/linux/commit/0158115f702b0ba208ab0
Kernel *int64 `json:"kernel,omitempty"`
// Kernel memory limit for tcp (in bytes)
KernelTCP *int64 `json:"kernelTCP,omitempty"`
@@ -310,6 +398,10 @@ type LinuxMemory struct {
DisableOOMKiller *bool `json:"disableOOMKiller,omitempty"`
// Enables hierarchical memory accounting
UseHierarchy *bool `json:"useHierarchy,omitempty"`
+ // CheckBeforeUpdate enables checking if a new memory limit is lower
+ // than the current usage during update, and if so, rejecting the new
+ // limit.
+ CheckBeforeUpdate *bool `json:"checkBeforeUpdate,omitempty"`
}
// LinuxCPU for Linux cgroup 'cpu' resource management
@@ -318,6 +410,9 @@ type LinuxCPU struct {
Shares *uint64 `json:"shares,omitempty"`
// CPU hardcap limit (in usecs). Allowed cpu time in a given period.
Quota *int64 `json:"quota,omitempty"`
+ // CPU hardcap burst limit (in usecs). Allowed accumulated cpu time additionally for burst in a
+ // given period.
+ Burst *uint64 `json:"burst,omitempty"`
// CPU period to be used for hardcapping (in usecs).
Period *uint64 `json:"period,omitempty"`
// How much time realtime scheduling may use (in usecs).
@@ -328,6 +423,8 @@ type LinuxCPU struct {
Cpus string `json:"cpus,omitempty"`
// List of memory nodes in the cpuset. Default is to use any available memory node.
Mems string `json:"mems,omitempty"`
+ // cgroups are configured with minimum weight, 0: default behavior, 1: SCHED_IDLE.
+ Idle *int64 `json:"idle,omitempty"`
}
// LinuxPids for Linux cgroup 'pids' resource management (Linux 4.3)
@@ -364,7 +461,7 @@ type LinuxResources struct {
Pids *LinuxPids `json:"pids,omitempty"`
// BlockIO restriction configuration
BlockIO *LinuxBlockIO `json:"blockIO,omitempty"`
- // Hugetlb limit (in bytes)
+ // Hugetlb limits (in bytes). Default to reservation limits if supported.
HugepageLimits []LinuxHugepageLimit `json:"hugepageLimits,omitempty"`
// Network restriction configuration
Network *LinuxNetwork `json:"network,omitempty"`
@@ -522,11 +619,21 @@ type WindowsMemoryResources struct {
// WindowsCPUResources contains CPU resource management settings.
type WindowsCPUResources struct {
- // Number of CPUs available to the container.
+ // Count is the number of CPUs available to the container. It represents the
+ // fraction of the configured processor `count` in a container in relation
+ // to the processors available in the host. The fraction ultimately
+ // determines the portion of processor cycles that the threads in a
+ // container can use during each scheduling interval, as the number of
+ // cycles per 10,000 cycles.
Count *uint64 `json:"count,omitempty"`
- // CPU shares (relative weight to other containers with cpu shares).
+ // Shares limits the share of processor time given to the container relative
+ // to other workloads on the processor. The processor `shares` (`weight` at
+ // the platform level) is a value between 0 and 10000.
Shares *uint16 `json:"shares,omitempty"`
- // Specifies the portion of processor cycles that this container can use as a percentage times 100.
+ // Maximum determines the portion of processor cycles that the threads in a
+ // container can use during each scheduling interval, as the number of
+ // cycles per 10,000 cycles. Set processor `maximum` to a percentage times
+ // 100.
Maximum *uint16 `json:"maximum,omitempty"`
}
@@ -613,6 +720,23 @@ type Arch string
// LinuxSeccompFlag is a flag to pass to seccomp(2).
type LinuxSeccompFlag string
+const (
+ // LinuxSeccompFlagLog is a seccomp flag to request all returned
+ // actions except SECCOMP_RET_ALLOW to be logged. An administrator may
+ // override this filter flag by preventing specific actions from being
+ // logged via the /proc/sys/kernel/seccomp/actions_logged file. (since
+ // Linux 4.14)
+ LinuxSeccompFlagLog LinuxSeccompFlag = "SECCOMP_FILTER_FLAG_LOG"
+
+ // LinuxSeccompFlagSpecAllow can be used to disable Speculative Store
+ // Bypass mitigation. (since Linux 4.17)
+ LinuxSeccompFlagSpecAllow LinuxSeccompFlag = "SECCOMP_FILTER_FLAG_SPEC_ALLOW"
+
+ // LinuxSeccompFlagWaitKillableRecv can be used to switch to the wait
+ // killable semantics. (since Linux 5.19)
+ LinuxSeccompFlagWaitKillableRecv LinuxSeccompFlag = "SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV"
+)
+
// Additional architectures permitted to be used for system calls
// By default only the native architecture of the kernel is permitted
const (
@@ -683,8 +807,9 @@ type LinuxSyscall struct {
Args []LinuxSeccompArg `json:"args,omitempty"`
}
-// LinuxIntelRdt has container runtime resource constraints for Intel RDT
-// CAT and MBA features which introduced in Linux 4.10 and 4.12 kernel
+// LinuxIntelRdt has container runtime resource constraints for Intel RDT CAT and MBA
+// features and flags enabling Intel RDT CMT and MBM features.
+// Intel RDT features are available in Linux 4.14 and newer kernel versions.
type LinuxIntelRdt struct {
// The identity for RDT Class of Service
ClosID string `json:"closID,omitempty"`
@@ -697,4 +822,76 @@ type LinuxIntelRdt struct {
// The unit of memory bandwidth is specified in "percentages" by
// default, and in "MBps" if MBA Software Controller is enabled.
MemBwSchema string `json:"memBwSchema,omitempty"`
+
+ // EnableCMT is the flag to indicate if the Intel RDT CMT is enabled. CMT (Cache Monitoring Technology) supports monitoring of
+ // the last-level cache (LLC) occupancy for the container.
+ EnableCMT bool `json:"enableCMT,omitempty"`
+
+ // EnableMBM is the flag to indicate if the Intel RDT MBM is enabled. MBM (Memory Bandwidth Monitoring) supports monitoring of
+ // total and local memory bandwidth for the container.
+ EnableMBM bool `json:"enableMBM,omitempty"`
+}
+
+// ZOS contains platform-specific configuration for z/OS based containers.
+type ZOS struct {
+ // Devices are a list of device nodes that are created for the container
+ Devices []ZOSDevice `json:"devices,omitempty"`
+}
+
+// ZOSDevice represents the mknod information for a z/OS special device file
+type ZOSDevice struct {
+ // Path to the device.
+ Path string `json:"path"`
+ // Device type, block, char, etc.
+ Type string `json:"type"`
+ // Major is the device's major number.
+ Major int64 `json:"major"`
+ // Minor is the device's minor number.
+ Minor int64 `json:"minor"`
+ // FileMode permission bits for the device.
+ FileMode *os.FileMode `json:"fileMode,omitempty"`
+ // UID of the device.
+ UID *uint32 `json:"uid,omitempty"`
+ // Gid of the device.
+ GID *uint32 `json:"gid,omitempty"`
}
+
+// LinuxSchedulerPolicy represents different scheduling policies used with the Linux Scheduler
+type LinuxSchedulerPolicy string
+
+const (
+ // SchedOther is the default scheduling policy
+ SchedOther LinuxSchedulerPolicy = "SCHED_OTHER"
+ // SchedFIFO is the First-In-First-Out scheduling policy
+ SchedFIFO LinuxSchedulerPolicy = "SCHED_FIFO"
+ // SchedRR is the Round-Robin scheduling policy
+ SchedRR LinuxSchedulerPolicy = "SCHED_RR"
+ // SchedBatch is the Batch scheduling policy
+ SchedBatch LinuxSchedulerPolicy = "SCHED_BATCH"
+ // SchedISO is the Isolation scheduling policy
+ SchedISO LinuxSchedulerPolicy = "SCHED_ISO"
+ // SchedIdle is the Idle scheduling policy
+ SchedIdle LinuxSchedulerPolicy = "SCHED_IDLE"
+ // SchedDeadline is the Deadline scheduling policy
+ SchedDeadline LinuxSchedulerPolicy = "SCHED_DEADLINE"
+)
+
+// LinuxSchedulerFlag represents the flags used by the Linux Scheduler.
+type LinuxSchedulerFlag string
+
+const (
+ // SchedFlagResetOnFork represents the reset on fork scheduling flag
+ SchedFlagResetOnFork LinuxSchedulerFlag = "SCHED_FLAG_RESET_ON_FORK"
+ // SchedFlagReclaim represents the reclaim scheduling flag
+ SchedFlagReclaim LinuxSchedulerFlag = "SCHED_FLAG_RECLAIM"
+ // SchedFlagDLOverrun represents the deadline overrun scheduling flag
+ SchedFlagDLOverrun LinuxSchedulerFlag = "SCHED_FLAG_DL_OVERRUN"
+ // SchedFlagKeepPolicy represents the keep policy scheduling flag
+ SchedFlagKeepPolicy LinuxSchedulerFlag = "SCHED_FLAG_KEEP_POLICY"
+ // SchedFlagKeepParams represents the keep parameters scheduling flag
+ SchedFlagKeepParams LinuxSchedulerFlag = "SCHED_FLAG_KEEP_PARAMS"
+ // SchedFlagUtilClampMin represents the utilization clamp minimum scheduling flag
+ SchedFlagUtilClampMin LinuxSchedulerFlag = "SCHED_FLAG_UTIL_CLAMP_MIN"
+ // SchedFlagUtilClampMin represents the utilization clamp maximum scheduling flag
+ SchedFlagUtilClampMax LinuxSchedulerFlag = "SCHED_FLAG_UTIL_CLAMP_MAX"
+)
diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go
index 596af0c2..f6c15f6c 100644
--- a/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go
+++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go
@@ -6,12 +6,12 @@ const (
// VersionMajor is for an API incompatible changes
VersionMajor = 1
// VersionMinor is for functionality in a backwards-compatible manner
- VersionMinor = 0
+ VersionMinor = 2
// VersionPatch is for backwards-compatible bug fixes
- VersionPatch = 2
+ VersionPatch = 0
// VersionDev indicates development branch. Releases will be empty string.
- VersionDev = "-dev"
+ VersionDev = "+dev"
)
// Version is the specification version that the package types support.
diff --git a/vendor/modules.txt b/vendor/modules.txt
index a5537dfe..40089cd4 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -35,7 +35,7 @@ github.com/moby/sys/mountinfo
# github.com/mrunalp/fileutils v0.5.1
## explicit; go 1.13
github.com/mrunalp/fileutils
-# github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417
+# github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95
## explicit
github.com/opencontainers/runtime-spec/specs-go
# github.com/opencontainers/selinux v1.10.0
--
2.47.1

View File

@ -0,0 +1,416 @@
From 4ad5d01eeda006ba9ae067cbf999a77fe096fe00 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Sat, 1 Nov 2025 17:21:36 +1100
Subject: [PATCH 1/2] [1.2] openat2: improve resilience on busy systems
Previously, we would see a ~3% failure rate when starting containers
with mounts that contain ".." (which can trigger -EAGAIN). To counteract
this, filepath-securejoin v0.5.1 includes a bump of the internal retry
limit from 32 to 128, which lowers the failure rate to 0.12%.
However, there is still a risk of spurious failure on regular systems.
In order to try to provide more resilience (while avoiding DoS attacks),
this patch also includes an additional retry loop that terminates based
on a deadline rather than retry count. The deadline is 2ms, as my
testing found that ~800us for a single pathrs operation was the longest
latency due to -EAGAIN retries, and that was an outlier compared to the
more common ~400us latencies -- so 2ms should be more than enough for
any real system.
The failure rates above were based on more 50k runs of runc with an
attack script (from libpathrs) running a rename attack on all cores of a
16-core system, which is arguably a worst-case but heavily utilised
servers could likely approach similar results.
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
go.mod | 2 +-
go.sum | 4 +-
internal/pathrs/mkdirall_pathrslite.go | 4 +-
internal/pathrs/procfs_pathrslite.go | 22 ++++---
internal/pathrs/retry.go | 66 +++++++++++++++++++
internal/pathrs/root_pathrslite.go | 7 +-
.../cyphar/filepath-securejoin/CHANGELOG.md | 34 +++++++++-
.../cyphar/filepath-securejoin/VERSION | 2 +-
.../internal/{errors.go => errors_linux.go} | 15 ++++-
.../pathrs-lite/internal/fd/openat2_linux.go | 12 ++--
vendor/modules.txt | 2 +-
11 files changed, 144 insertions(+), 26 deletions(-)
create mode 100644 internal/pathrs/retry.go
rename vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/{errors.go => errors_linux.go} (70%)
diff --git a/go.mod b/go.mod
index 5f00a576..90fa2e5b 100644
--- a/go.mod
+++ b/go.mod
@@ -12,7 +12,7 @@ require (
github.com/cilium/ebpf v0.16.0
github.com/containerd/console v1.0.5
github.com/coreos/go-systemd/v22 v22.5.0
- github.com/cyphar/filepath-securejoin v0.5.0
+ github.com/cyphar/filepath-securejoin v0.5.1
github.com/docker/go-units v0.5.0
github.com/godbus/dbus/v5 v5.1.0
github.com/moby/sys/mountinfo v0.7.1
diff --git a/go.sum b/go.sum
index 1f930ce4..049597b6 100644
--- a/go.sum
+++ b/go.sum
@@ -9,8 +9,8 @@ github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
-github.com/cyphar/filepath-securejoin v0.5.0 h1:hIAhkRBMQ8nIeuVwcAoymp7MY4oherZdAxD+m0u9zaw=
-github.com/cyphar/filepath-securejoin v0.5.0/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI=
+github.com/cyphar/filepath-securejoin v0.5.1 h1:eYgfMq5yryL4fbWfkLpFFy2ukSELzaJOTaUTuh+oF48=
+github.com/cyphar/filepath-securejoin v0.5.1/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
diff --git a/internal/pathrs/mkdirall_pathrslite.go b/internal/pathrs/mkdirall_pathrslite.go
index fb4f7842..a9a0157c 100644
--- a/internal/pathrs/mkdirall_pathrslite.go
+++ b/internal/pathrs/mkdirall_pathrslite.go
@@ -83,7 +83,9 @@ func MkdirAllInRootOpen(root, unsafePath string, mode os.FileMode) (*os.File, er
}
defer rootDir.Close()
- return pathrs.MkdirAllHandle(rootDir, unsafePath, mode)
+ return retryEAGAIN(func() (*os.File, error) {
+ return pathrs.MkdirAllHandle(rootDir, unsafePath, mode)
+ })
}
// MkdirAllInRoot is a wrapper around MkdirAllInRootOpen which closes the
diff --git a/internal/pathrs/procfs_pathrslite.go b/internal/pathrs/procfs_pathrslite.go
index a02b0d39..37450a0e 100644
--- a/internal/pathrs/procfs_pathrslite.go
+++ b/internal/pathrs/procfs_pathrslite.go
@@ -27,13 +27,15 @@ import (
)
func procOpenReopen(openFn func(subpath string) (*os.File, error), subpath string, flags int) (*os.File, error) {
- handle, err := openFn(subpath)
+ handle, err := retryEAGAIN(func() (*os.File, error) {
+ return openFn(subpath)
+ })
if err != nil {
return nil, err
}
defer handle.Close()
- f, err := pathrs.Reopen(handle, flags)
+ f, err := Reopen(handle, flags)
if err != nil {
return nil, fmt.Errorf("reopen %s: %w", handle.Name(), err)
}
@@ -44,7 +46,7 @@ func procOpenReopen(openFn func(subpath string) (*os.File, error), subpath strin
// [pathrs.Reopen], to let you one-shot open a procfs file with the given
// flags.
func ProcSelfOpen(subpath string, flags int) (*os.File, error) {
- proc, err := procfs.OpenProcRoot()
+ proc, err := retryEAGAIN(procfs.OpenProcRoot)
if err != nil {
return nil, err
}
@@ -55,7 +57,7 @@ func ProcSelfOpen(subpath string, flags int) (*os.File, error) {
// ProcPidOpen is a wrapper around [procfs.Handle.OpenPid] and [pathrs.Reopen],
// to let you one-shot open a procfs file with the given flags.
func ProcPidOpen(pid int, subpath string, flags int) (*os.File, error) {
- proc, err := procfs.OpenProcRoot()
+ proc, err := retryEAGAIN(procfs.OpenProcRoot)
if err != nil {
return nil, err
}
@@ -70,13 +72,15 @@ func ProcPidOpen(pid int, subpath string, flags int) (*os.File, error) {
// flags. The returned [procfs.ProcThreadSelfCloser] needs the same handling as
// when using pathrs-lite.
func ProcThreadSelfOpen(subpath string, flags int) (_ *os.File, _ procfs.ProcThreadSelfCloser, Err error) {
- proc, err := procfs.OpenProcRoot()
+ proc, err := retryEAGAIN(procfs.OpenProcRoot)
if err != nil {
return nil, nil, err
}
defer proc.Close()
- handle, closer, err := proc.OpenThreadSelf(subpath)
+ handle, closer, err := retryEAGAIN2(func() (*os.File, procfs.ProcThreadSelfCloser, error) {
+ return proc.OpenThreadSelf(subpath)
+ })
if err != nil {
return nil, nil, err
}
@@ -89,7 +93,7 @@ func ProcThreadSelfOpen(subpath string, flags int) (_ *os.File, _ procfs.ProcThr
}
defer handle.Close()
- f, err := pathrs.Reopen(handle, flags)
+ f, err := Reopen(handle, flags)
if err != nil {
return nil, nil, fmt.Errorf("reopen %s: %w", handle.Name(), err)
}
@@ -98,5 +102,7 @@ func ProcThreadSelfOpen(subpath string, flags int) (_ *os.File, _ procfs.ProcThr
// Reopen is a wrapper around pathrs.Reopen.
func Reopen(file *os.File, flags int) (*os.File, error) {
- return pathrs.Reopen(file, flags)
+ return retryEAGAIN(func() (*os.File, error) {
+ return pathrs.Reopen(file, flags)
+ })
}
diff --git a/internal/pathrs/retry.go b/internal/pathrs/retry.go
new file mode 100644
index 00000000..a51d335c
--- /dev/null
+++ b/internal/pathrs/retry.go
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+/*
+ * Copyright (C) 2024-2025 Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2024-2025 SUSE LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package pathrs
+
+import (
+ "errors"
+ "fmt"
+ "time"
+
+ "golang.org/x/sys/unix"
+)
+
+// Based on >50k tests running "runc run" on a 16-core system with very heavy
+// rename(2) load, the single longest latency caused by -EAGAIN retries was
+// ~800us (with the vast majority being closer to 400us). So, a 2ms limit
+// should give more than enough headroom for any real system in practice.
+const retryDeadline = 2 * time.Millisecond
+
+// retryEAGAIN is a top-level retry loop for pathrs to try to returning
+// spurious errors in most normal user cases when using openat2 (libpathrs
+// itself does up to 128 retries already, but this method takes a
+// wallclock-deadline approach to simply retry until a timer elapses).
+func retryEAGAIN[T any](fn func() (T, error)) (T, error) {
+ deadline := time.After(retryDeadline)
+ for {
+ v, err := fn()
+ if !errors.Is(err, unix.EAGAIN) {
+ return v, err
+ }
+ select {
+ case <-deadline:
+ return *new(T), fmt.Errorf("%v retry deadline exceeded: %w", retryDeadline, err)
+ default:
+ // retry
+ }
+ }
+}
+
+// retryEAGAIN2 is like retryEAGAIN except it returns two values.
+func retryEAGAIN2[T1, T2 any](fn func() (T1, T2, error)) (T1, T2, error) {
+ type ret struct {
+ v1 T1
+ v2 T2
+ }
+ v, err := retryEAGAIN(func() (ret, error) {
+ v1, v2, err := fn()
+ return ret{v1: v1, v2: v2}, err
+ })
+ return v.v1, v.v2, err
+}
diff --git a/internal/pathrs/root_pathrslite.go b/internal/pathrs/root_pathrslite.go
index 0ef81fae..899af270 100644
--- a/internal/pathrs/root_pathrslite.go
+++ b/internal/pathrs/root_pathrslite.go
@@ -31,12 +31,15 @@ import (
// is effectively shorthand for [securejoin.OpenInRoot] followed by
// [securejoin.Reopen].
func OpenInRoot(root, subpath string, flags int) (*os.File, error) {
- handle, err := pathrs.OpenInRoot(root, subpath)
+ handle, err := retryEAGAIN(func() (*os.File, error) {
+ return pathrs.OpenInRoot(root, subpath)
+ })
if err != nil {
return nil, err
}
defer handle.Close()
- return pathrs.Reopen(handle, flags)
+
+ return Reopen(handle, flags)
}
// CreateInRoot creates a new file inside a root (as well as any missing parent
diff --git a/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md b/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md
index 6862467c..3faee0bc 100644
--- a/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md
+++ b/vendor/github.com/cyphar/filepath-securejoin/CHANGELOG.md
@@ -4,7 +4,36 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/)
and this project adheres to [Semantic Versioning](http://semver.org/).
-## [Unreleased] ##
+## [Unreleased 0.5.z] ##
+
+## [0.5.1] - 2025-10-31 ##
+
+> Spooky scary skeletons send shivers down your spine!
+
+### Changed ###
+- `openat2` can return `-EAGAIN` if it detects a possible attack in certain
+ scenarios (namely if there was a rename or mount while walking a path with a
+ `..` component). While this is necessary to avoid a denial-of-service in the
+ kernel, it does require retry loops in userspace.
+
+ In previous versions, `pathrs-lite` would retry `openat2` 32 times before
+ returning an error, but we've received user reports that this limit can be
+ hit on systems with very heavy load. In some synthetic benchmarks (testing
+ the worst-case of an attacker doing renames in a tight loop on every core of
+ a 16-core machine) we managed to get a ~3% failure rate in runc. We have
+ improved this situation in two ways:
+
+ * We have now increased this limit to 128, which should be good enough for
+ most use-cases without becoming a denial-of-service vector (the number of
+ syscalls called by the `O_PATH` resolver in a typical case is within the
+ same ballpark). The same benchmarks show a failure rate of ~0.12% which
+ (while not zero) is probably sufficient for most users.
+
+ * In addition, we now return a `unix.EAGAIN` error that is bubbled up and can
+ be detected by callers. This means that callers with stricter requirements
+ to avoid spurious errors can choose to do their own infinite `EAGAIN` retry
+ loop (though we would strongly recommend users use time-based deadlines in
+ such retry loops to avoid potentially unbounded denials-of-service).
## [0.5.0] - 2025-09-26 ##
@@ -354,7 +383,8 @@ This is our first release of `github.com/cyphar/filepath-securejoin`,
containing a full implementation with a coverage of 93.5% (the only missing
cases are the error cases, which are hard to mocktest at the moment).
-[Unreleased]: https://github.com/cyphar/filepath-securejoin/compare/v0.5.0...HEAD
+[Unreleased 0.5.z]: https://github.com/cyphar/filepath-securejoin/compare/v0.5.1...release-0.5
+[0.5.1]: https://github.com/cyphar/filepath-securejoin/compare/v0.5.0...v0.5.1
[0.5.0]: https://github.com/cyphar/filepath-securejoin/compare/v0.4.1...v0.5.0
[0.4.1]: https://github.com/cyphar/filepath-securejoin/compare/v0.4.0...v0.4.1
[0.4.0]: https://github.com/cyphar/filepath-securejoin/compare/v0.3.6...v0.4.0
diff --git a/vendor/github.com/cyphar/filepath-securejoin/VERSION b/vendor/github.com/cyphar/filepath-securejoin/VERSION
index 8f0916f7..4b9fcbec 100644
--- a/vendor/github.com/cyphar/filepath-securejoin/VERSION
+++ b/vendor/github.com/cyphar/filepath-securejoin/VERSION
@@ -1 +1 @@
-0.5.0
+0.5.1
diff --git a/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors.go b/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors_linux.go
similarity index 70%
rename from vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors.go
rename to vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors_linux.go
index c26e440e..d0b200f4 100644
--- a/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors.go
+++ b/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/errors_linux.go
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: MPL-2.0
+//go:build linux
+
// Copyright (C) 2024-2025 Aleksa Sarai <cyphar@cyphar.com>
// Copyright (C) 2024-2025 SUSE LLC
//
@@ -12,15 +14,24 @@ package internal
import (
"errors"
+
+ "golang.org/x/sys/unix"
)
+type xdevErrorish struct {
+ description string
+}
+
+func (err xdevErrorish) Error() string { return err.description }
+func (err xdevErrorish) Is(target error) bool { return target == unix.EXDEV }
+
var (
// ErrPossibleAttack indicates that some attack was detected.
- ErrPossibleAttack = errors.New("possible attack detected")
+ ErrPossibleAttack error = xdevErrorish{"possible attack detected"}
// ErrPossibleBreakout indicates that during an operation we ended up in a
// state that could be a breakout but we detected it.
- ErrPossibleBreakout = errors.New("possible breakout detected")
+ ErrPossibleBreakout error = xdevErrorish{"possible breakout detected"}
// ErrInvalidDirectory indicates an unlinked directory.
ErrInvalidDirectory = errors.New("wandered into deleted directory")
diff --git a/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/fd/openat2_linux.go b/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/fd/openat2_linux.go
index 23053083..3e937fe3 100644
--- a/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/fd/openat2_linux.go
+++ b/vendor/github.com/cyphar/filepath-securejoin/pathrs-lite/internal/fd/openat2_linux.go
@@ -17,8 +17,6 @@ import (
"runtime"
"golang.org/x/sys/unix"
-
- "github.com/cyphar/filepath-securejoin/pathrs-lite/internal"
)
func scopedLookupShouldRetry(how *unix.OpenHow, err error) bool {
@@ -34,7 +32,10 @@ func scopedLookupShouldRetry(how *unix.OpenHow, err error) bool {
(errors.Is(err, unix.EAGAIN) || errors.Is(err, unix.EXDEV))
}
-const scopedLookupMaxRetries = 32
+// This is a fairly arbitrary limit we have just to avoid an attacker being
+// able to make us spin in an infinite retry loop -- callers can choose to
+// retry on EAGAIN if they prefer.
+const scopedLookupMaxRetries = 128
// Openat2 is an [Fd]-based wrapper around unix.Openat2, but with some retry
// logic in case of EAGAIN errors.
@@ -43,10 +44,10 @@ func Openat2(dir Fd, path string, how *unix.OpenHow) (*os.File, error) {
// Make sure we always set O_CLOEXEC.
how.Flags |= unix.O_CLOEXEC
var tries int
- for tries < scopedLookupMaxRetries {
+ for {
fd, err := unix.Openat2(dirFd, path, how)
if err != nil {
- if scopedLookupShouldRetry(how, err) {
+ if scopedLookupShouldRetry(how, err) && tries < scopedLookupMaxRetries {
// We retry a couple of times to avoid the spurious errors, and
// if we are being attacked then returning -EAGAIN is the best
// we can do.
@@ -58,5 +59,4 @@ func Openat2(dir Fd, path string, how *unix.OpenHow) (*os.File, error) {
runtime.KeepAlive(dir)
return os.NewFile(uintptr(fd), fullPath), nil
}
- return nil, &os.PathError{Op: "openat2", Path: fullPath, Err: internal.ErrPossibleAttack}
}
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 4e7e0ef8..64524598 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -25,7 +25,7 @@ github.com/coreos/go-systemd/v22/dbus
# github.com/cpuguy83/go-md2man/v2 v2.0.2
## explicit; go 1.11
github.com/cpuguy83/go-md2man/v2/md2man
-# github.com/cyphar/filepath-securejoin v0.5.0
+# github.com/cyphar/filepath-securejoin v0.5.1
## explicit; go 1.18
github.com/cyphar/filepath-securejoin
github.com/cyphar/filepath-securejoin/internal/consts
--
2.51.1

View File

@ -0,0 +1,161 @@
From c8588560cdebd80e9d1823a4a8e39172ee4650bb Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Fri, 7 Nov 2025 14:52:09 +1100
Subject: [PATCH] rootfs: only set mode= for tmpfs mount if target already
existed
This was always the intended behaviour but commit 72fbb34f5006 ("rootfs:
switch to fd-based handling of mountpoint targets") regressed it when
adding a mechanism to create a file handle to the target if it didn't
already exist (causing the later stat to always succeed).
A lot of people depend on this functionality, so add some tests to make
sure we don't break it in the future.
Fixes: 72fbb34f5006 ("rootfs: switch to fd-based handling of mountpoint targets")
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
(cherry picked from commit 9a9719eeb4978e73c64740b3fc796c1b12987b05)
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
libcontainer/rootfs_linux.go | 25 ++++++-----
tests/integration/mounts.bats | 81 +++++++++++++++++++++++++++++++++++
2 files changed, 93 insertions(+), 13 deletions(-)
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
index 204e6a80..ab5a260d 100644
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -511,6 +511,18 @@ func (m *mountEntry) createOpenMountpoint(rootfs string) (Err error) {
_ = dstFile.Close()
}
}()
+ if err == nil && m.Device == "tmpfs" {
+ // If the original target exists, copy the mode for the tmpfs mount.
+ stat, err := dstFile.Stat()
+ if err != nil {
+ return fmt.Errorf("check tmpfs source mode: %w", err)
+ }
+ dt := fmt.Sprintf("mode=%04o", syscallMode(stat.Mode()))
+ if m.Data != "" {
+ dt = dt + "," + m.Data
+ }
+ m.Data = dt
+ }
if err != nil {
if !errors.Is(err, unix.ENOENT) {
return fmt.Errorf("lookup mountpoint target: %w", err)
@@ -551,19 +563,6 @@ func (m *mountEntry) createOpenMountpoint(rootfs string) (Err error) {
}
}
- if m.Device == "tmpfs" {
- // If the original target exists, copy the mode for the tmpfs mount.
- stat, err := dstFile.Stat()
- if err != nil {
- return fmt.Errorf("check tmpfs source mode: %w", err)
- }
- dt := fmt.Sprintf("mode=%04o", syscallMode(stat.Mode()))
- if m.Data != "" {
- dt = dt + "," + m.Data
- }
- m.Data = dt
- }
-
dstFullPath, err := procfs.ProcSelfFdReadlink(dstFile)
if err != nil {
return fmt.Errorf("get mount destination real path: %w", err)
diff --git a/tests/integration/mounts.bats b/tests/integration/mounts.bats
index 11fb2cfc..b60c88ae 100644
--- a/tests/integration/mounts.bats
+++ b/tests/integration/mounts.bats
@@ -234,6 +234,87 @@ function test_mount_order() {
[[ "$(stat -c %a rootfs/setgid/a/b/c)" == 2755 ]]
}
+# https://github.com/opencontainers/runc/issues/4971
+@test "runc run [tmpfs mount mode= inherit]" {
+ mkdir rootfs/tmpfs
+ chmod "=0710" rootfs/tmpfs
+
+ update_config '.mounts += [{
+ type: "tmpfs",
+ source: "tmpfs",
+ destination: "/tmpfs",
+ options: ["rw", "nodev", "nosuid"]
+ }]'
+ update_config '.process.args = ["stat", "-c", "%a", "/tmpfs"]'
+
+ runc run test_busybox
+ [ "$status" -eq 0 ]
+ [[ "$output" == "710" ]]
+
+ update_config '.process.args = ["cat", "/proc/self/mounts"]'
+ runc run test_busybox
+ [ "$status" -eq 0 ]
+ grep -Ex "tmpfs /tmpfs tmpfs [^ ]*\bmode=710\b[^ ]* .*" <<<"$output"
+}
+
+# https://github.com/opencontainers/runc/issues/4971
+@test "runc run [tmpfs mount explicit mode=]" {
+ mkdir rootfs/tmpfs
+ chmod "=0710" rootfs/tmpfs
+
+ update_config '.mounts += [{
+ type: "tmpfs",
+ source: "tmpfs",
+ destination: "/tmpfs",
+ options: ["rw", "nodev", "nosuid", "mode=1500"]
+ }]'
+ update_config '.process.args = ["stat", "-c", "%a", "/tmpfs"]'
+
+ # Explicitly setting mode= overrides whatever mode we would've inherited.
+ runc run test_busybox
+ [ "$status" -eq 0 ]
+ [[ "$output" == "1500" ]]
+
+ update_config '.process.args = ["cat", "/proc/self/mounts"]'
+ runc run test_busybox
+ [ "$status" -eq 0 ]
+ grep -Ex "tmpfs /tmpfs tmpfs [^ ]*\bmode=1500\b[^ ]* .*" <<<"$output"
+
+ # Verify that the actual directory was not chmod-ed.
+ [[ "$(stat -c %a rootfs/tmpfs)" == 710 ]]
+}
+
+# https://github.com/opencontainers/runc/issues/4971
+@test "runc run [tmpfs mount mode=1777 default]" {
+ update_config '.mounts += [{
+ type: "tmpfs",
+ source: "tmpfs",
+ destination: "/non-existent/foo/bar/baz",
+ options: ["rw", "nodev", "nosuid"]
+ }]'
+ update_config '.process.args = ["stat", "-c", "%a", "/non-existent/foo/bar/baz"]'
+
+ rm -rf rootfs/non-existent
+ runc run test_busybox
+ [ "$status" -eq 0 ]
+ [[ "$output" == "1777" ]]
+
+ update_config '.process.args = ["cat", "/proc/self/mounts"]'
+
+ rm -rf rootfs/non-existent
+ runc run test_busybox
+ [ "$status" -eq 0 ]
+ # We don't explicitly set a mode= in this case, it is just the tmpfs default.
+ grep -Ex "tmpfs /non-existent/foo/bar/baz tmpfs .*" <<<"$output"
+ run ! grep -Ex "tmpfs /non-existent/foo/bar/baz tmpfs [^ ]*\bmode=[0-7]+\b[^ ]* .*" <<<"$output"
+
+ # Verify that the actual modes are *not* 1777.
+ [[ "$(stat -c %a rootfs/non-existent)" == 755 ]]
+ [[ "$(stat -c %a rootfs/non-existent/foo)" == 755 ]]
+ [[ "$(stat -c %a rootfs/non-existent/foo/bar)" == 755 ]]
+ [[ "$(stat -c %a rootfs/non-existent/foo/bar/baz)" == 755 ]]
+}
+
@test "runc run [ro /sys/fs/cgroup mounts]" {
# Without cgroup namespace.
update_config '.linux.namespaces -= [{"type": "cgroup"}]'
--
2.51.1

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,103 @@
From c6dad73d617864f3a281ac1fdaacd5ed971fa317 Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Thu, 27 Jun 2024 09:00:51 -0700
Subject: [PATCH 1/2] Bump runtime-spec to latest git HEAD
This is to include
- https://github.com/opencontainers/runtime-spec/pull/1261
- https://github.com/opencontainers/runtime-spec/pull/1253
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
(cherry picked from commit 2cac22b1e29e6be4c004f35ce582aa2b7e1c2fda)
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
go.mod | 2 +-
go.sum | 4 ++--
.../opencontainers/runtime-spec/specs-go/config.go | 8 ++++++++
.../opencontainers/runtime-spec/specs-go/version.go | 2 +-
vendor/modules.txt | 2 +-
5 files changed, 13 insertions(+), 5 deletions(-)
diff --git a/go.mod b/go.mod
index 348bc9c6..db2d7ef1 100644
--- a/go.mod
+++ b/go.mod
@@ -19,7 +19,7 @@ require (
github.com/moby/sys/user v0.3.0
github.com/moby/sys/userns v0.1.0
github.com/mrunalp/fileutils v0.5.1
- github.com/opencontainers/runtime-spec v1.2.0
+ github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95
github.com/opencontainers/selinux v1.11.0
github.com/seccomp/libseccomp-golang v0.10.0
github.com/sirupsen/logrus v1.9.3
diff --git a/go.sum b/go.sum
index 225d5860..4c863cc9 100644
--- a/go.sum
+++ b/go.sum
@@ -46,8 +46,8 @@ github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g
github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28=
github.com/mrunalp/fileutils v0.5.1 h1:F+S7ZlNKnrwHfSwdlgNSkKo67ReVf8o9fel6C3dkm/Q=
github.com/mrunalp/fileutils v0.5.1/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ=
-github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk=
-github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
+github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95 h1:Ghl8Z3l+yPQUDSxAp7Kg7fJLRNNXjOsR6ooDcca7PjU=
+github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
github.com/opencontainers/selinux v1.11.0 h1:+5Zbo97w3Lbmb3PeqQtpmTkMwsW5nRI3YaLpt7tQ7oU=
github.com/opencontainers/selinux v1.11.0/go.mod h1:E5dMC3VPuVvVHDYmi78qvhJp8+M586T4DlDRYpFkyec=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go
index d1236ba7..671f0d01 100644
--- a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go
+++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go
@@ -94,6 +94,8 @@ type Process struct {
SelinuxLabel string `json:"selinuxLabel,omitempty" platform:"linux"`
// IOPriority contains the I/O priority settings for the cgroup.
IOPriority *LinuxIOPriority `json:"ioPriority,omitempty" platform:"linux"`
+ // ExecCPUAffinity specifies CPU affinity for exec processes.
+ ExecCPUAffinity *CPUAffinity `json:"execCPUAffinity,omitempty" platform:"linux"`
}
// LinuxCapabilities specifies the list of allowed capabilities that are kept for a process.
@@ -127,6 +129,12 @@ const (
IOPRIO_CLASS_IDLE IOPriorityClass = "IOPRIO_CLASS_IDLE"
)
+// CPUAffinity specifies process' CPU affinity.
+type CPUAffinity struct {
+ Initial string `json:"initial,omitempty"`
+ Final string `json:"final,omitempty"`
+}
+
// Box specifies dimensions of a rectangle. Used for specifying the size of a console.
type Box struct {
// Height is the vertical dimension of a box.
diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go
index 503971e0..f6c15f6c 100644
--- a/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go
+++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go
@@ -11,7 +11,7 @@ const (
VersionPatch = 0
// VersionDev indicates development branch. Releases will be empty string.
- VersionDev = ""
+ VersionDev = "+dev"
)
// Version is the specification version that the package types support.
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 3b245e0d..df520923 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -46,7 +46,7 @@ github.com/moby/sys/userns
# github.com/mrunalp/fileutils v0.5.1
## explicit; go 1.13
github.com/mrunalp/fileutils
-# github.com/opencontainers/runtime-spec v1.2.0
+# github.com/opencontainers/runtime-spec v1.2.1-0.20240625190033-701738418b95
## explicit
github.com/opencontainers/runtime-spec/specs-go
github.com/opencontainers/runtime-spec/specs-go/features
--
2.47.1

View File

@ -0,0 +1,49 @@
From e949092d469c3ee3ea9bf1002649b6a692895da9 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 5 Nov 2025 02:04:02 +1100
Subject: [PATCH 2/2] [1.2] rootfs: re-allow dangling symlinks in mount targets
It seems there are a fair few images where dangling symlinks are used as
path components for mount targets, which pathrs-lite does not support
(and it would be difficult to fully support this in a race-free way).
This was actually meant to be blocked by commit 63c2908164f3 ("rootfs:
try to scope MkdirAll to stay inside the rootfs"), followed by commit
dd827f7b715a ("utils: switch to securejoin.MkdirAllHandle"). However, we
still used SecureJoin to construct mountpoint targets, which means that
dangling symlinks were "resolved" before reaching pathrs-lite.
This patch basically re-adds this hack in order to reduce the breakages
we've seen so far.
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
libcontainer/rootfs_linux.go | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
index 377642c9..6ea7cd47 100644
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -518,6 +518,17 @@ func (m *mountEntry) createOpenMountpoint(rootfs string) (Err error) {
dstIsFile = !fi.IsDir()
}
+ // In previous runc versions, we would tolerate nonsense paths with
+ // dangling symlinks as path components. pathrs-lite does not support
+ // this, so instead we have to emulate this behaviour by doing
+ // SecureJoin *purely to get a semi-reasonable path to use* and then we
+ // use pathrs-lite to operate on the path safely.
+ newUnsafePath, err := securejoin.SecureJoin(rootfs, unsafePath)
+ if err != nil {
+ return err
+ }
+ unsafePath = utils.StripRoot(rootfs, newUnsafePath)
+
if dstIsFile {
dstFile, err = pathrs.CreateInRoot(rootfs, unsafePath, unix.O_CREAT|unix.O_EXCL|unix.O_NOFOLLOW, 0o644)
} else {
--
2.51.1

View File

@ -1,7 +1,7 @@
From 1af672a2635628ca24ce3b5ed3344d316548f1ca Mon Sep 17 00:00:00 2001 From 73786942b7176eae1e676cf2f78af548f090e418 Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com> From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Mon, 21 Oct 2024 15:50:38 -0700 Date: Mon, 21 Oct 2024 15:50:38 -0700
Subject: [PATCH 2/2] [1.1] runc exec: implement CPU affinity Subject: [PATCH 2/2] runc exec: implement CPU affinity
As per As per
- https://github.com/opencontainers/runtime-spec/pull/1253 - https://github.com/opencontainers/runtime-spec/pull/1253
@ -27,25 +27,29 @@ Because of the above,
- exec's final CPU affinity, if not specified, can be different - exec's final CPU affinity, if not specified, can be different
depending on the kernel, therefore we don't test it. depending on the kernel, therefore we don't test it.
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
(cherry picked from commit 57237b31de367a722c5d49088912d57c28c6fb46)
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com> Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
--- ---
libcontainer/configs/config.go | 73 ++++++++++++++++++++ libcontainer/configs/config.go | 72 ++++++++++++++++++++
libcontainer/container_linux.go | 4 ++ libcontainer/container_linux.go | 4 ++
libcontainer/init_linux.go | 1 + libcontainer/init_linux.go | 3 +-
libcontainer/nsenter/nsexec.c | 36 +++++++++- libcontainer/nsenter/log.c | 9 ++-
libcontainer/nsenter/log.h | 3 +
libcontainer/nsenter/nsexec.c | 29 ++++++++
libcontainer/process.go | 2 + libcontainer/process.go | 2 +
libcontainer/process_linux.go | 51 +++++++++++++- libcontainer/process_linux.go | 49 +++++++++++++-
libcontainer/specconv/spec_linux.go | 5 ++ libcontainer/specconv/spec_linux.go | 5 ++
tests/integration/cpu_affinity.bats | 101 ++++++++++++++++++++++++++++ tests/integration/cpu_affinity.bats | 101 ++++++++++++++++++++++++++++
utils_linux.go | 6 ++ utils_linux.go | 6 ++
9 files changed, 275 insertions(+), 4 deletions(-) 11 files changed, 277 insertions(+), 6 deletions(-)
create mode 100644 tests/integration/cpu_affinity.bats create mode 100644 tests/integration/cpu_affinity.bats
diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go
index 6ebf5ec7..997f2724 100644 index 22fe0f9b..daffd130 100644
--- a/libcontainer/configs/config.go --- a/libcontainer/configs/config.go
+++ b/libcontainer/configs/config.go +++ b/libcontainer/configs/config.go
@@ -3,11 +3,15 @@ package configs @@ -3,8 +3,11 @@ package configs
import ( import (
"bytes" "bytes"
"encoding/json" "encoding/json"
@ -57,19 +61,20 @@ index 6ebf5ec7..997f2724 100644
"time" "time"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
+ "golang.org/x/sys/unix" @@ -225,6 +228,9 @@ type Config struct {
"github.com/opencontainers/runc/libcontainer/devices" // IOPriority is the container's I/O priority.
"github.com/opencontainers/runtime-spec/specs-go" IOPriority *IOPriority `json:"io_priority,omitempty"`
@@ -211,6 +215,75 @@ type Config struct {
// RootlessCgroups is set when unlikely to have the full access to cgroups.
// When RootlessCgroups is set, cgroups errors are ignored.
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
+ +
+ // ExecCPUAffinity is CPU affinity for a non-init process to be run in the container. + // ExecCPUAffinity is CPU affinity for a non-init process to be run in the container.
+ ExecCPUAffinity *CPUAffinity `json:"exec_cpu_affinity,omitempty"` + ExecCPUAffinity *CPUAffinity `json:"exec_cpu_affinity,omitempty"`
+} }
+
// Scheduler is based on the Linux sched_setattr(2) syscall.
@@ -294,6 +300,72 @@ var IOPrioClassMapping = map[specs.IOPriorityClass]int{
type IOPriority = specs.LinuxIOPriority
+type CPUAffinity struct { +type CPUAffinity struct {
+ Initial, Final *unix.CPUSet + Initial, Final *unix.CPUSet
+} +}
@ -134,14 +139,16 @@ index 6ebf5ec7..997f2724 100644
+ Initial: initial, + Initial: initial,
+ Final: final, + Final: final,
+ }, nil + }, nil
} +}
+
type ( type (
HookName string
HookList []Hook
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 40b332f9..68b6a74f 100644 index c0211617..1fc590a5 100644
--- a/libcontainer/container_linux.go --- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go
@@ -692,6 +692,7 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig { @@ -692,6 +692,7 @@ func (c *Container) newInitConfig(process *Process) *initConfig {
AppArmorProfile: c.config.AppArmorProfile, AppArmorProfile: c.config.AppArmorProfile,
ProcessLabel: c.config.ProcessLabel, ProcessLabel: c.config.ProcessLabel,
Rlimits: c.config.Rlimits, Rlimits: c.config.Rlimits,
@ -149,7 +156,7 @@ index 40b332f9..68b6a74f 100644
CreateConsole: process.ConsoleSocket != nil, CreateConsole: process.ConsoleSocket != nil,
ConsoleWidth: process.ConsoleWidth, ConsoleWidth: process.ConsoleWidth,
ConsoleHeight: process.ConsoleHeight, ConsoleHeight: process.ConsoleHeight,
@@ -708,6 +709,9 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig { @@ -708,6 +709,9 @@ func (c *Container) newInitConfig(process *Process) *initConfig {
if len(process.Rlimits) > 0 { if len(process.Rlimits) > 0 {
cfg.Rlimits = process.Rlimits cfg.Rlimits = process.Rlimits
} }
@ -160,43 +167,80 @@ index 40b332f9..68b6a74f 100644
cfg.Cgroup2Path = c.cgroupManager.Path("") cfg.Cgroup2Path = c.cgroupManager.Path("")
} }
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
index d9f18139..1f8562ec 100644 index 1eb0279d..eddbfba6 100644
--- a/libcontainer/init_linux.go --- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go
@@ -70,6 +70,7 @@ type initConfig struct { @@ -72,6 +72,7 @@ type initConfig struct {
RootlessCgroups bool `json:"rootless_cgroups,omitempty"` RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
SpecState *specs.State `json:"spec_state,omitempty"` SpecState *specs.State `json:"spec_state,omitempty"`
Cgroup2Path string `json:"cgroup2_path,omitempty"` Cgroup2Path string `json:"cgroup2_path,omitempty"`
+ CPUAffinity *configs.CPUAffinity `json:"cpu_affinity,omitempty"` + CPUAffinity *configs.CPUAffinity `json:"cpu_affinity,omitempty"`
} }
type initer interface { // Init is part of "runc init" implementation.
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c @@ -151,7 +152,7 @@ func startInitialization() (retErr error) {
index 2d224bab..6f70aa87 100644
--- a/libcontainer/nsenter/nsexec.c logrus.SetOutput(logPipe)
+++ b/libcontainer/nsenter/nsexec.c logrus.SetFormatter(new(logrus.JSONFormatter))
@@ -149,13 +149,18 @@ int setns(int fd, int nstype) - logrus.Debug("child process in init()")
+ logrus.Debugf("child process in init()")
// Only init processes have FIFOFD.
var fifoFile *os.File
diff --git a/libcontainer/nsenter/log.c b/libcontainer/nsenter/log.c
index 086b5398..72774cb0 100644
--- a/libcontainer/nsenter/log.c
+++ b/libcontainer/nsenter/log.c
@@ -31,6 +31,11 @@ void setup_logpipe(void)
loglevel = i;
} }
#endif
+bool log_enabled_for(int level) +bool log_enabled_for(int level)
+{ +{
+ return (logfd >= 0 && level <= loglevel); + return (logfd >= 0 && level <= loglevel);
+} +}
+ +
static void write_log(int level, const char *format, ...) /* Defined in nsexec.c */
{ extern int current_stage;
char *message = NULL, *stage = NULL, *json = NULL;
@@ -40,8 +45,8 @@ void write_log(int level, const char *format, ...)
va_list args; va_list args;
int ret; int ret;
- if (logfd < 0 || level > loglevel) - if (logfd < 0 || level > loglevel)
- goto out;
+ if (!log_enabled_for(level)) + if (!log_enabled_for(level))
goto out; + return;
va_start(args, format); va_start(args, format);
@@ -851,6 +856,25 @@ void try_unshare(int flags, const char *msg) ret = vasprintf(&message, format, args);
bail("failed to unshare %s", msg); diff --git a/libcontainer/nsenter/log.h b/libcontainer/nsenter/log.h
index 1fe95a11..3e18de68 100644
--- a/libcontainer/nsenter/log.h
+++ b/libcontainer/nsenter/log.h
@@ -1,6 +1,7 @@
#ifndef NSENTER_LOG_H
#define NSENTER_LOG_H
+#include <stdbool.h>
#include <stdio.h>
/*
@@ -20,6 +21,8 @@
*/
void setup_logpipe(void);
+bool log_enabled_for(int level);
+
void write_log(int level, const char *format, ...) __attribute__((format(printf, 2, 3)));
extern int logfd;
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index 565b2ca2..aa4976d6 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -558,6 +558,25 @@ static void update_timens_offsets(pid_t pid, char *map, size_t map_len)
bail("failed to update /proc/%d/timens_offsets", pid);
} }
+void print_cpu_affinity() +void print_cpu_affinity()
@ -221,7 +265,7 @@ index 2d224bab..6f70aa87 100644
void nsexec(void) void nsexec(void)
{ {
int pipenum; int pipenum;
@@ -892,6 +916,16 @@ void nsexec(void) @@ -584,6 +603,16 @@ void nsexec(void)
write_log(DEBUG, "=> nsexec container setup"); write_log(DEBUG, "=> nsexec container setup");
@ -239,31 +283,23 @@ index 2d224bab..6f70aa87 100644
nl_parse(pipenum, &config); nl_parse(pipenum, &config);
diff --git a/libcontainer/process.go b/libcontainer/process.go diff --git a/libcontainer/process.go b/libcontainer/process.go
index 8a5d340d..99167274 100644 index 114b3f2b..5339583f 100644
--- a/libcontainer/process.go --- a/libcontainer/process.go
+++ b/libcontainer/process.go +++ b/libcontainer/process.go
@@ -89,6 +89,8 @@ type Process struct { @@ -102,6 +102,8 @@ type Process struct {
// Scheduler *configs.Scheduler
// For cgroup v2, the only key allowed is "".
SubCgroupPaths map[string]string IOPriority *configs.IOPriority
+ +
+ CPUAffinity *configs.CPUAffinity + CPUAffinity *configs.CPUAffinity
} }
// Wait waits for the process to exit. // Wait waits for the process to exit.
diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
index 0d9ceb9c..3b48ae76 100644 index fcbb54a3..477c8a77 100644
--- a/libcontainer/process_linux.go --- a/libcontainer/process_linux.go
+++ b/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go
@@ -9,6 +9,7 @@ import ( @@ -122,6 +122,46 @@ func (p *setnsProcess) signal(sig os.Signal) error {
"os"
"os/exec"
"path/filepath"
+ "runtime"
"strconv"
"time"
@@ -78,12 +79,52 @@ func (p *setnsProcess) signal(sig os.Signal) error {
return unix.Kill(p.pid(), s) return unix.Kill(p.pid(), s)
} }
@ -308,18 +344,20 @@ index 0d9ceb9c..3b48ae76 100644
+} +}
+ +
func (p *setnsProcess) start() (retErr error) { func (p *setnsProcess) start() (retErr error) {
defer p.messageSockPair.parent.Close() defer p.comm.closeParent()
- // get the "before" value of oom kill count
+ // Get the "before" value of oom kill count. @@ -133,8 +173,8 @@ func (p *setnsProcess) start() (retErr error) {
// get the "before" value of oom kill count
oom, _ := p.manager.OOMKillCount() oom, _ := p.manager.OOMKillCount()
- err := p.cmd.Start() - err := p.cmd.Start()
- // close the write-side of the pipes (controlled by child) - // close the child-side of the pipes (controlled by child)
+ err := p.startWithCPUAffinity() + err := p.startWithCPUAffinity()
+ // Close the child-side of the pipes (controlled by child). + // Close the child-side of the pipes (controlled by child).
p.messageSockPair.child.Close() p.comm.closeChild()
p.logFilePair.child.Close()
if err != nil { if err != nil {
@@ -143,6 +184,10 @@ func (p *setnsProcess) start() (retErr error) { return fmt.Errorf("error starting setns process: %w", err)
@@ -184,6 +224,10 @@ func (p *setnsProcess) start() (retErr error) {
} }
} }
} }
@ -330,13 +368,21 @@ index 0d9ceb9c..3b48ae76 100644
if p.intelRdtPath != "" { if p.intelRdtPath != "" {
// if Intel RDT "resource control" filesystem path exists // if Intel RDT "resource control" filesystem path exists
_, err := os.Stat(p.intelRdtPath) _, err := os.Stat(p.intelRdtPath)
@@ -193,7 +237,6 @@ func (p *setnsProcess) start() (retErr error) {
}
}
}
-
if err := utils.WriteJSON(p.comm.initSockParent, p.config); err != nil {
return fmt.Errorf("error writing config to pipe: %w", err)
}
diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go
index 7dbfb869..b59e0d59 100644 index 95ada499..2d0db342 100644
--- a/libcontainer/specconv/spec_linux.go --- a/libcontainer/specconv/spec_linux.go
+++ b/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go
@@ -493,6 +493,11 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { @@ -556,6 +556,11 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
Ambient: spec.Process.Capabilities.Ambient, ioPriority := *spec.Process.IOPriority
} config.IOPriority = &ioPriority
} }
+ config.ExecCPUAffinity, err = configs.ConvertCPUAffinity(spec.Process.ExecCPUAffinity) + config.ExecCPUAffinity, err = configs.ConvertCPUAffinity(spec.Process.ExecCPUAffinity)
+ if err != nil { + if err != nil {
@ -454,10 +500,10 @@ index 00000000..f6adfa2a
+ [[ "$output" == *"Cpus_allowed_list: $final"* ]] # Mind the literal tab. + [[ "$output" == *"Cpus_allowed_list: $final"* ]] # Mind the literal tab.
+} +}
diff --git a/utils_linux.go b/utils_linux.go diff --git a/utils_linux.go b/utils_linux.go
index 60d534e8..30204133 100644 index feb6ef80..013dbcf4 100644
--- a/utils_linux.go --- a/utils_linux.go
+++ b/utils_linux.go +++ b/utils_linux.go
@@ -109,6 +109,12 @@ func newProcess(p specs.Process) (*libcontainer.Process, error) { @@ -90,6 +90,12 @@ func newProcess(p specs.Process) (*libcontainer.Process, error) {
} }
lp.Rlimits = append(lp.Rlimits, rl) lp.Rlimits = append(lp.Rlimits, rl)
} }

View File

@ -1,8 +1,5 @@
%global with_check 0 %global with_check 0
%global _find_debuginfo_dwz_opts %{nil}
%global _dwz_low_mem_die_limit 0
%if 0%{?rhel} > 7 && ! 0%{?fedora} %if 0%{?rhel} > 7 && ! 0%{?fedora}
%define gobuild(o:) \ %define gobuild(o:) \
go build -buildmode pie -compiler gc -tags="rpm_crashtraceback libtrust_openssl ${BUILDTAGS:-}" -ldflags "${LDFLAGS:-} -linkmode=external -compressdwarf=false -B 0x$(head -c20 /dev/urandom|od -An -tx1|tr -d ' \\n') -extldflags '%__global_ldflags'" -a -v %{?**}; go build -buildmode pie -compiler gc -tags="rpm_crashtraceback libtrust_openssl ${BUILDTAGS:-}" -ldflags "${LDFLAGS:-} -linkmode=external -compressdwarf=false -B 0x$(head -c20 /dev/urandom|od -An -tx1|tr -d ' \\n') -extldflags '%__global_ldflags'" -a -v %{?**};
@ -20,10 +17,10 @@ go build -buildmode pie -compiler gc -tags="rpm_crashtraceback libtrust_openssl
%global import_path %{provider}.%{provider_tld}/%{project}/%{repo} %global import_path %{provider}.%{provider_tld}/%{project}/%{repo}
%global git0 https://%{import_path} %global git0 https://%{import_path}
Epoch: 1 Epoch: 4
Name: %{repo} Name: %{repo}
Version: 1.1.12 Version: 1.2.5
Release: 6%{?dist} Release: 2%{?dist}
Summary: CLI for running Open Containers Summary: CLI for running Open Containers
# https://fedoraproject.org/wiki/PackagingDrafts/Go#Go_Language_Architectures # https://fedoraproject.org/wiki/PackagingDrafts/Go#Go_Language_Architectures
#ExclusiveArch: %%{go_arches} #ExclusiveArch: %%{go_arches}
@ -33,15 +30,21 @@ ExcludeArch: %{ix86}
License: ASL 2.0 License: ASL 2.0
URL: %{git0} URL: %{git0}
Source0: %{git0}/archive/v%{version}.tar.gz Source0: %{git0}/archive/v%{version}.tar.gz
Patch0: 0001-1.1-Bump-runtime-spec-to-latest-git-HEAD.patch Patch0: 0001-Bump-runtime-spec-to-latest-git-HEAD.patch
Patch1: 0002-1.1-runc-exec-implement-CPU-affinity.patch Patch1: 0002-runc-exec-implement-CPU-affinity.patch
Patch2: 0001-1.2.5-1.el9-CVEs-mega-patch.patch
Patch3: 0001-1.2-openat2-improve-resilience-on-busy-systems.patch
Patch4: 0002-1.2-rootfs-re-allow-dangling-symlinks-in-mount-targe.patch
Patch5: 0001-1.2-rootfs-only-set-mode-for-tmpfs-mount-if-target-alrea.patch
Provides: oci-runtime Provides: oci-runtime
BuildRequires: golang >= 1.21.4 BuildRequires: golang >= 1.22.4
BuildRequires: git BuildRequires: git
BuildRequires: /usr/bin/go-md2man BuildRequires: /usr/bin/go-md2man
BuildRequires: libseccomp-devel >= 2.5 BuildRequires: libseccomp-devel >= 2.5
BuildRequires: container-selinux >= 2.224.0
Requires: libseccomp >= 2.5 Requires: libseccomp >= 2.5
Requires: criu Recommends: criu
Requires: container-selinux >= 2.224.0
%description %description
The runc command can be used to start containers which are packaged The runc command can be used to start containers which are packaged
@ -63,7 +66,7 @@ pushd GOPATH/src/%{import_path}
export GO111MODULE=off export GO111MODULE=off
export GOPATH=%{gopath}:$(pwd)/GOPATH export GOPATH=%{gopath}:$(pwd)/GOPATH
export CGO_CFLAGS="%{optflags} -D_GNU_SOURCE -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64" export CGO_CFLAGS="%{optflags} -D_GNU_SOURCE -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64"
export BUILDTAGS="selinux seccomp no_openssl" export BUILDTAGS="selinux seccomp runc_dmz_selinux_nocompat no_openssl"
export LDFLAGS="-X main.gitCommit= -X main.version=%{version}" export LDFLAGS="-X main.gitCommit= -X main.version=%{version}"
%gobuild -o %{name} %{import_path} %gobuild -o %{name} %{import_path}
@ -87,6 +90,14 @@ make install install-man install-bash DESTDIR=$RPM_BUILD_ROOT PREFIX=%{_prefix}
%{_datadir}/bash-completion/completions/%{name} %{_datadir}/bash-completion/completions/%{name}
%changelog %changelog
* Wed Nov 12 2025 Jindrich Novy <jnovy@redhat.com> - 4:1.2.5-2
- fix permission regression
- Related: RHEL-122384
* Fri Nov 07 2025 Jindrich Novy <jnovy@redhat.com> - 4:1.2.5-1
- fix CVE-2025-31133 CVE-2025-52565 CVE-2025-52881
- Resolves: RHEL-122384
* Mon Jan 20 2025 Jindrich Novy <jnovy@redhat.com> - 1:1.1.12-6 * Mon Jan 20 2025 Jindrich Novy <jnovy@redhat.com> - 1:1.1.12-6
- Add CPU affinity feature from Kir Kolishkin - Add CPU affinity feature from Kir Kolishkin
- Resolves: RHEL-74865 - Resolves: RHEL-74865